| 12345678910111213141516171819202122232425 |
- from datetime import timedelta
- from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
- __all__ = ["default_pg_timeout", "default_pg_nccl_timeout"]
- # Default process group wide timeout, if applicable.
- # This only applies to the non-nccl backends
- # To make an attempt at backwards compatibility with THD, we use an
- # extraordinarily high default timeout, given that THD did not have timeouts.
- default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
- # Separate timeout for PGNCCL mainly because it's always been that way in the C++ layer, but until recently
- # there was one default that applied across all backends in the python layer.
- # Later, we could consider merging them back together at the c++ layer if we can align on a same value.
- # (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
- try:
- from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
- default_pg_nccl_timeout: timedelta | None = _DEFAULT_PG_NCCL_TIMEOUT
- except ImportError:
- # if C++ NCCL support is not compiled, we don't have access to the default nccl value.
- # if anyone is actually trying to use nccl in this state, it should error.
- default_pg_nccl_timeout = None
|