| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320 |
- # mypy: allow-untyped-defs
- r"""This package adds support for device memory management implemented in CUDA."""
- import collections
- import contextlib
- import ctypes
- import pickle
- import sys
- import warnings
- from inspect import signature
- from typing import Any, Literal, TYPE_CHECKING
- from typing_extensions import deprecated
- import torch
- from torch import _C
- from torch._utils import _augment_memory_snapshot_stack_traces, _dummy_type
- from . import (
- _get_amdsmi_device_index,
- _get_device_index,
- _get_nvml_device_index,
- _lazy_init,
- is_initialized,
- )
- from ._memory_viz import memory as _memory, segments as _segments
- if TYPE_CHECKING:
- from torch.types import Device
- __all__ = [
- "caching_allocator_alloc",
- "caching_allocator_delete",
- "caching_allocator_enable",
- "get_per_process_memory_fraction",
- "set_per_process_memory_fraction",
- "empty_cache",
- "memory_stats",
- "memory_stats_as_nested_dict",
- "reset_accumulated_memory_stats",
- "reset_peak_memory_stats",
- "reset_max_memory_allocated",
- "reset_max_memory_cached",
- "host_memory_stats",
- "host_memory_stats_as_nested_dict",
- "reset_accumulated_host_memory_stats",
- "reset_peak_host_memory_stats",
- "memory_allocated",
- "max_memory_allocated",
- "memory_reserved",
- "max_memory_reserved",
- "memory_cached",
- "max_memory_cached",
- "memory_snapshot",
- "memory_summary",
- "list_gpu_processes",
- "mem_get_info",
- "get_allocator_backend",
- "CUDAPluggableAllocator",
- "change_current_allocator",
- "MemPool",
- "use_mem_pool",
- ]
- if not hasattr(torch._C, "_cuda_CUDAAllocator"):
- # Define dummy base classes
- torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator")
- if not hasattr(torch._C, "_MemPool"):
- # Define dummy base classes
- torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool")
- torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type(
- "_cuda_beginAllocateToPool"
- )
- torch._C.__dict__["_cuda_beginAllocateCurrentThreadToPool"] = _dummy_type(
- "_cuda_beginAllocateCurrentThreadToPool"
- )
- torch._C.__dict__["_cuda_endAllocateToPool"] = _dummy_type(
- "_cuda_endAllocateToPool"
- )
- torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool")
- from torch._C import ( # noqa: F401
- _cuda_beginAllocateCurrentThreadToPool,
- _cuda_beginAllocateToPool,
- _cuda_CUDAAllocator,
- _cuda_endAllocateToPool,
- _cuda_releasePool,
- _MemPool,
- )
- def _host_allocator():
- _lazy_init()
- return torch._C._cuda_cudaHostAllocator()
- @contextlib.contextmanager
- def _free_mutex():
- torch._C._cuda_lock_mutex()
- try:
- yield
- finally:
- torch._C._cuda_unlock_mutex()
- def caching_allocator_alloc(size, device: "Device" = None, stream=None):
- r"""Perform a memory allocation using the CUDA memory allocator.
- Memory is allocated for a given device and a stream, this
- function is intended to be used for interoperability with other
- frameworks. Allocated memory is released through
- :func:`~torch.cuda.caching_allocator_delete`.
- Args:
- size (int): number of bytes to be allocated.
- device (torch.device or int, optional): selected device. If it is
- ``None`` the default CUDA device is used.
- stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
- the default stream for the selected device is used.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- if stream is None:
- stream = torch.cuda.current_stream(device)
- if isinstance(stream, torch.cuda.streams.Stream):
- stream = stream.cuda_stream
- if not isinstance(stream, int):
- raise TypeError(
- "Invalid type for stream argument, must be "
- "`torch.cuda.Stream` or `int` representing a pointer "
- "to a existing stream"
- )
- with torch.cuda.device(device):
- return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
- def caching_allocator_delete(mem_ptr):
- r"""Delete memory allocated using the CUDA memory allocator.
- Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
- is freed here. The associated device and stream are tracked inside
- the allocator.
- Args:
- mem_ptr (int): memory address to be freed by the allocator.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
- def caching_allocator_enable(value: bool = True) -> None:
- r"""Enable or disable the CUDA memory allocator. On by default."""
- if is_initialized():
- torch._C._cuda_cudaCachingAllocator_enable(value)
- def set_per_process_memory_fraction(fraction, device: "Device" = None) -> None:
- r"""Set memory fraction for a process.
- The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
- The allowed value equals the total visible memory multiplied fraction.
- If trying to allocate more than the allowed value in a process, will raise an out of
- memory error in allocator.
- Args:
- fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
- device (torch.device or int, optional): selected device. If it is
- ``None`` the default CUDA device is used.
- .. note::
- In general, the total available free memory is less than the total capacity.
- """
- _lazy_init()
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- if not isinstance(fraction, float):
- raise TypeError("Invalid type for fraction argument, must be `float`")
- if fraction < 0 or fraction > 1:
- raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1")
- torch._C._cuda_setMemoryFraction(fraction, device)
- def get_per_process_memory_fraction(device: "Device" = None) -> float:
- r"""Get memory fraction for a process.
- Args:
- device (torch.device or int, optional): selected device. If it is
- ``None`` the default CUDA device is used.
- Returns:
- memory fraction, in range 0~1. Allowed memory equals total_memory * fraction.
- """
- _lazy_init()
- if device is None:
- device = torch.cuda.current_device()
- device = _get_device_index(device)
- return torch._C._cuda_getMemoryFraction(device)
- def empty_cache() -> None:
- r"""Release all unoccupied cached memory currently held by the caching
- allocator so that those can be used in other GPU application and visible in
- `nvidia-smi`.
- .. note::
- :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
- memory available for PyTorch. However, it may help reduce fragmentation
- of GPU memory in certain cases. See :ref:`cuda-memory-management` for
- more details about GPU memory management.
- """
- if is_initialized():
- torch._C._cuda_emptyCache()
- def memory_stats(device: "Device" = None) -> dict[str, Any]:
- r"""Return a dictionary of CUDA memory allocator statistics for a given device.
- The return value of this function is a dictionary of statistics, each of
- which is a non-negative integer.
- Core statistics:
- - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of allocation requests received by the memory allocator.
- - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of allocated memory.
- - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of reserved segments from ``cudaMalloc()``.
- - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of reserved memory.
- - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of active memory blocks.
- - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of active memory.
- - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- number of inactive, non-releasable memory blocks.
- - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- amount of inactive, non-releasable memory.
- For these core statistics, values are broken down as follows.
- Pool type:
- - ``all``: combined statistics across all memory pools.
- - ``large_pool``: statistics for the large allocation pool
- (as of June 2025, for size >= 1MB allocations).
- - ``small_pool``: statistics for the small allocation pool
- (as of June 2025, for size < 1MB allocations).
- Metric type:
- - ``current``: current value of this metric.
- - ``peak``: maximum value of this metric.
- - ``allocated``: historical total increase in this metric.
- - ``freed``: historical total decrease in this metric.
- In addition to the core statistics, we also provide some simple event
- counters:
- - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
- result in a cache flush and retry.
- - ``"num_ooms"``: number of out-of-memory errors thrown.
- - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls.
- - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both
- cuMemMap and cudaMalloc.
- - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap
- and cudaFree.
- The caching allocator can be configured via ENV to not split blocks larger than a
- defined size (see Memory Management section of the Cuda Semantics documentation).
- This helps avoid memory fragmentation but may have a performance
- penalty. Additional outputs to assist with tuning and evaluating impact:
- - ``"max_split_size"``: blocks above this size will not be split.
- - ``"oversize_allocations.{current,peak,allocated,freed}"``:
- number of over-size allocation requests received by the memory allocator.
- - ``"oversize_segments.{current,peak,allocated,freed}"``:
- number of over-size reserved segments from ``cudaMalloc()``.
- The caching allocator can be configured via ENV to round memory allocations in order
- to reduce fragmentation. Sometimes the overhead from rounding can be higher than
- the fragmentation it helps reduce. The following stat can be used to check if
- rounding adds too much overhead:
- - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
- memory requested by client code, compare this with allocated_bytes to check if
- allocation rounding adds too much overhead.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistics for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- .. note::
- With :ref:`backend:cudaMallocAsync<cuda-memory-envvars>`, some stats are not
- meaningful, and are always reported as zero.
- """
- result = []
- def _recurse_add_to_result(prefix, obj):
- if isinstance(obj, dict):
- if len(prefix) > 0:
- prefix += "."
- for k, v in obj.items():
- _recurse_add_to_result(prefix + k, v)
- else:
- result.append((prefix, obj))
- stats = memory_stats_as_nested_dict(device=device)
- _recurse_add_to_result("", stats)
- result.sort()
- return collections.OrderedDict(result)
- def memory_stats_as_nested_dict(device: "Device" = None) -> dict[str, Any]:
- r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
- if not is_initialized():
- return {}
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_memoryStats(device)
- def reset_accumulated_memory_stats(device: "Device" = None) -> None:
- r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator.
- See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
- the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
- `"num_alloc_retries"` and `"num_ooms"`.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_resetAccumulatedMemoryStats(device)
- def reset_peak_memory_stats(device: "Device" = None) -> None:
- r"""Reset the "peak" stats tracked by the CUDA memory allocator.
- See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
- `"peak"` key in each individual stat dict.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- return torch._C._cuda_resetPeakMemoryStats(device)
- def host_memory_stats() -> dict[str, Any]:
- r"""Return a dictionary of pinned (host) allocator statistics.
- Core statistics (host pinned allocator):
- - ``"allocations.{current,peak,allocated,freed}"``:
- pinned blocks owned by the allocator (active + cached). Grows when a new
- block is created via CUDA and shrinks when cached blocks are returned.
- - ``"allocated_bytes.{current,peak,allocated,freed}"``:
- bytes of pinned blocks owned by the allocator (active + cached), using
- the rounded block size requested from CUDA.
- - ``"active_requests.{current,peak,allocated,freed}"``:
- blocks currently checked out to callers (increments on handout, decrements
- when the block becomes reusable after stream deps finish).
- - ``"active_bytes.{current,peak,allocated,freed}"``:
- bytes corresponding to active blocks.
- Metric type:
- - ``current``: current value.
- - ``peak``: maximum value.
- - ``allocated``: historical total increase.
- - ``freed``: historical total decrease.
- Event/timing counters:
- - ``"num_host_alloc"`` / ``"num_host_free"``: blocks created to grow the
- pool / cached blocks returned to CUDA (matches allocations allocated/freed).
- - ``"host_alloc_time.{total,max,min,count,avg}"``: time in CUDA alloc calls
- when growing the pool (microseconds).
- - ``"host_free_time.{total,max,min,count,avg}"``: time in CUDA free calls
- when cached blocks are returned (microseconds).
- Block sizes are rounded up to the next power of two before calling CUDA, so
- byte stats reflect the rounded size. Peak values are aggregated per bucket
- and are a best-effort approximation of the true peak.
- """
- result = []
- def _recurse_add_to_result(prefix, obj):
- if isinstance(obj, dict):
- if len(prefix) > 0:
- prefix += "."
- for k, v in obj.items():
- _recurse_add_to_result(prefix + k, v)
- else:
- result.append((prefix, obj))
- stats = host_memory_stats_as_nested_dict()
- _recurse_add_to_result("", stats)
- result.sort()
- return collections.OrderedDict(result)
- def host_memory_stats_as_nested_dict() -> dict[str, Any]:
- r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary."""
- if not is_initialized():
- return {}
- return torch._C._cuda_hostMemoryStats()
- def reset_accumulated_host_memory_stats() -> None:
- r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator.
- See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to
- the `"allocated"` and `"freed"` keys in each individual stat dict.
- """
- return torch._C._cuda_resetAccumulatedHostMemoryStats()
- def reset_peak_host_memory_stats() -> None:
- r"""Reset the "peak" stats tracked by the host memory allocator.
- See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the
- `"peak"` key in each individual stat dict.
- """
- return torch._C._cuda_resetPeakHostMemoryStats()
- def reset_max_memory_allocated(device: "Device" = None) -> None:
- r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
- See :func:`~torch.cuda.max_memory_allocated` for details.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. warning::
- This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
- /all/ peak memory stats.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- warnings.warn(
- "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
- "which resets /all/ peak memory stats.",
- FutureWarning,
- stacklevel=2,
- )
- return reset_peak_memory_stats(device=device)
- def reset_max_memory_cached(device: "Device" = None) -> None:
- r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
- See :func:`~torch.cuda.max_memory_cached` for details.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. warning::
- This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
- /all/ peak memory stats.
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- warnings.warn(
- "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
- "which resets /all/ peak memory stats.",
- FutureWarning,
- stacklevel=2,
- )
- return reset_peak_memory_stats(device=device)
- def memory_allocated(device: "Device" = None) -> int:
- r"""Return the current GPU memory occupied by tensors in bytes for a given device.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- This is likely less than the amount shown in `nvidia-smi` since some
- unused memory can be held by the caching allocator and some context
- needs to be created on GPU. See :ref:`cuda-memory-management` for more
- details about GPU memory management.
- """
- return memory_stats(device=device).get("allocated_bytes.all.current", 0)
- def max_memory_allocated(device: "Device" = None) -> int:
- r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
- By default, this returns the peak allocated memory since the beginning of
- this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
- reset the starting point in tracking this metric. For example, these two
- functions can measure the peak allocated memory usage of each iteration in a
- training loop.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
- def memory_reserved(device: "Device" = None) -> int:
- r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("reserved_bytes.all.current", 0)
- def max_memory_reserved(device: "Device" = None) -> int:
- r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
- By default, this returns the peak cached memory since the beginning of this
- program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
- the starting point in tracking this metric. For example, these two functions
- can measure the peak cached memory amount of each iteration in a training
- loop.
- Args:
- device (torch.device or int, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
- @deprecated(
- "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`",
- category=FutureWarning,
- )
- def memory_cached(device: "Device" = None) -> int:
- r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
- return memory_reserved(device=device)
- @deprecated(
- "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`",
- category=FutureWarning,
- )
- def max_memory_cached(device: "Device" = None) -> int:
- r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
- return max_memory_reserved(device=device)
- def memory_snapshot(mempool_id=None, include_traces=True):
- r"""Return a snapshot of the CUDA memory allocator state across all devices.
- Interpreting the output of this function requires familiarity with the
- memory allocator internals.
- Args:
- mempool_id: Optional memory pool ID to get snapshot for a specific pool
- include_traces: Whether to include trace entries in the snapshot.
- If True (default), all trace entries are included.
- If False, no trace entries are included (lightweight/fast snapshot).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- if mempool_id is None:
- # pyrefly: ignore [bad-argument-type]
- return torch._C._cuda_memorySnapshot((0, 0, include_traces))["segments"]
- else:
- return torch._C._cuda_memorySnapshot(
- # pyrefly: ignore [bad-argument-type]
- (mempool_id[0], mempool_id[1], include_traces)
- )["segments"]
- def memory_summary(device: "Device" = None, abbreviated: bool = False) -> str:
- r"""Return a human-readable printout of the current memory allocator statistics for a given device.
- This can be useful to display periodically during training, or when
- handling out-of-memory exceptions.
- Args:
- device (torch.device or int, optional): selected device. Returns
- printout for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- abbreviated (bool, optional): whether to return an abbreviated summary
- (default: False).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- device = _get_device_index(device, optional=True)
- stats = memory_stats(device=device)
- def _format_size(sz, pref_sz):
- prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"]
- prefix = prefixes[0]
- for new_prefix in prefixes[1:]:
- if pref_sz < 768 * 1024:
- break
- prefix = new_prefix
- sz //= 1024
- pref_sz /= 1024
- return f"{sz:6d} {prefix}"
- def _format_count(cnt, pref_cnt):
- prefixes = [" ", "K", "M"]
- prefix = prefixes[0]
- for new_prefix in prefixes[1:]:
- if pref_cnt < 750 * 1000:
- break
- prefix = new_prefix
- cnt //= 1000
- pref_cnt /= 1000
- return f"{cnt:7d} {prefix} "
- metrics_to_display = [
- ("allocated_bytes", "Allocated memory", _format_size),
- ("active_bytes", "Active memory", _format_size),
- ("requested_bytes", "Requested memory", _format_size),
- ("reserved_bytes", "GPU reserved memory", _format_size),
- ("inactive_split_bytes", "Non-releasable memory", _format_size),
- ("allocation", "Allocations", _format_count),
- ("active", "Active allocs", _format_count),
- ("segment", "GPU reserved segments", _format_count),
- ("inactive_split", "Non-releasable allocs", _format_count),
- ]
- lines = []
- lines.append("=" * 75)
- lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
- lines.append("-" * 75)
- lines.append(
- " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} "
- )
- lines.append("=" * 75)
- lines.append(
- " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed "
- )
- for metric_key, metric_name, formatter in metrics_to_display:
- lines.append("-" * 75)
- submetrics = [("all", metric_name)]
- if not abbreviated:
- submetrics.append(("large_pool", " from large pool"))
- submetrics.append(("small_pool", " from small pool"))
- current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
- None,
- None,
- None,
- None,
- )
- for submetric_key, submetric_name in submetrics:
- prefix = metric_key + "." + submetric_key + "."
- current = stats[prefix + "current"]
- peak = stats[prefix + "peak"]
- allocated = stats[prefix + "allocated"]
- freed = stats[prefix + "freed"]
- if current_prefval is None:
- current_prefval = current
- peak_prefval = peak
- allocated_prefval = allocated
- freed_prefval = freed
- lines.append(
- # pyrefly: ignore [bad-argument-type]
- f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | "
- f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ",
- )
- metrics_to_display = [
- ("oversize_allocations", "Oversize allocations", _format_count),
- ("oversize_segments", "Oversize GPU segments", _format_count),
- ]
- for metric_key, metric_name, formatter in metrics_to_display:
- lines.append("-" * 75)
- prefix = metric_key + "."
- current = stats[prefix + "current"]
- peak = stats[prefix + "peak"]
- allocated = stats[prefix + "allocated"]
- freed = stats[prefix + "freed"]
- lines.append(
- # pyrefly: ignore [bad-argument-type]
- f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | "
- f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ",
- )
- lines.append("=" * 75)
- fmt_dict = {"_": "", "device": device}
- for k, v in stats.items():
- fmt_dict[k.replace(".", "-")] = v
- return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
- def list_gpu_processes(device: "Device" = None) -> str:
- r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
- This can be useful to display periodically during training, or when
- handling out-of-memory exceptions.
- Args:
- device (torch.device or int, optional): selected device. Returns
- printout for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- """
- if not torch.version.hip:
- try:
- import pynvml # type: ignore[import]
- except ModuleNotFoundError:
- return "pynvml module not found, please install nvidia-ml-py"
- # pyrefly: ignore [import-error, missing-import, missing-module-attribute]
- from pynvml import NVMLError_DriverNotLoaded
- try:
- pynvml.nvmlInit()
- except NVMLError_DriverNotLoaded:
- return "cuda driver can't be loaded, is cuda enabled?"
- device = _get_nvml_device_index(device)
- handle = pynvml.nvmlDeviceGetHandleByIndex(device)
- procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
- else:
- try:
- import amdsmi # type: ignore[import]
- except ModuleNotFoundError:
- return "amdsmi module not found, please install amdsmi"
- try:
- amdsmi.amdsmi_init() # type: ignore[attr-defined]
- except amdsmi.AmdSmiException: # type: ignore[attr-defined]
- return "amdsmi driver can't be loaded, is ROCm installed?"
- device = _get_amdsmi_device_index(device)
- try:
- handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined]
- procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined]
- except amdsmi.AmdSmiException: # type: ignore[attr-defined]
- return "amdsmi cannot list processes from other users"
- lines = []
- lines.append(f"GPU:{device}")
- if len(procs) == 0:
- lines.append("no processes are running")
- for p in procs:
- if not torch.version.hip:
- mem = p.usedGpuMemory / (1024 * 1024)
- pid = p.pid
- else:
- try:
- proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined]
- except AttributeError:
- # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7
- # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi
- proc_info = p
- mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024)
- pid = proc_info["pid"]
- lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory")
- return "\n".join(lines)
- def mem_get_info(device: "Device" = None) -> tuple[int, int]:
- r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo.
- Args:
- device (torch.device or int or str, optional): selected device. Returns
- statistic for the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default) or if the device index is not specified.
- .. note::
- See :ref:`cuda-memory-management` for more
- details about GPU memory management.
- """
- if device is None:
- device = torch.cuda.current_device()
- # optional=True allows `device = torch.device('cuda')` for which device.index is None
- device = _get_device_index(device, optional=True)
- return torch.cuda.cudart().cudaMemGetInfo(device)
- def _record_memory_history_legacy(
- enabled: bool,
- record_context=True,
- trace_alloc_max_entries=1,
- trace_alloc_record_context=False,
- device: "Device" = None,
- record_context_cpp=False,
- clear_history=False,
- compile_context=False,
- global_record_annotations=False,
- skip_actions=None,
- ):
- _C._cuda_record_memory_history_legacy( # type: ignore[call-arg]
- enabled,
- record_context,
- # pyrefly: ignore [bad-argument-type]
- trace_alloc_max_entries,
- trace_alloc_record_context,
- record_context_cpp,
- clear_history,
- compile_context,
- global_record_annotations,
- # pyrefly: ignore [bad-argument-count]
- skip_actions if skip_actions is not None else [],
- )
- def _record_memory_history(
- enabled: Literal["state", "all"] | None = "all", *args, **kwargs
- ) -> None:
- """Enable recording of stack traces associated with memory
- allocations, so you can tell what allocated any piece of memory in
- :func:`torch.cuda.memory._snapshot()`.
- In addition to keeping stack traces with each current allocation and free,
- this will also enable recording of a history of all alloc/free events.
- Use :func:`torch.cuda.memory._snapshot()` to retrieve this information,
- and the tools in `_memory_viz.py` to visualize snapshots.
- Buffer behavior
- ---------------
- This will store up to `max_entries` instances of `TraceEntry` when enabled.
- Python trace collection defaults to `sys.maxsize`, meaning long-running
- or indefinitely running jobs should set a reasonable limit to avoid excessive
- memory use. Expect each entry to be several KB.
- Longer running workflows or those with smaller `max_entries` values will only
- store the last accumulated `max_entries` entries, meaning new entries overwrite
- older entries.
- C++ implementation for reference to ring buffer implementation:
- .. code-block:: cpp
- if (record_history) {
- if (alloc_trace->size() < alloc_trace_max_entries_) {
- alloc_trace->emplace_back(te);
- } else {
- (*alloc_trace)[alloc_trace_next++] = te;
- if (alloc_trace_next == alloc_trace_max_entries_) {
- alloc_trace_next = 0;
- }
- }
- }
- Latency impact
- --------------
- The Python trace collection is fast (2us per trace), so you may consider
- enabling this on production jobs if you anticipate ever having to debug
- memory issues.
- C++ trace collection is also fast (~50ns/frame), which for many typical programs
- works out to ~2us per trace, but can vary depending on stack depth.
- Args:
- enabled (Literal[None, "state", "all"], optional):
- `None`, disable recording memory history.
- `"state"`, keep information for currently allocated memory.
- `"all"`, additionally keep a history of all alloc/free calls.
- Defaults to "all".
- context (Literal[None, "state", "alloc", "all"], optional):
- `None`, Do not record any tracebacks.
- `"state"`, Record tracebacks for currently allocated memory.
- `"alloc"`, additionally keep tracebacks for alloc calls.
- `"all"`, additionally keep tracebacks for free calls.
- Defaults to "all".
- stacks (Literal["python", "all"], optional):
- `"python"`, include Python, TorchScript, and inductor frames in tracebacks
- `"all"`, additionally include C++ frames
- Defaults to "all".
- max_entries (int, optional): Keep a maximum of `max_entries`
- alloc/free events in the recorded history recorded.
- clear_history (bool, optional): Clear history when enabling, defaults to False.
- skip_actions (list[str], optional): List of action types to skip when recording
- memory history. This can be used to reduce memory overhead by excluding
- certain types of events from being recorded. Valid action types are:
- - `"alloc"`: Memory allocation events
- - `"free_requested"`: Free requests (memory marked for freeing)
- - `"free_completed"`: Completed free operations (memory actually freed)
- - `"segment_alloc"`: Segment allocation from cudaMalloc
- - `"segment_free"`: Segment freed back to CUDA via cudaFree
- - `"oom"`: Out-of-memory exceptions
- - `"snapshot"`: Memory snapshot generation events
- For example, to skip recording free_requested events:
- `skip_actions=["free_requested"]`
- Defaults to None (record all actions).
- """
- if isinstance(enabled, bool):
- return _record_memory_history_legacy(enabled, *args, **kwargs)
- else:
- return _record_memory_history_impl(enabled, *args, **kwargs)
- def _record_memory_history_impl(
- enabled: str | None = "all",
- context: str | None = "all",
- stacks: str = "all",
- max_entries: int = sys.maxsize,
- device: "Device" = None,
- clear_history: bool = False,
- compile_context: bool = False,
- global_record_annotations: bool = False,
- skip_actions: list[str] | None = None,
- ):
- _C._cuda_record_memory_history( # type: ignore[call-arg]
- enabled,
- context,
- stacks,
- max_entries,
- clear_history,
- compile_context,
- global_record_annotations,
- # pyrefly: ignore [bad-argument-count]
- skip_actions if skip_actions is not None else [],
- )
- _record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined]
- def _snapshot(device: "Device" = None, augment_with_fx_traces=False):
- """Save a snapshot of CUDA memory state at the time it was called.
- The state is represented as a dictionary with the following structure.
- .. code-block:: python
- class Snapshot(TypedDict):
- segments: List[Segment]
- device_traces: List[List[TraceEntry]]
- class Segment(TypedDict):
- # Segments are memory returned from a cudaMalloc call.
- # The size of reserved memory is the sum of all Segments.
- # Segments are cached and reused for future allocations.
- # If the reuse is smaller than the segment, the segment
- # is split into more then one Block.
- # empty_cache() frees Segments that are entirely inactive.
- address: int
- total_size: int # cudaMalloc'd size of segment
- stream: int
- segment_type: Literal["small", "large"] # 'large' (>1MB)
- allocated_size: int # size of memory in use
- active_size: int # size of memory in use or in active_awaiting_free state
- blocks: List[Block]
- class Block(TypedDict):
- # A piece of memory returned from the allocator, or
- # current cached but inactive.
- size: int
- requested_size: int # size requested during malloc, may be smaller than
- # size due to rounding
- address: int
- state: Literal[
- "active_allocated", # used by a tensor
- "active_awaiting_free", # waiting for another stream to finish using
- # this, then it will become free
- "inactive",
- ] # free for reuse
- frames: List[Frame] # stack trace from where the allocation occurred
- class Frame(TypedDict):
- filename: str
- line: int
- name: str
- # Optional FX debug fields (present when augment_with_fx_traces=True
- # and the frame corresponds to FX-generated code)
- fx_node_op: str # FX node operation type (e.g., 'call_function', 'output')
- fx_node_name: str # FX node name (e.g., 'linear', 'relu_1')
- fx_original_trace: str # Original model source code stack trace
- class TraceEntry(TypedDict):
- # When `torch.cuda.memory._record_memory_history()` is enabled,
- # the snapshot will contain TraceEntry objects that record each
- # action the allocator took.
- action: Literal[
- "alloc" # memory allocated
- "free_requested", # the allocated received a call to free memory
- "free_completed", # the memory that was requested to be freed is now
- # able to be used in future allocation calls
- "segment_alloc", # the caching allocator ask cudaMalloc for more memory
- # and added it as a segment in its cache
- "segment_free", # the caching allocator called cudaFree to return memory
- # to cuda possibly trying free up memory to
- # allocate more segments or because empty_caches was called
- "oom", # the allocator threw an OOM exception. 'size' is
- # the requested number of bytes that did not succeed
- "snapshot", # the allocator generated a memory snapshot
- # useful to coorelate a previously taken
- # snapshot with this trace
- ]
- addr: int # not present for OOM
- frames: List[Frame]
- size: int
- stream: int
- device_free: int # only present for OOM, the amount of
- # memory cuda still reports to be free
- Args:
- device: Device to capture snapshot for. If None, captures for current device.
- augment_with_fx_traces: If True, augment stack trace frames with FX debug information
- that maps generated FX code back to original model source code.
- This adds fx_node_op, fx_node_name, fx_original_trace, and
- fx_node_info fields to Frame objects. Default: False.
- Returns:
- The Snapshot dictionary object
- """
- s = _C._cuda_memorySnapshot(None)
- if augment_with_fx_traces:
- s = _augment_memory_snapshot_stack_traces(s) # type: ignore[assignment, arg-type]
- return s
- def _dump_snapshot(filename="dump_snapshot.pickle", augment_with_fx_traces=False):
- """
- Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
- This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
- Snapshot file sizes scale with `max_entries` and stack trace depth per entry,
- with several KB per entry. These can easily be in the GB range for longer running
- workflows with large `max_entries`.
- Args:
- filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
- augment_with_fx_traces (bool, optional): If True, augment the snapshot with FX debug information
- before dumping. This maps generated FX code stack traces
- back to original model source code. Defaults to False.
- """
- s = _snapshot(augment_with_fx_traces=augment_with_fx_traces)
- with open(filename, "wb") as f:
- pickle.dump(s, f)
- def _set_memory_metadata(metadata: str):
- """
- Set custom metadata that will be attached to all subsequent CUDA memory allocations.
- This metadata will be recorded in the memory snapshot for all allocations made
- after this call until the metadata is cleared or changed.
- Args:
- metadata (str): Custom metadata string to attach to allocations.
- Pass an empty string to clear the metadata.
- """
- # pyrefly: ignore [missing-attribute]
- torch._C._cuda_setMemoryMetadata(metadata)
- def _get_memory_metadata() -> str:
- """
- Get the current custom metadata that is being attached to CUDA memory allocations.
- Returns:
- str: The current metadata string, or empty string if no metadata is set.
- """
- # pyrefly: ignore [missing-attribute]
- return torch._C._cuda_getMemoryMetadata()
- def _save_segment_usage(filename="output.svg", snapshot=None):
- if snapshot is None:
- snapshot = _snapshot()
- with open(filename, "w") as f:
- f.write(_segments(snapshot))
- def _save_memory_usage(filename="output.svg", snapshot=None):
- if snapshot is None:
- snapshot = _snapshot()
- with open(filename, "w") as f:
- f.write(_memory(snapshot))
- @deprecated(
- "torch.cuda._set_allocator_settings is deprecated. Use torch._C._accelerator_setAllocatorSettings instead.",
- category=FutureWarning,
- )
- def _set_allocator_settings(env: str):
- # pyrefly: ignore [missing-attribute]
- return torch._C._accelerator_setAllocatorSettings(env)
- def get_allocator_backend() -> str:
- r"""Return a string describing the active allocator backend as set by
- ``PYTORCH_ALLOC_CONF``. Currently available backends are
- ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync``
- (CUDA's built-in asynchronous allocator).
- .. note::
- See :ref:`cuda-memory-management` for details on choosing the allocator backend.
- """
- return torch._C._cuda_getAllocatorBackend()
- class _CUDAAllocator:
- r"""Wrapper over internal CUDA memory allocators."""
- def __init__(self, allocator: torch._C._cuda_CUDAAllocator):
- self._allocator = allocator
- def allocator(self):
- return self._allocator
- class CUDAPluggableAllocator(_CUDAAllocator):
- r"""CUDA memory allocator loaded from a so file."""
- def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
- r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes.
- To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function.
- Args:
- path_to_so_file(str): Path in the filesystem to the `.so` file containing
- the allocator functions
- alloc_fn_name(str): Name of the function to perform the memory allocation
- in the so file. The signature must be:
- void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream);
- free_fn_name(str): Name of the function to perform the memory release
- in the so file. The signature must be:
- void free_fn_name(void* ptr, size_t size, cudaStream_t stream);
- .. warning::
- This is currently supported only in unix OSs
- .. note::
- See :ref:`cuda-memory-management` for details on creating and using a custom allocator
- """
- allocator = ctypes.CDLL(path_to_so_file)
- alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
- free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
- if alloc_fn is None:
- raise AssertionError(f"alloc_fn '{alloc_fn_name}' is None")
- if free_fn is None:
- raise AssertionError(f"free_fn '{free_fn_name}' is None")
- self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn)
- def change_current_allocator(allocator: _CUDAAllocator) -> None:
- r"""Change the currently used memory allocator to be the one provided.
- If the current allocator has already been used/initialized, this function will error.
- Args:
- allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one.
- .. note::
- See :ref:`cuda-memory-management` for details on creating and using a custom allocator
- """
- torch._C._cuda_changeCurrentAllocator(allocator.allocator())
- def _get_current_allocator() -> _CUDAAllocator:
- r"""Return the allocator being currently used.
- .. note::
- See :ref:`cuda-memory-management` for details on creating and using a custom allocator
- """
- return _CUDAAllocator(torch._C._cuda_getAllocator())
- class MemPool(_MemPool):
- r"""MemPool represents a pool of memory in a caching allocator. Currently,
- it's just the ID of the pool object maintained in the CUDACachingAllocator.
- Args:
- allocator(torch._C._cuda_CUDAAllocator, optional): a
- torch._C._cuda_CUDAAllocator object that can be used to
- define how memory gets allocated in the pool. If :attr:`allocator`
- is ``None`` (default), memory allocation follows the default/
- current configuration of the CUDACachingAllocator.
- use_on_oom(bool): a bool that indicates if this pool can be used
- as a last resort if a memory allocation outside of the pool fails due
- to Out Of Memory. This is False by default.
- no_split(bool): a bool that indicates if this pool should not split a segment.
- This is False by default.
- """
- def __init__(
- self,
- allocator: _cuda_CUDAAllocator | None = None,
- use_on_oom: bool = False,
- no_split: bool = False,
- ):
- # pyrefly: ignore [bad-argument-count]
- super().__init__(allocator, True, use_on_oom, no_split)
- @property
- def id(self) -> tuple[int, int]:
- r"""Returns the ID of this pool as a tuple of two ints."""
- return super().id
- def use_count(self) -> int: # pylint: disable=useless-parent-delegation
- r"""Returns the reference count of this pool."""
- return super().use_count()
- def snapshot(self, include_traces=True):
- r"""Return a snapshot of the CUDA memory allocator pool state across all
- devices.
- Interpreting the output of this function requires familiarity with the
- memory allocator internals.
- Args:
- include_traces: Whether to include trace entries in the snapshot.
- If True (default), all trace entries are included.
- If False, no trace entries are included (lightweight/fast snapshot).
- .. note::
- See :ref:`cuda-memory-management` for more details about GPU memory
- management.
- """
- snapshot = torch.cuda.memory_snapshot(self.id, include_traces=include_traces)
- return snapshot
- @contextlib.contextmanager
- def use_mem_pool(pool: MemPool, device: "Device" = None):
- r"""A context manager that routes allocations to a given pool.
- Args:
- pool(torch.cuda.MemPool): a MemPool object to be made active so that
- allocations route to this pool.
- device (torch.device or int, optional): selected device. Uses MemPool on
- the current device, given by :func:`~torch.cuda.current_device`,
- if :attr:`device` is ``None`` (default).
- .. note::
- This context manager makes only current thread's allocations route to
- the given pool. If a new thread is spawned inside the context manager
- (e.g. by calling backward) the allocations in that thread will not
- route to the given pool.
- """
- device_index = (
- torch.cuda.current_device() if device is None else _get_device_index(device)
- )
- _cuda_beginAllocateCurrentThreadToPool(device_index, pool.id)
- try:
- yield
- finally:
- _cuda_endAllocateToPool(device_index, pool.id)
- _cuda_releasePool(device_index, pool.id)
|