| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import torch
- from torch._C import dtype
- __all__ = ["GPULimits"]
- class GPULimits:
- r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
- limits don't take into account thermal throttling (assume that the GPU run at its
- peak rated frequency). This is because user hardware configuration may influence
- power behavior.
- """
- def __init__(self, target_device: torch.device):
- # The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
- # runtime function. We need the total memory bus width and the memory clock rate to
- # calculate the memory bandwidth.
- self.device_properties = torch.cuda.get_device_properties(target_device)
- # The compute capability is needed to determine the number of FLOPs per cycle per SM
- self.compute_capability = int(
- f"{self.device_properties.major}{self.device_properties.minor}"
- )
- # FLOPs per cycle information derived from Table 2 in:
- # https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
- # Returns the number of FMA instructions retired per cycle per SM for a given
- # data type, when tensor cores are NOT used
- def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
- hardcoded_device_values = {
- # Ampere Architecture
- "fp16_80": 256,
- "fp32_80": 64,
- "fp64_80": 32,
- # Hopper Architecture
- "fp16_90": 64,
- "fp32_90": 128,
- "fp64_90": 64,
- # Blackwell Architecture
- "fp16_100": 256,
- "fp32_100": 128,
- "fp64_100": 64,
- }
- dict_key = ""
- if data_type is torch.float16:
- dict_key = f"fp16_{self.compute_capability}"
- elif data_type is torch.float32:
- dict_key = f"fp32_{self.compute_capability}"
- elif data_type is torch.float64:
- dict_key = f"fp64_{self.compute_capability}"
- else:
- dict_key = "unknown"
- if dict_key not in hardcoded_device_values:
- raise RuntimeError(
- f"No data for sm_{self.compute_capability} and {data_type}."
- )
- return hardcoded_device_values[dict_key]
- # Returns the number of FMA instructions retired per cycle per SM for a given
- # data type, when tensor cores ARE used
- def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
- hardcoded_device_values = {
- # Ampere Architecture
- "int8_80": 2048,
- "fp16_80": 1024,
- "fp32_80": 512,
- "fp64_80": 64,
- # Hopper Architecture
- "int8_90": 4096,
- "fp8_90": 4096,
- "fp16_90": 2048,
- "fp32_90": 1024,
- "fp64_90": 128,
- # Blackwell Architecture
- "int8_100": 8192,
- "fp8_100": 8192,
- "fp16_100": 4096,
- "fp32_100": 2048,
- }
- dict_key = ""
- if data_type is torch.float16:
- dict_key = f"fp16_{self.compute_capability}"
- elif data_type is torch.bfloat16:
- # FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
- dict_key = f"fp16_{self.compute_capability}"
- elif data_type is torch.float32:
- dict_key = f"fp32_{self.compute_capability}"
- elif data_type is torch.int8:
- dict_key = f"int8_{self.compute_capability}"
- elif data_type is torch.float64:
- dict_key = f"fp64_{self.compute_capability}"
- else:
- dict_key = "unknown"
- if dict_key not in hardcoded_device_values:
- raise RuntimeError(
- f"No data for sm_{self.compute_capability} and {data_type}."
- )
- return hardcoded_device_values[dict_key]
- def get_tflops_per_second(
- self, data_type: dtype, use_tensor_cores: bool = True
- ) -> float:
- num_sms = self.device_properties.multi_processor_count
- clock_rate = self.device_properties.clock_rate # KHz
- fma_per_cycle = 0
- if use_tensor_cores:
- fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
- else:
- fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
- # 1 FMA counts as 2 floating point operations
- # Clock rate is in KHz
- tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
- return tflops_per_second
- def get_memory_bandwidth_Bps(self) -> int:
- # DRAM devices are Double-Data which means they provide an output at both fronts of
- # a clock beat
- bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
- mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
- bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz
- return bytes_per_second
- def get_shared_memory_bandwidth_Bps(self) -> int:
- # Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
- # device's throughput we need to multiply with frequency and number of SMs.
- num_sms = self.device_properties.multi_processor_count
- bytes_per_cycle_per_sm = 128
- bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
- bytes_per_second = (
- bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
- )
- return bytes_per_second
|