k8s_utils.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. import logging
  2. from ray._private.utils import get_num_cpus
  3. logger = logging.getLogger(__name__)
  4. CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
  5. CPU_USAGE_PATH_V2 = "/sys/fs/cgroup/cpu.stat"
  6. PROC_STAT_PATH = "/proc/stat"
  7. container_num_cpus = None
  8. host_num_cpus = None
  9. last_cpu_usage = None
  10. last_system_usage = None
  11. def cpu_percent():
  12. """Estimate CPU usage percent for Ray pod managed by Kubernetes
  13. Operator.
  14. Computed by the following steps
  15. (1) Replicate the logic used by 'docker stats' cli command.
  16. See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
  17. (2) Divide by the number of CPUs available to the container, so that
  18. e.g. full capacity use of 2 CPUs will read as 100%,
  19. rather than 200%.
  20. Step (1) above works by
  21. dividing delta in cpu usage by
  22. delta in total host cpu usage, averaged over host's cpus.
  23. Since deltas are not initially available, return 0.0 on first call.
  24. """ # noqa
  25. global last_system_usage
  26. global last_cpu_usage
  27. try:
  28. cpu_usage = _cpu_usage()
  29. system_usage = _system_usage()
  30. # Return 0.0 on first call.
  31. if last_system_usage is None:
  32. cpu_percent = 0.0
  33. else:
  34. cpu_delta = cpu_usage - last_cpu_usage
  35. # "System time passed." (Typically close to clock time.)
  36. system_delta = (system_usage - last_system_usage) / _host_num_cpus()
  37. quotient = cpu_delta / system_delta
  38. cpu_percent = round(quotient * 100 / get_num_cpus(), 1)
  39. last_system_usage = system_usage
  40. last_cpu_usage = cpu_usage
  41. # Computed percentage might be slightly above 100%.
  42. return min(cpu_percent, 100.0)
  43. except Exception:
  44. logger.exception("Error computing CPU usage of Ray Kubernetes pod.")
  45. return 0.0
  46. def _cpu_usage():
  47. """Compute total cpu usage of the container in nanoseconds
  48. by reading from cpuacct in cgroups v1 or cpu.stat in cgroups v2."""
  49. try:
  50. # cgroups v1
  51. return int(open(CPU_USAGE_PATH).read())
  52. except FileNotFoundError:
  53. # cgroups v2
  54. cpu_stat_text = open(CPU_USAGE_PATH_V2).read()
  55. # e.g. "usage_usec 16089294616"
  56. cpu_stat_first_line = cpu_stat_text.split("\n")[0]
  57. # get the second word of the first line, cast as an integer
  58. # this is the CPU usage is microseconds
  59. cpu_usec = int(cpu_stat_first_line.split()[1])
  60. # Convert to nanoseconds and return.
  61. return cpu_usec * 1000
  62. def _system_usage():
  63. """
  64. Computes total CPU usage of the host in nanoseconds.
  65. Logic taken from here:
  66. https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
  67. See also the /proc/stat entry here:
  68. https://man7.org/linux/man-pages/man5/proc.5.html
  69. """ # noqa
  70. cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
  71. parts = cpu_summary_str.split()
  72. assert parts[0] == "cpu"
  73. usage_data = parts[1:8]
  74. total_clock_ticks = sum(int(entry) for entry in usage_data)
  75. # 100 clock ticks per second, 10^9 ns per second
  76. usage_ns = total_clock_ticks * 10**7
  77. return usage_ns
  78. def _host_num_cpus():
  79. """Number of physical CPUs, obtained by parsing /proc/stat."""
  80. global host_num_cpus
  81. if host_num_cpus is None:
  82. proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
  83. split_proc_stat_lines = [line.split() for line in proc_stat_lines]
  84. cpu_lines = [
  85. split_line
  86. for split_line in split_proc_stat_lines
  87. if len(split_line) > 0 and "cpu" in split_line[0]
  88. ]
  89. # Number of lines starting with a word including 'cpu', subtracting
  90. # 1 for the first summary line.
  91. host_num_cpus = len(cpu_lines) - 1
  92. return host_num_cpus