nsight.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. import asyncio
  2. import copy
  3. import logging
  4. import os
  5. import subprocess
  6. import sys
  7. from pathlib import Path
  8. from typing import Dict, List, Optional, Tuple
  9. from ray._common.utils import (
  10. try_to_create_directory,
  11. )
  12. from ray._private.runtime_env.context import RuntimeEnvContext
  13. from ray._private.runtime_env.plugin import RuntimeEnvPlugin
  14. from ray.exceptions import RuntimeEnvSetupError
  15. default_logger = logging.getLogger(__name__)
  16. # Nsight options used when runtime_env={"_nsight": "default"}
  17. NSIGHT_DEFAULT_CONFIG = {
  18. "t": "cuda,cudnn,cublas,nvtx",
  19. "o": "'worker_process_%p'",
  20. "stop-on-exit": "true",
  21. }
  22. def parse_nsight_config(nsight_config: Dict[str, str]) -> List[str]:
  23. """
  24. Function to convert dictionary of nsight options into
  25. nsight command line
  26. The function returns:
  27. - List[str]: nsys profile cmd line split into list of str
  28. """
  29. nsight_cmd = ["nsys", "profile"]
  30. for option, option_val in nsight_config.items():
  31. # option standard based on
  32. # https://www.gnu.org/software/libc/manual/html_node/Argument-Syntax.html
  33. if len(option) > 1:
  34. nsight_cmd.append(f"--{option}={option_val}")
  35. else:
  36. nsight_cmd += [f"-{option}", option_val]
  37. return nsight_cmd
  38. class NsightPlugin(RuntimeEnvPlugin):
  39. name = "_nsight"
  40. def __init__(self, resources_dir: str):
  41. self.nsight_cmd = []
  42. # replace this with better way to get logs dir
  43. session_dir, runtime_dir = os.path.split(resources_dir)
  44. self._nsight_dir = Path(session_dir) / "logs" / "nsight"
  45. try_to_create_directory(self._nsight_dir)
  46. async def _check_nsight_script(
  47. self, nsight_config: Dict[str, str]
  48. ) -> Tuple[bool, str]:
  49. """
  50. Function to validate if nsight_config is a valid nsight profile options
  51. Args:
  52. nsight_config: dictionary mapping nsight option to it's value
  53. Returns:
  54. a tuple consists of a boolean indicating if the nsight_config
  55. is valid option and an error message if the nsight_config is invalid
  56. """
  57. # use empty as nsight report test filename
  58. nsight_config_copy = copy.deepcopy(nsight_config)
  59. nsight_config_copy["o"] = str(Path(self._nsight_dir) / "empty")
  60. nsight_cmd = parse_nsight_config(nsight_config_copy)
  61. try:
  62. nsight_cmd = nsight_cmd + [sys.executable, "-c", '""']
  63. process = await asyncio.create_subprocess_exec(
  64. *nsight_cmd,
  65. stdout=subprocess.PIPE,
  66. stderr=subprocess.PIPE,
  67. )
  68. stdout, stderr = await process.communicate()
  69. error_msg = stderr.strip() if stderr.strip() != "" else stdout.strip()
  70. # cleanup test.nsys-rep file
  71. clean_up_cmd = ["rm", f"{nsight_config_copy['o']}.nsys-rep"]
  72. cleanup_process = await asyncio.create_subprocess_exec(
  73. *clean_up_cmd,
  74. stdout=subprocess.PIPE,
  75. stderr=subprocess.PIPE,
  76. )
  77. _, _ = await cleanup_process.communicate()
  78. if process.returncode == 0:
  79. return True, None
  80. else:
  81. return False, error_msg
  82. except FileNotFoundError:
  83. return False, ("nsight is not installed")
  84. async def create(
  85. self,
  86. uri: Optional[str],
  87. runtime_env: "RuntimeEnv", # noqa: F821
  88. context: RuntimeEnvContext,
  89. logger: logging.Logger = default_logger,
  90. ) -> int:
  91. nsight_config = runtime_env.nsight()
  92. if not nsight_config:
  93. return 0
  94. if nsight_config and sys.platform != "linux":
  95. raise RuntimeEnvSetupError(
  96. "Nsight CLI is only available in Linux.\n"
  97. "More information can be found in "
  98. "https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html"
  99. )
  100. if isinstance(nsight_config, str):
  101. if nsight_config == "default":
  102. nsight_config = NSIGHT_DEFAULT_CONFIG
  103. else:
  104. raise RuntimeEnvSetupError(
  105. f"Unsupported nsight config: {nsight_config}. "
  106. "The supported config is 'default' or "
  107. "Dictionary of nsight options"
  108. )
  109. is_valid_nsight_cmd, error_msg = await self._check_nsight_script(nsight_config)
  110. if not is_valid_nsight_cmd:
  111. logger.warning(error_msg)
  112. raise RuntimeEnvSetupError(
  113. "nsight profile failed to run with the following "
  114. f"error message:\n {error_msg}"
  115. )
  116. # add set output path to logs dir
  117. nsight_config["o"] = str(
  118. Path(self._nsight_dir) / nsight_config.get("o", NSIGHT_DEFAULT_CONFIG["o"])
  119. )
  120. self.nsight_cmd = parse_nsight_config(nsight_config)
  121. return 0
  122. def modify_context(
  123. self,
  124. uris: List[str],
  125. runtime_env: "RuntimeEnv", # noqa: F821
  126. context: RuntimeEnvContext,
  127. logger: Optional[logging.Logger] = default_logger,
  128. ):
  129. logger.info("Running nsight profiler")
  130. context.py_executable = " ".join(self.nsight_cmd) + " python"