packaging.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133
  1. import asyncio
  2. import hashlib
  3. import logging
  4. import os
  5. import shutil
  6. import sys
  7. import time
  8. from pathlib import Path
  9. from tempfile import TemporaryDirectory
  10. from typing import Callable, List, Optional, Tuple
  11. from urllib.parse import urlparse
  12. from zipfile import ZipFile
  13. from filelock import FileLock
  14. from ray._private.path_utils import is_path
  15. from ray._private.ray_constants import (
  16. GRPC_CPP_MAX_MESSAGE_SIZE,
  17. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT,
  18. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR,
  19. )
  20. from ray._private.runtime_env.conda_utils import exec_cmd_stream_to_logger
  21. from ray._private.runtime_env.protocol import Protocol
  22. from ray._private.thirdparty.pathspec import PathSpec
  23. from ray._raylet import GcsClient
  24. from ray.experimental.internal_kv import (
  25. _internal_kv_exists,
  26. _internal_kv_put,
  27. _pin_runtime_env_uri,
  28. )
  29. from ray.util.annotations import DeveloperAPI
  30. default_logger = logging.getLogger(__name__)
  31. # If an individual file is beyond this size, print a warning.
  32. FILE_SIZE_WARNING = 10 * 1024 * 1024 # 10MiB
  33. # The size is bounded by the max gRPC message size.
  34. # Keep in sync with max_grpc_message_size in ray_config_def.h.
  35. GCS_STORAGE_MAX_SIZE = int(
  36. os.environ.get("RAY_max_grpc_message_size", GRPC_CPP_MAX_MESSAGE_SIZE)
  37. )
  38. RAY_PKG_PREFIX = "_ray_pkg_"
  39. RAY_RUNTIME_ENV_FAIL_UPLOAD_FOR_TESTING_ENV_VAR = (
  40. "RAY_RUNTIME_ENV_FAIL_UPLOAD_FOR_TESTING"
  41. )
  42. RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR = (
  43. "RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING"
  44. )
  45. # The name of the hidden top-level directory that appears when files are
  46. # zipped on MacOS.
  47. MAC_OS_ZIP_HIDDEN_DIR_NAME = "__MACOSX"
  48. def _mib_string(num_bytes: float) -> str:
  49. size_mib = float(num_bytes / 1024**2)
  50. return f"{size_mib:.2f}MiB"
  51. def _to_extended_length_path(path: str) -> str:
  52. r"""Convert paths to extended-length format if needed on Windows
  53. if needed. Paths on other platforms are returned unchanged.
  54. Extended-length paths (\\?\) support paths up to 32,767 characters on Windows
  55. instead of 260. Extended-length paths must be normalized (i.e., no "." or ".."
  56. components) so this function normalizes the path before applying the prefix.
  57. Args:
  58. path: The path to convert.
  59. Returns:
  60. The path with extended-length prefixed path on Windows, unchanged on other platforms.
  61. """
  62. # Ensure we always work with strings, not Path objects
  63. path = str(path)
  64. if sys.platform != "win32":
  65. return path
  66. # Convert to absolute path and fully normalize to remove any . or .. components
  67. # This is critical because extended-length paths disable Windows path normalization
  68. abs_path = os.path.normpath(os.path.abspath(path))
  69. # Extended-length path prefix
  70. extended_prefix = "\\\\?\\"
  71. # Already in extended format
  72. if abs_path.startswith(extended_prefix):
  73. return abs_path
  74. # UNC paths need special handling: \\server\share -> \\?\UNC\server\share
  75. if abs_path.startswith("\\\\"):
  76. return extended_prefix + "UNC" + abs_path[1:]
  77. # Local paths: C:\path -> \\?\C:\path
  78. return extended_prefix + abs_path
  79. class _AsyncFileLock:
  80. """Asyncio version used to prevent blocking event loop."""
  81. def __init__(self, lock_file: str):
  82. self.file = FileLock(lock_file)
  83. async def __aenter__(self):
  84. while True:
  85. try:
  86. self.file.acquire(timeout=0)
  87. return
  88. except TimeoutError:
  89. await asyncio.sleep(0.1)
  90. async def __aexit__(self, exc_type, exc, tb):
  91. self.file.release()
  92. def _xor_bytes(left: bytes, right: bytes) -> bytes:
  93. if left and right:
  94. return bytes(a ^ b for (a, b) in zip(left, right))
  95. return left or right
  96. def _dir_travel(
  97. path: Path,
  98. excludes: List[Callable],
  99. handler: Callable,
  100. include_gitignore: bool,
  101. logger: Optional[logging.Logger] = default_logger,
  102. ):
  103. """Travels the path recursively, calling the handler on each subpath.
  104. Respects excludes, which will be called to check if this path is skipped.
  105. """
  106. new_excludes = get_excludes_from_ignore_files(
  107. path, include_gitignore=include_gitignore, logger=logger
  108. )
  109. excludes.extend(new_excludes)
  110. skip = any(e(path) for e in excludes)
  111. if not skip:
  112. try:
  113. handler(path)
  114. except Exception as e:
  115. logger.error(f"Issue with path: {path}")
  116. raise e
  117. if path.is_dir():
  118. for sub_path in path.iterdir():
  119. _dir_travel(
  120. sub_path,
  121. excludes,
  122. handler,
  123. include_gitignore=include_gitignore,
  124. logger=logger,
  125. )
  126. for _ in range(len(new_excludes)):
  127. excludes.pop()
  128. def _hash_file_content_or_directory_name(
  129. filepath: Path,
  130. relative_path: Path,
  131. logger: Optional[logging.Logger] = default_logger,
  132. ) -> bytes:
  133. """Helper function to create hash of a single file or directory.
  134. This function hashes the path of the file or directory,
  135. and if it's a file, then it hashes its content too.
  136. """
  137. BUF_SIZE = 4096 * 1024
  138. sha1 = hashlib.sha1()
  139. sha1.update(str(filepath.relative_to(relative_path)).encode())
  140. if not filepath.is_dir():
  141. try:
  142. f = filepath.open("rb")
  143. except Exception as e:
  144. logger.debug(
  145. f"Skipping contents of file {filepath} when calculating package hash "
  146. f"because the file couldn't be opened: {e}"
  147. )
  148. else:
  149. try:
  150. data = f.read(BUF_SIZE)
  151. while len(data) != 0:
  152. sha1.update(data)
  153. data = f.read(BUF_SIZE)
  154. finally:
  155. f.close()
  156. return sha1.digest()
  157. def _hash_file(
  158. filepath: Path,
  159. relative_path: Path,
  160. logger: Optional[logging.Logger] = default_logger,
  161. ) -> bytes:
  162. """Helper function to create hash of a single file.
  163. It hashes the path of the file and its content to create a hash value.
  164. """
  165. file_hash = _hash_file_content_or_directory_name(
  166. filepath, relative_path, logger=logger
  167. )
  168. return _xor_bytes(file_hash, b"0" * 8)
  169. def _hash_directory(
  170. root: Path,
  171. relative_path: Path,
  172. excludes: Optional[Callable],
  173. include_gitignore: bool,
  174. logger: Optional[logging.Logger] = default_logger,
  175. ) -> bytes:
  176. """Helper function to create hash of a directory.
  177. It'll go through all the files in the directory and xor
  178. hash(file_name, file_content) to create a hash value.
  179. """
  180. hash_val = b"0" * 8
  181. def handler(path: Path):
  182. file_hash = _hash_file_content_or_directory_name(
  183. path, relative_path, logger=logger
  184. )
  185. nonlocal hash_val
  186. hash_val = _xor_bytes(hash_val, file_hash)
  187. excludes = [] if excludes is None else [excludes]
  188. _dir_travel(
  189. root, excludes, handler, include_gitignore=include_gitignore, logger=logger
  190. )
  191. return hash_val
  192. def parse_path(pkg_path: str) -> None:
  193. """Parse the path to check it is well-formed and exists."""
  194. path = Path(pkg_path)
  195. try:
  196. path.resolve(strict=True)
  197. except OSError:
  198. raise ValueError(f"{path} is not a valid path.")
  199. def parse_uri(pkg_uri: str) -> Tuple[Protocol, str]:
  200. """
  201. Parse package uri into protocol and package name based on its format.
  202. Note that the output of this function is not for handling actual IO, it's
  203. only for setting up local directory folders by using package name as path.
  204. >>> parse_uri("https://test.com/file.zip")
  205. (<Protocol.HTTPS: 'https'>, 'https_test_com_file.zip')
  206. >>> parse_uri("https://test.com/file.whl")
  207. (<Protocol.HTTPS: 'https'>, 'file.whl')
  208. """
  209. if is_path(pkg_uri):
  210. raise ValueError(f"Expected URI but received path {pkg_uri}")
  211. uri = urlparse(pkg_uri)
  212. try:
  213. protocol = Protocol(uri.scheme)
  214. except ValueError as e:
  215. raise ValueError(
  216. f'Invalid protocol for runtime_env URI "{pkg_uri}". '
  217. f"Supported protocols: {Protocol._member_names_}. Original error: {e}"
  218. )
  219. if protocol in Protocol.remote_protocols():
  220. if uri.path.endswith(".whl"):
  221. # Don't modify the .whl filename. See
  222. # https://peps.python.org/pep-0427/#file-name-convention
  223. # for more information.
  224. package_name = uri.path.split("/")[-1]
  225. else:
  226. package_name = f"{protocol.value}_{uri.netloc}{uri.path}"
  227. disallowed_chars = ["/", ":", "@", "+", " ", "(", ")"]
  228. for disallowed_char in disallowed_chars:
  229. package_name = package_name.replace(disallowed_char, "_")
  230. # Remove all periods except the last, which is part of the
  231. # file extension
  232. package_name = package_name.replace(".", "_", package_name.count(".") - 1)
  233. else:
  234. package_name = uri.netloc
  235. return (protocol, package_name)
  236. def is_zip_uri(uri: str) -> bool:
  237. try:
  238. protocol, path = parse_uri(uri)
  239. except ValueError:
  240. return False
  241. return Path(path).suffix == ".zip"
  242. def is_whl_uri(uri: str) -> bool:
  243. try:
  244. _, path = parse_uri(uri)
  245. except ValueError:
  246. return False
  247. return Path(path).suffix == ".whl"
  248. def is_jar_uri(uri: str) -> bool:
  249. try:
  250. _, path = parse_uri(uri)
  251. except ValueError:
  252. return False
  253. return Path(path).suffix == ".jar"
  254. def _get_excludes(path: Path, excludes: List[str]) -> Callable:
  255. path = path.absolute()
  256. pathspec = PathSpec.from_lines("gitwildmatch", excludes)
  257. def match(p: Path):
  258. path_str = str(p.absolute().relative_to(path))
  259. return pathspec.match_file(path_str)
  260. return match
  261. def _get_ignore_file(path: Path, ignore_file: str) -> Optional[Callable]:
  262. """Returns a function that returns True if the path should be excluded.
  263. Returns None if there is no ignore_file in the path.
  264. Args:
  265. path: The path to the directory to check for an ignore file.
  266. ignore_file: The name of the ignore file.
  267. Returns:
  268. A function that returns True if the path should be excluded.
  269. """
  270. path = path.absolute()
  271. ignore_file = path / ignore_file
  272. if ignore_file.is_file():
  273. with ignore_file.open("r") as f:
  274. pathspec = PathSpec.from_lines("gitwildmatch", f.readlines())
  275. def match(p: Path):
  276. path_str = str(p.absolute().relative_to(path))
  277. return pathspec.match_file(path_str)
  278. return match
  279. else:
  280. return None
  281. def get_excludes_from_ignore_files(
  282. path: Path,
  283. include_gitignore: bool,
  284. logger: Optional[logging.Logger] = default_logger,
  285. ) -> List[Callable]:
  286. """Get exclusion functions from .gitignore and .rayignore files in the current path.
  287. Args:
  288. path: The path to check for ignore files.
  289. include_gitignore: Whether to respect .gitignore files.
  290. logger: Logger to use.
  291. Returns:
  292. List[Callable]: List of exclusion functions. Each function takes a Path
  293. and returns True if the path should be excluded based on the ignore
  294. patterns in the respective ignore file.
  295. """
  296. ignore_files = []
  297. to_ignore: List[Optional[Callable]] = []
  298. if include_gitignore:
  299. g = _get_ignore_file(path, ignore_file=".gitignore")
  300. if g is not None:
  301. to_ignore.append(g)
  302. ignore_files.append(path / ".gitignore")
  303. r = _get_ignore_file(path, ignore_file=".rayignore")
  304. if r is not None:
  305. to_ignore.append(r)
  306. ignore_files.append(path / ".rayignore")
  307. if ignore_files:
  308. logger.info(f"Ignoring upload to cluster for these files: {ignore_files}")
  309. return to_ignore
  310. def pin_runtime_env_uri(uri: str, *, expiration_s: Optional[int] = None) -> None:
  311. """Pin a reference to a runtime_env URI in the GCS on a timeout.
  312. This is used to avoid premature eviction in edge conditions for job
  313. reference counting. See https://github.com/ray-project/ray/pull/24719.
  314. Packages are uploaded to GCS in order to be downloaded by a runtime env plugin
  315. (e.g. working_dir, py_modules) after the job starts.
  316. This function adds a temporary reference to the package in the GCS to prevent
  317. it from being deleted before the job starts. (See #23423 for the bug where
  318. this happened.)
  319. If this reference didn't have an expiration, then if the script exited
  320. (e.g. via Ctrl-C) before the job started, the reference would never be
  321. removed, so the package would never be deleted.
  322. """
  323. if expiration_s is None:
  324. expiration_s = int(
  325. os.environ.get(
  326. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR,
  327. RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT,
  328. )
  329. )
  330. elif not isinstance(expiration_s, int):
  331. raise ValueError(f"expiration_s must be an int, got {type(expiration_s)}.")
  332. if expiration_s < 0:
  333. raise ValueError(f"expiration_s must be >= 0, got {expiration_s}.")
  334. elif expiration_s > 0:
  335. _pin_runtime_env_uri(uri, expiration_s=expiration_s)
  336. def _store_package_in_gcs(
  337. pkg_uri: str,
  338. data: bytes,
  339. logger: Optional[logging.Logger] = default_logger,
  340. ) -> int:
  341. """Stores package data in the Global Control Store (GCS).
  342. Args:
  343. pkg_uri: The GCS key to store the data in.
  344. data: The serialized package's bytes to store in the GCS.
  345. logger (Optional[logging.Logger]): The logger used by this function.
  346. Return:
  347. int: Size of data
  348. Raises:
  349. RuntimeError: If the upload to the GCS fails.
  350. ValueError: If the data's size exceeds GCS_STORAGE_MAX_SIZE.
  351. """
  352. file_size = len(data)
  353. size_str = _mib_string(file_size)
  354. if len(data) >= GCS_STORAGE_MAX_SIZE:
  355. raise ValueError(
  356. f"Package size ({size_str}) exceeds the maximum size of "
  357. f"{_mib_string(GCS_STORAGE_MAX_SIZE)}. To exclude large files, "
  358. "add them to '.gitignore' or '.rayignore' files, or "
  359. "use the 'excludes' option in the runtime_env, or provide a "
  360. "URI of a remote zip file. For more information, refer to "
  361. "https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#api-reference." # noqa
  362. )
  363. logger.info(f"Pushing file package '{pkg_uri}' ({size_str}) to Ray cluster...")
  364. try:
  365. if os.environ.get(RAY_RUNTIME_ENV_FAIL_UPLOAD_FOR_TESTING_ENV_VAR):
  366. raise RuntimeError(
  367. "Simulating failure to upload package for testing purposes."
  368. )
  369. _internal_kv_put(pkg_uri, data)
  370. except Exception as e:
  371. raise RuntimeError(
  372. "Failed to store package in the GCS.\n"
  373. f" - GCS URI: {pkg_uri}\n"
  374. f" - Package data ({size_str}): {data[:15]}...\n"
  375. ) from e
  376. logger.info(f"Successfully pushed file package '{pkg_uri}'.")
  377. return len(data)
  378. def _get_local_path(base_directory: str, pkg_uri: str) -> str:
  379. _, pkg_name = parse_uri(pkg_uri)
  380. return os.path.join(base_directory, pkg_name)
  381. def _zip_files(
  382. path_str: str,
  383. excludes: List[str],
  384. output_path: str,
  385. include_gitignore: bool,
  386. include_parent_dir: bool = False,
  387. logger: Optional[logging.Logger] = default_logger,
  388. ) -> None:
  389. """Zip the target file or directory and write it to the output_path.
  390. path_str: The file or directory to zip.
  391. excludes (List(str)): The directories or file to be excluded.
  392. output_path: The output path for the zip file.
  393. include_parent_dir: If true, includes the top-level directory as a
  394. directory inside the zip file.
  395. """
  396. pkg_file = Path(output_path).absolute()
  397. # Use extended-length paths on Windows to avoid MAX_PATH limitations
  398. extended_pkg_file = _to_extended_length_path(str(pkg_file))
  399. with ZipFile(extended_pkg_file, "w", strict_timestamps=False) as zip_handler:
  400. # Put all files in the directory into the zip file.
  401. file_path = Path(path_str).absolute()
  402. dir_path = file_path
  403. if file_path.is_file():
  404. dir_path = file_path.parent
  405. def handler(path: Path):
  406. # Pack this path if it's an empty directory or it's a file.
  407. if path.is_dir() and next(path.iterdir(), None) is None or path.is_file():
  408. file_size = path.stat().st_size
  409. if file_size >= FILE_SIZE_WARNING:
  410. logger.warning(
  411. f"File {path} is very large "
  412. f"({_mib_string(file_size)}). Consider adding this "
  413. "file to the 'excludes' list to skip uploading it: "
  414. "`ray.init(..., "
  415. f"runtime_env={{'excludes': ['{path}']}})`"
  416. )
  417. to_path = path.relative_to(dir_path)
  418. if include_parent_dir:
  419. to_path = dir_path.name / to_path
  420. zip_handler.write(path, to_path)
  421. excludes = [_get_excludes(file_path, excludes)]
  422. _dir_travel(
  423. file_path,
  424. excludes,
  425. handler,
  426. include_gitignore=include_gitignore,
  427. logger=logger,
  428. )
  429. def package_exists(pkg_uri: str) -> bool:
  430. """Check whether the package with given URI exists or not.
  431. Args:
  432. pkg_uri: The uri of the package
  433. Return:
  434. True for package existing and False for not.
  435. """
  436. protocol, pkg_name = parse_uri(pkg_uri)
  437. if protocol == Protocol.GCS:
  438. return _internal_kv_exists(pkg_uri)
  439. else:
  440. raise NotImplementedError(f"Protocol {protocol} is not supported")
  441. def get_uri_for_package(package: Path) -> str:
  442. """Get a content-addressable URI from a package's contents."""
  443. if package.suffix == ".whl":
  444. # Wheel file names include the Python package name, version
  445. # and tags, so it is already effectively content-addressed.
  446. return "{protocol}://{whl_filename}".format(
  447. protocol=Protocol.GCS.value, whl_filename=package.name
  448. )
  449. else:
  450. hash_val = hashlib.sha1(package.read_bytes()).hexdigest()
  451. return "{protocol}://{pkg_name}.zip".format(
  452. protocol=Protocol.GCS.value, pkg_name=RAY_PKG_PREFIX + hash_val
  453. )
  454. def get_uri_for_file(file: str) -> str:
  455. """Get a content-addressable URI from a file's content.
  456. This function generates the name of the package by the file.
  457. The final package name is _ray_pkg_<HASH_VAL>.zip of this package,
  458. where HASH_VAL is the hash value of the file.
  459. For example: _ray_pkg_029f88d5ecc55e1e4d64fc6e388fd103.zip
  460. Examples:
  461. >>> get_uri_for_file("/my_file.py") # doctest: +SKIP
  462. _ray_pkg_af2734982a741.zip
  463. Args:
  464. file: The file.
  465. Returns:
  466. URI (str)
  467. Raises:
  468. ValueError: If the file doesn't exist.
  469. """
  470. filepath = Path(file).absolute()
  471. if not filepath.exists() or not filepath.is_file():
  472. raise ValueError(f"File {filepath} must be an existing file")
  473. hash_val = _hash_file(filepath, filepath.parent)
  474. return "{protocol}://{pkg_name}.zip".format(
  475. protocol=Protocol.GCS.value, pkg_name=RAY_PKG_PREFIX + hash_val.hex()
  476. )
  477. def get_uri_for_directory(
  478. directory: str,
  479. include_gitignore: bool,
  480. excludes: Optional[List[str]] = None,
  481. ) -> str:
  482. """Get a content-addressable URI from a directory's contents.
  483. This function generates the name of the package by the directory.
  484. It'll go through all the files in the directory and hash the contents
  485. of the files to get the hash value of the package.
  486. The final package name is _ray_pkg_<HASH_VAL>.zip of this package.
  487. For example: _ray_pkg_029f88d5ecc55e1e4d64fc6e388fd103.zip
  488. Examples:
  489. >>> get_uri_for_directory("/my_directory") # doctest: +SKIP
  490. _ray_pkg_af2734982a741.zip
  491. Args:
  492. directory: The directory.
  493. include_gitignore: Whether to respect .gitignore files.
  494. excludes (list[str]): The dir or files that should be excluded.
  495. Returns:
  496. URI (str)
  497. Raises:
  498. ValueError: If the directory doesn't exist.
  499. """
  500. if excludes is None:
  501. excludes = []
  502. directory = Path(directory).absolute()
  503. if not directory.exists() or not directory.is_dir():
  504. raise ValueError(f"directory {directory} must be an existing directory")
  505. hash_val = _hash_directory(
  506. directory,
  507. directory,
  508. _get_excludes(directory, excludes),
  509. include_gitignore=include_gitignore,
  510. )
  511. return "{protocol}://{pkg_name}.zip".format(
  512. protocol=Protocol.GCS.value, pkg_name=RAY_PKG_PREFIX + hash_val.hex()
  513. )
  514. def upload_package_to_gcs(pkg_uri: str, pkg_bytes: bytes) -> None:
  515. """Upload a local package to GCS.
  516. Args:
  517. pkg_uri: The URI of the package, e.g. gcs://my_package.zip
  518. pkg_bytes: The data to be uploaded.
  519. Raises:
  520. RuntimeError: If the upload fails.
  521. ValueError: If the pkg_uri is a remote path or if the data's
  522. size exceeds GCS_STORAGE_MAX_SIZE.
  523. NotImplementedError: If the protocol of the URI is not supported.
  524. """
  525. protocol, pkg_name = parse_uri(pkg_uri)
  526. if protocol == Protocol.GCS:
  527. _store_package_in_gcs(pkg_uri, pkg_bytes)
  528. elif protocol in Protocol.remote_protocols():
  529. raise ValueError(
  530. "upload_package_to_gcs should not be called with a remote path."
  531. )
  532. else:
  533. raise NotImplementedError(f"Protocol {protocol} is not supported")
  534. def create_package(
  535. module_path: str,
  536. target_path: Path,
  537. include_gitignore: bool,
  538. include_parent_dir: bool = False,
  539. excludes: Optional[List[str]] = None,
  540. logger: Optional[logging.Logger] = default_logger,
  541. ):
  542. if excludes is None:
  543. excludes = []
  544. if logger is None:
  545. logger = default_logger
  546. if not target_path.exists():
  547. logger.info(f"Creating a file package for local module '{module_path}'.")
  548. _zip_files(
  549. module_path,
  550. excludes,
  551. str(target_path),
  552. include_gitignore=include_gitignore,
  553. include_parent_dir=include_parent_dir,
  554. logger=logger,
  555. )
  556. def upload_package_if_needed(
  557. pkg_uri: str,
  558. base_directory: str,
  559. module_path: str,
  560. include_gitignore: bool,
  561. include_parent_dir: bool = False,
  562. excludes: Optional[List[str]] = None,
  563. logger: Optional[logging.Logger] = default_logger,
  564. ) -> bool:
  565. """Upload the contents of the directory under the given URI.
  566. This will first create a temporary zip file under the passed
  567. base_directory.
  568. If the package already exists in storage, this is a no-op.
  569. Args:
  570. pkg_uri: URI of the package to upload.
  571. base_directory: Directory where package files are stored.
  572. module_path: The module to be uploaded, either a single .py file or a directory.
  573. include_parent_dir: If true, includes the top-level directory as a
  574. directory inside the zip file.
  575. excludes: List specifying files to exclude.
  576. include_gitignore: Whether to respect .gitignore files. Default is True.
  577. Raises:
  578. RuntimeError: If the upload fails.
  579. ValueError: If the pkg_uri is a remote path or if the data's
  580. size exceeds GCS_STORAGE_MAX_SIZE.
  581. NotImplementedError: If the protocol of the URI is not supported.
  582. """
  583. if excludes is None:
  584. excludes = []
  585. if logger is None:
  586. logger = default_logger
  587. pin_runtime_env_uri(pkg_uri)
  588. if package_exists(pkg_uri):
  589. return False
  590. package_file = Path(_get_local_path(base_directory, pkg_uri))
  591. # Make the temporary zip file name unique so that it doesn't conflict with
  592. # concurrent upload_package_if_needed calls with the same pkg_uri.
  593. # See https://github.com/ray-project/ray/issues/47471.
  594. package_file = package_file.with_name(
  595. f"{time.time_ns()}_{os.getpid()}_{package_file.name}"
  596. )
  597. create_package(
  598. module_path,
  599. package_file,
  600. include_gitignore=include_gitignore,
  601. include_parent_dir=include_parent_dir,
  602. excludes=excludes,
  603. )
  604. package_file_bytes = package_file.read_bytes()
  605. # Remove the local file to avoid accumulating temporary zip files.
  606. package_file.unlink()
  607. upload_package_to_gcs(pkg_uri, package_file_bytes)
  608. return True
  609. def get_local_dir_from_uri(uri: str, base_directory: str) -> Path:
  610. """Return the local directory corresponding to this URI."""
  611. pkg_file = Path(_get_local_path(base_directory, uri))
  612. local_dir = pkg_file.with_suffix("")
  613. return local_dir
  614. @DeveloperAPI
  615. async def download_and_unpack_package(
  616. pkg_uri: str,
  617. base_directory: str,
  618. gcs_client: Optional[GcsClient] = None,
  619. logger: Optional[logging.Logger] = default_logger,
  620. overwrite: bool = False,
  621. ) -> str:
  622. """Download the package corresponding to this URI and unpack it if zipped.
  623. Will be written to a file or directory named {base_directory}/{uri}.
  624. Returns the path to this file or directory.
  625. Args:
  626. pkg_uri: URI of the package to download.
  627. base_directory: Directory to use as the parent directory of the target
  628. directory for the unpacked files.
  629. gcs_client: Client to use for downloading from the GCS.
  630. logger: The logger to use.
  631. overwrite: If True, overwrite the existing package.
  632. Returns:
  633. Path to the local directory containing the unpacked package files.
  634. Raises:
  635. IOError: If the download fails.
  636. ImportError: If smart_open is not installed and a remote URI is used.
  637. NotImplementedError: If the protocol of the URI is not supported.
  638. ValueError: If the GCS client is not provided when downloading from GCS,
  639. or if package URI is invalid.
  640. """
  641. pkg_file = Path(_get_local_path(base_directory, pkg_uri))
  642. if pkg_file.suffix == "":
  643. raise ValueError(
  644. f"Invalid package URI: {pkg_uri}."
  645. "URI must have a file extension and the URI must be valid."
  646. )
  647. async with _AsyncFileLock(str(pkg_file) + ".lock"):
  648. if logger is None:
  649. logger = default_logger
  650. logger.debug(f"Fetching package for URI: {pkg_uri}")
  651. local_dir = get_local_dir_from_uri(pkg_uri, base_directory)
  652. assert local_dir != pkg_file, "Invalid pkg_file!"
  653. download_package: bool = True
  654. if local_dir.exists() and not overwrite:
  655. download_package = False
  656. assert local_dir.is_dir(), f"{local_dir} is not a directory"
  657. elif local_dir.exists():
  658. logger.info(f"Removing {local_dir} with pkg_file {pkg_file}")
  659. shutil.rmtree(local_dir)
  660. if download_package:
  661. protocol, _ = parse_uri(pkg_uri)
  662. logger.info(
  663. f"Downloading package from {pkg_uri} to {pkg_file} "
  664. f"with protocol {protocol}"
  665. )
  666. if protocol == Protocol.GCS:
  667. if gcs_client is None:
  668. raise ValueError(
  669. "GCS client must be provided to download from GCS."
  670. )
  671. # Download package from the GCS.
  672. code = await gcs_client.async_internal_kv_get(
  673. pkg_uri.encode(), namespace=None, timeout=None
  674. )
  675. if os.environ.get(RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR):
  676. code = None
  677. if code is None:
  678. raise IOError(
  679. f"Failed to download runtime_env file package {pkg_uri} "
  680. "from the GCS to the Ray worker node. The package may "
  681. "have prematurely been deleted from the GCS due to a "
  682. "long upload time or a problem with Ray. Try setting the "
  683. "environment variable "
  684. f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR} "
  685. " to a value larger than the upload time in seconds "
  686. "(the default is "
  687. f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT}). "
  688. "If this fails, try re-running "
  689. "after making any change to a file in the file package."
  690. )
  691. code = code or b""
  692. pkg_file.write_bytes(code)
  693. if is_zip_uri(pkg_uri):
  694. unzip_package(
  695. package_path=pkg_file,
  696. target_dir=local_dir,
  697. remove_top_level_directory=False,
  698. unlink_zip=True,
  699. logger=logger,
  700. )
  701. else:
  702. return str(pkg_file)
  703. elif protocol in Protocol.remote_protocols():
  704. protocol.download_remote_uri(source_uri=pkg_uri, dest_file=pkg_file)
  705. if pkg_file.suffix in [".zip", ".jar"]:
  706. unzip_package(
  707. package_path=pkg_file,
  708. target_dir=local_dir,
  709. remove_top_level_directory=True,
  710. unlink_zip=True,
  711. logger=logger,
  712. )
  713. elif pkg_file.suffix == ".whl":
  714. return str(pkg_file)
  715. else:
  716. raise NotImplementedError(
  717. f"Package format {pkg_file.suffix} is ",
  718. "not supported for remote protocols",
  719. )
  720. else:
  721. raise NotImplementedError(f"Protocol {protocol} is not supported")
  722. return str(local_dir)
  723. def get_top_level_dir_from_compressed_package(package_path: str):
  724. """
  725. If compressed package at package_path contains a single top-level
  726. directory, returns the name of the top-level directory. Otherwise,
  727. returns None.
  728. Ignores a second top-level directory if it is named __MACOSX.
  729. """
  730. package_zip = ZipFile(package_path, "r")
  731. top_level_directory = None
  732. def is_top_level_file(file_name):
  733. return "/" not in file_name
  734. def base_dir_name(file_name):
  735. return file_name.split("/")[0]
  736. for file_name in package_zip.namelist():
  737. if top_level_directory is None:
  738. # Cache the top_level_directory name when checking
  739. # the first file in the zipped package
  740. if is_top_level_file(file_name):
  741. return None
  742. else:
  743. # Top-level directory, or non-top-level file or directory
  744. dir_name = base_dir_name(file_name)
  745. if dir_name == MAC_OS_ZIP_HIDDEN_DIR_NAME:
  746. continue
  747. top_level_directory = dir_name
  748. else:
  749. # Confirm that all other files
  750. # belong to the same top_level_directory
  751. if is_top_level_file(file_name) or base_dir_name(file_name) not in [
  752. top_level_directory,
  753. MAC_OS_ZIP_HIDDEN_DIR_NAME,
  754. ]:
  755. return None
  756. return top_level_directory
  757. def remove_dir_from_filepaths(base_dir: str, rdir: str):
  758. """
  759. base_dir: String path of the directory containing rdir
  760. rdir: String path of directory relative to base_dir whose contents should
  761. be moved to its base_dir, its parent directory
  762. Removes rdir from the filepaths of all files and directories inside it.
  763. In other words, moves all the files inside rdir to the directory that
  764. contains rdir. Assumes base_dir's contents and rdir's contents have no
  765. name conflicts.
  766. """
  767. # Move rdir to a temporary directory, so its contents can be moved to
  768. # base_dir without any name conflicts
  769. with TemporaryDirectory() as tmp_dir:
  770. # Apply extended-length path to temp directory to handle long paths
  771. extended_tmp_dir = _to_extended_length_path(tmp_dir)
  772. # shutil.move() is used instead of os.rename() in case rdir and tmp_dir
  773. # are located on separate file systems
  774. shutil.move(os.path.join(base_dir, rdir), os.path.join(extended_tmp_dir, rdir))
  775. # Shift children out of rdir and into base_dir
  776. rdir_children = os.listdir(os.path.join(extended_tmp_dir, rdir))
  777. for child in rdir_children:
  778. shutil.move(
  779. os.path.join(extended_tmp_dir, rdir, child),
  780. os.path.join(base_dir, child),
  781. )
  782. def unzip_package(
  783. package_path: str,
  784. target_dir: str,
  785. remove_top_level_directory: bool,
  786. unlink_zip: bool,
  787. logger: Optional[logging.Logger] = default_logger,
  788. ) -> None:
  789. """
  790. Unzip the compressed package contained at package_path to target_dir.
  791. If remove_top_level_directory is True and the top level consists of a
  792. a single directory (or possibly also a second hidden directory named
  793. __MACOSX at the top level arising from macOS's zip command), the function
  794. will automatically remove the top-level directory and store the contents
  795. directly in target_dir.
  796. Otherwise, if remove_top_level_directory is False or if the top level
  797. consists of multiple files or directories (not counting __MACOS),
  798. the zip contents will be stored in target_dir.
  799. Args:
  800. package_path: String path of the compressed package to unzip.
  801. target_dir: String path of the directory to store the unzipped contents.
  802. remove_top_level_directory: Whether to remove the top-level directory
  803. from the zip contents.
  804. unlink_zip: Whether to unlink the zip file stored at package_path.
  805. logger: Optional logger to use for logging.
  806. """
  807. # Use extended-length paths on Windows to avoid MAX_PATH limitations
  808. extended_target_dir = _to_extended_length_path(target_dir)
  809. try:
  810. os.mkdir(extended_target_dir)
  811. except FileExistsError:
  812. logger.info(f"Directory at {target_dir} already exists")
  813. logger.debug(f"Unpacking {package_path} to {extended_target_dir}")
  814. with ZipFile(str(package_path), "r") as zip_ref:
  815. # ZipFile.extractall() doesn't support extended paths
  816. # on Windows, which are needed to handle paths longer than 260
  817. # characters, so we implement our own extraction logic here.
  818. for member in zip_ref.namelist():
  819. # Build the full extraction path with extended-length prefix
  820. member_path = os.path.join(extended_target_dir, member)
  821. member_path = _to_extended_length_path(member_path)
  822. # Ensure the resolved path is within target_dir to prevent
  823. # path traversal attacks (e.g., ../../../etc/malicious).
  824. # Use os.path.commonpath to verify both paths share the same root
  825. try:
  826. common = os.path.commonpath([extended_target_dir, member_path])
  827. if not common.startswith(extended_target_dir):
  828. logger.warning(f"Skipping unsafe path in zip: {member}")
  829. continue
  830. except ValueError:
  831. # Paths on different drives (Windows)
  832. logger.warning(f"Skipping path on different drive in zip: {member}")
  833. continue
  834. logger.debug(f"Extracting {member} to {member_path}")
  835. # Get ZipInfo for this member to access metadata
  836. zip_info = zip_ref.getinfo(member)
  837. # Create directories if this is a directory entry
  838. if member.endswith("/"):
  839. os.makedirs(member_path, exist_ok=True)
  840. else:
  841. # Ensure parent directory exists
  842. parent_dir = os.path.dirname(member_path)
  843. if parent_dir:
  844. os.makedirs(parent_dir, exist_ok=True)
  845. # Extract the file
  846. with zip_ref.open(member) as source, open(member_path, "wb") as target:
  847. shutil.copyfileobj(source, target)
  848. # Preserve file permissions from the zip archive
  849. # ZipInfo.external_attr contains Unix file mode in upper 16 bits
  850. if zip_info.external_attr:
  851. # Extract Unix file mode from external_attr
  852. mode = zip_info.external_attr >> 16
  853. if mode:
  854. os.chmod(member_path, mode)
  855. if remove_top_level_directory:
  856. top_level_directory = get_top_level_dir_from_compressed_package(package_path)
  857. if top_level_directory is not None:
  858. # Remove __MACOSX directory if it exists
  859. # Use extended path to handle long paths on Windows
  860. macos_dir = _to_extended_length_path(
  861. os.path.join(target_dir, MAC_OS_ZIP_HIDDEN_DIR_NAME)
  862. )
  863. if os.path.isdir(macos_dir):
  864. shutil.rmtree(macos_dir)
  865. # Use extended path for cleanup operations
  866. remove_dir_from_filepaths(extended_target_dir, top_level_directory)
  867. if unlink_zip:
  868. Path(package_path).unlink()
  869. def delete_package(pkg_uri: str, base_directory: str) -> Tuple[bool, int]:
  870. """Deletes a specific URI from the local filesystem.
  871. Args:
  872. pkg_uri: URI to delete.
  873. Returns:
  874. bool: True if the URI was successfully deleted, else False.
  875. """
  876. deleted = False
  877. path = Path(_get_local_path(base_directory, pkg_uri))
  878. with FileLock(str(path) + ".lock"):
  879. path = path.with_suffix("")
  880. if path.exists():
  881. if path.is_dir() and not path.is_symlink():
  882. shutil.rmtree(str(path))
  883. else:
  884. path.unlink()
  885. deleted = True
  886. return deleted
  887. async def install_wheel_package(
  888. wheel_uri: str,
  889. target_dir: str,
  890. logger: Optional[logging.Logger] = default_logger,
  891. ) -> None:
  892. """Install packages in the wheel URI, and then delete the local wheel file."""
  893. pip_install_cmd = [
  894. "pip",
  895. "install",
  896. wheel_uri,
  897. f"--target={target_dir}",
  898. ]
  899. logger.info("Running py_modules wheel install command: %s", str(pip_install_cmd))
  900. try:
  901. # TODO(architkulkarni): Use `await check_output_cmd` or similar.
  902. exit_code, output = exec_cmd_stream_to_logger(pip_install_cmd, logger)
  903. finally:
  904. wheel_uri_path = Path(wheel_uri)
  905. if wheel_uri_path.exists():
  906. if wheel_uri_path.is_dir():
  907. shutil.rmtree(wheel_uri)
  908. else:
  909. Path(wheel_uri).unlink()
  910. if exit_code != 0:
  911. if Path(target_dir).exists():
  912. shutil.rmtree(target_dir)
  913. raise RuntimeError(
  914. f"Failed to install py_modules wheel {wheel_uri}"
  915. f"to {target_dir}:\n{output}"
  916. )