state_cli.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328
  1. import json
  2. import logging
  3. from datetime import datetime
  4. from enum import Enum, unique
  5. from typing import Dict, List, Optional, Tuple
  6. import click
  7. import yaml
  8. import ray._private.services as services
  9. from ray._common.network_utils import parse_address
  10. from ray._private.thirdparty.tabulate.tabulate import tabulate
  11. from ray.util.annotations import PublicAPI
  12. from ray.util.state import (
  13. StateApiClient,
  14. get_log,
  15. list_logs,
  16. summarize_actors,
  17. summarize_objects,
  18. summarize_tasks,
  19. )
  20. from ray.util.state.common import (
  21. DEFAULT_LIMIT,
  22. DEFAULT_LOG_LIMIT,
  23. DEFAULT_RPC_TIMEOUT,
  24. GetApiOptions,
  25. ListApiOptions,
  26. PredicateType,
  27. StateResource,
  28. StateSchema,
  29. SupportedFilterType,
  30. resource_to_schema,
  31. )
  32. from ray.util.state.exception import RayStateApiException
  33. logger = logging.getLogger(__name__)
  34. @unique
  35. class AvailableFormat(Enum):
  36. DEFAULT = "default"
  37. JSON = "json"
  38. YAML = "yaml"
  39. TABLE = "table"
  40. def _parse_filter(filter: str) -> Tuple[str, PredicateType, SupportedFilterType]:
  41. """Parse the filter string to a tuple of key, preciate, and value."""
  42. # The function assumes there's going to be no key that includes "="" or "!=".
  43. # Since key is controlled by us, it should be trivial to keep the invariant.
  44. predicate = None
  45. # Tuple of [predicate_start, predicate_end).
  46. predicate_index = None
  47. # Find the first predicate match. This logic works because we assume the
  48. # key doesn't contain = or !=.
  49. for i in range(len(filter)):
  50. char = filter[i]
  51. if char == "=":
  52. predicate = "="
  53. predicate_index = (i, i + 1)
  54. break
  55. elif char == "!":
  56. if len(filter) <= i + 1:
  57. continue
  58. next_char = filter[i + 1]
  59. if next_char == "=":
  60. predicate = "!="
  61. predicate_index = (i, i + 2)
  62. break
  63. if not predicate or not predicate_index:
  64. raise ValueError(
  65. f"The format of a given filter {filter} is invalid: "
  66. "Cannot find the predicate. "
  67. "Please provide key=val or key!=val format string."
  68. )
  69. key, predicate, value = (
  70. filter[: predicate_index[0]],
  71. filter[predicate_index[0] : predicate_index[1]],
  72. filter[predicate_index[1] :],
  73. )
  74. assert predicate == "=" or predicate == "!="
  75. if len(key) == 0 or len(value) == 0:
  76. raise ValueError(
  77. f"The format of a given filter {filter} is invalid: "
  78. f"Cannot identify key {key} or value, {value}. "
  79. "Please provide key=val or key!=val format string."
  80. )
  81. return (key, predicate, value)
  82. def _get_available_formats() -> List[str]:
  83. """Return the available formats in a list of string"""
  84. return [format_enum.value for format_enum in AvailableFormat]
  85. def _get_available_resources(
  86. excluded: Optional[List[StateResource]] = None,
  87. ) -> List[str]:
  88. """Return the available resources in a list of string
  89. Args:
  90. excluded: List of resources that should be excluded
  91. """
  92. # All resource names use '_' rather than '-'. But users options have '-'
  93. return [
  94. e.value.replace("_", "-")
  95. for e in StateResource
  96. if excluded is None or e not in excluded
  97. ]
  98. def get_table_output(state_data: List, schema: StateSchema, detail: bool) -> str:
  99. """Display the table output.
  100. The table headers are ordered as the order defined in the dataclass of
  101. `StateSchema`. For example,
  102. @dataclass
  103. class A(StateSchema):
  104. a: str
  105. b: str
  106. c: str
  107. will create headers
  108. A B C
  109. -----
  110. Args:
  111. state_data: A list of state data.
  112. schema: The schema for the corresponding resource.
  113. Returns:
  114. The table formatted string.
  115. """
  116. time = datetime.now()
  117. header = "=" * 8 + f" List: {time} " + "=" * 8
  118. headers = []
  119. table = []
  120. cols = schema.list_columns(detail=detail)
  121. for data in state_data:
  122. for key, val in data.items():
  123. if isinstance(val, dict):
  124. data[key] = yaml.dump(val, indent=2)
  125. keys = set(data.keys())
  126. headers = []
  127. for col in cols:
  128. if col in keys:
  129. headers.append(col.upper())
  130. table.append([data[header.lower()] for header in headers])
  131. return f"""
  132. {header}
  133. Stats:
  134. ------------------------------
  135. Total: {len(state_data)}
  136. Table:
  137. ------------------------------
  138. {tabulate(table, headers=headers, showindex=True, tablefmt="plain", floatfmt=".3f")}
  139. """
  140. def output_with_format(
  141. state_data: List[Dict],
  142. *,
  143. schema: Optional[StateSchema],
  144. format: AvailableFormat = AvailableFormat.DEFAULT,
  145. detail: bool = False,
  146. ) -> str:
  147. # humanify all input state data
  148. if schema:
  149. state_data = [schema.humanify(state) for state in state_data]
  150. if format == AvailableFormat.DEFAULT:
  151. return get_table_output(state_data, schema, detail)
  152. if format == AvailableFormat.YAML:
  153. return yaml.dump(
  154. state_data,
  155. indent=4,
  156. explicit_start=True,
  157. # We want to keep the defined ordering of the states, thus sort_keys=False
  158. sort_keys=False,
  159. explicit_end=True,
  160. )
  161. elif format == AvailableFormat.JSON:
  162. return json.dumps(state_data)
  163. elif format == AvailableFormat.TABLE:
  164. return get_table_output(state_data, schema, detail)
  165. else:
  166. raise ValueError(
  167. f"Unexpected format: {format}. "
  168. f"Supported formatting: {_get_available_formats()}"
  169. )
  170. def format_summary_output(state_data: Dict, *, resource: StateResource) -> str:
  171. if len(state_data) == 0:
  172. return "No resource in the cluster"
  173. # Parse the data.
  174. cluster_data = state_data["cluster"]
  175. summaries = cluster_data["summary"]
  176. summary_by = cluster_data["summary_by"]
  177. del cluster_data["summary_by"]
  178. del cluster_data["summary"]
  179. cluster_info_table = yaml.dump(cluster_data, indent=2)
  180. # Create a table.
  181. table = []
  182. headers = []
  183. for summary in summaries.values():
  184. # Convert dict to yaml for better formatting.
  185. for key, val in summary.items():
  186. if isinstance(val, dict):
  187. summary[key] = yaml.dump(val, indent=2)
  188. headers = sorted([key.upper() for key in summary.keys()])
  189. table.append([summary[header.lower()] for header in headers])
  190. summary_table = tabulate(
  191. table, headers=headers, showindex=True, tablefmt="plain", numalign="left"
  192. )
  193. time = datetime.now()
  194. header = "=" * 8 + f" {resource.value.capitalize()} Summary: {time} " + "=" * 8
  195. return f"""
  196. {header}
  197. Stats:
  198. ------------------------------------
  199. {cluster_info_table}
  200. Table (group by {summary_by}):
  201. ------------------------------------
  202. {summary_table}
  203. """
  204. def format_object_summary_output(state_data: Dict) -> str:
  205. if len(state_data) == 0:
  206. return "No resource in the cluster"
  207. # Parse the data.
  208. cluster_data = state_data["cluster"]
  209. summaries = cluster_data["summary"]
  210. summary_by = cluster_data["summary_by"]
  211. del cluster_data["summary_by"]
  212. del cluster_data["summary"]
  213. cluster_info_table = yaml.dump(cluster_data, indent=2)
  214. # Create a table per callsite.
  215. tables = []
  216. for callsite, summary in summaries.items():
  217. # Convert dict to yaml for better formatting.
  218. for key, val in summary.items():
  219. if isinstance(val, dict):
  220. summary[key] = yaml.dump(val, indent=2)
  221. table = []
  222. headers = sorted([key.upper() for key in summary.keys()])
  223. table.append([summary[header.lower()] for header in headers])
  224. table_for_callsite = tabulate(
  225. table, headers=headers, showindex=True, numalign="left"
  226. )
  227. # Format callsite. | is a separator for ray callsite.
  228. formatted_callsite = callsite.replace("|", "\n|")
  229. tables.append(f"{formatted_callsite}\n{table_for_callsite}")
  230. time = datetime.now()
  231. header = "=" * 8 + f" Object Summary: {time} " + "=" * 8
  232. table_string = "\n\n\n\n".join(tables)
  233. return f"""
  234. {header}
  235. Stats:
  236. ------------------------------------
  237. {cluster_info_table}
  238. Table (group by {summary_by})
  239. ------------------------------------
  240. {table_string}
  241. """
  242. def format_get_api_output(
  243. state_data: Optional[StateSchema],
  244. id: str,
  245. *,
  246. schema: StateSchema,
  247. format: AvailableFormat = AvailableFormat.YAML,
  248. ) -> str:
  249. if not state_data or isinstance(state_data, list) and len(state_data) == 0:
  250. return f"Resource with id={id} not found in the cluster."
  251. if not isinstance(state_data, list):
  252. state_data = [state_data]
  253. state_data = [state.asdict() for state in state_data]
  254. return output_with_format(state_data, schema=schema, format=format, detail=True)
  255. def format_list_api_output(
  256. state_data: List[StateSchema],
  257. *,
  258. schema: StateSchema,
  259. format: AvailableFormat = AvailableFormat.DEFAULT,
  260. detail: bool = False,
  261. ) -> str:
  262. if len(state_data) == 0:
  263. return "No resource in the cluster"
  264. state_data = [state.asdict() for state in state_data]
  265. return output_with_format(state_data, schema=schema, format=format, detail=detail)
  266. def _should_explain(format: AvailableFormat) -> bool:
  267. # If the format is json or yaml, it should not print stats because
  268. # users don't want additional strings.
  269. return format == AvailableFormat.DEFAULT or format == AvailableFormat.TABLE
  270. """
  271. Common Options for State API commands
  272. """
  273. timeout_option = click.option(
  274. "--timeout",
  275. default=DEFAULT_RPC_TIMEOUT,
  276. help=f"Timeout in seconds for the API requests. Default is {DEFAULT_RPC_TIMEOUT}",
  277. )
  278. address_option = click.option(
  279. "--address",
  280. default=None,
  281. help=(
  282. "The address of Ray API server. If not provided, it will be configured "
  283. "automatically from querying the GCS server."
  284. ),
  285. )
  286. @click.command()
  287. @click.argument(
  288. "resource",
  289. # NOTE(rickyyx): We are not allowing query job with id, and runtime envs
  290. type=click.Choice(
  291. _get_available_resources(
  292. excluded=[StateResource.JOBS, StateResource.RUNTIME_ENVS]
  293. )
  294. ),
  295. )
  296. @click.argument(
  297. "id",
  298. type=str,
  299. required=False,
  300. )
  301. @address_option
  302. @timeout_option
  303. @PublicAPI(stability="stable")
  304. def ray_get(
  305. resource: str,
  306. id: str,
  307. address: Optional[str],
  308. timeout: float,
  309. ):
  310. """Get a state of a given resource by ID.
  311. We currently DO NOT support get by id for jobs and runtime-envs
  312. The output schema is defined at :ref:`State API Schema section. <state-api-schema>`
  313. For example, the output schema of `ray get tasks <task-id>` is
  314. :class:`~ray.util.state.common.TaskState`.
  315. Usage:
  316. Get an actor with actor id <actor-id>
  317. ```
  318. ray get actors <actor-id>
  319. ```
  320. Get a placement group information with <placement-group-id>
  321. ```
  322. ray get placement-groups <placement-group-id>
  323. ```
  324. The API queries one or more components from the cluster to obtain the data.
  325. The returned state snapshot could be stale, and it is not guaranteed to return
  326. the live data.
  327. Args:
  328. resource: The type of the resource to query.
  329. id: The id of the resource.
  330. Raises:
  331. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  332. if the CLI is failed to query the data.
  333. """ # noqa: E501
  334. if not id:
  335. raise click.BadParameter(
  336. f"Missing argument 'ID'. Do you mean 'ray list {resource}'?"
  337. )
  338. # All resource names use '_' rather than '-'. But users options have '-'
  339. resource = StateResource(resource.replace("-", "_"))
  340. # Create the State API server and put it into context
  341. logger.debug(f"Create StateApiClient to ray instance at: {address}...")
  342. client = StateApiClient(address=address)
  343. options = GetApiOptions(timeout=timeout)
  344. # If errors occur, exceptions will be thrown.
  345. try:
  346. data = client.get(
  347. resource=resource,
  348. id=id,
  349. options=options,
  350. _explain=_should_explain(AvailableFormat.YAML),
  351. )
  352. except RayStateApiException as e:
  353. raise click.UsageError(str(e))
  354. # Print data to console.
  355. print(
  356. format_get_api_output(
  357. state_data=data,
  358. id=id,
  359. schema=resource_to_schema(resource),
  360. format=AvailableFormat.YAML,
  361. )
  362. )
  363. @click.command()
  364. @click.argument(
  365. "resource",
  366. type=click.Choice(_get_available_resources()),
  367. )
  368. @click.option(
  369. "--format", default="default", type=click.Choice(_get_available_formats())
  370. )
  371. @click.option(
  372. "-f",
  373. "--filter",
  374. help=(
  375. "A key, predicate, and value to filter the result. "
  376. "E.g., --filter 'key=value' or --filter 'key!=value'. "
  377. "You can specify multiple --filter options. In this case all predicates "
  378. "are concatenated as AND. For example, --filter key=value --filter key2=value "
  379. "means (key==val) AND (key2==val2), "
  380. "String filter values are case-insensitive."
  381. ),
  382. multiple=True,
  383. )
  384. @click.option(
  385. "--limit",
  386. default=DEFAULT_LIMIT,
  387. type=int,
  388. help=("Maximum number of entries to return. 100 by default."),
  389. )
  390. @click.option(
  391. "--detail",
  392. help=(
  393. "If the flag is set, the output will contain data in more details. "
  394. "Note that the API could query more sources "
  395. "to obtain information in a greater detail."
  396. ),
  397. is_flag=True,
  398. default=False,
  399. )
  400. @timeout_option
  401. @address_option
  402. @PublicAPI(stability="stable")
  403. def ray_list(
  404. resource: str,
  405. format: str,
  406. filter: List[str],
  407. limit: int,
  408. detail: bool,
  409. timeout: float,
  410. address: str,
  411. ):
  412. """List all states of a given resource.
  413. Normally, summary APIs are recommended before listing all resources.
  414. The output schema is defined at :ref:`State API Schema section. <state-api-schema>`
  415. For example, the output schema of `ray list tasks` is
  416. :class:`~ray.util.state.common.TaskState`.
  417. Usage:
  418. List all actor information from the cluster.
  419. ```
  420. ray list actors
  421. ```
  422. List 50 actors from the cluster. The sorting order cannot be controlled.
  423. ```
  424. ray list actors --limit 50
  425. ```
  426. List 10 actors with state PENDING.
  427. ```
  428. ray list actors --limit 10 --filter "state=PENDING"
  429. ```
  430. List actors with yaml format.
  431. ```
  432. ray list actors --format yaml
  433. ```
  434. List actors with details. When --detail is specified, it might query
  435. more data sources to obtain data in details.
  436. ```
  437. ray list actors --detail
  438. ```
  439. The API queries one or more components from the cluster to obtain the data.
  440. The returned state snapshot could be stale, and it is not guaranteed to return
  441. the live data.
  442. The API can return partial or missing output upon the following scenarios.
  443. - When the API queries more than 1 component, if some of them fail,
  444. the API will return the partial result (with a suppressible warning).
  445. - When the API returns too many entries, the API
  446. will truncate the output. Currently, truncated data cannot be
  447. selected by users.
  448. Args:
  449. resource: The type of the resource to query.
  450. Raises:
  451. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  452. if the CLI is failed to query the data.
  453. Changes:
  454. - changed in version 2.7: --filter values are case-insensitive.
  455. """ # noqa: E501
  456. # All resource names use '_' rather than '-'. But users options have '-'
  457. resource = StateResource(resource.replace("-", "_"))
  458. format = AvailableFormat(format)
  459. # Create the State API server and put it into context
  460. client = StateApiClient(address=address)
  461. filter = [_parse_filter(f) for f in filter]
  462. options = ListApiOptions(
  463. limit=limit,
  464. timeout=timeout,
  465. filters=filter,
  466. detail=detail,
  467. )
  468. # If errors occur, exceptions will be thrown. Empty data indicate successful query.
  469. try:
  470. data = client.list(
  471. resource,
  472. options=options,
  473. raise_on_missing_output=False,
  474. _explain=_should_explain(format),
  475. )
  476. except RayStateApiException as e:
  477. raise click.UsageError(str(e))
  478. # If --detail is given, the default formatting is yaml.
  479. if detail and format == AvailableFormat.DEFAULT:
  480. format = AvailableFormat.YAML
  481. # Print data to console.
  482. print(
  483. format_list_api_output(
  484. state_data=data,
  485. schema=resource_to_schema(resource),
  486. format=format,
  487. detail=detail,
  488. )
  489. )
  490. @click.group("summary")
  491. @click.pass_context
  492. @PublicAPI(stability="stable")
  493. def summary_state_cli_group(ctx):
  494. """Return the summarized information of a given resource."""
  495. pass
  496. @summary_state_cli_group.command(name="tasks")
  497. @timeout_option
  498. @address_option
  499. @click.pass_context
  500. @PublicAPI(stability="stable")
  501. def task_summary(ctx, timeout: float, address: str):
  502. """Summarize the task state of the cluster.
  503. By default, the output contains the information grouped by
  504. task function names.
  505. The output schema is
  506. :class:`~ray.util.state.common.TaskSummaries`.
  507. Raises:
  508. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  509. if the CLI is failed to query the data.
  510. """ # noqa: E501
  511. print(
  512. format_summary_output(
  513. summarize_tasks(
  514. address=address,
  515. timeout=timeout,
  516. raise_on_missing_output=False,
  517. _explain=True,
  518. ),
  519. resource=StateResource.TASKS,
  520. )
  521. )
  522. @summary_state_cli_group.command(name="actors")
  523. @timeout_option
  524. @address_option
  525. @click.pass_context
  526. @PublicAPI(stability="stable")
  527. def actor_summary(ctx, timeout: float, address: str):
  528. """Summarize the actor state of the cluster.
  529. By default, the output contains the information grouped by
  530. actor class names.
  531. The output schema is
  532. :class:`ray.util.state.common.ActorSummaries
  533. <ray.util.state.common.ActorSummaries>`.
  534. Raises:
  535. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  536. if the CLI is failed to query the data.
  537. """ # noqa: E501
  538. print(
  539. format_summary_output(
  540. summarize_actors(
  541. address=address,
  542. timeout=timeout,
  543. raise_on_missing_output=False,
  544. _explain=True,
  545. ),
  546. resource=StateResource.ACTORS,
  547. )
  548. )
  549. @summary_state_cli_group.command(name="objects")
  550. @timeout_option
  551. @address_option
  552. @click.pass_context
  553. @PublicAPI(stability="stable")
  554. def object_summary(ctx, timeout: float, address: str):
  555. """Summarize the object state of the cluster.
  556. The API is recommended when debugging memory leaks.
  557. See :ref:`Debugging with Ray Memory <debug-with-ray-memory>` for more details.
  558. (Note that this command is almost equivalent to `ray memory`, but it returns
  559. easier-to-understand output).
  560. By default, the output contains the information grouped by
  561. object callsite. Note that the callsite is not collected and
  562. all data will be aggregated as "disable" callsite if the env var
  563. `RAY_record_ref_creation_sites` is not configured. To enable the
  564. callsite collection, set the following environment variable when
  565. starting Ray.
  566. Example:
  567. ```
  568. RAY_record_ref_creation_sites=1 ray start --head
  569. ```
  570. ```
  571. RAY_record_ref_creation_sites=1 ray_script.py
  572. ```
  573. The output schema is
  574. :class:`ray.util.state.common.ObjectSummaries
  575. <ray.util.state.common.ObjectSummaries>`.
  576. Raises:
  577. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  578. if the CLI is failed to query the data.
  579. """ # noqa: E501
  580. print(
  581. format_object_summary_output(
  582. summarize_objects(
  583. address=address,
  584. timeout=timeout,
  585. raise_on_missing_output=False,
  586. _explain=True,
  587. ),
  588. )
  589. )
  590. log_follow_option = click.option(
  591. "--follow",
  592. "-f",
  593. required=False,
  594. type=bool,
  595. is_flag=True,
  596. help="Streams the log file as it is updated instead of just tailing.",
  597. )
  598. log_tail_option = click.option(
  599. "--tail",
  600. required=False,
  601. type=int,
  602. default=DEFAULT_LOG_LIMIT,
  603. help="Number of lines to tail from log. Use -1 to fetch the whole file.",
  604. )
  605. log_interval_option = click.option(
  606. "--interval",
  607. required=False,
  608. type=float,
  609. default=None,
  610. help="The interval in secs to print new logs when `--follow` is specified.",
  611. hidden=True,
  612. )
  613. log_timeout_option = click.option(
  614. "--timeout",
  615. default=DEFAULT_RPC_TIMEOUT,
  616. help=(
  617. "Timeout in seconds for the API requests. "
  618. f"Default is {DEFAULT_RPC_TIMEOUT}. If --follow is specified, "
  619. "this option will be ignored."
  620. ),
  621. )
  622. log_node_ip_option = click.option(
  623. "-ip",
  624. "--node-ip",
  625. required=False,
  626. type=str,
  627. default=None,
  628. help="Filters the logs by this ip address",
  629. )
  630. log_node_id_option = click.option(
  631. "--node-id",
  632. "-id",
  633. required=False,
  634. type=str,
  635. default=None,
  636. help="Filters the logs by this NodeID",
  637. )
  638. log_suffix_option = click.option(
  639. "--err",
  640. is_flag=True,
  641. default=False,
  642. help=(
  643. "If supplied, querying stderr files for workers/actors, "
  644. "else defaults to stdout files."
  645. ),
  646. )
  647. log_encoding_option = click.option(
  648. "--encoding",
  649. required=False,
  650. default="utf-8",
  651. help=(
  652. "The encoding use to decode the log file. Accepts any encoding "
  653. "supported by Python's `codecs` module. Defaults to utf-8."
  654. ),
  655. )
  656. log_encoding_errors_option = click.option(
  657. "--encoding-errors",
  658. required=False,
  659. default="strict",
  660. help=(
  661. "The error handling scheme to use for decoding errors. "
  662. "Accepts any error handling scheme supported by Python's `codecs`"
  663. "module. Defaults to strict."
  664. ),
  665. )
  666. def _get_head_node_ip(address: Optional[str] = None):
  667. """Get the head node ip from the ray address if possible
  668. Args:
  669. address: ray cluster address, e.g. "auto", "localhost:6379"
  670. Raises:
  671. click.UsageError: if node ip could not be resolved
  672. """
  673. try:
  674. address = services.canonicalize_bootstrap_address_or_die(address)
  675. return parse_address(address)[0]
  676. except (ConnectionError, ValueError) as e:
  677. # Hide all the stack trace
  678. raise click.UsageError(str(e))
  679. def _print_log(
  680. address: Optional[str] = None,
  681. node_id: Optional[str] = None,
  682. node_ip: Optional[str] = None,
  683. filename: Optional[str] = None,
  684. actor_id: Optional[str] = None,
  685. pid: Optional[int] = None,
  686. follow: bool = False,
  687. tail: int = DEFAULT_LOG_LIMIT,
  688. timeout: int = DEFAULT_RPC_TIMEOUT,
  689. interval: Optional[float] = None,
  690. suffix: str = "out",
  691. encoding: str = "utf-8",
  692. encoding_errors: str = "strict",
  693. task_id: Optional[str] = None,
  694. attempt_number: int = 0,
  695. submission_id: Optional[str] = None,
  696. ):
  697. """Wrapper around `get_log()` that prints the preamble and the log lines"""
  698. if tail > 0:
  699. print(
  700. f"--- Log has been truncated to last {tail} lines."
  701. " Use `--tail` flag to toggle. Set to -1 for getting the entire file. ---\n"
  702. )
  703. if node_id is None and node_ip is None:
  704. # Auto detect node ip from the ray address when address neither is given
  705. node_ip = _get_head_node_ip(address)
  706. for chunk in get_log(
  707. address=address,
  708. node_id=node_id,
  709. node_ip=node_ip,
  710. filename=filename,
  711. actor_id=actor_id,
  712. tail=tail,
  713. pid=pid,
  714. follow=follow,
  715. _interval=interval,
  716. timeout=timeout,
  717. suffix=suffix,
  718. encoding=encoding,
  719. errors=encoding_errors,
  720. task_id=task_id,
  721. attempt_number=attempt_number,
  722. submission_id=submission_id,
  723. ):
  724. print(chunk, end="", flush=True)
  725. LOG_CLI_HELP_MSG = """
  726. Get logs based on filename (cluster) or resource identifiers (actor)
  727. Example:
  728. Get all the log files available on a node (ray address could be
  729. obtained from `ray start --head` or `ray.init()`).
  730. ```
  731. ray logs cluster
  732. ```
  733. [ray logs cluster] Print the last 500 lines of raylet.out on a head node.
  734. ```
  735. ray logs cluster raylet.out --tail 500
  736. ```
  737. Or simply, using `ray logs` as an alias for `ray logs cluster`:
  738. ```
  739. ray logs raylet.out --tail 500
  740. ```
  741. Print the last 500 lines of raylet.out on a worker node id A.
  742. ```
  743. ray logs raylet.out --tail 500 —-node-id A
  744. ```
  745. [ray logs actor] Follow the log file with an actor id ABC.
  746. ```
  747. ray logs actor --id ABC --follow
  748. ```
  749. [ray logs task] Get the std err generated by a task.
  750. Note: If a task is from a concurrent actor (i.e. an async actor or
  751. a threaded actor), the log of the tasks are expected to be interleaved.
  752. Please use `ray logs actor --id <actor_id>` for the entire actor log.
  753. ```
  754. ray logs task --id <TASK_ID> --err
  755. ```
  756. """
  757. class LogCommandGroup(click.Group):
  758. def resolve_command(self, ctx, args):
  759. """Try resolve the command line args assuming users omitted the subcommand.
  760. This overrides the default `resolve_command` for the parent class.
  761. This will allow command alias of `ray <glob>` to `ray cluster <glob>`.
  762. """
  763. ctx.resilient_parsing = True
  764. res = super().resolve_command(ctx, args)
  765. cmd_name, cmd, parsed_args = res
  766. if cmd is None:
  767. # It could have been `ray logs ...`, forward to `ray logs cluster ...`
  768. return super().resolve_command(ctx, ["cluster"] + args)
  769. return cmd_name, cmd, parsed_args
  770. logs_state_cli_group = LogCommandGroup(help=LOG_CLI_HELP_MSG)
  771. @logs_state_cli_group.command(name="cluster")
  772. @click.argument(
  773. "glob_filter",
  774. required=False,
  775. default="*",
  776. )
  777. @address_option
  778. @log_node_id_option
  779. @log_node_ip_option
  780. @log_follow_option
  781. @log_tail_option
  782. @log_interval_option
  783. @log_timeout_option
  784. @log_encoding_option
  785. @log_encoding_errors_option
  786. @click.pass_context
  787. @PublicAPI(stability="stable")
  788. def log_cluster(
  789. ctx,
  790. glob_filter: str,
  791. address: Optional[str],
  792. node_id: Optional[str],
  793. node_ip: Optional[str],
  794. follow: bool,
  795. tail: int,
  796. interval: float,
  797. timeout: int,
  798. encoding: str,
  799. encoding_errors: str,
  800. ):
  801. """Get/List logs that matches the GLOB_FILTER in the cluster.
  802. By default, it prints a list of log files that match the filter.
  803. By default, it prints the head node logs.
  804. If there's only 1 match, it will print the log file.
  805. Example:
  806. Print the last 500 lines of raylet.out on a head node.
  807. ```
  808. ray logs [cluster] raylet.out --tail 500
  809. ```
  810. Print the last 500 lines of raylet.out on a worker node id A.
  811. ```
  812. ray logs [cluster] raylet.out --tail 500 —-node-id A
  813. ```
  814. Download the gcs_server.txt file to the local machine.
  815. ```
  816. ray logs [cluster] gcs_server.out --tail -1 > gcs_server.txt
  817. ```
  818. Follow the log files from the last 100 lines.
  819. ```
  820. ray logs [cluster] raylet.out --tail 100 -f
  821. ```
  822. Raises:
  823. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>` if the CLI
  824. is failed to query the data.
  825. """ # noqa: E501
  826. if node_id is None and node_ip is None:
  827. node_ip = _get_head_node_ip(address)
  828. logs = list_logs(
  829. address=address,
  830. node_id=node_id,
  831. node_ip=node_ip,
  832. glob_filter=glob_filter,
  833. timeout=timeout,
  834. )
  835. log_files_found = []
  836. for _, log_files in logs.items():
  837. for log_file in log_files:
  838. log_files_found.append(log_file)
  839. if len(log_files_found) != 1:
  840. # Print the list of log files found if no unique log found
  841. if node_id:
  842. print(f"Node ID: {node_id}")
  843. elif node_ip:
  844. print(f"Node IP: {node_ip}")
  845. print(output_with_format(logs, schema=None, format=AvailableFormat.YAML))
  846. return
  847. # If there's only 1 file, that means there's a unique match.
  848. filename = log_files_found[0]
  849. _print_log(
  850. address=address,
  851. node_id=node_id,
  852. node_ip=node_ip,
  853. filename=filename,
  854. tail=tail,
  855. follow=follow,
  856. interval=interval,
  857. timeout=timeout,
  858. encoding=encoding,
  859. encoding_errors=encoding_errors,
  860. )
  861. @logs_state_cli_group.command(name="actor")
  862. @click.option(
  863. "--id",
  864. "-a",
  865. required=False,
  866. type=str,
  867. default=None,
  868. help="Retrieves the logs corresponding to this ActorID.",
  869. )
  870. @click.option(
  871. "--pid",
  872. "-pid",
  873. required=False,
  874. type=str,
  875. default=None,
  876. help="Retrieves the logs from the actor with this pid.",
  877. )
  878. @address_option
  879. @log_node_id_option
  880. @log_node_ip_option
  881. @log_follow_option
  882. @log_tail_option
  883. @log_interval_option
  884. @log_timeout_option
  885. @log_suffix_option
  886. @click.pass_context
  887. @PublicAPI(stability="stable")
  888. def log_actor(
  889. ctx,
  890. id: Optional[str],
  891. pid: Optional[str],
  892. address: Optional[str],
  893. node_id: Optional[str],
  894. node_ip: Optional[str],
  895. follow: bool,
  896. tail: int,
  897. interval: float,
  898. timeout: int,
  899. err: bool,
  900. ):
  901. """Get/List logs associated with an actor.
  902. Example:
  903. Follow the log file with an actor id ABCDEFG.
  904. ```
  905. ray logs actor --id ABCDEFG --follow
  906. ```
  907. Get the actor log from pid 123, ip x.x.x.x
  908. Note that this goes well with the driver log of Ray which prints
  909. (ip=x.x.x.x, pid=123, class_name) logs.
  910. ```
  911. ray logs actor --pid=123 —ip=x.x.x.x
  912. ```
  913. Get the actor err log file.
  914. ```
  915. ray logs actor --id ABCDEFG --err
  916. ```
  917. Raises:
  918. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  919. if the CLI is failed to query the data.
  920. MissingParameter: if inputs are missing.
  921. """ # noqa: E501
  922. if pid is None and id is None:
  923. raise click.MissingParameter(
  924. message="At least one of `--pid` and `--id` has to be set",
  925. param_type="option",
  926. )
  927. _print_log(
  928. address=address,
  929. node_id=node_id,
  930. node_ip=node_ip,
  931. pid=pid,
  932. actor_id=id,
  933. tail=tail,
  934. follow=follow,
  935. interval=interval,
  936. timeout=timeout,
  937. suffix="err" if err else "out",
  938. )
  939. @logs_state_cli_group.command(name="worker")
  940. @click.option(
  941. "--pid",
  942. "-pid",
  943. # The only identifier supported for now, TODO(rickyx): add worker id support
  944. required=True,
  945. type=str,
  946. help="Retrieves the logs from the worker with this pid.",
  947. )
  948. @address_option
  949. @log_node_id_option
  950. @log_node_ip_option
  951. @log_follow_option
  952. @log_tail_option
  953. @log_interval_option
  954. @log_timeout_option
  955. @log_suffix_option
  956. @click.pass_context
  957. @PublicAPI(stability="stable")
  958. def log_worker(
  959. ctx,
  960. pid: Optional[str],
  961. address: Optional[str],
  962. node_id: Optional[str],
  963. node_ip: Optional[str],
  964. follow: bool,
  965. tail: int,
  966. interval: float,
  967. timeout: int,
  968. err: bool,
  969. ):
  970. """Get logs associated with a worker process.
  971. Example:
  972. Follow the log file from a worker process with pid=123
  973. ```
  974. ray logs worker --pid 123 --follow
  975. ```
  976. Get the stderr logs from a worker process.
  977. ```
  978. ray logs worker --pid 123 --err
  979. ```
  980. Raises:
  981. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  982. if the CLI is failed to query the data.
  983. MissingParameter: if inputs are missing.
  984. """ # noqa: E501
  985. _print_log(
  986. address=address,
  987. node_id=node_id,
  988. node_ip=node_ip,
  989. pid=pid,
  990. tail=tail,
  991. follow=follow,
  992. interval=interval,
  993. timeout=timeout,
  994. suffix="err" if err else "out",
  995. )
  996. @logs_state_cli_group.command(name="job")
  997. @click.option(
  998. "--id",
  999. "submission_id",
  1000. required=True,
  1001. type=str,
  1002. help=(
  1003. "Retrieves the logs from a submission job with submission id,"
  1004. "i.e. raysubmit_XXX"
  1005. ),
  1006. )
  1007. @address_option
  1008. @log_follow_option
  1009. @log_tail_option
  1010. @log_interval_option
  1011. @log_timeout_option
  1012. @click.pass_context
  1013. @PublicAPI(stability="stable")
  1014. def log_job(
  1015. ctx,
  1016. submission_id: Optional[str],
  1017. address: Optional[str],
  1018. follow: bool,
  1019. tail: int,
  1020. interval: float,
  1021. timeout: int,
  1022. ):
  1023. """Get logs associated with a submission job.
  1024. Example:
  1025. Follow the log file from a submission job with submission id raysumbit_xxx.
  1026. ```
  1027. ray logs job --id raysubmit_xxx
  1028. ```
  1029. Follow the submission job log.
  1030. ```
  1031. ray logs jobs --id raysubmit_xxx --follow
  1032. ```
  1033. Raises:
  1034. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  1035. if the CLI is failed to query the data.
  1036. MissingParameter: if inputs are missing.
  1037. """ # noqa: E501
  1038. _print_log(
  1039. address=address,
  1040. tail=tail,
  1041. follow=follow,
  1042. interval=interval,
  1043. timeout=timeout,
  1044. submission_id=submission_id,
  1045. )
  1046. @logs_state_cli_group.command(name="task")
  1047. @click.option(
  1048. "--id",
  1049. "task_id",
  1050. required=True,
  1051. type=str,
  1052. help="Retrieves the logs from the task with this task id.",
  1053. )
  1054. @click.option(
  1055. "--attempt-number",
  1056. "-a",
  1057. required=False,
  1058. type=int,
  1059. default=0,
  1060. help="Retrieves the logs from the attempt, default to 0",
  1061. )
  1062. @address_option
  1063. @log_follow_option
  1064. @log_interval_option
  1065. @log_tail_option
  1066. @log_timeout_option
  1067. @log_suffix_option
  1068. @click.pass_context
  1069. @PublicAPI(stability="stable")
  1070. def log_task(
  1071. ctx,
  1072. task_id: Optional[str],
  1073. attempt_number: int,
  1074. address: Optional[str],
  1075. follow: bool,
  1076. interval: float,
  1077. tail: int,
  1078. timeout: int,
  1079. err: bool,
  1080. ):
  1081. """Get logs associated with a task.
  1082. Example:
  1083. Follow the log file from a task with task id = ABCDEFG
  1084. ```
  1085. ray logs tasks --id ABCDEFG --follow
  1086. ```
  1087. Get the log from a retry attempt 1 from a task.
  1088. ```
  1089. ray logs tasks --id ABCDEFG -a 1
  1090. ```
  1091. Note: If a task is from a concurrent actor (i.e. an async actor or
  1092. a threaded actor), the log of the tasks are expected to be interleaved.
  1093. Please use `ray logs actor --id <actor_id>` for the entire actor log.
  1094. Raises:
  1095. :class:`RayStateApiException <ray.util.state.exception.RayStateApiException>`
  1096. if the CLI is failed to query the data.
  1097. MissingParameter: if inputs are missing.
  1098. """ # noqa: E501
  1099. _print_log(
  1100. address=address,
  1101. task_id=task_id,
  1102. attempt_number=attempt_number,
  1103. follow=follow,
  1104. tail=tail,
  1105. interval=interval,
  1106. timeout=timeout,
  1107. suffix="err" if err else "out",
  1108. )