schema.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. from enum import Enum
  2. from typing import Dict, List, Optional
  3. from ray._common.pydantic_compat import BaseModel, Field
  4. from ray.dashboard.modules.job.pydantic_models import JobDetails
  5. from ray.util.annotations import DeveloperAPI
  6. MAX_ERROR_STACK_TRACE_LENGTH = 50000
  7. @DeveloperAPI
  8. class RunStatusEnum(str, Enum):
  9. """Enumeration for the status of a train run."""
  10. # (Deprecated) Replaced by RUNNING.
  11. # The train run has started
  12. STARTED = "STARTED"
  13. # The train run is running
  14. RUNNING = "RUNNING"
  15. # The train run was terminated as expected
  16. FINISHED = "FINISHED"
  17. # The train run was terminated early due to errors in the training function
  18. ERRORED = "ERRORED"
  19. # The train run was terminated early due to system errors or controller errors
  20. ABORTED = "ABORTED"
  21. @DeveloperAPI
  22. class ActorStatusEnum(str, Enum):
  23. DEAD = "DEAD"
  24. ALIVE = "ALIVE"
  25. @DeveloperAPI
  26. class TrainWorkerInfo(BaseModel):
  27. """Metadata of a Ray Train worker."""
  28. actor_id: str = Field(description="Actor ID of the worker.")
  29. world_rank: int = Field(description="World rank of the worker.")
  30. local_rank: int = Field(description="Local rank of the worker.")
  31. node_rank: int = Field(description="Node rank of the worker.")
  32. node_id: str = Field(description="ID of the node that the worker is running on.")
  33. node_ip: str = Field(
  34. description="IP address of the node that the worker is running on."
  35. )
  36. pid: int = Field(description="Process ID of the worker.")
  37. gpu_ids: List[int] = Field(
  38. description="A list of GPU ids allocated to that worker."
  39. )
  40. status: ActorStatusEnum = Field(
  41. description="The status of the train worker actor. It can be ALIVE or DEAD."
  42. )
  43. resources: Dict[str, float] = Field(
  44. description="The resources allocated to the worker."
  45. )
  46. @DeveloperAPI
  47. class MemoryInfo(BaseModel):
  48. rss: int
  49. vms: int
  50. pfaults: Optional[int]
  51. pageins: Optional[int]
  52. @DeveloperAPI
  53. class ProcessStats(BaseModel):
  54. cpuPercent: float
  55. # total memory, free memory, memory used ratio
  56. mem: Optional[List[int]]
  57. memoryInfo: MemoryInfo
  58. class ProcessGPUUsage(BaseModel):
  59. # This gpu usage stats from a process
  60. pid: int
  61. gpuMemoryUsage: int
  62. @DeveloperAPI
  63. class GPUStats(BaseModel):
  64. uuid: str
  65. index: int
  66. name: str
  67. utilizationGpu: Optional[float]
  68. memoryUsed: float
  69. memoryTotal: float
  70. processInfo: ProcessGPUUsage
  71. @DeveloperAPI
  72. class TrainWorkerInfoWithDetails(TrainWorkerInfo):
  73. """Metadata of a Ray Train worker."""
  74. processStats: Optional[ProcessStats] = Field(
  75. None, description="Process stats of the worker."
  76. )
  77. gpus: List[GPUStats] = Field(
  78. default_factory=list,
  79. description=(
  80. "GPU stats of the worker. "
  81. "Only returns GPUs that are attached to the worker process."
  82. ),
  83. )
  84. @DeveloperAPI
  85. class TrainDatasetInfo(BaseModel):
  86. name: str = Field(
  87. description="The key of the dataset dict specified in Ray Train Trainer."
  88. )
  89. dataset_uuid: str = Field(description="The uuid of the dataset.")
  90. dataset_name: Optional[str] = Field(description="The name of the dataset.")
  91. @DeveloperAPI
  92. class TrainRunInfo(BaseModel):
  93. """Metadata for a Ray Train run and information about its workers."""
  94. name: str = Field(description="The name of the Train run.")
  95. id: str = Field(description="The unique identifier for each Train run.")
  96. job_id: str = Field(description="The Ray Job ID.")
  97. controller_actor_id: str = Field(description="Actor Id of the Train controller.")
  98. workers: List[TrainWorkerInfo] = Field(
  99. description="A List of Train workers sorted by global ranks."
  100. )
  101. datasets: List[TrainDatasetInfo] = Field(
  102. description="A List of dataset info for this Train run."
  103. )
  104. run_status: RunStatusEnum = Field(
  105. description="The current status of the train run. It can be one of the "
  106. "following: RUNNING, FINISHED, ERRORED, or ABORTED."
  107. )
  108. status_detail: str = Field(
  109. description="Detailed information about the current run status, "
  110. "such as error messages."
  111. )
  112. start_time_ms: int = Field(
  113. description="The UNIX timestamp of the start time of this Train run."
  114. )
  115. end_time_ms: Optional[int] = Field(
  116. description="The UNIX timestamp of the end time of this Train run. "
  117. "If null, the Train run has not ended yet."
  118. )
  119. resources: List[Dict[str, float]] = Field(
  120. description="The resources allocated to the worker."
  121. )
  122. @DeveloperAPI
  123. class TrainRunInfoWithDetails(TrainRunInfo):
  124. """Metadata for a Ray Train run and information about its workers."""
  125. workers: List[TrainWorkerInfoWithDetails] = Field(
  126. description="A List of Train workers sorted by global ranks."
  127. )
  128. job_details: Optional[JobDetails] = Field(
  129. None, description="Details of the job that started this Train run."
  130. )
  131. @DeveloperAPI
  132. class TrainRunsResponse(BaseModel):
  133. train_runs: List[TrainRunInfoWithDetails]