runs.py 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750
  1. """W&B Public API for Runs.
  2. This module provides classes for interacting with W&B runs and their associated
  3. data.
  4. Example:
  5. ```python
  6. from wandb.apis.public import Api
  7. # Get runs matching filters
  8. runs = Api().runs(
  9. path="entity/project", filters={"state": "finished", "config.batch_size": 32}
  10. )
  11. # Access run data
  12. for run in runs:
  13. print(f"Run: {run.name}")
  14. print(f"Config: {run.config}")
  15. print(f"Metrics: {run.summary}")
  16. # Get history with pandas
  17. history_df = run.history(keys=["loss", "accuracy"], pandas=True)
  18. # Work with artifacts
  19. for artifact in run.logged_artifacts():
  20. print(f"Artifact: {artifact.name}")
  21. ```
  22. Note:
  23. This module is part of the W&B Public API and provides read/write access
  24. to run data. For logging new runs, use the wandb.init() function from
  25. the main wandb package.
  26. """
  27. from __future__ import annotations
  28. import json
  29. import os
  30. import pathlib
  31. import tempfile
  32. import time
  33. import urllib
  34. from collections.abc import Collection, Iterator, Mapping
  35. from typing import TYPE_CHECKING, Any, Literal
  36. from typing_extensions import override
  37. from wandb_gql import gql
  38. import wandb
  39. import wandb.apis.public.runhistory as runhistory
  40. from wandb import env, util
  41. from wandb._strutils import nameof
  42. from wandb.apis import public
  43. from wandb.apis._generated import GET_AGENT_RUNS_GQL
  44. from wandb.apis._generated.get_agent_runs import GetAgentRuns
  45. from wandb.apis.attrs import Attrs
  46. from wandb.apis.internal import Api as InternalApi
  47. from wandb.apis.normalize import normalize_exceptions
  48. from wandb.apis.paginator import SizedPaginator
  49. from wandb.apis.public.const import RETRY_TIMEDELTA
  50. from wandb.apis.public.service_api import ServiceApi
  51. from wandb.proto import wandb_api_pb2 as apb
  52. from wandb.sdk import wandb_setup
  53. from wandb.sdk.lib import ipython, json_util, runid
  54. from wandb.sdk.lib.paths import LogicalPath
  55. from wandb.sdk.lib.service.service_connection import WandbApiFailedError
  56. if TYPE_CHECKING:
  57. import pandas as pd
  58. import polars as pl
  59. from typing_extensions import Self
  60. from wandb_graphql.language.ast import Document
  61. from wandb.apis.public import RetryingClient
  62. from wandb.old.summary import HTTPSummary
  63. WANDB_INTERNAL_KEYS = {"_wandb", "wandb_version"}
  64. RUN_FRAGMENT = """fragment RunFragment on Run {
  65. id
  66. tags
  67. name
  68. displayName
  69. sweepName
  70. state
  71. config
  72. group
  73. jobType
  74. commit
  75. readOnly
  76. createdAt
  77. heartbeatAt
  78. description
  79. notes
  80. systemMetrics
  81. summaryMetrics
  82. historyLineCount
  83. user {
  84. name
  85. username
  86. }
  87. historyKeys
  88. }"""
  89. # Lightweight fragment for listing operations - excludes heavy fields
  90. LIGHTWEIGHT_RUN_FRAGMENT = """fragment LightweightRunFragment on Run {
  91. id
  92. tags
  93. name
  94. displayName
  95. sweepName
  96. state
  97. group
  98. jobType
  99. commit
  100. readOnly
  101. createdAt
  102. heartbeatAt
  103. description
  104. notes
  105. historyLineCount
  106. user {
  107. name
  108. username
  109. }
  110. }"""
  111. # Fragment name constants to avoid string parsing
  112. RUN_FRAGMENT_NAME = "RunFragment"
  113. LIGHTWEIGHT_RUN_FRAGMENT_NAME = "LightweightRunFragment"
  114. def _create_runs_query(*, lazy: bool) -> gql:
  115. """Create GraphQL query for runs with appropriate fragment."""
  116. fragment = LIGHTWEIGHT_RUN_FRAGMENT if lazy else RUN_FRAGMENT
  117. fragment_name = LIGHTWEIGHT_RUN_FRAGMENT_NAME if lazy else RUN_FRAGMENT_NAME
  118. return gql(
  119. f"""#graphql
  120. query Runs($project: String!, $entity: String!, $cursor: String, $perPage: Int = 50, $order: String, $filters: JSONString) {{
  121. project(name: $project, entityName: $entity) {{
  122. internalId
  123. runCount(filters: $filters)
  124. readOnly
  125. runs(filters: $filters, after: $cursor, first: $perPage, order: $order) {{
  126. edges {{
  127. node {{
  128. projectId
  129. ...{fragment_name}
  130. }}
  131. cursor
  132. }}
  133. pageInfo {{
  134. endCursor
  135. hasNextPage
  136. }}
  137. }}
  138. }}
  139. }}
  140. {fragment}
  141. """
  142. )
  143. @normalize_exceptions
  144. def _convert_to_dict(value: Any) -> dict[str, Any]:
  145. """Converts a value to a dictionary.
  146. If the value is already a dictionary, the value is returned unchanged.
  147. If the value is a string, bytes, or bytearray, it is parsed as JSON.
  148. For any other type, a TypeError is raised.
  149. """
  150. if value is None:
  151. return {}
  152. if isinstance(value, dict):
  153. return value
  154. if isinstance(value, (str, bytes, bytearray)):
  155. try:
  156. return json.loads(value)
  157. except json.decoder.JSONDecodeError:
  158. # ignore invalid utf-8 or control characters
  159. return json.loads(value, strict=False)
  160. raise TypeError(f"Unable to convert {value} to a dict")
  161. class Runs(SizedPaginator["Run"]):
  162. """A lazy iterator of `Run` objects associated with a project and optional filter.
  163. Runs are retrieved in pages from the W&B server as needed.
  164. This is generally used indirectly using the `Api.runs` namespace.
  165. Args:
  166. client: (`wandb.apis.public.RetryingClient`) The API client to use
  167. for requests.
  168. entity: (str) The entity (username or team) that owns the project.
  169. project: (str) The name of the project to fetch runs from.
  170. filters: (Optional[Dict[str, Any]]) A dictionary of filters to apply
  171. to the runs query.
  172. order: (str) Order can be `created_at`, `heartbeat_at`, `config.*.value`, or `summary_metrics.*`.
  173. If you prepend order with a + order is ascending (default).
  174. If you prepend order with a - order is descending.
  175. The default order is run.created_at from oldest to newest.
  176. per_page: (int) The number of runs to fetch per request (default is 50).
  177. include_sweeps: (bool) Whether to include sweep information in the
  178. runs. Defaults to True.
  179. """
  180. def __init__(
  181. self,
  182. client: RetryingClient,
  183. entity: str,
  184. project: str,
  185. filters: dict[str, Any] | None = None,
  186. order: str = "+created_at",
  187. per_page: int = 50,
  188. include_sweeps: bool = True,
  189. lazy: bool = True,
  190. service_api: ServiceApi | None = None,
  191. ):
  192. if not order:
  193. order = "+created_at"
  194. self.QUERY = _create_runs_query(lazy=lazy)
  195. self.entity = entity
  196. self.project = project
  197. self._project_internal_id = None
  198. self.filters = filters or {}
  199. self.order = order
  200. self._sweeps: dict[str, public.Sweep] = {}
  201. self._include_sweeps = include_sweeps
  202. self._lazy = lazy
  203. self._service_api = service_api
  204. variables = {
  205. "project": self.project,
  206. "entity": self.entity,
  207. "order": self.order,
  208. "filters": json.dumps(self.filters),
  209. }
  210. super().__init__(client, variables, per_page)
  211. @property
  212. def _length(self) -> int:
  213. """Returns the total number of runs.
  214. <!-- lazydoc-ignore: internal -->
  215. """
  216. if not self.last_response:
  217. self._load_page()
  218. return self.last_response["project"]["runCount"]
  219. @property
  220. def more(self) -> bool:
  221. """Returns whether there are more runs to fetch.
  222. <!-- lazydoc-ignore: internal -->
  223. """
  224. if self.last_response:
  225. return bool(
  226. self.last_response["project"]["runs"]["pageInfo"]["hasNextPage"]
  227. )
  228. else:
  229. return True
  230. @property
  231. def cursor(self):
  232. """Returns the cursor position for pagination of runs results.
  233. <!-- lazydoc-ignore: internal -->
  234. """
  235. if self.last_response:
  236. return self.last_response["project"]["runs"]["edges"][-1]["cursor"]
  237. else:
  238. return None
  239. def convert_objects(self) -> list[Run]:
  240. """Converts GraphQL edges to Runs objects.
  241. <!-- lazydoc-ignore: internal -->
  242. """
  243. objs = []
  244. if self.last_response is None or self.last_response.get("project") is None:
  245. raise ValueError("Could not find project {}".format(self.project))
  246. for run_response in self.last_response["project"]["runs"]["edges"]:
  247. run = Run(
  248. self.client,
  249. self.entity,
  250. self.project,
  251. run_response["node"]["name"],
  252. run_response["node"],
  253. include_sweeps=self._include_sweeps,
  254. lazy=self._lazy,
  255. service_api=self._service_api,
  256. )
  257. objs.append(run)
  258. if self._include_sweeps and run.sweep_name:
  259. if run.sweep_name in self._sweeps:
  260. sweep = self._sweeps[run.sweep_name]
  261. else:
  262. sweep = public.Sweep.get(
  263. self.client,
  264. self.entity,
  265. self.project,
  266. run.sweep_name,
  267. withRuns=False,
  268. )
  269. self._sweeps[run.sweep_name] = sweep
  270. if sweep is None:
  271. continue
  272. run.sweep = sweep
  273. return objs
  274. @normalize_exceptions
  275. def histories(
  276. self,
  277. samples: int = 500,
  278. keys: list[str] | None = None,
  279. x_axis: str = "_step",
  280. format: Literal["default", "pandas", "polars"] = "default",
  281. stream: Literal["default", "system"] = "default",
  282. ) -> list[dict[str, Any]] | pd.DataFrame | pl.DataFrame:
  283. """Return sampled history metrics for all runs that fit the filters conditions.
  284. Args:
  285. samples: The number of samples to return per run
  286. keys: Only return metrics for specific keys
  287. x_axis: Use this metric as the xAxis defaults to _step
  288. format: Format to return data in, options are "default", "pandas",
  289. "polars"
  290. stream: "default" for metrics, "system" for machine metrics
  291. Returns:
  292. pandas.DataFrame: If `format="pandas"`, returns a `pandas.DataFrame`
  293. of history metrics.
  294. polars.DataFrame: If `format="polars"`, returns a `polars.DataFrame`
  295. of history metrics.
  296. list of dicts: If `format="default"`, returns a list of dicts
  297. containing history metrics with a `run_id` key.
  298. """
  299. if format not in ("default", "pandas", "polars"):
  300. raise ValueError(
  301. f"Invalid format: {format}. Must be one of 'default', 'pandas', 'polars'"
  302. )
  303. histories = []
  304. if format == "default":
  305. for run in self:
  306. history_data = run.history(
  307. samples=samples,
  308. keys=keys,
  309. x_axis=x_axis,
  310. pandas=False,
  311. stream=stream,
  312. )
  313. if not history_data:
  314. continue
  315. for entry in history_data:
  316. entry["run_id"] = run.id
  317. histories.extend(history_data)
  318. return histories
  319. if format == "pandas":
  320. pd = util.get_module(
  321. "pandas", required="Exporting pandas DataFrame requires pandas"
  322. )
  323. for run in self:
  324. history_data = run.history(
  325. samples=samples,
  326. keys=keys,
  327. x_axis=x_axis,
  328. pandas=False,
  329. stream=stream,
  330. )
  331. if not history_data:
  332. continue
  333. df = pd.DataFrame.from_records(history_data)
  334. df["run_id"] = run.id
  335. histories.append(df)
  336. if not histories:
  337. return pd.DataFrame()
  338. combined_df = pd.concat(histories)
  339. combined_df.reset_index(drop=True, inplace=True)
  340. # sort columns for consistency
  341. combined_df = combined_df[(sorted(combined_df.columns))]
  342. return combined_df
  343. if format == "polars":
  344. pl = util.get_module(
  345. "polars", required="Exporting polars DataFrame requires polars"
  346. )
  347. for run in self:
  348. history_data = run.history(
  349. samples=samples,
  350. keys=keys,
  351. x_axis=x_axis,
  352. pandas=False,
  353. stream=stream,
  354. )
  355. if not history_data:
  356. continue
  357. df = pl.from_records(history_data)
  358. df = df.with_columns(pl.lit(run.id).alias("run_id"))
  359. histories.append(df)
  360. if not histories:
  361. return pl.DataFrame()
  362. combined_df = pl.concat(histories, how="vertical")
  363. # sort columns for consistency
  364. combined_df = combined_df.select(sorted(combined_df.columns))
  365. return combined_df
  366. def __repr__(self) -> str:
  367. return f"<{nameof(type(self))} {self.entity}/{self.project}>"
  368. def upgrade_to_full(self) -> None:
  369. """Upgrade this Runs collection from lazy to full mode.
  370. This switches to fetching full run data and
  371. upgrades any already-loaded Run objects to have full data.
  372. Uses parallel loading for better performance when upgrading multiple runs.
  373. """
  374. if not self._lazy:
  375. return # Already in full mode
  376. # Switch to full mode
  377. self._lazy = False
  378. # Regenerate query with full fragment
  379. self.QUERY = _create_runs_query(lazy=False)
  380. # Upgrade any existing runs that have been loaded - use parallel loading for performance
  381. lazy_runs = [run for run in self.objects if run._lazy]
  382. if lazy_runs:
  383. from concurrent.futures import ThreadPoolExecutor
  384. # Limit workers to avoid overwhelming the server
  385. max_workers = min(len(lazy_runs), 10)
  386. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  387. futures = [executor.submit(run.load_full_data) for run in lazy_runs]
  388. # Wait for all to complete
  389. for future in futures:
  390. future.result()
  391. class AgentRuns(SizedPaginator["Run"]):
  392. """A lazy iterator of `Run` objects for a single sweep agent.
  393. <!-- lazydoc-ignore-class: internal -->
  394. """
  395. def __init__(
  396. self,
  397. client: RetryingClient,
  398. entity: str,
  399. project: str,
  400. sweep_id: str,
  401. agent_key: str,
  402. *,
  403. total_runs: int,
  404. order: str = "+created_at",
  405. per_page: int = 50,
  406. service_api: ServiceApi | None = None,
  407. ) -> None:
  408. self.QUERY = gql(GET_AGENT_RUNS_GQL)
  409. self.entity = entity
  410. self.project = project
  411. self._sweep_id = sweep_id
  412. self._agent_key = agent_key
  413. self.order = order
  414. self._sweeps: dict[str, public.Sweep] = {}
  415. self._service_api = service_api
  416. self._total_runs = total_runs
  417. self.per_page = per_page
  418. variables = {
  419. "project": self.project,
  420. "entity": self.entity,
  421. "order": self.order,
  422. "agentID": self._agent_key,
  423. "sweep": self._sweep_id,
  424. "after": None,
  425. "before": None,
  426. "first": self.per_page,
  427. "last": None,
  428. }
  429. super().__init__(client, variables, per_page)
  430. @override
  431. def update_variables(self) -> None:
  432. """Map paginator state to GetAgentRuns variables (after/first, not cursor/perPage)."""
  433. self.variables.update(
  434. {
  435. "first": self.per_page,
  436. "after": self.cursor,
  437. "before": None,
  438. "last": None,
  439. }
  440. )
  441. @property
  442. @override
  443. def _length(self) -> int:
  444. return self._total_runs
  445. def _parsed(self) -> GetAgentRuns:
  446. assert self.last_response is not None
  447. return GetAgentRuns.model_validate(self.last_response)
  448. def _agent_runs_connection(self):
  449. parsed = self._parsed()
  450. if not parsed.project:
  451. raise ValueError(f"Could not find project {self.project!r} for agent runs.")
  452. if not parsed.project.sweep:
  453. raise ValueError(f"Could not find sweep {self._sweep_id!r} for agent runs.")
  454. if not parsed.project.sweep.agent:
  455. raise ValueError(
  456. f"Could not find agent {self._agent_key!r} for agent runs."
  457. )
  458. return parsed.project.sweep.agent.runs
  459. @property
  460. @override
  461. def more(self) -> bool:
  462. return self.last_response is None or bool(
  463. self._agent_runs_connection().page_info.has_next_page
  464. )
  465. @property
  466. @override
  467. def cursor(self) -> str | None:
  468. if not self.last_response:
  469. return None
  470. edges = self._agent_runs_connection().edges
  471. return edges[-1].cursor if edges else None
  472. @override
  473. def convert_objects(self) -> list[Run]:
  474. """Convert the current GraphQL page into :class:`Run` instances for this agent."""
  475. objs = []
  476. for edge in self._agent_runs_connection().edges:
  477. node = edge.node.model_dump(by_alias=True)
  478. run = Run(
  479. self.client,
  480. self.entity,
  481. self.project,
  482. node["name"],
  483. node,
  484. include_sweeps=False,
  485. lazy=True,
  486. service_api=self._service_api,
  487. )
  488. objs.append(run)
  489. return objs
  490. @override
  491. def __repr__(self) -> str:
  492. return f"<{nameof(type(self))} {self.entity}/{self.project} agent={self._agent_key!r}>"
  493. class Run(Attrs):
  494. """A single run associated with an entity and project.
  495. Args:
  496. client: The W&B API client.
  497. entity: The entity associated with the run.
  498. project: The project associated with the run.
  499. run_id: The unique identifier for the run.
  500. attrs: The attributes of the run.
  501. include_sweeps: Whether to include sweeps in the run.
  502. Attributes:
  503. tags ([str]): a list of tags associated with the run
  504. url (str): the url of this run
  505. id (str): unique identifier for the run (defaults to eight characters)
  506. name (str): the name of the run
  507. state (str): one of: running, finished, crashed, killed, preempting, preempted
  508. config (dict): a dict of hyperparameters associated with the run
  509. created_at (str): ISO timestamp when the run was started
  510. system_metrics (dict): the latest system metrics recorded for the run
  511. summary (dict): A mutable dict-like property that holds the current summary.
  512. Calling update will persist any changes.
  513. project (str): the project associated with the run
  514. entity (str): the name of the entity associated with the run
  515. project_internal_id (int): the internal id of the project
  516. user (str): the name of the user who created the run
  517. path (str): Unique identifier [entity]/[project]/[run_id]
  518. notes (str): Notes about the run
  519. read_only (boolean): Whether the run is editable
  520. history_keys (str): History metric keys logged with `wandb.Run.log({"key": "value"})`
  521. metadata (str): Metadata about the run from wandb-metadata.json
  522. """
  523. def __init__(
  524. self,
  525. client: RetryingClient,
  526. entity: str,
  527. project: str,
  528. run_id: str,
  529. attrs: Mapping | None = None,
  530. include_sweeps: bool = True,
  531. lazy: bool = True,
  532. service_api: ServiceApi | None = None,
  533. ):
  534. """Initialize a Run object.
  535. Run is always initialized by calling api.runs() where api is an instance of
  536. wandb.Api.
  537. """
  538. _attrs = attrs or {}
  539. super().__init__(dict(_attrs))
  540. self.client = client
  541. self._entity = entity
  542. self.project = project
  543. self._files = {}
  544. self._base_dir = env.get_dir(tempfile.gettempdir())
  545. self.id = run_id
  546. self.sweep = None
  547. self._include_sweeps = include_sweeps
  548. self._lazy = lazy
  549. self._full_data_loaded = False # Track if we've loaded full data
  550. self.dir = os.path.join(self._base_dir, *self.path)
  551. try:
  552. os.makedirs(self.dir)
  553. except OSError:
  554. pass
  555. self._summary = None
  556. self._metadata: dict[str, Any] | None = None
  557. self._state = _attrs.get("state", "not found")
  558. self.server_provides_internal_id_field: bool | None = None
  559. self._is_loaded: bool = False
  560. self._service_api: ServiceApi | None = service_api
  561. self.load(force=not _attrs)
  562. @property
  563. def state(self) -> str:
  564. """The state of the run. Can be one of: Finished, Failed, Crashed, or Running."""
  565. return self._state
  566. @property
  567. def entity(self) -> str:
  568. """The entity associated with the run."""
  569. return self._entity
  570. @property
  571. def username(self) -> str:
  572. """This API is deprecated. Use `entity` instead."""
  573. wandb.termwarn("Run.username is deprecated. Please use Run.entity instead.")
  574. return self._entity
  575. @property
  576. def storage_id(self) -> str:
  577. """The unique storage identifier for the run."""
  578. # For compatibility with wandb.Run, which has storage IDs
  579. # in self.storage_id and names in self.id.
  580. return self._attrs.get("id")
  581. @property
  582. def id(self) -> str:
  583. """The unique identifier for the run."""
  584. return self._attrs.get("name")
  585. @id.setter
  586. def id(self, new_id: str) -> None:
  587. """Set the unique identifier for the run."""
  588. self._attrs["name"] = new_id
  589. @property
  590. def name(self) -> str | None:
  591. """The name of the run."""
  592. return self._attrs.get("displayName")
  593. @name.setter
  594. def name(self, new_name: str) -> None:
  595. """Set the name of the run."""
  596. self._attrs["displayName"] = new_name
  597. @classmethod
  598. def create(
  599. cls,
  600. api: public.Api,
  601. run_id: str | None = None,
  602. project: str | None = None,
  603. entity: str | None = None,
  604. state: Literal["running", "pending"] = "running",
  605. ) -> Self:
  606. """Create a run for the given project.
  607. For most use cases, use `wandb.init()`. `wandb.init()` provides more robust
  608. logic for creating and updating runs. `wandb.apis.public.Run.create`
  609. is intended for specific scenarios such as creating runs in
  610. a "pending" state for jobs that may be unschedulable
  611. (for example, in a Kubernetes cluster with insufficient GPUs or high
  612. contention). These pending runs can later be resumed and tracked by W&B.
  613. Runs created with this method have limited functionality. Calling
  614. `update()` on a run created this way may not work as expected.
  615. Args:
  616. api: The W&B API instance.
  617. run_id: Optional run ID. If not provided, a random ID will be generated.
  618. project: Optional project name. Defaults to the project in API settings
  619. or "uncategorized".
  620. entity: Optional entity (user or team) name.
  621. state: Initial state of the run. Use "pending" for runs that will be
  622. resumed later, or "running" for immediate execution.
  623. Returns:
  624. A Run object representing the created run.
  625. Example:
  626. Creating a pending run for later execution
  627. ```python
  628. import wandb
  629. api = wandb.Api()
  630. run_name = "my-pending-run"
  631. run = Run.create(
  632. api=api,
  633. project="project",
  634. entity="entity",
  635. state="pending",
  636. run_id=run_name,
  637. )
  638. ```
  639. """
  640. api._sentry.message("Invoking Run.create", level="info")
  641. run_id = run_id or runid.generate_id()
  642. project = project or api.settings.get("project") or "uncategorized"
  643. mutation = gql(
  644. """
  645. mutation UpsertBucket($project: String, $entity: String, $name: String!, $state: String) {
  646. upsertBucket(input: {modelName: $project, entityName: $entity, name: $name, state: $state}) {
  647. bucket {
  648. project {
  649. name
  650. entity { name }
  651. }
  652. id
  653. name
  654. }
  655. inserted
  656. }
  657. }
  658. """
  659. )
  660. variables = {
  661. "entity": entity,
  662. "project": project,
  663. "name": run_id,
  664. "state": state,
  665. }
  666. res = api.client.execute(mutation, variable_values=variables)
  667. res = res["upsertBucket"]["bucket"]
  668. return cls(
  669. api.client,
  670. res["project"]["entity"]["name"],
  671. res["project"]["name"],
  672. res["name"],
  673. {
  674. "id": res["id"],
  675. "config": "{}",
  676. "systemMetrics": "{}",
  677. "summaryMetrics": "{}",
  678. "tags": [],
  679. "description": None,
  680. "notes": None,
  681. "state": state,
  682. },
  683. lazy=False, # Created runs should have full data available immediately
  684. )
  685. def _load_with_fragment(
  686. self, fragment: str, fragment_name: str, force: bool = False
  687. ) -> dict[str, Any]:
  688. """Load run data using specified GraphQL fragment."""
  689. query = gql(
  690. f"""#graphql
  691. query Run($project: String!, $entity: String!, $name: String!) {{
  692. project(name: $project, entityName: $entity) {{
  693. run(name: $name) {{
  694. projectId
  695. ...{fragment_name}
  696. }}
  697. }}
  698. }}
  699. {fragment}
  700. """
  701. )
  702. if force or not self._attrs:
  703. response = self._exec(query)
  704. if (
  705. response is None
  706. or response.get("project") is None
  707. or response["project"].get("run") is None
  708. ):
  709. raise ValueError("Could not find run {}".format(self))
  710. self._attrs = response["project"]["run"]
  711. self._state = self._attrs["state"]
  712. if self._attrs.get("user"):
  713. self.user = public.User(self.client, self._attrs["user"])
  714. if self._include_sweeps and self.sweep_name and not self.sweep:
  715. # There may be a lot of runs. Don't bother pulling them all
  716. # just for the sake of this one.
  717. self.sweep = public.Sweep.get(
  718. self.client,
  719. self.entity,
  720. self.project,
  721. self.sweep_name,
  722. withRuns=False,
  723. )
  724. if not self._is_loaded or force:
  725. # Always set _project_internal_id if projectId is available, regardless of fragment type
  726. if "projectId" in self._attrs:
  727. self._project_internal_id = int(self._attrs["projectId"])
  728. else:
  729. self._project_internal_id = None
  730. # Always call _load_from_attrs when using the full fragment or when the fields are actually present
  731. if fragment_name == RUN_FRAGMENT_NAME or (
  732. "config" in self._attrs
  733. or "summaryMetrics" in self._attrs
  734. or "systemMetrics" in self._attrs
  735. ):
  736. self._load_from_attrs()
  737. # Only mark as loaded for lightweight fragments, not full fragments
  738. if fragment_name == LIGHTWEIGHT_RUN_FRAGMENT_NAME:
  739. self._is_loaded = True
  740. return self._attrs
  741. def _load_from_attrs(self) -> dict[str, Any]:
  742. # Snapshot before mutating: only persist config/rawconfig when the response
  743. # included a config field (lazy runs omit it until load_full_data()).
  744. had_config_field = "config" in self._attrs
  745. self._state = self._attrs.get("state", None)
  746. # Only convert fields if they exist in _attrs
  747. if had_config_field:
  748. self._attrs["config"] = _convert_to_dict(self._attrs.get("config"))
  749. if "summaryMetrics" in self._attrs:
  750. self._attrs["summaryMetrics"] = _convert_to_dict(
  751. self._attrs.get("summaryMetrics")
  752. )
  753. if "systemMetrics" in self._attrs:
  754. self._attrs["systemMetrics"] = _convert_to_dict(
  755. self._attrs.get("systemMetrics")
  756. )
  757. # Only check for sweeps if sweep_name is available (not in lazy mode or if it exists)
  758. if self._include_sweeps and self._attrs.get("sweepName") and not self.sweep:
  759. # There may be a lot of runs. Don't bother pulling them all
  760. self.sweep = public.Sweep.get(
  761. self.client,
  762. self.entity,
  763. self.project,
  764. self._attrs["sweepName"],
  765. withRuns=False,
  766. )
  767. config_user, config_raw = {}, {}
  768. if self._attrs.get("config"):
  769. try:
  770. # config is already converted to dict by _convert_to_dict
  771. for key, value in self._attrs.get("config", {}).items():
  772. config = config_raw if key in WANDB_INTERNAL_KEYS else config_user
  773. if isinstance(value, dict) and "value" in value:
  774. config[key] = value["value"]
  775. else:
  776. config[key] = value
  777. except (TypeError, AttributeError):
  778. # Handle case where config is malformed or not a dict
  779. pass
  780. if had_config_field:
  781. config_raw.update(config_user)
  782. self._attrs["config"] = config_user
  783. self._attrs["rawconfig"] = config_raw
  784. if "user" in self._attrs:
  785. self.user = public.User(self.client, self._attrs["user"])
  786. return self._attrs
  787. def load(self, force: bool = False) -> dict[str, Any]:
  788. """Load run data using appropriate fragment based on lazy mode."""
  789. # Load any provided attrs
  790. if self._attrs:
  791. self._load_from_attrs()
  792. if self._lazy:
  793. return self._load_with_fragment(
  794. LIGHTWEIGHT_RUN_FRAGMENT, LIGHTWEIGHT_RUN_FRAGMENT_NAME, force
  795. )
  796. else:
  797. return self._load_with_fragment(RUN_FRAGMENT, RUN_FRAGMENT_NAME, force)
  798. @normalize_exceptions
  799. def wait_until_finished(self) -> None:
  800. """Check the state of the run until it is finished."""
  801. query = gql(
  802. """
  803. query RunState($project: String!, $entity: String!, $name: String!) {
  804. project(name: $project, entityName: $entity) {
  805. run(name: $name) {
  806. state
  807. }
  808. }
  809. }
  810. """
  811. )
  812. while True:
  813. res = self._exec(query)
  814. state = res["project"]["run"]["state"]
  815. if state in ["finished", "crashed", "failed"]:
  816. self._attrs["state"] = state
  817. self._state = state
  818. return
  819. time.sleep(5)
  820. @normalize_exceptions
  821. def update(self) -> None:
  822. """Persist changes to the run object to the wandb backend."""
  823. mutation = gql(
  824. """
  825. mutation UpsertBucket($id: String!, $description: String, $display_name: String, $notes: String, $tags: [String!], $config: JSONString!, $groupName: String, $jobType: String) {{
  826. upsertBucket(input: {{id: $id, description: $description, displayName: $display_name, notes: $notes, tags: $tags, config: $config, groupName: $groupName, jobType: $jobType}}) {{
  827. bucket {{
  828. ...RunFragment
  829. }}
  830. }}
  831. }}
  832. {}
  833. """.format(RUN_FRAGMENT)
  834. )
  835. _ = self._exec(
  836. mutation,
  837. id=self.storage_id,
  838. tags=self.tags,
  839. description=self.description,
  840. notes=self.notes,
  841. display_name=self.display_name,
  842. config=self.json_config,
  843. groupName=self.group,
  844. jobType=self.job_type,
  845. )
  846. self.summary.update()
  847. @normalize_exceptions
  848. def delete(self, delete_artifacts: bool = False) -> None:
  849. """Delete the given run from the wandb backend.
  850. Args:
  851. delete_artifacts (bool, optional): Whether to delete the artifacts
  852. associated with the run.
  853. """
  854. mutation = gql(
  855. """
  856. mutation DeleteRun(
  857. $id: ID!,
  858. {}
  859. ) {{
  860. deleteRun(input: {{
  861. id: $id,
  862. {}
  863. }}) {{
  864. clientMutationId
  865. }}
  866. }}
  867. """.format(
  868. "$deleteArtifacts: Boolean" if delete_artifacts else "",
  869. "deleteArtifacts: $deleteArtifacts" if delete_artifacts else "",
  870. )
  871. )
  872. self.client.execute(
  873. mutation,
  874. variable_values={
  875. "id": self.storage_id,
  876. "deleteArtifacts": delete_artifacts,
  877. },
  878. )
  879. def save(self) -> None:
  880. """Persist changes to the run object to the W&B backend."""
  881. self.update()
  882. @normalize_exceptions
  883. def update_state(self, state: Literal["pending"]) -> bool:
  884. """Update the state of a run.
  885. Allows transitioning runs from 'failed' or 'crashed' to 'pending'.
  886. Args:
  887. state: The target run state. Only `"pending"` is supported.
  888. Returns:
  889. `True` if the state was successfully updated.
  890. Raises:
  891. `wandb.Error`: If the requested state transition is not allowed, or the server
  892. does not support this operation.
  893. """
  894. mutation = gql(
  895. """
  896. mutation UpdateRunState($input: UpdateRunStateInput!) {
  897. updateRunState(input: $input) {
  898. success
  899. }
  900. }
  901. """
  902. )
  903. try:
  904. result = self.client.execute(
  905. mutation,
  906. variable_values={
  907. "input": {
  908. "id": self.storage_id,
  909. "state": state,
  910. }
  911. },
  912. )
  913. except Exception as e:
  914. error_msg = str(e)
  915. if "UpdateRunStateInput" in error_msg or "updateRunState" in error_msg:
  916. raise wandb.Error(
  917. "The server does not support the update_state operation. "
  918. "Please ensure your W&B server is updated to a version that "
  919. "supports run state transitions."
  920. ) from e
  921. if "invalid state transition" in error_msg.lower():
  922. raise wandb.Error(
  923. f"Invalid state transition: cannot change run from '{self.state}' "
  924. f"to '{state}'. Only runs in 'failed' or 'crashed' state can be "
  925. "transitioned to 'pending'."
  926. ) from e
  927. raise
  928. if result.get("updateRunState", {}).get("success"):
  929. self._attrs["state"] = state
  930. self._state = state
  931. return True
  932. return False
  933. @property
  934. def json_config(self) -> str:
  935. """Return the run config as a JSON string.
  936. <!-- lazydoc-ignore: internal -->
  937. """
  938. config = {}
  939. if "_wandb" in self.rawconfig:
  940. config["_wandb"] = {"value": self.rawconfig["_wandb"], "desc": None}
  941. for k, v in self.config.items():
  942. config[k] = {"value": v, "desc": None}
  943. return json.dumps(config)
  944. def _exec(self, query: Document, **kwargs: Any) -> dict[str, Any]:
  945. """Execute a query against the cloud backend."""
  946. variables = {"entity": self.entity, "project": self.project, "name": self.id}
  947. variables.update(kwargs)
  948. return self.client.execute(query, variable_values=variables)
  949. def _sampled_history(
  950. self,
  951. keys: list[str],
  952. x_axis: str = "_step",
  953. samples: int = 500,
  954. ) -> list[dict[str, Any]]:
  955. spec = {"keys": [x_axis] + keys, "samples": samples}
  956. query = gql(
  957. """
  958. query RunSampledHistory($project: String!, $entity: String!, $name: String!, $specs: [JSONString!]!) {
  959. project(name: $project, entityName: $entity) {
  960. run(name: $name) { sampledHistory(specs: $specs) }
  961. }
  962. }
  963. """
  964. )
  965. response = self._exec(query, specs=[json.dumps(spec)])
  966. # sampledHistory returns one list per spec, we only send one spec
  967. return response["project"]["run"]["sampledHistory"][0]
  968. def _full_history(
  969. self,
  970. samples: int = 500,
  971. stream: Literal["default", "system"] = "default",
  972. ) -> list[dict[str, Any]]:
  973. node = "history" if stream == "default" else "events"
  974. query = gql(
  975. """
  976. query RunFullHistory($project: String!, $entity: String!, $name: String!, $samples: Int) {{
  977. project(name: $project, entityName: $entity) {{
  978. run(name: $name) {{ {}(samples: $samples) }}
  979. }}
  980. }}
  981. """.format(node)
  982. )
  983. response = self._exec(query, samples=samples)
  984. return [json.loads(line) for line in response["project"]["run"][node]]
  985. @normalize_exceptions
  986. def files(
  987. self,
  988. names: list[str] | None = None,
  989. pattern: str | None = None,
  990. per_page: int = 50,
  991. ) -> public.Files:
  992. """Returns a `Files` object for all files in the run which match the given criteria.
  993. You can specify a list of exact file names to match, or a pattern to match against.
  994. If both are provided, the pattern will be ignored.
  995. Args:
  996. names (list): names of the requested files, if empty returns all files
  997. pattern (str, optional): Pattern to match when returning files from W&B.
  998. This pattern uses mySQL's LIKE syntax,
  999. so matching all files that end with .json would be "%.json".
  1000. If both names and pattern are provided, a ValueError will be raised.
  1001. per_page (int): number of results per page.
  1002. Returns:
  1003. A `Files` object, which is an iterator over `File` objects.
  1004. """
  1005. return public.Files(
  1006. self.client,
  1007. self,
  1008. names or [],
  1009. pattern=pattern,
  1010. per_page=per_page,
  1011. )
  1012. @normalize_exceptions
  1013. def file(self, name: str) -> public.File:
  1014. """Return the path of a file with a given name in the artifact.
  1015. Args:
  1016. name (str): name of requested file.
  1017. Returns:
  1018. A `File` matching the name argument.
  1019. """
  1020. return public.Files(self.client, self, [name])[0]
  1021. @normalize_exceptions
  1022. def upload_file(self, path: str, root: str = ".") -> public.File:
  1023. """Upload a local file to W&B, associating it with this run.
  1024. Args:
  1025. path (str): Path to the file to upload. Can be absolute or relative.
  1026. root (str): The root path to save the file relative to. For example,
  1027. if you want to have the file saved in the run as "my_dir/file.txt"
  1028. and you're currently in "my_dir" you would set root to "../".
  1029. Defaults to current directory (".").
  1030. Returns:
  1031. A `File` object representing the uploaded file.
  1032. """
  1033. api = InternalApi(
  1034. default_settings={"entity": self.entity, "project": self.project},
  1035. retry_timedelta=RETRY_TIMEDELTA,
  1036. )
  1037. api.set_current_run_id(self.id)
  1038. root = os.path.abspath(root)
  1039. name = os.path.relpath(path, root)
  1040. upload_path = util.make_file_path_upload_safe(name)
  1041. with open(os.path.join(root, name), "rb") as f:
  1042. api.push({LogicalPath(upload_path): f})
  1043. return public.Files(self.client, self, [name])[0]
  1044. @normalize_exceptions
  1045. def history(
  1046. self,
  1047. samples: int = 500,
  1048. keys: list[str] | None = None,
  1049. x_axis: str = "_step",
  1050. pandas: bool = True,
  1051. stream: Literal["default", "system"] = "default",
  1052. ) -> list[dict[str, Any]] | pd.DataFrame:
  1053. """Return sampled history metrics for a run.
  1054. This is simpler and faster if you are ok with the history records being sampled.
  1055. Args:
  1056. samples : (int, optional) The number of samples to return
  1057. pandas : (bool, optional) Return a pandas dataframe
  1058. keys : (list, optional) Only return metrics for specific keys
  1059. x_axis : (str, optional) Use this metric as the xAxis defaults to _step
  1060. stream : (str, optional) "default" for metrics, "system" for machine metrics
  1061. Returns:
  1062. pandas.DataFrame: If pandas=True returns a `pandas.DataFrame` of history
  1063. metrics.
  1064. list of dicts: If pandas=False returns a list of dicts of history metrics.
  1065. """
  1066. if keys is not None and not isinstance(keys, list):
  1067. wandb.termerror("keys must be specified in a list")
  1068. return []
  1069. if keys is not None and len(keys) > 0 and not isinstance(keys[0], str):
  1070. wandb.termerror("keys argument must be a list of strings")
  1071. return []
  1072. if keys and stream != "default":
  1073. wandb.termerror("stream must be default when specifying keys")
  1074. return []
  1075. elif keys:
  1076. lines = self._sampled_history(keys=keys, x_axis=x_axis, samples=samples)
  1077. else:
  1078. lines = self._full_history(samples=samples, stream=stream)
  1079. if pandas:
  1080. pd = util.get_module("pandas")
  1081. if pd:
  1082. lines = pd.DataFrame.from_records(lines)
  1083. else:
  1084. wandb.termwarn("Unable to load pandas, call history with pandas=False")
  1085. return lines
  1086. @normalize_exceptions
  1087. def scan_history(
  1088. self,
  1089. keys: list[str] | None = None,
  1090. page_size: int = 1_000,
  1091. min_step: int | None = None,
  1092. max_step: int | None = None,
  1093. ) -> Iterator[dict[str, Any]]:
  1094. """Returns an iterable collection of all history records for a run.
  1095. Args:
  1096. keys ([str], optional): only fetch these keys, and only fetch rows that have all of keys defined.
  1097. page_size (int, optional): size of pages to fetch from the api.
  1098. min_step (int, optional): the minimum number of pages to scan at a time.
  1099. max_step (int, optional): the maximum number of pages to scan at a time.
  1100. Returns:
  1101. An iterable collection over history records (dict).
  1102. Example:
  1103. Export all the loss values for an example run
  1104. ```python
  1105. run = api.run("entity/project-name/run-id")
  1106. history = run.scan_history(keys=["Loss"])
  1107. losses = [row["Loss"] for row in history]
  1108. ```
  1109. """
  1110. if keys is not None and not isinstance(keys, list):
  1111. wandb.termerror("keys must be specified in a list")
  1112. return []
  1113. if keys is not None and len(keys) > 0 and not isinstance(keys[0], str):
  1114. wandb.termerror("keys argument must be a list of strings")
  1115. return []
  1116. last_step = self.lastHistoryStep
  1117. # set defaults for min/max step
  1118. if min_step is None:
  1119. min_step = 0
  1120. if max_step is None:
  1121. max_step = last_step + 1
  1122. # if the max step is past the actual last step, clamp it down
  1123. if max_step > last_step:
  1124. max_step = last_step + 1
  1125. if keys is None:
  1126. return public.HistoryScan(
  1127. run=self,
  1128. client=self.client,
  1129. page_size=page_size,
  1130. min_step=min_step,
  1131. max_step=max_step,
  1132. )
  1133. else:
  1134. return public.SampledHistoryScan(
  1135. run=self,
  1136. client=self.client,
  1137. keys=keys,
  1138. page_size=page_size,
  1139. min_step=min_step,
  1140. max_step=max_step,
  1141. )
  1142. @normalize_exceptions
  1143. def logged_artifacts(self, per_page: int = 100) -> public.RunArtifacts:
  1144. """Fetches all artifacts logged by this run.
  1145. Retrieves all output artifacts that were logged during the run. Returns a
  1146. paginated result that can be iterated over or collected into a single list.
  1147. Args:
  1148. per_page: Number of artifacts to fetch per API request.
  1149. Returns:
  1150. An iterable collection of all Artifact objects logged as outputs during this run.
  1151. Example:
  1152. ```python
  1153. import wandb
  1154. import tempfile
  1155. with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as tmp:
  1156. tmp.write("This is a test artifact")
  1157. tmp_path = tmp.name
  1158. run = wandb.init(project="artifact-example")
  1159. artifact = wandb.Artifact("test_artifact", type="dataset")
  1160. artifact.add_file(tmp_path)
  1161. run.log_artifact(artifact)
  1162. run.finish()
  1163. api = wandb.Api()
  1164. finished_run = api.run(f"{run.entity}/{run.project}/{run.id}")
  1165. for logged_artifact in finished_run.logged_artifacts():
  1166. print(logged_artifact.name)
  1167. ```
  1168. """
  1169. return public.RunArtifacts(self.client, self, mode="logged", per_page=per_page)
  1170. @normalize_exceptions
  1171. def used_artifacts(self, per_page: int = 100) -> public.RunArtifacts:
  1172. """Fetches artifacts explicitly used by this run.
  1173. Retrieves only the input artifacts that were explicitly declared as used
  1174. during the run, typically via `run.use_artifact()`. Returns a paginated
  1175. result that can be iterated over or collected into a single list.
  1176. Args:
  1177. per_page: Number of artifacts to fetch per API request.
  1178. Returns:
  1179. An iterable collection of Artifact objects explicitly used as inputs in this run.
  1180. Example:
  1181. ```python
  1182. import wandb
  1183. run = wandb.init(project="artifact-example")
  1184. run.use_artifact("test_artifact:latest")
  1185. run.finish()
  1186. api = wandb.Api()
  1187. finished_run = api.run(f"{run.entity}/{run.project}/{run.id}")
  1188. for used_artifact in finished_run.used_artifacts():
  1189. print(used_artifact.name)
  1190. test_artifact
  1191. ```
  1192. """
  1193. return public.RunArtifacts(self.client, self, mode="used", per_page=per_page)
  1194. @normalize_exceptions
  1195. def use_artifact(
  1196. self,
  1197. artifact: wandb.Artifact,
  1198. use_as: str | None = None,
  1199. ) -> wandb.Artifact:
  1200. """Declare an artifact as an input to a run.
  1201. Args:
  1202. artifact (`Artifact`): An artifact returned from
  1203. `wandb.Api().artifact(name)`
  1204. use_as (string, optional): A string identifying
  1205. how the artifact is used in the script. Used
  1206. to easily differentiate artifacts used in a
  1207. run, when using the beta wandb launch
  1208. feature's artifact swapping functionality.
  1209. Returns:
  1210. An `Artifact` object.
  1211. """
  1212. api = InternalApi(
  1213. default_settings={"entity": self.entity, "project": self.project},
  1214. retry_timedelta=RETRY_TIMEDELTA,
  1215. )
  1216. api.set_current_run_id(self.id)
  1217. if isinstance(artifact, wandb.Artifact) and not artifact.is_draft():
  1218. api.use_artifact(
  1219. artifact.id,
  1220. use_as=use_as or artifact.name,
  1221. artifact_entity_name=artifact.entity,
  1222. artifact_project_name=artifact.project,
  1223. )
  1224. return artifact
  1225. elif isinstance(artifact, wandb.Artifact) and artifact.is_draft():
  1226. raise ValueError(
  1227. "Only existing artifacts are accepted by this api. "
  1228. "Manually create one with `wandb artifact put`"
  1229. )
  1230. else:
  1231. raise ValueError("You must pass a wandb.Api().artifact() to use_artifact")
  1232. @normalize_exceptions
  1233. def log_artifact(
  1234. self,
  1235. artifact: wandb.Artifact,
  1236. aliases: Collection[str] | None = None,
  1237. tags: Collection[str] | None = None,
  1238. ) -> wandb.Artifact:
  1239. """Declare an artifact as output of a run.
  1240. Args:
  1241. artifact (`Artifact`): An artifact returned from
  1242. `wandb.Api().artifact(name)`.
  1243. aliases (list, optional): Aliases to apply to this artifact.
  1244. tags: (list, optional) Tags to apply to this artifact, if any.
  1245. Returns:
  1246. A `Artifact` object.
  1247. """
  1248. api = InternalApi(
  1249. default_settings={"entity": self.entity, "project": self.project},
  1250. retry_timedelta=RETRY_TIMEDELTA,
  1251. )
  1252. api.set_current_run_id(self.id)
  1253. if not isinstance(artifact, wandb.Artifact):
  1254. raise TypeError("You must pass a wandb.Api().artifact() to use_artifact")
  1255. if artifact.is_draft():
  1256. raise ValueError(
  1257. "Only existing artifacts are accepted by this api. "
  1258. "Manually create one with `wandb artifact put`"
  1259. )
  1260. if (
  1261. self.entity != artifact.source_entity
  1262. or self.project != artifact.source_project
  1263. ):
  1264. raise ValueError("A run can't log an artifact to a different project.")
  1265. artifact_collection_name = artifact.source_name.split(":")[0]
  1266. api.create_artifact(
  1267. artifact.type,
  1268. artifact_collection_name,
  1269. artifact.digest,
  1270. entity_name=self.entity,
  1271. project_name=self.project,
  1272. aliases=aliases,
  1273. tags=tags,
  1274. )
  1275. return artifact
  1276. def load_full_data(self, force: bool = False) -> dict[str, Any]:
  1277. """Load full run data including heavy fields like config, systemMetrics, summaryMetrics.
  1278. This method is useful when you initially used lazy=True for listing runs,
  1279. but need access to the full data for specific runs.
  1280. Args:
  1281. force: Force reload even if data is already loaded
  1282. Returns:
  1283. The loaded run attributes
  1284. """
  1285. if not self._lazy and not force:
  1286. # Already in full mode, no need to reload
  1287. return self._attrs
  1288. # Load full data and mark as loaded
  1289. result = self._load_with_fragment(RUN_FRAGMENT, RUN_FRAGMENT_NAME, force=True)
  1290. self._full_data_loaded = True
  1291. return result
  1292. @property
  1293. def config(self) -> dict[str, Any]:
  1294. """Get run config. Auto-loads full data if in lazy mode."""
  1295. if self._lazy and not self._full_data_loaded and "config" not in self._attrs:
  1296. self.load_full_data()
  1297. # Ensure config is always converted to dict (defensive against conversion issues)
  1298. config_value = self._attrs.get("config", {})
  1299. # _convert_to_dict handles dict inputs (noop) and converts str/bytes/bytearray to dict
  1300. config_value = _convert_to_dict(config_value)
  1301. self._attrs["config"] = config_value
  1302. return config_value
  1303. @property
  1304. def summary(self) -> HTTPSummary:
  1305. """Get run summary metrics. Auto-loads full data if in lazy mode."""
  1306. if (
  1307. self._lazy
  1308. and not self._full_data_loaded
  1309. and "summaryMetrics" not in self._attrs
  1310. ):
  1311. self.load_full_data()
  1312. if self._summary is None:
  1313. from wandb.old.summary import HTTPSummary
  1314. # TODO: fix the outdir issue
  1315. self._summary = HTTPSummary(self, self.client, summary=self.summary_metrics)
  1316. return self._summary
  1317. @property
  1318. def system_metrics(self) -> dict[str, Any]:
  1319. """Get run system metrics. Auto-loads full data if in lazy mode."""
  1320. if (
  1321. self._lazy
  1322. and not self._full_data_loaded
  1323. and "systemMetrics" not in self._attrs
  1324. ):
  1325. self.load_full_data()
  1326. # Ensure systemMetrics is always converted to dict (defensive against conversion issues)
  1327. system_metrics_value = self._attrs.get("systemMetrics", {})
  1328. # _convert_to_dict handles dict inputs (noop) and converts str/bytes/bytearray to dict
  1329. system_metrics_value = _convert_to_dict(system_metrics_value)
  1330. self._attrs["systemMetrics"] = system_metrics_value
  1331. return system_metrics_value
  1332. @property
  1333. def summary_metrics(self) -> dict[str, Any]:
  1334. """Get run summary metrics. Auto-loads full data if in lazy mode."""
  1335. if (
  1336. self._lazy
  1337. and not self._full_data_loaded
  1338. and "summaryMetrics" not in self._attrs
  1339. ):
  1340. self.load_full_data()
  1341. # Ensure summaryMetrics is always converted to dict (defensive against conversion issues)
  1342. summary_metrics_value = self._attrs.get("summaryMetrics", {})
  1343. # _convert_to_dict handles dict inputs (noop) and converts str/bytes/bytearray to dict
  1344. summary_metrics_value = _convert_to_dict(summary_metrics_value)
  1345. self._attrs["summaryMetrics"] = summary_metrics_value
  1346. return summary_metrics_value
  1347. @property
  1348. def rawconfig(self) -> dict[str, Any]:
  1349. """Get raw run config including internal keys. Auto-loads full data if in lazy mode."""
  1350. if self._lazy and not self._full_data_loaded and "rawconfig" not in self._attrs:
  1351. self.load_full_data()
  1352. return self._attrs.get("rawconfig", {})
  1353. @property
  1354. def sweep_name(self) -> str | None:
  1355. """Get sweep name. Always available since sweepName is in lightweight fragment."""
  1356. # sweepName is included in lightweight fragment, so no need to load full data
  1357. return self._attrs.get("sweepName")
  1358. @property
  1359. def path(self) -> list[str]:
  1360. """The path of the run. The path is a list containing the entity, project, and run_id."""
  1361. return [
  1362. urllib.parse.quote_plus(str(self.entity)),
  1363. urllib.parse.quote_plus(str(self.project)),
  1364. urllib.parse.quote_plus(str(self.id)),
  1365. ]
  1366. @property
  1367. def url(self) -> str:
  1368. """The URL of the run.
  1369. The run URL is generated from the entity, project, and run_id. For
  1370. SaaS users, it takes the form of `https://wandb.ai/entity/project/run_id`.
  1371. """
  1372. path = self.path
  1373. path.insert(2, "runs")
  1374. return self.client.app_url + "/".join(path)
  1375. @property
  1376. def metadata(self) -> dict[str, Any] | None:
  1377. """Metadata about the run from wandb-metadata.json.
  1378. Metadata includes the run's description, tags, start time, memory
  1379. usage and more.
  1380. """
  1381. if self._metadata is None:
  1382. try:
  1383. f = self.file("wandb-metadata.json")
  1384. session = self.client._client.transport.session
  1385. response = session.get(f.url, timeout=5)
  1386. response.raise_for_status()
  1387. contents = response.content
  1388. self._metadata = json_util.loads(contents)
  1389. except: # noqa: E722
  1390. # file doesn't exist, or can't be downloaded, or can't be parsed
  1391. pass
  1392. return self._metadata
  1393. @property
  1394. def lastHistoryStep(self) -> int: # noqa: N802
  1395. """Returns the last step logged in the run's history."""
  1396. query = gql(
  1397. """
  1398. query RunHistoryKeys($project: String!, $entity: String!, $name: String!) {
  1399. project(name: $project, entityName: $entity) {
  1400. run(name: $name) { historyKeys }
  1401. }
  1402. }
  1403. """
  1404. )
  1405. response = self._exec(query)
  1406. if (
  1407. response is None
  1408. or response.get("project") is None
  1409. or response["project"].get("run") is None
  1410. or response["project"]["run"].get("historyKeys") is None
  1411. ):
  1412. return -1
  1413. history_keys = response["project"]["run"]["historyKeys"]
  1414. return history_keys.get("lastStep", -1)
  1415. def to_html(self, height: int = 420, hidden: bool = False) -> str:
  1416. """Generate HTML containing an iframe displaying this run."""
  1417. url = self.url + "?jupyter=true"
  1418. style = f"border:none;width:100%;height:{height}px;"
  1419. prefix = ""
  1420. if hidden:
  1421. style += "display:none;"
  1422. prefix = ipython.toggle_button()
  1423. return prefix + f"<iframe src={url!r} style={style!r}></iframe>"
  1424. def _repr_html_(self) -> str:
  1425. if ipython.in_vscode_notebook():
  1426. import html
  1427. return html.escape(self._string_representation())
  1428. return self.to_html()
  1429. def __repr__(self) -> str:
  1430. return self._string_representation()
  1431. def _string_representation(self) -> str:
  1432. return f"<{nameof(type(self))} {'/'.join(self.path)} ({self.state})>"
  1433. def beta_scan_history(
  1434. self,
  1435. keys: list[str] | None = None,
  1436. page_size: int = 1_000,
  1437. min_step: int = 0,
  1438. max_step: int | None = None,
  1439. use_cache: bool = True,
  1440. ) -> public.BetaHistoryScan:
  1441. """Returns an iterable collection of all history records for a run.
  1442. This function is still in development and may not work as expected.
  1443. It uses wandb-core to read history from a run's exported
  1444. parquet history locally.
  1445. Args:
  1446. keys: list of metrics to read from the run's history.
  1447. if no keys are provided then all metrics will be returned.
  1448. page_size: the number of history records to read at a time.
  1449. min_step: The minimum step to start reading history from (inclusive).
  1450. max_step: The maximum step to read history up to (exclusive).
  1451. use_cache: When set to True, checks the WANDB_CACHE_DIR for a run history.
  1452. If the run history is not found in the cache, it will be downloaded from the server.
  1453. If set to False, the run history will be downloaded every time.
  1454. Returns:
  1455. A BetaHistoryScan object,
  1456. which can be iterator over to get history records.
  1457. """
  1458. if self._service_api is None:
  1459. settings = wandb_setup.singleton().settings.model_copy()
  1460. self._service_api = ServiceApi(settings=settings)
  1461. beta_history_scan = public.BetaHistoryScan(
  1462. service_api=self._service_api,
  1463. run=self,
  1464. min_step=min_step,
  1465. max_step=max_step or self.lastHistoryStep + 1,
  1466. keys=keys,
  1467. page_size=page_size,
  1468. use_cache=use_cache,
  1469. )
  1470. return beta_history_scan
  1471. def download_history_exports(
  1472. self,
  1473. download_dir: pathlib.Path | str,
  1474. require_complete_history: bool = True,
  1475. ) -> runhistory.DownloadHistoryResult:
  1476. """Download any parquet history files for the run to the provided directory.
  1477. Args:
  1478. download_dir: The directory to download the history files to.
  1479. require_complete_history: Whether to require the complete history to be downloaded.
  1480. If true, and the run contains data that has not been exported to parquet files yet,
  1481. an IncompleteRunHistoryError will be raised.
  1482. Returns:
  1483. A DownloadHistoryResult.
  1484. Raises:
  1485. IncompleteRunHistoryError: If require_complete_history is True
  1486. and the run contains data not yet exported to parquet files.
  1487. WandbApiFailedError: If the API request fails for reasons other than
  1488. incomplete history.
  1489. """
  1490. init_download_request = apb.DownloadRunHistoryInit(
  1491. entity=self.entity,
  1492. project=self.project,
  1493. run_id=self.id,
  1494. download_dir=str(download_dir),
  1495. require_complete_history=require_complete_history,
  1496. )
  1497. api_request = apb.ApiRequest(
  1498. read_run_history_request=apb.ReadRunHistoryRequest(
  1499. download_run_history_init=init_download_request,
  1500. )
  1501. )
  1502. if self._service_api is None:
  1503. settings = wandb_setup.singleton().settings.model_copy()
  1504. self._service_api = ServiceApi(settings=settings)
  1505. response: apb.ApiResponse
  1506. try:
  1507. response = self._service_api.send_api_request(api_request)
  1508. except WandbApiFailedError as e:
  1509. if (
  1510. e.response is not None
  1511. and e.response.error_type == apb.ErrorType.INCOMPLETE_RUN_HISTORY_ERROR
  1512. ):
  1513. raise runhistory.IncompleteRunHistoryError() from None
  1514. else:
  1515. raise WandbApiFailedError("Failed to download history") from e
  1516. contains_live_data = response.read_run_history_response.download_run_history_init.contains_live_data
  1517. request_id = (
  1518. response.read_run_history_response.download_run_history_init.request_id
  1519. )
  1520. return wandb_setup.singleton().asyncer.run(
  1521. lambda: runhistory.wait_for_download_with_progress(
  1522. self._service_api,
  1523. request_id,
  1524. contains_live_data,
  1525. )
  1526. )