| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391 |
- """W&B Public API for File objects.
- This module provides classes for interacting with files stored in W&B.
- Example:
- ```python
- from wandb.apis.public import Api
- # Get files from a specific run
- run = Api().run("entity/project/run_id")
- files = run.files()
- # Work with files
- for file in files:
- print(f"File: {file.name}")
- print(f"Size: {file.size} bytes")
- print(f"Type: {file.mimetype}")
- # Download file
- if file.size < 1000000: # Less than 1MB
- file.download(root="./downloads")
- # Get S3 URI for large files
- if file.size >= 1000000:
- print(f"S3 URI: {file.path_uri}")
- ```
- Note:
- This module is part of the W&B Public API and provides methods to access,
- download, and manage files stored in W&B. Files are typically associated
- with specific runs and can include model weights, datasets, visualizations,
- and other artifacts.
- """
- from __future__ import annotations
- import io
- import os
- from typing import TYPE_CHECKING, Any, Callable
- from wandb_gql import gql
- from wandb_gql.client import RetryError
- import wandb
- from wandb._strutils import nameof
- from wandb.apis.attrs import Attrs
- from wandb.apis.normalize import normalize_exceptions
- from wandb.apis.paginator import SizedPaginator
- from wandb.apis.public import utils
- from wandb.apis.public.const import RETRY_TIMEDELTA
- from wandb.apis.public.runs import Run
- from wandb.sdk.lib import retry
- from wandb.util import POW_2_BYTES, download_file_from_url, no_retry_auth, to_human_size
- if TYPE_CHECKING:
- from wandb_graphql.language.ast import Document
- from wandb.apis.public import Api, RetryingClient
- FILE_FRAGMENT = """fragment RunFilesFragment on Run {
- files(names: $fileNames, after: $fileCursor, first: $fileLimit, pattern: $pattern) {
- edges {
- node {
- id
- name
- url(upload: $upload)
- directUrl
- sizeBytes
- mimetype
- updatedAt
- md5
- }
- cursor
- }
- pageInfo {
- endCursor
- hasNextPage
- }
- }
- }"""
- class Files(SizedPaginator["File"]):
- """A lazy iterator over a collection of `File` objects.
- Access and manage files uploaded to W&B during a run. Handles pagination
- automatically when iterating through large collections of files.
- Example:
- ```python
- from wandb.apis.public.files import Files
- from wandb.apis.public.api import Api
- # Example run object
- run = Api().run("entity/project/run-id")
- # Create a Files object to iterate over files in the run
- files = Files(api.client, run)
- # Iterate over files
- for file in files:
- print(file.name)
- print(file.url)
- print(file.size)
- # Download the file
- file.download(root="download_directory", replace=True)
- ```
- """
- def _get_query(self) -> Document:
- """Generate query dynamically based on server capabilities."""
- return gql(
- f"""
- query RunFiles($project: String!, $entity: String!, $name: String!, $fileCursor: String,
- $fileLimit: Int = 50, $fileNames: [String] = [], $upload: Boolean = false, $pattern: String) {{
- project(name: $project, entityName: $entity) {{
- internalId
- run(name: $name) {{
- fileCount
- ...RunFilesFragment
- }}
- }}
- }}
- {FILE_FRAGMENT}
- """
- )
- def __init__(
- self,
- client: RetryingClient,
- run: Run,
- names: list[str] | None = None,
- per_page: int = 50,
- upload: bool = False,
- pattern: str | None = None,
- ):
- """Initialize a lazy iterator over a collection of `File` objects.
- Files are retrieved in pages from the W&B server as needed.
- Args:
- client: The run object that contains the files
- run: The run object that contains the files
- names (list, optional): A list of file names to filter the files
- per_page (int, optional): The number of files to fetch per page
- upload (bool, optional): If `True`, fetch the upload URL for each file
- pattern (str, optional): Pattern to match when returning files from W&B
- This pattern uses mySQL's LIKE syntax,
- so matching all files that end with .json would be "%.json".
- If both names and pattern are provided, a ValueError will be raised.
- """
- if names and pattern:
- raise ValueError(
- "Querying for files by both names and pattern is not supported."
- " Please provide either a list of names or a pattern to match.",
- )
- self.run = run
- variables = {
- "project": run.project,
- "entity": run.entity,
- "name": run.id,
- "fileNames": names or [],
- "upload": upload,
- "pattern": pattern,
- }
- super().__init__(client, variables, per_page)
- def _update_response(self) -> None:
- """Fetch and store the response data for the next page using dynamic query."""
- self.last_response = self.client.execute(
- self._get_query(), variable_values=self.variables
- )
- @property
- def _length(self) -> int:
- """
- Returns total number of files.
- <!-- lazydoc-ignore: internal -->
- """
- if not self.last_response:
- self._load_page()
- return self.last_response["project"]["run"]["fileCount"]
- @property
- def more(self) -> bool:
- """Returns whether there are more files to fetch.
- <!-- lazydoc-ignore: internal -->
- """
- if self.last_response:
- return self.last_response["project"]["run"]["files"]["pageInfo"][
- "hasNextPage"
- ]
- else:
- return True
- @property
- def cursor(self) -> str | None:
- """Returns the cursor position for pagination of file results.
- <!-- lazydoc-ignore: internal -->
- """
- if self.last_response:
- return self.last_response["project"]["run"]["files"]["edges"][-1]["cursor"]
- else:
- return None
- def update_variables(self) -> None:
- """Updates the GraphQL query variables for pagination.
- <!-- lazydoc-ignore: internal -->
- """
- self.variables.update({"fileLimit": self.per_page, "fileCursor": self.cursor})
- def convert_objects(self) -> list[File]:
- """Converts GraphQL edges to File objects.
- <!-- lazydoc-ignore: internal -->
- """
- return [
- File(self.client, r["node"], self.run)
- for r in self.last_response["project"]["run"]["files"]["edges"]
- ]
- def __repr__(self) -> str:
- return f"<{nameof(type(self))} {'/'.join(self.run.path)} ({len(self)})>"
- class File(Attrs):
- """File saved to W&B.
- Represents a single file stored in W&B. Includes access to file metadata.
- Files are associated with a specific run and
- can include text files, model weights, datasets, visualizations, and other
- artifacts. You can download the file, delete the file, and access file
- properties.
- Specify one or more attributes in a dictionary to fine a specific
- file logged to a specific run. You can search using the following keys:
- - id (str): The ID of the run that contains the file
- - name (str): Name of the file
- - url (str): path to file
- - direct_url (str): path to file in the bucket
- - sizeBytes (int): size of file in bytes
- - md5 (str): md5 of file
- - mimetype (str): mimetype of file
- - updated_at (str): timestamp of last update
- - path_uri (str): path to file in the bucket, currently only available for S3 objects and reference files
- Args:
- client: The run object that contains the file
- attrs (dict): A dictionary of attributes that define the file
- run: The run object that contains the file
- <!-- lazydoc-ignore-init: internal -->
- """
- def __init__(
- self,
- client: RetryingClient,
- attrs: dict[str, Any],
- run: Run | None = None,
- ):
- self.client = client
- self._attrs = attrs
- self.run = run
- self._download_decorated: Callable[..., Any] | None = None
- super().__init__(dict(attrs))
- @property
- def size(self) -> int:
- """Returns the size of the file in bytes."""
- size_bytes = self._attrs["sizeBytes"]
- if size_bytes is not None:
- return int(size_bytes)
- return 0
- @property
- def path_uri(self) -> str:
- """Returns the URI path to the file in the storage bucket.
- Returns:
- str: The S3 URI (e.g., 's3://bucket/path/to/file') if the file is stored in S3,
- the direct URL if it's a reference file, or an empty string if unavailable.
- """
- if not (direct_url := self._attrs.get("directUrl")):
- wandb.termwarn("Unable to find direct_url of file")
- return ""
- # For reference files, both the directUrl and the url are just the path to the file in the bucket
- if direct_url == self._attrs.get("url"):
- return direct_url
- try:
- return utils.parse_s3_url_to_s3_uri(direct_url)
- except ValueError:
- wandb.termwarn("path_uri is only available for files stored in S3")
- return ""
- def _build_download_wrapper(self) -> Callable[..., io.TextIOWrapper]:
- import requests
- @retry.retriable(
- retry_timedelta=RETRY_TIMEDELTA,
- check_retry_fn=no_retry_auth,
- retryable_exceptions=(RetryError, requests.RequestException),
- )
- def _impl(
- root: str = ".",
- replace: bool = False,
- exist_ok: bool = False,
- api: Api | None = None,
- ) -> io.TextIOWrapper:
- if api is None:
- api = wandb.Api()
- path = os.path.join(root, self.name)
- if os.path.exists(path) and not replace:
- if exist_ok:
- return open(path)
- raise ValueError(
- "File already exists, pass replace=True to overwrite "
- "or exist_ok=True to leave it as is and don't error."
- )
- download_file_from_url(path, self.url, api.api_key)
- return open(path)
- return _impl
- @normalize_exceptions
- def download(
- self,
- root: str = ".",
- replace: bool = False,
- exist_ok: bool = False,
- api: Api | None = None,
- ) -> io.TextIOWrapper:
- """Downloads a file previously saved by a run from the wandb server.
- Args:
- root: Local directory to save the file. Defaults to the
- current working directory (".").
- replace: If `True`, download will overwrite a local file
- if it exists. Defaults to `False`.
- exist_ok: If `True`, will not raise ValueError if file already
- exists and will not re-download unless replace=True.
- Defaults to `False`.
- api: If specified, the `Api` instance used to download the file.
- Raises:
- `ValueError` if file already exists, `replace=False` and
- `exist_ok=False`.
- """
- if self._download_decorated is None:
- self._download_decorated = self._build_download_wrapper()
- return self._download_decorated(root, replace, exist_ok, api)
- @normalize_exceptions
- def delete(self) -> None:
- """Delete the file from the W&B server."""
- variable_values = {
- "files": [self.id],
- "projectId": self.run._project_internal_id,
- }
- mutation = gql("""
- mutation deleteFiles($files: [ID!]!, $projectId: Int) {
- deleteFiles(input: {
- files: $files
- projectId: $projectId
- }) {
- success
- }
- }
- """)
- self.client.execute(
- mutation,
- variable_values=variable_values,
- )
- def __repr__(self) -> str:
- classname = nameof(type(self))
- size = to_human_size(self.size, units=POW_2_BYTES)
- return f"<{classname} {self.name} ({self.mimetype}) {size}>"
|