| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- # Copyright 2026 The HuggingFace Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Contains commands to interact with datasets on the Hugging Face Hub.
- Usage:
- # list datasets on the Hub
- hf datasets ls
- # list datasets with a search query
- hf datasets ls --search "code"
- # get info about a dataset
- hf datasets info HuggingFaceFW/fineweb
- """
- import enum
- from typing import Annotated, get_args
- import typer
- from huggingface_hub._dataset_viewer import execute_raw_sql_query
- from huggingface_hub.errors import CLIError, RepositoryNotFoundError, RevisionNotFoundError
- from huggingface_hub.hf_api import DatasetSort_T, ExpandDatasetProperty_T
- from ._cli_utils import (
- AuthorOpt,
- FilterOpt,
- FormatWithAutoOpt,
- LimitOpt,
- RevisionOpt,
- SearchOpt,
- TokenOpt,
- api_object_to_dict,
- get_hf_api,
- make_expand_properties_parser,
- typer_factory,
- )
- from ._output import OutputFormatWithAuto, out
- _EXPAND_PROPERTIES = sorted(get_args(ExpandDatasetProperty_T))
- _SORT_OPTIONS = get_args(DatasetSort_T)
- DatasetSortEnum = enum.Enum("DatasetSortEnum", {s: s for s in _SORT_OPTIONS}, type=str) # type: ignore[misc]
- ExpandOpt = Annotated[
- str | None,
- typer.Option(
- help=f"Comma-separated properties to return. When used, only the listed properties (and id) are returned. Example: '--expand=downloads,likes,tags'. Valid: {', '.join(_EXPAND_PROPERTIES)}.",
- callback=make_expand_properties_parser(_EXPAND_PROPERTIES),
- ),
- ]
- datasets_cli = typer_factory(help="Interact with datasets on the Hub.")
- @datasets_cli.command(
- "list | ls",
- examples=[
- "hf datasets ls",
- "hf datasets ls --sort downloads --limit 10",
- 'hf datasets ls --search "code"',
- ],
- )
- def datasets_ls(
- search: SearchOpt = None,
- author: AuthorOpt = None,
- filter: FilterOpt = None,
- sort: Annotated[
- DatasetSortEnum | None,
- typer.Option(help="Sort results."),
- ] = None,
- limit: LimitOpt = 10,
- expand: ExpandOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """List datasets on the Hub."""
- api = get_hf_api(token=token)
- sort_key = sort.value if sort else None
- results = [
- api_object_to_dict(dataset_info)
- for dataset_info in api.list_datasets(
- filter=filter,
- author=author,
- search=search,
- sort=sort_key,
- limit=limit,
- expand=expand, # type: ignore
- )
- ]
- out.table(results)
- @datasets_cli.command(
- "info",
- examples=[
- "hf datasets info HuggingFaceFW/fineweb",
- "hf datasets info my-dataset --expand downloads,likes,tags",
- ],
- )
- def datasets_info(
- dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")],
- revision: RevisionOpt = None,
- expand: ExpandOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Get info about a dataset on the Hub."""
- api = get_hf_api(token=token)
- try:
- info = api.dataset_info(repo_id=dataset_id, revision=revision, expand=expand) # type: ignore
- except RepositoryNotFoundError as e:
- raise CLIError(f"Dataset '{dataset_id}' not found.") from e
- except RevisionNotFoundError as e:
- raise CLIError(f"Revision '{revision}' not found on '{dataset_id}'.") from e
- out.dict(info)
- @datasets_cli.command(
- "parquet",
- examples=[
- "hf datasets parquet cfahlgren1/hub-stats",
- "hf datasets parquet cfahlgren1/hub-stats --subset models",
- "hf datasets parquet cfahlgren1/hub-stats --split train",
- "hf datasets parquet cfahlgren1/hub-stats --format json",
- ],
- )
- def datasets_parquet(
- dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")],
- subset: Annotated[str | None, typer.Option("--subset", help="Filter parquet entries by subset/config.")] = None,
- split: Annotated[str | None, typer.Option(help="Filter parquet entries by split.")] = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """List parquet file URLs available for a dataset."""
- api = get_hf_api(token=token)
- entries = api.list_dataset_parquet_files(repo_id=dataset_id, config=subset)
- filtered = [entry for entry in entries if split is None or entry.split == split]
- results = [
- {"subset": entry.config, "split": entry.split, "url": entry.url, "size": entry.size} for entry in filtered
- ]
- out.table(results, headers=["subset", "split", "url", "size"], id_key="url")
- @datasets_cli.command(
- "sql",
- examples=[
- "hf datasets sql \"SELECT COUNT(*) AS rows FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet')\"",
- "hf datasets sql \"SELECT * FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet') LIMIT 5\" --format json",
- ],
- )
- def datasets_sql(
- sql: Annotated[str, typer.Argument(help="Raw SQL query to execute.")],
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Execute a raw SQL query with DuckDB against dataset parquet URLs."""
- try:
- result = execute_raw_sql_query(sql_query=sql, token=token)
- except ImportError as e:
- raise CLIError(str(e)) from e
- out.table(result)
|