datasets.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # Copyright 2026 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Contains commands to interact with datasets on the Hugging Face Hub.
  15. Usage:
  16. # list datasets on the Hub
  17. hf datasets ls
  18. # list datasets with a search query
  19. hf datasets ls --search "code"
  20. # get info about a dataset
  21. hf datasets info HuggingFaceFW/fineweb
  22. """
  23. import enum
  24. from typing import Annotated, get_args
  25. import typer
  26. from huggingface_hub._dataset_viewer import execute_raw_sql_query
  27. from huggingface_hub.errors import CLIError, RepositoryNotFoundError, RevisionNotFoundError
  28. from huggingface_hub.hf_api import DatasetSort_T, ExpandDatasetProperty_T
  29. from ._cli_utils import (
  30. AuthorOpt,
  31. FilterOpt,
  32. FormatWithAutoOpt,
  33. LimitOpt,
  34. RevisionOpt,
  35. SearchOpt,
  36. TokenOpt,
  37. api_object_to_dict,
  38. get_hf_api,
  39. make_expand_properties_parser,
  40. typer_factory,
  41. )
  42. from ._output import OutputFormatWithAuto, out
  43. _EXPAND_PROPERTIES = sorted(get_args(ExpandDatasetProperty_T))
  44. _SORT_OPTIONS = get_args(DatasetSort_T)
  45. DatasetSortEnum = enum.Enum("DatasetSortEnum", {s: s for s in _SORT_OPTIONS}, type=str) # type: ignore[misc]
  46. ExpandOpt = Annotated[
  47. str | None,
  48. typer.Option(
  49. help=f"Comma-separated properties to return. When used, only the listed properties (and id) are returned. Example: '--expand=downloads,likes,tags'. Valid: {', '.join(_EXPAND_PROPERTIES)}.",
  50. callback=make_expand_properties_parser(_EXPAND_PROPERTIES),
  51. ),
  52. ]
  53. datasets_cli = typer_factory(help="Interact with datasets on the Hub.")
  54. @datasets_cli.command(
  55. "list | ls",
  56. examples=[
  57. "hf datasets ls",
  58. "hf datasets ls --sort downloads --limit 10",
  59. 'hf datasets ls --search "code"',
  60. ],
  61. )
  62. def datasets_ls(
  63. search: SearchOpt = None,
  64. author: AuthorOpt = None,
  65. filter: FilterOpt = None,
  66. sort: Annotated[
  67. DatasetSortEnum | None,
  68. typer.Option(help="Sort results."),
  69. ] = None,
  70. limit: LimitOpt = 10,
  71. expand: ExpandOpt = None,
  72. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  73. token: TokenOpt = None,
  74. ) -> None:
  75. """List datasets on the Hub."""
  76. api = get_hf_api(token=token)
  77. sort_key = sort.value if sort else None
  78. results = [
  79. api_object_to_dict(dataset_info)
  80. for dataset_info in api.list_datasets(
  81. filter=filter,
  82. author=author,
  83. search=search,
  84. sort=sort_key,
  85. limit=limit,
  86. expand=expand, # type: ignore
  87. )
  88. ]
  89. out.table(results)
  90. @datasets_cli.command(
  91. "info",
  92. examples=[
  93. "hf datasets info HuggingFaceFW/fineweb",
  94. "hf datasets info my-dataset --expand downloads,likes,tags",
  95. ],
  96. )
  97. def datasets_info(
  98. dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")],
  99. revision: RevisionOpt = None,
  100. expand: ExpandOpt = None,
  101. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  102. token: TokenOpt = None,
  103. ) -> None:
  104. """Get info about a dataset on the Hub."""
  105. api = get_hf_api(token=token)
  106. try:
  107. info = api.dataset_info(repo_id=dataset_id, revision=revision, expand=expand) # type: ignore
  108. except RepositoryNotFoundError as e:
  109. raise CLIError(f"Dataset '{dataset_id}' not found.") from e
  110. except RevisionNotFoundError as e:
  111. raise CLIError(f"Revision '{revision}' not found on '{dataset_id}'.") from e
  112. out.dict(info)
  113. @datasets_cli.command(
  114. "parquet",
  115. examples=[
  116. "hf datasets parquet cfahlgren1/hub-stats",
  117. "hf datasets parquet cfahlgren1/hub-stats --subset models",
  118. "hf datasets parquet cfahlgren1/hub-stats --split train",
  119. "hf datasets parquet cfahlgren1/hub-stats --format json",
  120. ],
  121. )
  122. def datasets_parquet(
  123. dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")],
  124. subset: Annotated[str | None, typer.Option("--subset", help="Filter parquet entries by subset/config.")] = None,
  125. split: Annotated[str | None, typer.Option(help="Filter parquet entries by split.")] = None,
  126. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  127. token: TokenOpt = None,
  128. ) -> None:
  129. """List parquet file URLs available for a dataset."""
  130. api = get_hf_api(token=token)
  131. entries = api.list_dataset_parquet_files(repo_id=dataset_id, config=subset)
  132. filtered = [entry for entry in entries if split is None or entry.split == split]
  133. results = [
  134. {"subset": entry.config, "split": entry.split, "url": entry.url, "size": entry.size} for entry in filtered
  135. ]
  136. out.table(results, headers=["subset", "split", "url", "size"], id_key="url")
  137. @datasets_cli.command(
  138. "sql",
  139. examples=[
  140. "hf datasets sql \"SELECT COUNT(*) AS rows FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet')\"",
  141. "hf datasets sql \"SELECT * FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet') LIMIT 5\" --format json",
  142. ],
  143. )
  144. def datasets_sql(
  145. sql: Annotated[str, typer.Argument(help="Raw SQL query to execute.")],
  146. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  147. token: TokenOpt = None,
  148. ) -> None:
  149. """Execute a raw SQL query with DuckDB against dataset parquet URLs."""
  150. try:
  151. result = execute_raw_sql_query(sql_query=sql, token=token)
  152. except ImportError as e:
  153. raise CLIError(str(e)) from e
  154. out.table(result)