_eval_results.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """Evaluation results utilities for the `.eval_results/*.yaml` format.
  2. See https://huggingface.co/docs/hub/eval-results for more details.
  3. Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
  4. """
  5. from dataclasses import dataclass
  6. from typing import Any
  7. @dataclass
  8. class EvalResultEntry:
  9. """
  10. Evaluation result entry for the `.eval_results/*.yaml` format.
  11. Represents evaluation scores stored in model repos that automatically appear on
  12. the model page and the benchmark dataset's leaderboard.
  13. For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.
  14. See https://huggingface.co/docs/hub/eval-results for more details.
  15. Args:
  16. dataset_id (`str`):
  17. Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
  18. task_id (`str`):
  19. Task identifier within the benchmark. Example: "gpqa_diamond".
  20. value (`Any`):
  21. The metric value. Example: 20.90.
  22. dataset_revision (`str`, *optional*):
  23. Git SHA of the benchmark dataset.
  24. verify_token (`str`, *optional*):
  25. A signature that can be used to prove that evaluation is provably auditable and reproducible.
  26. date (`str`, *optional*):
  27. When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
  28. source_url (`str`, *optional*):
  29. Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
  30. source_name (`str`, *optional*):
  31. Display name for the source. Example: "Eval Logs".
  32. source_user (`str`, *optional*):
  33. HF user name for attribution. Example: "celinah".
  34. source_org (`str`, *optional*):
  35. HF org name for attribution. Example: "cais".
  36. notes (`str`, *optional*):
  37. Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".
  38. Example:
  39. ```python
  40. >>> from huggingface_hub import EvalResultEntry
  41. >>> # Minimal example with required fields only
  42. >>> result = EvalResultEntry(
  43. ... dataset_id="Idavidrein/gpqa",
  44. ... task_id="gpqa_diamond",
  45. ... value=0.412,
  46. ... )
  47. >>> # Full example with all fields
  48. >>> result = EvalResultEntry(
  49. ... dataset_id="cais/hle",
  50. ... task_id="default",
  51. ... value=20.90,
  52. ... dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
  53. ... verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
  54. ... date="2025-01-15T10:30:00Z",
  55. ... source_url="https://huggingface.co/datasets/cais/hle",
  56. ... source_name="CAIS HLE",
  57. ... source_org="cais",
  58. ... notes="no-tools",
  59. ... )
  60. ```
  61. """
  62. dataset_id: str
  63. task_id: str
  64. value: Any
  65. dataset_revision: str | None = None
  66. verify_token: str | None = None
  67. date: str | None = None
  68. source_url: str | None = None
  69. source_name: str | None = None
  70. source_user: str | None = None
  71. source_org: str | None = None
  72. notes: str | None = None
  73. def __post_init__(self) -> None:
  74. if (
  75. self.source_name is not None or self.source_user is not None or self.source_org is not None
  76. ) and self.source_url is None:
  77. raise ValueError(
  78. "If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
  79. )
  80. def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
  81. """Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.
  82. This produces the format expected in `.eval_results/*.yaml` files.
  83. Args:
  84. entries (`list[EvalResultEntry]`):
  85. List of evaluation result entries to serialize.
  86. Returns:
  87. `list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.
  88. Example:
  89. ```python
  90. >>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
  91. >>> entries = [
  92. ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
  93. ... EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
  94. ... ]
  95. >>> yaml_data = eval_result_entries_to_yaml(entries)
  96. >>> yaml_data[0]
  97. {'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}
  98. ```
  99. To upload eval results to the Hub:
  100. ```python
  101. >>> import yaml
  102. >>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
  103. >>> entries = [
  104. ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
  105. ... ]
  106. >>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
  107. >>> upload_file(
  108. ... path_or_fileobj=yaml_content.encode(),
  109. ... path_in_repo=".eval_results/hle.yaml",
  110. ... repo_id="your-username/your-model",
  111. ... )
  112. ```
  113. """
  114. result = []
  115. for entry in entries:
  116. # build the dataset object
  117. dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
  118. if entry.dataset_revision is not None:
  119. dataset["revision"] = entry.dataset_revision
  120. data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
  121. if entry.verify_token is not None:
  122. data["verifyToken"] = entry.verify_token
  123. if entry.date is not None:
  124. data["date"] = entry.date
  125. # build the source object
  126. if entry.source_url is not None:
  127. source: dict[str, Any] = {"url": entry.source_url}
  128. if entry.source_name is not None:
  129. source["name"] = entry.source_name
  130. if entry.source_user is not None:
  131. source["user"] = entry.source_user
  132. if entry.source_org is not None:
  133. source["org"] = entry.source_org
  134. data["source"] = source
  135. if entry.notes is not None:
  136. data["notes"] = entry.notes
  137. result.append(data)
  138. return result
  139. def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
  140. """Parse a list of dicts into [`EvalResultEntry`] objects.
  141. This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
  142. use [`model_index_to_eval_results`] instead.
  143. Args:
  144. data (`list[dict[str, Any]]`):
  145. A list of dictionaries (e.g., parsed from YAML or API response).
  146. Returns:
  147. `list[EvalResultEntry]`: A list of evaluation result entry objects.
  148. Example:
  149. ```python
  150. >>> from huggingface_hub import parse_eval_result_entries
  151. >>> data = [
  152. ... {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
  153. ... {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
  154. ... ]
  155. >>> entries = parse_eval_result_entries(data)
  156. >>> entries[0].dataset_id
  157. 'cais/hle'
  158. >>> entries[0].value
  159. 20.9
  160. ```
  161. """
  162. entries = []
  163. for item in data:
  164. entry_data = item.get("data", item)
  165. dataset = entry_data.get("dataset", {})
  166. source = entry_data.get("source", {})
  167. entry = EvalResultEntry(
  168. dataset_id=dataset["id"],
  169. value=entry_data["value"],
  170. task_id=dataset["task_id"],
  171. dataset_revision=dataset.get("revision"),
  172. verify_token=entry_data.get("verifyToken"),
  173. date=entry_data.get("date"),
  174. source_url=source.get("url") if source else None,
  175. source_name=source.get("name") if source else None,
  176. source_user=source.get("user") if source else None,
  177. source_org=source.get("org") if source else None,
  178. notes=entry_data.get("notes"),
  179. )
  180. entries.append(entry)
  181. return entries