| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211 |
- """Evaluation results utilities for the `.eval_results/*.yaml` format.
- See https://huggingface.co/docs/hub/eval-results for more details.
- Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
- """
- from dataclasses import dataclass
- from typing import Any
- @dataclass
- class EvalResultEntry:
- """
- Evaluation result entry for the `.eval_results/*.yaml` format.
- Represents evaluation scores stored in model repos that automatically appear on
- the model page and the benchmark dataset's leaderboard.
- For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.
- See https://huggingface.co/docs/hub/eval-results for more details.
- Args:
- dataset_id (`str`):
- Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
- task_id (`str`):
- Task identifier within the benchmark. Example: "gpqa_diamond".
- value (`Any`):
- The metric value. Example: 20.90.
- dataset_revision (`str`, *optional*):
- Git SHA of the benchmark dataset.
- verify_token (`str`, *optional*):
- A signature that can be used to prove that evaluation is provably auditable and reproducible.
- date (`str`, *optional*):
- When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
- source_url (`str`, *optional*):
- Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
- source_name (`str`, *optional*):
- Display name for the source. Example: "Eval Logs".
- source_user (`str`, *optional*):
- HF user name for attribution. Example: "celinah".
- source_org (`str`, *optional*):
- HF org name for attribution. Example: "cais".
- notes (`str`, *optional*):
- Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".
- Example:
- ```python
- >>> from huggingface_hub import EvalResultEntry
- >>> # Minimal example with required fields only
- >>> result = EvalResultEntry(
- ... dataset_id="Idavidrein/gpqa",
- ... task_id="gpqa_diamond",
- ... value=0.412,
- ... )
- >>> # Full example with all fields
- >>> result = EvalResultEntry(
- ... dataset_id="cais/hle",
- ... task_id="default",
- ... value=20.90,
- ... dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
- ... verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
- ... date="2025-01-15T10:30:00Z",
- ... source_url="https://huggingface.co/datasets/cais/hle",
- ... source_name="CAIS HLE",
- ... source_org="cais",
- ... notes="no-tools",
- ... )
- ```
- """
- dataset_id: str
- task_id: str
- value: Any
- dataset_revision: str | None = None
- verify_token: str | None = None
- date: str | None = None
- source_url: str | None = None
- source_name: str | None = None
- source_user: str | None = None
- source_org: str | None = None
- notes: str | None = None
- def __post_init__(self) -> None:
- if (
- self.source_name is not None or self.source_user is not None or self.source_org is not None
- ) and self.source_url is None:
- raise ValueError(
- "If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
- )
- def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
- """Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.
- This produces the format expected in `.eval_results/*.yaml` files.
- Args:
- entries (`list[EvalResultEntry]`):
- List of evaluation result entries to serialize.
- Returns:
- `list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.
- Example:
- ```python
- >>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
- >>> entries = [
- ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
- ... EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
- ... ]
- >>> yaml_data = eval_result_entries_to_yaml(entries)
- >>> yaml_data[0]
- {'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}
- ```
- To upload eval results to the Hub:
- ```python
- >>> import yaml
- >>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
- >>> entries = [
- ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
- ... ]
- >>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
- >>> upload_file(
- ... path_or_fileobj=yaml_content.encode(),
- ... path_in_repo=".eval_results/hle.yaml",
- ... repo_id="your-username/your-model",
- ... )
- ```
- """
- result = []
- for entry in entries:
- # build the dataset object
- dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
- if entry.dataset_revision is not None:
- dataset["revision"] = entry.dataset_revision
- data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
- if entry.verify_token is not None:
- data["verifyToken"] = entry.verify_token
- if entry.date is not None:
- data["date"] = entry.date
- # build the source object
- if entry.source_url is not None:
- source: dict[str, Any] = {"url": entry.source_url}
- if entry.source_name is not None:
- source["name"] = entry.source_name
- if entry.source_user is not None:
- source["user"] = entry.source_user
- if entry.source_org is not None:
- source["org"] = entry.source_org
- data["source"] = source
- if entry.notes is not None:
- data["notes"] = entry.notes
- result.append(data)
- return result
- def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
- """Parse a list of dicts into [`EvalResultEntry`] objects.
- This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
- use [`model_index_to_eval_results`] instead.
- Args:
- data (`list[dict[str, Any]]`):
- A list of dictionaries (e.g., parsed from YAML or API response).
- Returns:
- `list[EvalResultEntry]`: A list of evaluation result entry objects.
- Example:
- ```python
- >>> from huggingface_hub import parse_eval_result_entries
- >>> data = [
- ... {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
- ... {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
- ... ]
- >>> entries = parse_eval_result_entries(data)
- >>> entries[0].dataset_id
- 'cais/hle'
- >>> entries[0].value
- 20.9
- ```
- """
- entries = []
- for item in data:
- entry_data = item.get("data", item)
- dataset = entry_data.get("dataset", {})
- source = entry_data.get("source", {})
- entry = EvalResultEntry(
- dataset_id=dataset["id"],
- value=entry_data["value"],
- task_id=dataset["task_id"],
- dataset_revision=dataset.get("revision"),
- verify_token=entry_data.get("verifyToken"),
- date=entry_data.get("date"),
- source_url=source.get("url") if source else None,
- source_name=source.get("name") if source else None,
- source_user=source.get("user") if source else None,
- source_org=source.get("org") if source else None,
- notes=entry_data.get("notes"),
- )
- entries.append(entry)
- return entries
|