| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448 |
- """CLI commands for Hugging Face Inference Endpoints."""
- from typing import Annotated
- import typer
- from huggingface_hub._inference_endpoints import InferenceEndpointScalingMetric
- from huggingface_hub.errors import HfHubHTTPError
- from ._cli_utils import FormatWithAutoOpt, TokenOpt, get_hf_api, typer_factory
- from ._output import OutputFormatWithAuto, out
- ie_cli = typer_factory(help="Manage Hugging Face Inference Endpoints.")
- catalog_app = typer_factory(help="Interact with the Inference Endpoints catalog.")
- NameArg = Annotated[
- str,
- typer.Argument(help="Endpoint name."),
- ]
- NameOpt = Annotated[
- str | None,
- typer.Option(help="Endpoint name."),
- ]
- NamespaceOpt = Annotated[
- str | None,
- typer.Option(
- help="The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.",
- ),
- ]
- @ie_cli.command("list | ls", examples=["hf endpoints ls", "hf endpoints ls --namespace my-org"])
- def ls(
- namespace: NamespaceOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Lists all Inference Endpoints for the given namespace."""
- api = get_hf_api(token=token)
- try:
- endpoints = api.list_inference_endpoints(namespace=namespace, token=token)
- except HfHubHTTPError as error:
- out.error(f"Listing failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- results = []
- for endpoint in endpoints:
- raw = endpoint.raw
- status = raw.get("status", {})
- model = raw.get("model", {})
- compute = raw.get("compute", {})
- provider = raw.get("provider", {})
- results.append(
- {
- "name": raw.get("name", ""),
- "model": model.get("repository", "") if isinstance(model, dict) else "",
- "status": status.get("state", "") if isinstance(status, dict) else "",
- "task": model.get("task", "") if isinstance(model, dict) else "",
- "framework": model.get("framework", "") if isinstance(model, dict) else "",
- "instance": compute.get("instanceType", "") if isinstance(compute, dict) else "",
- "vendor": provider.get("vendor", "") if isinstance(provider, dict) else "",
- "region": provider.get("region", "") if isinstance(provider, dict) else "",
- }
- )
- out.table(results, id_key="name")
- @ie_cli.command(name="deploy", examples=["hf endpoints deploy my-endpoint --repo gpt2 --framework pytorch ..."])
- def deploy(
- name: NameArg,
- repo: Annotated[
- str,
- typer.Option(
- help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
- ),
- ],
- framework: Annotated[
- str,
- typer.Option(
- help="The machine learning framework used for the model (e.g. 'vllm').",
- ),
- ],
- accelerator: Annotated[
- str,
- typer.Option(
- help="The hardware accelerator to be used for inference (e.g. 'cpu').",
- ),
- ],
- instance_size: Annotated[
- str,
- typer.Option(
- help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
- ),
- ],
- instance_type: Annotated[
- str,
- typer.Option(
- help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
- ),
- ],
- region: Annotated[
- str,
- typer.Option(
- help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').",
- ),
- ],
- vendor: Annotated[
- str,
- typer.Option(
- help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').",
- ),
- ],
- *,
- namespace: NamespaceOpt = None,
- task: Annotated[
- str | None,
- typer.Option(
- help="The task on which to deploy the model (e.g. 'text-classification').",
- ),
- ] = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- min_replica: Annotated[
- int,
- typer.Option(
- help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
- ),
- ] = 1,
- max_replica: Annotated[
- int,
- typer.Option(
- help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
- ),
- ] = 1,
- scale_to_zero_timeout: Annotated[
- int | None,
- typer.Option(
- help="The duration in minutes before an inactive endpoint is scaled to zero.",
- ),
- ] = None,
- scaling_metric: Annotated[
- InferenceEndpointScalingMetric | None,
- typer.Option(
- help="The metric reference for scaling.",
- ),
- ] = None,
- scaling_threshold: Annotated[
- float | None,
- typer.Option(
- help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
- ),
- ] = None,
- ) -> None:
- """Deploy an Inference Endpoint from a Hub repository."""
- api = get_hf_api(token=token)
- endpoint = api.create_inference_endpoint(
- name=name,
- repository=repo,
- framework=framework,
- accelerator=accelerator,
- instance_size=instance_size,
- instance_type=instance_type,
- region=region,
- vendor=vendor,
- namespace=namespace,
- task=task,
- token=token,
- min_replica=min_replica,
- max_replica=max_replica,
- scaling_metric=scaling_metric,
- scaling_threshold=scaling_threshold,
- scale_to_zero_timeout=scale_to_zero_timeout,
- )
- out.dict(endpoint.raw)
- @catalog_app.command(name="deploy", examples=["hf endpoints catalog deploy --repo meta-llama/Llama-3.2-1B-Instruct"])
- def deploy_from_catalog(
- repo: Annotated[
- str,
- typer.Option(
- help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
- ),
- ],
- name: NameOpt = None,
- accelerator: Annotated[
- str | None,
- typer.Option(
- help="The hardware accelerator to be used for inference (e.g. 'cpu', 'gpu', 'neuron').",
- ),
- ] = None,
- namespace: NamespaceOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Deploy an Inference Endpoint from the Model Catalog."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.create_inference_endpoint_from_catalog(
- repo_id=repo,
- name=name,
- accelerator=accelerator,
- namespace=namespace,
- token=token,
- )
- except HfHubHTTPError as error:
- out.error(f"Deployment failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
- def list_catalog(
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """List available Catalog models."""
- api = get_hf_api(token=token)
- try:
- models = api.list_inference_catalog(token=token)
- except HfHubHTTPError as error:
- out.error(f"Catalog fetch failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict({"models": models})
- catalog_app.command(name="list | ls", examples=["hf endpoints catalog ls"])(list_catalog)
- ie_cli.command(name="list-catalog", hidden=True)(list_catalog)
- ie_cli.add_typer(catalog_app, name="catalog")
- @ie_cli.command(examples=["hf endpoints describe my-endpoint"])
- def describe(
- name: NameArg,
- namespace: NamespaceOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Get information about an existing endpoint."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.get_inference_endpoint(name=name, namespace=namespace, token=token)
- except HfHubHTTPError as error:
- out.error(f"Fetch failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
- @ie_cli.command(examples=["hf endpoints update my-endpoint --min-replica 2"])
- def update(
- name: NameArg,
- namespace: NamespaceOpt = None,
- repo: Annotated[
- str | None,
- typer.Option(
- help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
- ),
- ] = None,
- accelerator: Annotated[
- str | None,
- typer.Option(
- help="The hardware accelerator to be used for inference (e.g. 'cpu').",
- ),
- ] = None,
- instance_size: Annotated[
- str | None,
- typer.Option(
- help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
- ),
- ] = None,
- instance_type: Annotated[
- str | None,
- typer.Option(
- help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
- ),
- ] = None,
- framework: Annotated[
- str | None,
- typer.Option(
- help="The machine learning framework used for the model (e.g. 'custom').",
- ),
- ] = None,
- revision: Annotated[
- str | None,
- typer.Option(
- help="The specific model revision to deploy on the Inference Endpoint (e.g. '6c0e6080953db56375760c0471a8c5f2929baf11').",
- ),
- ] = None,
- task: Annotated[
- str | None,
- typer.Option(
- help="The task on which to deploy the model (e.g. 'text-classification').",
- ),
- ] = None,
- min_replica: Annotated[
- int | None,
- typer.Option(
- help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
- ),
- ] = None,
- max_replica: Annotated[
- int | None,
- typer.Option(
- help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
- ),
- ] = None,
- scale_to_zero_timeout: Annotated[
- int | None,
- typer.Option(
- help="The duration in minutes before an inactive endpoint is scaled to zero.",
- ),
- ] = None,
- scaling_metric: Annotated[
- InferenceEndpointScalingMetric | None,
- typer.Option(
- help="The metric reference for scaling.",
- ),
- ] = None,
- scaling_threshold: Annotated[
- float | None,
- typer.Option(
- help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
- ),
- ] = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Update an existing endpoint."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.update_inference_endpoint(
- name=name,
- namespace=namespace,
- repository=repo,
- framework=framework,
- revision=revision,
- task=task,
- accelerator=accelerator,
- instance_size=instance_size,
- instance_type=instance_type,
- min_replica=min_replica,
- max_replica=max_replica,
- scale_to_zero_timeout=scale_to_zero_timeout,
- scaling_metric=scaling_metric,
- scaling_threshold=scaling_threshold,
- token=token,
- )
- except HfHubHTTPError as error:
- out.error(f"Update failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
- @ie_cli.command(examples=["hf endpoints delete my-endpoint"])
- def delete(
- name: NameArg,
- namespace: NamespaceOpt = None,
- yes: Annotated[
- bool,
- typer.Option("--yes", help="Skip confirmation prompts."),
- ] = False,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Delete an Inference Endpoint permanently."""
- out.confirm(f"Delete endpoint '{name}'?", yes=yes)
- api = get_hf_api(token=token)
- try:
- api.delete_inference_endpoint(name=name, namespace=namespace, token=token)
- except HfHubHTTPError as error:
- out.error(f"Delete failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.result(f"Deleted '{name}'.", name=name)
- @ie_cli.command(examples=["hf endpoints pause my-endpoint"])
- def pause(
- name: NameArg,
- namespace: NamespaceOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Pause an Inference Endpoint."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.pause_inference_endpoint(name=name, namespace=namespace, token=token)
- except HfHubHTTPError as error:
- out.error(f"Pause failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
- @ie_cli.command(examples=["hf endpoints resume my-endpoint"])
- def resume(
- name: NameArg,
- namespace: NamespaceOpt = None,
- fail_if_already_running: Annotated[
- bool,
- typer.Option(
- "--fail-if-already-running",
- help="If `True`, the method will raise an error if the Inference Endpoint is already running.",
- ),
- ] = False,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Resume an Inference Endpoint."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.resume_inference_endpoint(
- name=name,
- namespace=namespace,
- token=token,
- running_ok=not fail_if_already_running,
- )
- except HfHubHTTPError as error:
- out.error(f"Resume failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
- @ie_cli.command(examples=["hf endpoints scale-to-zero my-endpoint"])
- def scale_to_zero(
- name: NameArg,
- namespace: NamespaceOpt = None,
- format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
- token: TokenOpt = None,
- ) -> None:
- """Scale an Inference Endpoint to zero."""
- api = get_hf_api(token=token)
- try:
- endpoint = api.scale_to_zero_inference_endpoint(name=name, namespace=namespace, token=token)
- except HfHubHTTPError as error:
- out.error(f"Scale To Zero failed: {error}")
- raise typer.Exit(code=error.response.status_code) from error
- out.dict(endpoint.raw)
|