inference_endpoints.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. """CLI commands for Hugging Face Inference Endpoints."""
  2. from typing import Annotated
  3. import typer
  4. from huggingface_hub._inference_endpoints import InferenceEndpointScalingMetric
  5. from huggingface_hub.errors import HfHubHTTPError
  6. from ._cli_utils import FormatWithAutoOpt, TokenOpt, get_hf_api, typer_factory
  7. from ._output import OutputFormatWithAuto, out
  8. ie_cli = typer_factory(help="Manage Hugging Face Inference Endpoints.")
  9. catalog_app = typer_factory(help="Interact with the Inference Endpoints catalog.")
  10. NameArg = Annotated[
  11. str,
  12. typer.Argument(help="Endpoint name."),
  13. ]
  14. NameOpt = Annotated[
  15. str | None,
  16. typer.Option(help="Endpoint name."),
  17. ]
  18. NamespaceOpt = Annotated[
  19. str | None,
  20. typer.Option(
  21. help="The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.",
  22. ),
  23. ]
  24. @ie_cli.command("list | ls", examples=["hf endpoints ls", "hf endpoints ls --namespace my-org"])
  25. def ls(
  26. namespace: NamespaceOpt = None,
  27. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  28. token: TokenOpt = None,
  29. ) -> None:
  30. """Lists all Inference Endpoints for the given namespace."""
  31. api = get_hf_api(token=token)
  32. try:
  33. endpoints = api.list_inference_endpoints(namespace=namespace, token=token)
  34. except HfHubHTTPError as error:
  35. out.error(f"Listing failed: {error}")
  36. raise typer.Exit(code=error.response.status_code) from error
  37. results = []
  38. for endpoint in endpoints:
  39. raw = endpoint.raw
  40. status = raw.get("status", {})
  41. model = raw.get("model", {})
  42. compute = raw.get("compute", {})
  43. provider = raw.get("provider", {})
  44. results.append(
  45. {
  46. "name": raw.get("name", ""),
  47. "model": model.get("repository", "") if isinstance(model, dict) else "",
  48. "status": status.get("state", "") if isinstance(status, dict) else "",
  49. "task": model.get("task", "") if isinstance(model, dict) else "",
  50. "framework": model.get("framework", "") if isinstance(model, dict) else "",
  51. "instance": compute.get("instanceType", "") if isinstance(compute, dict) else "",
  52. "vendor": provider.get("vendor", "") if isinstance(provider, dict) else "",
  53. "region": provider.get("region", "") if isinstance(provider, dict) else "",
  54. }
  55. )
  56. out.table(results, id_key="name")
  57. @ie_cli.command(name="deploy", examples=["hf endpoints deploy my-endpoint --repo gpt2 --framework pytorch ..."])
  58. def deploy(
  59. name: NameArg,
  60. repo: Annotated[
  61. str,
  62. typer.Option(
  63. help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
  64. ),
  65. ],
  66. framework: Annotated[
  67. str,
  68. typer.Option(
  69. help="The machine learning framework used for the model (e.g. 'vllm').",
  70. ),
  71. ],
  72. accelerator: Annotated[
  73. str,
  74. typer.Option(
  75. help="The hardware accelerator to be used for inference (e.g. 'cpu').",
  76. ),
  77. ],
  78. instance_size: Annotated[
  79. str,
  80. typer.Option(
  81. help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
  82. ),
  83. ],
  84. instance_type: Annotated[
  85. str,
  86. typer.Option(
  87. help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
  88. ),
  89. ],
  90. region: Annotated[
  91. str,
  92. typer.Option(
  93. help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').",
  94. ),
  95. ],
  96. vendor: Annotated[
  97. str,
  98. typer.Option(
  99. help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').",
  100. ),
  101. ],
  102. *,
  103. namespace: NamespaceOpt = None,
  104. task: Annotated[
  105. str | None,
  106. typer.Option(
  107. help="The task on which to deploy the model (e.g. 'text-classification').",
  108. ),
  109. ] = None,
  110. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  111. token: TokenOpt = None,
  112. min_replica: Annotated[
  113. int,
  114. typer.Option(
  115. help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
  116. ),
  117. ] = 1,
  118. max_replica: Annotated[
  119. int,
  120. typer.Option(
  121. help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
  122. ),
  123. ] = 1,
  124. scale_to_zero_timeout: Annotated[
  125. int | None,
  126. typer.Option(
  127. help="The duration in minutes before an inactive endpoint is scaled to zero.",
  128. ),
  129. ] = None,
  130. scaling_metric: Annotated[
  131. InferenceEndpointScalingMetric | None,
  132. typer.Option(
  133. help="The metric reference for scaling.",
  134. ),
  135. ] = None,
  136. scaling_threshold: Annotated[
  137. float | None,
  138. typer.Option(
  139. help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
  140. ),
  141. ] = None,
  142. ) -> None:
  143. """Deploy an Inference Endpoint from a Hub repository."""
  144. api = get_hf_api(token=token)
  145. endpoint = api.create_inference_endpoint(
  146. name=name,
  147. repository=repo,
  148. framework=framework,
  149. accelerator=accelerator,
  150. instance_size=instance_size,
  151. instance_type=instance_type,
  152. region=region,
  153. vendor=vendor,
  154. namespace=namespace,
  155. task=task,
  156. token=token,
  157. min_replica=min_replica,
  158. max_replica=max_replica,
  159. scaling_metric=scaling_metric,
  160. scaling_threshold=scaling_threshold,
  161. scale_to_zero_timeout=scale_to_zero_timeout,
  162. )
  163. out.dict(endpoint.raw)
  164. @catalog_app.command(name="deploy", examples=["hf endpoints catalog deploy --repo meta-llama/Llama-3.2-1B-Instruct"])
  165. def deploy_from_catalog(
  166. repo: Annotated[
  167. str,
  168. typer.Option(
  169. help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
  170. ),
  171. ],
  172. name: NameOpt = None,
  173. accelerator: Annotated[
  174. str | None,
  175. typer.Option(
  176. help="The hardware accelerator to be used for inference (e.g. 'cpu', 'gpu', 'neuron').",
  177. ),
  178. ] = None,
  179. namespace: NamespaceOpt = None,
  180. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  181. token: TokenOpt = None,
  182. ) -> None:
  183. """Deploy an Inference Endpoint from the Model Catalog."""
  184. api = get_hf_api(token=token)
  185. try:
  186. endpoint = api.create_inference_endpoint_from_catalog(
  187. repo_id=repo,
  188. name=name,
  189. accelerator=accelerator,
  190. namespace=namespace,
  191. token=token,
  192. )
  193. except HfHubHTTPError as error:
  194. out.error(f"Deployment failed: {error}")
  195. raise typer.Exit(code=error.response.status_code) from error
  196. out.dict(endpoint.raw)
  197. def list_catalog(
  198. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  199. token: TokenOpt = None,
  200. ) -> None:
  201. """List available Catalog models."""
  202. api = get_hf_api(token=token)
  203. try:
  204. models = api.list_inference_catalog(token=token)
  205. except HfHubHTTPError as error:
  206. out.error(f"Catalog fetch failed: {error}")
  207. raise typer.Exit(code=error.response.status_code) from error
  208. out.dict({"models": models})
  209. catalog_app.command(name="list | ls", examples=["hf endpoints catalog ls"])(list_catalog)
  210. ie_cli.command(name="list-catalog", hidden=True)(list_catalog)
  211. ie_cli.add_typer(catalog_app, name="catalog")
  212. @ie_cli.command(examples=["hf endpoints describe my-endpoint"])
  213. def describe(
  214. name: NameArg,
  215. namespace: NamespaceOpt = None,
  216. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  217. token: TokenOpt = None,
  218. ) -> None:
  219. """Get information about an existing endpoint."""
  220. api = get_hf_api(token=token)
  221. try:
  222. endpoint = api.get_inference_endpoint(name=name, namespace=namespace, token=token)
  223. except HfHubHTTPError as error:
  224. out.error(f"Fetch failed: {error}")
  225. raise typer.Exit(code=error.response.status_code) from error
  226. out.dict(endpoint.raw)
  227. @ie_cli.command(examples=["hf endpoints update my-endpoint --min-replica 2"])
  228. def update(
  229. name: NameArg,
  230. namespace: NamespaceOpt = None,
  231. repo: Annotated[
  232. str | None,
  233. typer.Option(
  234. help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
  235. ),
  236. ] = None,
  237. accelerator: Annotated[
  238. str | None,
  239. typer.Option(
  240. help="The hardware accelerator to be used for inference (e.g. 'cpu').",
  241. ),
  242. ] = None,
  243. instance_size: Annotated[
  244. str | None,
  245. typer.Option(
  246. help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
  247. ),
  248. ] = None,
  249. instance_type: Annotated[
  250. str | None,
  251. typer.Option(
  252. help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
  253. ),
  254. ] = None,
  255. framework: Annotated[
  256. str | None,
  257. typer.Option(
  258. help="The machine learning framework used for the model (e.g. 'custom').",
  259. ),
  260. ] = None,
  261. revision: Annotated[
  262. str | None,
  263. typer.Option(
  264. help="The specific model revision to deploy on the Inference Endpoint (e.g. '6c0e6080953db56375760c0471a8c5f2929baf11').",
  265. ),
  266. ] = None,
  267. task: Annotated[
  268. str | None,
  269. typer.Option(
  270. help="The task on which to deploy the model (e.g. 'text-classification').",
  271. ),
  272. ] = None,
  273. min_replica: Annotated[
  274. int | None,
  275. typer.Option(
  276. help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
  277. ),
  278. ] = None,
  279. max_replica: Annotated[
  280. int | None,
  281. typer.Option(
  282. help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
  283. ),
  284. ] = None,
  285. scale_to_zero_timeout: Annotated[
  286. int | None,
  287. typer.Option(
  288. help="The duration in minutes before an inactive endpoint is scaled to zero.",
  289. ),
  290. ] = None,
  291. scaling_metric: Annotated[
  292. InferenceEndpointScalingMetric | None,
  293. typer.Option(
  294. help="The metric reference for scaling.",
  295. ),
  296. ] = None,
  297. scaling_threshold: Annotated[
  298. float | None,
  299. typer.Option(
  300. help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
  301. ),
  302. ] = None,
  303. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  304. token: TokenOpt = None,
  305. ) -> None:
  306. """Update an existing endpoint."""
  307. api = get_hf_api(token=token)
  308. try:
  309. endpoint = api.update_inference_endpoint(
  310. name=name,
  311. namespace=namespace,
  312. repository=repo,
  313. framework=framework,
  314. revision=revision,
  315. task=task,
  316. accelerator=accelerator,
  317. instance_size=instance_size,
  318. instance_type=instance_type,
  319. min_replica=min_replica,
  320. max_replica=max_replica,
  321. scale_to_zero_timeout=scale_to_zero_timeout,
  322. scaling_metric=scaling_metric,
  323. scaling_threshold=scaling_threshold,
  324. token=token,
  325. )
  326. except HfHubHTTPError as error:
  327. out.error(f"Update failed: {error}")
  328. raise typer.Exit(code=error.response.status_code) from error
  329. out.dict(endpoint.raw)
  330. @ie_cli.command(examples=["hf endpoints delete my-endpoint"])
  331. def delete(
  332. name: NameArg,
  333. namespace: NamespaceOpt = None,
  334. yes: Annotated[
  335. bool,
  336. typer.Option("--yes", help="Skip confirmation prompts."),
  337. ] = False,
  338. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  339. token: TokenOpt = None,
  340. ) -> None:
  341. """Delete an Inference Endpoint permanently."""
  342. out.confirm(f"Delete endpoint '{name}'?", yes=yes)
  343. api = get_hf_api(token=token)
  344. try:
  345. api.delete_inference_endpoint(name=name, namespace=namespace, token=token)
  346. except HfHubHTTPError as error:
  347. out.error(f"Delete failed: {error}")
  348. raise typer.Exit(code=error.response.status_code) from error
  349. out.result(f"Deleted '{name}'.", name=name)
  350. @ie_cli.command(examples=["hf endpoints pause my-endpoint"])
  351. def pause(
  352. name: NameArg,
  353. namespace: NamespaceOpt = None,
  354. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  355. token: TokenOpt = None,
  356. ) -> None:
  357. """Pause an Inference Endpoint."""
  358. api = get_hf_api(token=token)
  359. try:
  360. endpoint = api.pause_inference_endpoint(name=name, namespace=namespace, token=token)
  361. except HfHubHTTPError as error:
  362. out.error(f"Pause failed: {error}")
  363. raise typer.Exit(code=error.response.status_code) from error
  364. out.dict(endpoint.raw)
  365. @ie_cli.command(examples=["hf endpoints resume my-endpoint"])
  366. def resume(
  367. name: NameArg,
  368. namespace: NamespaceOpt = None,
  369. fail_if_already_running: Annotated[
  370. bool,
  371. typer.Option(
  372. "--fail-if-already-running",
  373. help="If `True`, the method will raise an error if the Inference Endpoint is already running.",
  374. ),
  375. ] = False,
  376. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  377. token: TokenOpt = None,
  378. ) -> None:
  379. """Resume an Inference Endpoint."""
  380. api = get_hf_api(token=token)
  381. try:
  382. endpoint = api.resume_inference_endpoint(
  383. name=name,
  384. namespace=namespace,
  385. token=token,
  386. running_ok=not fail_if_already_running,
  387. )
  388. except HfHubHTTPError as error:
  389. out.error(f"Resume failed: {error}")
  390. raise typer.Exit(code=error.response.status_code) from error
  391. out.dict(endpoint.raw)
  392. @ie_cli.command(examples=["hf endpoints scale-to-zero my-endpoint"])
  393. def scale_to_zero(
  394. name: NameArg,
  395. namespace: NamespaceOpt = None,
  396. format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
  397. token: TokenOpt = None,
  398. ) -> None:
  399. """Scale an Inference Endpoint to zero."""
  400. api = get_hf_api(token=token)
  401. try:
  402. endpoint = api.scale_to_zero_inference_endpoint(name=name, namespace=namespace, token=token)
  403. except HfHubHTTPError as error:
  404. out.error(f"Scale To Zero failed: {error}")
  405. raise typer.Exit(code=error.response.status_code) from error
  406. out.dict(endpoint.raw)