ingress.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from ray.llm._internal.serve.core.ingress.ingress import (
  2. OpenAiIngress as _OpenAiIngress,
  3. make_fastapi_ingress,
  4. )
  5. from ray.util.annotations import PublicAPI
  6. @PublicAPI(stability="alpha")
  7. class OpenAiIngress(_OpenAiIngress):
  8. """The implementation of the OpenAI compatible model router.
  9. This deployment creates the following endpoints:
  10. - /v1/chat/completions: Chat interface (OpenAI-style)
  11. - /v1/completions: Text completion
  12. - /v1/models: List available models
  13. - /v1/models/{model}: Model information
  14. - /v1/embeddings: Text embeddings
  15. - /v1/audio/transcriptions: Audio transcription
  16. - /v1/score: Text scoring
  17. Examples:
  18. .. testcode::
  19. :skipif: True
  20. from ray import serve
  21. from ray.serve.llm import LLMConfig
  22. from ray.serve.llm.deployment import LLMServer
  23. from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress
  24. llm_config1 = LLMConfig(
  25. model_loading_config=dict(
  26. model_id="qwen-0.5b",
  27. model_source="Qwen/Qwen2.5-0.5B-Instruct",
  28. ),
  29. deployment_config=dict(
  30. autoscaling_config=dict(
  31. min_replicas=1, max_replicas=2,
  32. )
  33. ),
  34. accelerator_type="A10G",
  35. )
  36. llm_config2 = LLMConfig(
  37. model_loading_config=dict(
  38. model_id="qwen-1.5b",
  39. model_source="Qwen/Qwen2.5-1.5B-Instruct",
  40. ),
  41. deployment_config=dict(
  42. autoscaling_config=dict(
  43. min_replicas=1, max_replicas=2,
  44. )
  45. ),
  46. accelerator_type="A10G",
  47. )
  48. # deployment #1
  49. server_options1 = LLMServer.get_deployment_options(llm_config1)
  50. server_deployment1 = serve.deployment(LLMServer).options(
  51. **server_options1).bind(llm_config1)
  52. # deployment #2
  53. server_options2 = LLMServer.get_deployment_options(llm_config2)
  54. server_deployment2 = serve.deployment(LLMServer).options(
  55. **server_options2).bind(llm_config2)
  56. # ingress
  57. ingress_options = OpenAiIngress.get_deployment_options(
  58. llm_configs=[llm_config1, llm_config2])
  59. ingress_cls = make_fastapi_ingress(OpenAiIngress)
  60. ingress_deployment = serve.deployment(ingress_cls).options(
  61. **ingress_options).bind([server_deployment1, server_deployment2])
  62. # run
  63. serve.run(ingress_deployment, blocking=True)
  64. """
  65. pass
  66. __all__ = ["OpenAiIngress", "make_fastapi_ingress"]