| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- from ray.llm._internal.serve.core.ingress.ingress import (
- OpenAiIngress as _OpenAiIngress,
- make_fastapi_ingress,
- )
- from ray.util.annotations import PublicAPI
- @PublicAPI(stability="alpha")
- class OpenAiIngress(_OpenAiIngress):
- """The implementation of the OpenAI compatible model router.
- This deployment creates the following endpoints:
- - /v1/chat/completions: Chat interface (OpenAI-style)
- - /v1/completions: Text completion
- - /v1/models: List available models
- - /v1/models/{model}: Model information
- - /v1/embeddings: Text embeddings
- - /v1/audio/transcriptions: Audio transcription
- - /v1/score: Text scoring
- Examples:
- .. testcode::
- :skipif: True
- from ray import serve
- from ray.serve.llm import LLMConfig
- from ray.serve.llm.deployment import LLMServer
- from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress
- llm_config1 = LLMConfig(
- model_loading_config=dict(
- model_id="qwen-0.5b",
- model_source="Qwen/Qwen2.5-0.5B-Instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1, max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- llm_config2 = LLMConfig(
- model_loading_config=dict(
- model_id="qwen-1.5b",
- model_source="Qwen/Qwen2.5-1.5B-Instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1, max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- # deployment #1
- server_options1 = LLMServer.get_deployment_options(llm_config1)
- server_deployment1 = serve.deployment(LLMServer).options(
- **server_options1).bind(llm_config1)
- # deployment #2
- server_options2 = LLMServer.get_deployment_options(llm_config2)
- server_deployment2 = serve.deployment(LLMServer).options(
- **server_options2).bind(llm_config2)
- # ingress
- ingress_options = OpenAiIngress.get_deployment_options(
- llm_configs=[llm_config1, llm_config2])
- ingress_cls = make_fastapi_ingress(OpenAiIngress)
- ingress_deployment = serve.deployment(ingress_cls).options(
- **ingress_options).bind([server_deployment1, server_deployment2])
- # run
- serve.run(ingress_deployment, blocking=True)
- """
- pass
- __all__ = ["OpenAiIngress", "make_fastapi_ingress"]
|