yichael
/
image-match


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
							from ray.llm._internal.serve.core.ingress.ingress import (
    OpenAiIngress as _OpenAiIngress,
    make_fastapi_ingress,
)
from ray.util.annotations import PublicAPI


@PublicAPI(stability="alpha")
class OpenAiIngress(_OpenAiIngress):

    """The implementation of the OpenAI compatible model router.

    This deployment creates the following endpoints:
      - /v1/chat/completions: Chat interface (OpenAI-style)
      - /v1/completions: Text completion
      - /v1/models: List available models
      - /v1/models/{model}: Model information
      - /v1/embeddings: Text embeddings
      - /v1/audio/transcriptions: Audio transcription
      - /v1/score: Text scoring


    Examples:
        .. testcode::
            :skipif: True


            from ray import serve
            from ray.serve.llm import LLMConfig
            from ray.serve.llm.deployment import LLMServer
            from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # deployment #1
            server_options1 = LLMServer.get_deployment_options(llm_config1)
            server_deployment1 = serve.deployment(LLMServer).options(
                **server_options1).bind(llm_config1)

            # deployment #2
            server_options2 = LLMServer.get_deployment_options(llm_config2)
            server_deployment2 = serve.deployment(LLMServer).options(
                **server_options2).bind(llm_config2)

            # ingress
            ingress_options = OpenAiIngress.get_deployment_options(
                llm_configs=[llm_config1, llm_config2])
            ingress_cls = make_fastapi_ingress(OpenAiIngress)
            ingress_deployment = serve.deployment(ingress_cls).options(
                **ingress_options).bind([server_deployment1, server_deployment2])

            # run
            serve.run(ingress_deployment, blocking=True)
    """

    pass


__all__ = ["OpenAiIngress", "make_fastapi_ingress"]