yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
							from ray.llm._internal.serve.core.server.llm_server import (
    LLMServer as InternalLLMServer,
)
from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import (
    DPServer as _DPServer,
)
from ray.llm._internal.serve.serving_patterns.prefill_decode.pd_server import (
    PDProxyServer as _PDProxyServer,
)
from ray.util.annotations import PublicAPI

#############
# Deployments
#############


@PublicAPI(stability="alpha")
class LLMServer(InternalLLMServer):
    """The implementation of the vLLM engine deployment.

    To build a Deployment object you should use `build_llm_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `serve.deployment` function.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig
            from ray.serve.llm.deployment import LLMServer

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=8,
                    )
                ),
            )

            # Build the deployment directly
            serve_options = LLMServer.get_deployment_options(llm_config)
            llm_app = serve.deployment(LLMServer).options(
                **serve_options).bind(llm_config)

            model_handle = serve.run(llm_app)

            # Query the model via `chat` api
            from ray.serve.llm.openai_api_models import ChatCompletionRequest
            request = ChatCompletionRequest(
                model="llama-3.1-8b",
                messages=[
                    {
                        "role": "user",
                        "content": "Hello, world!"
                    }
                ]
            )
            response = ray.get(model_handle.chat(request))
            print(response)
    """

    pass


@PublicAPI(stability="alpha")
class PDProxyServer(_PDProxyServer):
    """A proxy server for prefill-decode disaggregation.

    This server acts as a proxy in a prefill-decode disaggregated system.
    For chat and completions, proxy sends the request to the prefill server
    with max_tokens=1 and then sends the returned metadata to the decode server.

    Args:
        prefill_server: The prefill server deployment handle.
        decode_server: The decode server deployment handle.
    """

    pass


@PublicAPI(stability="alpha")
class DPServer(_DPServer):
    """Data Parallel LLM Server.

    This class is used to serve data parallel attention (DP Attention)
    deployment paradigm, where the attention layers are replicated and
    the MoE layers are sharded. DP Attention is typically used for models
    like DeepSeek-V3.

    To build a Deployment object you should use `build_dp_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `serve.deployment` function.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_dp_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                engine_kwargs=dict(
                    data_parallel_size=2,
                    tensor_parallel_size=1,
                ),
                experimental_configs=dict(
                    dp_size_per_node=2,
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            dp_app = build_dp_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(dp_app)
    """

    pass


__all__ = ["LLMServer", "PDProxyServer", "DPServer"]