| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- from ray.llm._internal.serve.core.server.llm_server import (
- LLMServer as InternalLLMServer,
- )
- from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import (
- DPServer as _DPServer,
- )
- from ray.llm._internal.serve.serving_patterns.prefill_decode.pd_server import (
- PDProxyServer as _PDProxyServer,
- )
- from ray.util.annotations import PublicAPI
- #############
- # Deployments
- #############
- @PublicAPI(stability="alpha")
- class LLMServer(InternalLLMServer):
- """The implementation of the vLLM engine deployment.
- To build a Deployment object you should use `build_llm_deployment` function.
- We also expose a lower level API for more control over the deployment class
- through `serve.deployment` function.
- Examples:
- .. testcode::
- :skipif: True
- from ray import serve
- from ray.serve.llm import LLMConfig
- from ray.serve.llm.deployment import LLMServer
- # Configure the model
- llm_config = LLMConfig(
- model_loading_config=dict(
- served_model_name="llama-3.1-8b",
- model_source="meta-llama/Llama-3.1-8b-instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1,
- max_replicas=8,
- )
- ),
- )
- # Build the deployment directly
- serve_options = LLMServer.get_deployment_options(llm_config)
- llm_app = serve.deployment(LLMServer).options(
- **serve_options).bind(llm_config)
- model_handle = serve.run(llm_app)
- # Query the model via `chat` api
- from ray.serve.llm.openai_api_models import ChatCompletionRequest
- request = ChatCompletionRequest(
- model="llama-3.1-8b",
- messages=[
- {
- "role": "user",
- "content": "Hello, world!"
- }
- ]
- )
- response = ray.get(model_handle.chat(request))
- print(response)
- """
- pass
- @PublicAPI(stability="alpha")
- class PDProxyServer(_PDProxyServer):
- """A proxy server for prefill-decode disaggregation.
- This server acts as a proxy in a prefill-decode disaggregated system.
- For chat and completions, proxy sends the request to the prefill server
- with max_tokens=1 and then sends the returned metadata to the decode server.
- Args:
- prefill_server: The prefill server deployment handle.
- decode_server: The decode server deployment handle.
- """
- pass
- @PublicAPI(stability="alpha")
- class DPServer(_DPServer):
- """Data Parallel LLM Server.
- This class is used to serve data parallel attention (DP Attention)
- deployment paradigm, where the attention layers are replicated and
- the MoE layers are sharded. DP Attention is typically used for models
- like DeepSeek-V3.
- To build a Deployment object you should use `build_dp_deployment` function.
- We also expose a lower level API for more control over the deployment class
- through `serve.deployment` function.
- Examples:
- .. testcode::
- :skipif: True
- from ray import serve
- from ray.serve.llm import LLMConfig, build_dp_deployment
- # Configure the model
- llm_config = LLMConfig(
- model_loading_config=dict(
- model_id="Qwen/Qwen2.5-0.5B-Instruct",
- ),
- engine_kwargs=dict(
- data_parallel_size=2,
- tensor_parallel_size=1,
- ),
- experimental_configs=dict(
- dp_size_per_node=2,
- ),
- accelerator_type="A10G",
- )
- # Build the deployment
- dp_app = build_dp_deployment(llm_config)
- # Deploy the application
- model_handle = serve.run(dp_app)
- """
- pass
- __all__ = ["LLMServer", "PDProxyServer", "DPServer"]
|