deployment.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. from ray.llm._internal.serve.core.server.llm_server import (
  2. LLMServer as InternalLLMServer,
  3. )
  4. from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import (
  5. DPServer as _DPServer,
  6. )
  7. from ray.llm._internal.serve.serving_patterns.prefill_decode.pd_server import (
  8. PDProxyServer as _PDProxyServer,
  9. )
  10. from ray.util.annotations import PublicAPI
  11. #############
  12. # Deployments
  13. #############
  14. @PublicAPI(stability="alpha")
  15. class LLMServer(InternalLLMServer):
  16. """The implementation of the vLLM engine deployment.
  17. To build a Deployment object you should use `build_llm_deployment` function.
  18. We also expose a lower level API for more control over the deployment class
  19. through `serve.deployment` function.
  20. Examples:
  21. .. testcode::
  22. :skipif: True
  23. from ray import serve
  24. from ray.serve.llm import LLMConfig
  25. from ray.serve.llm.deployment import LLMServer
  26. # Configure the model
  27. llm_config = LLMConfig(
  28. model_loading_config=dict(
  29. served_model_name="llama-3.1-8b",
  30. model_source="meta-llama/Llama-3.1-8b-instruct",
  31. ),
  32. deployment_config=dict(
  33. autoscaling_config=dict(
  34. min_replicas=1,
  35. max_replicas=8,
  36. )
  37. ),
  38. )
  39. # Build the deployment directly
  40. serve_options = LLMServer.get_deployment_options(llm_config)
  41. llm_app = serve.deployment(LLMServer).options(
  42. **serve_options).bind(llm_config)
  43. model_handle = serve.run(llm_app)
  44. # Query the model via `chat` api
  45. from ray.serve.llm.openai_api_models import ChatCompletionRequest
  46. request = ChatCompletionRequest(
  47. model="llama-3.1-8b",
  48. messages=[
  49. {
  50. "role": "user",
  51. "content": "Hello, world!"
  52. }
  53. ]
  54. )
  55. response = ray.get(model_handle.chat(request))
  56. print(response)
  57. """
  58. pass
  59. @PublicAPI(stability="alpha")
  60. class PDProxyServer(_PDProxyServer):
  61. """A proxy server for prefill-decode disaggregation.
  62. This server acts as a proxy in a prefill-decode disaggregated system.
  63. For chat and completions, proxy sends the request to the prefill server
  64. with max_tokens=1 and then sends the returned metadata to the decode server.
  65. Args:
  66. prefill_server: The prefill server deployment handle.
  67. decode_server: The decode server deployment handle.
  68. """
  69. pass
  70. @PublicAPI(stability="alpha")
  71. class DPServer(_DPServer):
  72. """Data Parallel LLM Server.
  73. This class is used to serve data parallel attention (DP Attention)
  74. deployment paradigm, where the attention layers are replicated and
  75. the MoE layers are sharded. DP Attention is typically used for models
  76. like DeepSeek-V3.
  77. To build a Deployment object you should use `build_dp_deployment` function.
  78. We also expose a lower level API for more control over the deployment class
  79. through `serve.deployment` function.
  80. Examples:
  81. .. testcode::
  82. :skipif: True
  83. from ray import serve
  84. from ray.serve.llm import LLMConfig, build_dp_deployment
  85. # Configure the model
  86. llm_config = LLMConfig(
  87. model_loading_config=dict(
  88. model_id="Qwen/Qwen2.5-0.5B-Instruct",
  89. ),
  90. engine_kwargs=dict(
  91. data_parallel_size=2,
  92. tensor_parallel_size=1,
  93. ),
  94. experimental_configs=dict(
  95. dp_size_per_node=2,
  96. ),
  97. accelerator_type="A10G",
  98. )
  99. # Build the deployment
  100. dp_app = build_dp_deployment(llm_config)
  101. # Deploy the application
  102. model_handle = serve.run(dp_app)
  103. """
  104. pass
  105. __all__ = ["LLMServer", "PDProxyServer", "DPServer"]