__init__.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. from typing import TYPE_CHECKING, Optional, Type
  2. from ray._common.deprecation import Deprecated
  3. from ray.llm._internal.serve.core.configs.llm_config import (
  4. CloudMirrorConfig as _CloudMirrorConfig,
  5. LLMConfig as _LLMConfig,
  6. LoraConfig as _LoraConfig,
  7. ModelLoadingConfig as _ModelLoadingConfig,
  8. )
  9. from ray.llm._internal.serve.core.ingress.builder import (
  10. LLMServingArgs as _LLMServingArgs,
  11. )
  12. from ray.llm._internal.serve.core.ingress.ingress import (
  13. OpenAiIngress as _OpenAiIngress,
  14. )
  15. # For backward compatibility
  16. from ray.llm._internal.serve.core.server.llm_server import (
  17. LLMServer as _LLMServer,
  18. )
  19. from ray.util.annotations import PublicAPI
  20. if TYPE_CHECKING:
  21. from ray.serve.deployment import Application
  22. ##########
  23. # Models
  24. ##########
  25. @PublicAPI(stability="alpha")
  26. class LLMConfig(_LLMConfig):
  27. """The configuration for starting an LLM deployment."""
  28. pass
  29. @PublicAPI(stability="alpha")
  30. class LLMServingArgs(_LLMServingArgs):
  31. """The configuration for starting an LLM deployment application."""
  32. pass
  33. @PublicAPI(stability="alpha")
  34. class ModelLoadingConfig(_ModelLoadingConfig):
  35. """The configuration for loading an LLM model."""
  36. pass
  37. @PublicAPI(stability="alpha")
  38. class CloudMirrorConfig(_CloudMirrorConfig):
  39. """The configuration for mirroring an LLM model from cloud storage."""
  40. pass
  41. @PublicAPI(stability="alpha")
  42. class LoraConfig(_LoraConfig):
  43. """The configuration for loading an LLM model with LoRA."""
  44. pass
  45. #############
  46. # Deployments
  47. #############
  48. @Deprecated(
  49. old="ray.serve.llm.LLMServer", new="ray.serve.llm.deployment.LLMServer", error=False
  50. )
  51. class LLMServer(_LLMServer):
  52. pass
  53. @Deprecated(
  54. old="ray.serve.llm.LLMRouter",
  55. new="ray.serve.llm.ingress.OpenAIIngress",
  56. error=False,
  57. )
  58. class LLMRouter(_OpenAiIngress):
  59. pass
  60. ##########
  61. # Builders
  62. ##########
  63. @PublicAPI(stability="alpha")
  64. def build_llm_deployment(
  65. llm_config: "LLMConfig",
  66. *,
  67. name_prefix: Optional[str] = None,
  68. bind_kwargs: Optional[dict] = None,
  69. override_serve_options: Optional[dict] = None,
  70. deployment_cls: Optional[Type[LLMServer]] = None,
  71. ) -> "Application":
  72. """Helper to build a single vllm deployment from the given llm config.
  73. Examples:
  74. .. testcode::
  75. :skipif: True
  76. from ray import serve
  77. from ray.serve.llm import LLMConfig, build_llm_deployment
  78. # Configure the model
  79. llm_config = LLMConfig(
  80. model_loading_config=dict(
  81. model_id="llama-3.1-8b",
  82. model_source="meta-llama/Llama-3.1-8b-instruct",
  83. ),
  84. deployment_config=dict(
  85. autoscaling_config=dict(
  86. min_replicas=1,
  87. max_replicas=2,
  88. )
  89. ),
  90. accelerator_type="A10G",
  91. )
  92. # Build the deployment
  93. llm_app = build_llm_deployment(llm_config)
  94. # Deploy the application
  95. model_handle = serve.run(llm_app)
  96. # Querying the model handle
  97. import asyncio
  98. model_handle = model_handle.options(stream=True)
  99. async def query_model(model_handle):
  100. from ray.serve.llm.openai_api_models import ChatCompletionRequest
  101. request = ChatCompletionRequest(
  102. model="qwen-0.5b",
  103. messages=[
  104. {
  105. "role": "user",
  106. "content": "Hello, world!"
  107. }
  108. ]
  109. )
  110. resp = model_handle.chat.remote(request)
  111. async for message in resp:
  112. print("message: ", message)
  113. asyncio.run(query_model(model_handle))
  114. Args:
  115. llm_config: The llm config to build vllm deployment.
  116. name_prefix: Optional prefix to be used for the deployment name.
  117. bind_kwargs: Optional kwargs to pass to the deployment.
  118. override_serve_options: Optional serve options to override the original serve options based on the llm_config.
  119. deployment_cls: Optional deployment class to use.
  120. Returns:
  121. The configured Ray Serve Application for vllm deployment.
  122. """
  123. from ray.llm._internal.serve.core.server.builder import (
  124. build_llm_deployment,
  125. )
  126. return build_llm_deployment(
  127. llm_config=llm_config,
  128. name_prefix=name_prefix,
  129. bind_kwargs=bind_kwargs,
  130. override_serve_options=override_serve_options,
  131. deployment_cls=deployment_cls,
  132. )
  133. @PublicAPI(stability="alpha")
  134. def build_openai_app(llm_serving_args: dict) -> "Application":
  135. """Helper to build an OpenAI compatible app with the llm deployment setup from
  136. the given llm serving args. This is the main entry point for users to create a
  137. Serve application serving LLMs.
  138. Examples:
  139. .. code-block:: python
  140. :caption: Example usage in code.
  141. from ray import serve
  142. from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app
  143. llm_config1 = LLMConfig(
  144. model_loading_config=dict(
  145. model_id="qwen-0.5b",
  146. model_source="Qwen/Qwen2.5-0.5B-Instruct",
  147. ),
  148. deployment_config=dict(
  149. autoscaling_config=dict(
  150. min_replicas=1, max_replicas=2,
  151. )
  152. ),
  153. accelerator_type="A10G",
  154. )
  155. llm_config2 = LLMConfig(
  156. model_loading_config=dict(
  157. model_id="qwen-1.5b",
  158. model_source="Qwen/Qwen2.5-1.5B-Instruct",
  159. ),
  160. deployment_config=dict(
  161. autoscaling_config=dict(
  162. min_replicas=1, max_replicas=2,
  163. )
  164. ),
  165. accelerator_type="A10G",
  166. )
  167. # Deploy the application
  168. llm_app = build_openai_app(
  169. LLMServingArgs(
  170. llm_configs=[
  171. llm_config1,
  172. llm_config2,
  173. ]
  174. )
  175. )
  176. serve.run(llm_app)
  177. # Querying the model via openai client
  178. from openai import OpenAI
  179. # Initialize client
  180. client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
  181. # Basic completion
  182. response = client.chat.completions.create(
  183. model="qwen-0.5b",
  184. messages=[{"role": "user", "content": "Hello!"}]
  185. )
  186. .. code-block:: yaml
  187. :caption: Example usage in YAML.
  188. # config.yaml
  189. applications:
  190. - args:
  191. llm_configs:
  192. - model_loading_config:
  193. model_id: qwen-0.5b
  194. model_source: Qwen/Qwen2.5-0.5B-Instruct
  195. accelerator_type: A10G
  196. deployment_config:
  197. autoscaling_config:
  198. min_replicas: 1
  199. max_replicas: 2
  200. - model_loading_config:
  201. model_id: qwen-1.5b
  202. model_source: Qwen/Qwen2.5-1.5B-Instruct
  203. accelerator_type: A10G
  204. deployment_config:
  205. autoscaling_config:
  206. min_replicas: 1
  207. max_replicas: 2
  208. import_path: ray.serve.llm:build_openai_app
  209. name: llm_app
  210. route_prefix: "/"
  211. Args:
  212. llm_serving_args: A dict that conforms to the LLMServingArgs pydantic model.
  213. Returns:
  214. The configured Ray Serve Application router.
  215. """
  216. from ray.llm._internal.serve.core.ingress.builder import (
  217. build_openai_app,
  218. )
  219. return build_openai_app(builder_config=llm_serving_args)
  220. @PublicAPI(stability="alpha")
  221. def build_pd_openai_app(pd_serving_args: dict) -> "Application":
  222. """Build a deployable application utilizing P/D disaggregation.
  223. Examples:
  224. .. code-block:: python
  225. :caption: Example usage in code.
  226. from ray import serve
  227. from ray.serve.llm import LLMConfig, build_pd_openai_app
  228. config = LLMConfig(
  229. model_loading_config=dict(
  230. model_id="qwen-0.5b",
  231. model_source="Qwen/Qwen2.5-0.5B-Instruct",
  232. ),
  233. deployment_config=dict(
  234. autoscaling_config=dict(
  235. min_replicas=1, max_replicas=2,
  236. )
  237. ),
  238. accelerator_type="A10G",
  239. )
  240. # Deploy the application
  241. llm_app = build_pd_openai_app(
  242. dict(
  243. prefill_config=config,
  244. decode_config=config,
  245. )
  246. )
  247. serve.run(llm_app)
  248. # Querying the model via openai client
  249. from openai import OpenAI
  250. # Initialize client
  251. client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
  252. # Basic completion
  253. response = client.chat.completions.create(
  254. model="qwen-0.5b",
  255. messages=[{"role": "user", "content": "Hello!"}]
  256. )
  257. .. code-block:: yaml
  258. :caption: Example usage in YAML.
  259. # config.yaml
  260. applications:
  261. - args:
  262. prefill_config:
  263. model_loading_config:
  264. model_id: qwen-0.5b
  265. model_source: Qwen/Qwen2.5-0.5B-Instruct
  266. accelerator_type: A10G
  267. deployment_config:
  268. autoscaling_config:
  269. min_replicas: 1
  270. max_replicas: 2
  271. decode_config:
  272. model_loading_config:
  273. model_id: qwen-1.5b
  274. model_source: Qwen/Qwen2.5-1.5B-Instruct
  275. accelerator_type: A10G
  276. deployment_config:
  277. autoscaling_config:
  278. min_replicas: 1
  279. max_replicas: 2
  280. import_path: ray.serve.llm:build_pd_openai_app
  281. name: llm_app
  282. route_prefix: "/"
  283. Args:
  284. pd_serving_args: The dictionary containing prefill and decode configs. See PDServingArgs for more details.
  285. Returns:
  286. The configured Ray Serve Application router.
  287. """
  288. from ray.llm._internal.serve.serving_patterns.prefill_decode.builder import (
  289. build_pd_openai_app,
  290. )
  291. return build_pd_openai_app(pd_serving_args=pd_serving_args)
  292. @PublicAPI(stability="alpha")
  293. def build_dp_deployment(
  294. llm_config: "LLMConfig",
  295. *,
  296. name_prefix: Optional[str] = None,
  297. override_serve_options: Optional[dict] = None,
  298. ) -> "Application":
  299. """Build a data parallel attention LLM deployment.
  300. Args:
  301. llm_config: The LLM configuration.
  302. name_prefix: The prefix to add to the deployment name.
  303. override_serve_options: The optional serve options to override the
  304. default options.
  305. Returns:
  306. The Ray Serve Application for the data parallel attention LLM deployment.
  307. """
  308. from ray.llm._internal.serve.serving_patterns.data_parallel.builder import (
  309. build_dp_deployment,
  310. )
  311. return build_dp_deployment(
  312. llm_config=llm_config,
  313. name_prefix=name_prefix,
  314. override_serve_options=override_serve_options,
  315. )
  316. @PublicAPI(stability="alpha")
  317. def build_dp_openai_app(dp_serving_args: dict) -> "Application":
  318. """Build an OpenAI compatible app with the DP attention deployment
  319. setup from the given builder configuration.
  320. Args:
  321. dp_serving_args: The configuration for the builder. It has to conform
  322. to the DPOpenAiServingArgs pydantic model.
  323. Returns:
  324. The configured Ray Serve Application.
  325. """
  326. from ray.llm._internal.serve.serving_patterns.data_parallel.builder import (
  327. build_dp_openai_app,
  328. )
  329. return build_dp_openai_app(builder_config=dp_serving_args)
  330. __all__ = [
  331. "LLMConfig",
  332. "LLMServingArgs",
  333. "ModelLoadingConfig",
  334. "CloudMirrorConfig",
  335. "LoraConfig",
  336. "build_llm_deployment",
  337. "build_openai_app",
  338. "build_pd_openai_app",
  339. "build_dp_deployment",
  340. "build_dp_openai_app",
  341. "LLMServer",
  342. "LLMRouter",
  343. ]