| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433 |
- from typing import TYPE_CHECKING, Optional, Type
- from ray._common.deprecation import Deprecated
- from ray.llm._internal.serve.core.configs.llm_config import (
- CloudMirrorConfig as _CloudMirrorConfig,
- LLMConfig as _LLMConfig,
- LoraConfig as _LoraConfig,
- ModelLoadingConfig as _ModelLoadingConfig,
- )
- from ray.llm._internal.serve.core.ingress.builder import (
- LLMServingArgs as _LLMServingArgs,
- )
- from ray.llm._internal.serve.core.ingress.ingress import (
- OpenAiIngress as _OpenAiIngress,
- )
- # For backward compatibility
- from ray.llm._internal.serve.core.server.llm_server import (
- LLMServer as _LLMServer,
- )
- from ray.util.annotations import PublicAPI
- if TYPE_CHECKING:
- from ray.serve.deployment import Application
- ##########
- # Models
- ##########
- @PublicAPI(stability="alpha")
- class LLMConfig(_LLMConfig):
- """The configuration for starting an LLM deployment."""
- pass
- @PublicAPI(stability="alpha")
- class LLMServingArgs(_LLMServingArgs):
- """The configuration for starting an LLM deployment application."""
- pass
- @PublicAPI(stability="alpha")
- class ModelLoadingConfig(_ModelLoadingConfig):
- """The configuration for loading an LLM model."""
- pass
- @PublicAPI(stability="alpha")
- class CloudMirrorConfig(_CloudMirrorConfig):
- """The configuration for mirroring an LLM model from cloud storage."""
- pass
- @PublicAPI(stability="alpha")
- class LoraConfig(_LoraConfig):
- """The configuration for loading an LLM model with LoRA."""
- pass
- #############
- # Deployments
- #############
- @Deprecated(
- old="ray.serve.llm.LLMServer", new="ray.serve.llm.deployment.LLMServer", error=False
- )
- class LLMServer(_LLMServer):
- pass
- @Deprecated(
- old="ray.serve.llm.LLMRouter",
- new="ray.serve.llm.ingress.OpenAIIngress",
- error=False,
- )
- class LLMRouter(_OpenAiIngress):
- pass
- ##########
- # Builders
- ##########
- @PublicAPI(stability="alpha")
- def build_llm_deployment(
- llm_config: "LLMConfig",
- *,
- name_prefix: Optional[str] = None,
- bind_kwargs: Optional[dict] = None,
- override_serve_options: Optional[dict] = None,
- deployment_cls: Optional[Type[LLMServer]] = None,
- ) -> "Application":
- """Helper to build a single vllm deployment from the given llm config.
- Examples:
- .. testcode::
- :skipif: True
- from ray import serve
- from ray.serve.llm import LLMConfig, build_llm_deployment
- # Configure the model
- llm_config = LLMConfig(
- model_loading_config=dict(
- model_id="llama-3.1-8b",
- model_source="meta-llama/Llama-3.1-8b-instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1,
- max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- # Build the deployment
- llm_app = build_llm_deployment(llm_config)
- # Deploy the application
- model_handle = serve.run(llm_app)
- # Querying the model handle
- import asyncio
- model_handle = model_handle.options(stream=True)
- async def query_model(model_handle):
- from ray.serve.llm.openai_api_models import ChatCompletionRequest
- request = ChatCompletionRequest(
- model="qwen-0.5b",
- messages=[
- {
- "role": "user",
- "content": "Hello, world!"
- }
- ]
- )
- resp = model_handle.chat.remote(request)
- async for message in resp:
- print("message: ", message)
- asyncio.run(query_model(model_handle))
- Args:
- llm_config: The llm config to build vllm deployment.
- name_prefix: Optional prefix to be used for the deployment name.
- bind_kwargs: Optional kwargs to pass to the deployment.
- override_serve_options: Optional serve options to override the original serve options based on the llm_config.
- deployment_cls: Optional deployment class to use.
- Returns:
- The configured Ray Serve Application for vllm deployment.
- """
- from ray.llm._internal.serve.core.server.builder import (
- build_llm_deployment,
- )
- return build_llm_deployment(
- llm_config=llm_config,
- name_prefix=name_prefix,
- bind_kwargs=bind_kwargs,
- override_serve_options=override_serve_options,
- deployment_cls=deployment_cls,
- )
- @PublicAPI(stability="alpha")
- def build_openai_app(llm_serving_args: dict) -> "Application":
- """Helper to build an OpenAI compatible app with the llm deployment setup from
- the given llm serving args. This is the main entry point for users to create a
- Serve application serving LLMs.
- Examples:
- .. code-block:: python
- :caption: Example usage in code.
- from ray import serve
- from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app
- llm_config1 = LLMConfig(
- model_loading_config=dict(
- model_id="qwen-0.5b",
- model_source="Qwen/Qwen2.5-0.5B-Instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1, max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- llm_config2 = LLMConfig(
- model_loading_config=dict(
- model_id="qwen-1.5b",
- model_source="Qwen/Qwen2.5-1.5B-Instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1, max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- # Deploy the application
- llm_app = build_openai_app(
- LLMServingArgs(
- llm_configs=[
- llm_config1,
- llm_config2,
- ]
- )
- )
- serve.run(llm_app)
- # Querying the model via openai client
- from openai import OpenAI
- # Initialize client
- client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
- # Basic completion
- response = client.chat.completions.create(
- model="qwen-0.5b",
- messages=[{"role": "user", "content": "Hello!"}]
- )
- .. code-block:: yaml
- :caption: Example usage in YAML.
- # config.yaml
- applications:
- - args:
- llm_configs:
- - model_loading_config:
- model_id: qwen-0.5b
- model_source: Qwen/Qwen2.5-0.5B-Instruct
- accelerator_type: A10G
- deployment_config:
- autoscaling_config:
- min_replicas: 1
- max_replicas: 2
- - model_loading_config:
- model_id: qwen-1.5b
- model_source: Qwen/Qwen2.5-1.5B-Instruct
- accelerator_type: A10G
- deployment_config:
- autoscaling_config:
- min_replicas: 1
- max_replicas: 2
- import_path: ray.serve.llm:build_openai_app
- name: llm_app
- route_prefix: "/"
- Args:
- llm_serving_args: A dict that conforms to the LLMServingArgs pydantic model.
- Returns:
- The configured Ray Serve Application router.
- """
- from ray.llm._internal.serve.core.ingress.builder import (
- build_openai_app,
- )
- return build_openai_app(builder_config=llm_serving_args)
- @PublicAPI(stability="alpha")
- def build_pd_openai_app(pd_serving_args: dict) -> "Application":
- """Build a deployable application utilizing P/D disaggregation.
- Examples:
- .. code-block:: python
- :caption: Example usage in code.
- from ray import serve
- from ray.serve.llm import LLMConfig, build_pd_openai_app
- config = LLMConfig(
- model_loading_config=dict(
- model_id="qwen-0.5b",
- model_source="Qwen/Qwen2.5-0.5B-Instruct",
- ),
- deployment_config=dict(
- autoscaling_config=dict(
- min_replicas=1, max_replicas=2,
- )
- ),
- accelerator_type="A10G",
- )
- # Deploy the application
- llm_app = build_pd_openai_app(
- dict(
- prefill_config=config,
- decode_config=config,
- )
- )
- serve.run(llm_app)
- # Querying the model via openai client
- from openai import OpenAI
- # Initialize client
- client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
- # Basic completion
- response = client.chat.completions.create(
- model="qwen-0.5b",
- messages=[{"role": "user", "content": "Hello!"}]
- )
- .. code-block:: yaml
- :caption: Example usage in YAML.
- # config.yaml
- applications:
- - args:
- prefill_config:
- model_loading_config:
- model_id: qwen-0.5b
- model_source: Qwen/Qwen2.5-0.5B-Instruct
- accelerator_type: A10G
- deployment_config:
- autoscaling_config:
- min_replicas: 1
- max_replicas: 2
- decode_config:
- model_loading_config:
- model_id: qwen-1.5b
- model_source: Qwen/Qwen2.5-1.5B-Instruct
- accelerator_type: A10G
- deployment_config:
- autoscaling_config:
- min_replicas: 1
- max_replicas: 2
- import_path: ray.serve.llm:build_pd_openai_app
- name: llm_app
- route_prefix: "/"
- Args:
- pd_serving_args: The dictionary containing prefill and decode configs. See PDServingArgs for more details.
- Returns:
- The configured Ray Serve Application router.
- """
- from ray.llm._internal.serve.serving_patterns.prefill_decode.builder import (
- build_pd_openai_app,
- )
- return build_pd_openai_app(pd_serving_args=pd_serving_args)
- @PublicAPI(stability="alpha")
- def build_dp_deployment(
- llm_config: "LLMConfig",
- *,
- name_prefix: Optional[str] = None,
- override_serve_options: Optional[dict] = None,
- ) -> "Application":
- """Build a data parallel attention LLM deployment.
- Args:
- llm_config: The LLM configuration.
- name_prefix: The prefix to add to the deployment name.
- override_serve_options: The optional serve options to override the
- default options.
- Returns:
- The Ray Serve Application for the data parallel attention LLM deployment.
- """
- from ray.llm._internal.serve.serving_patterns.data_parallel.builder import (
- build_dp_deployment,
- )
- return build_dp_deployment(
- llm_config=llm_config,
- name_prefix=name_prefix,
- override_serve_options=override_serve_options,
- )
- @PublicAPI(stability="alpha")
- def build_dp_openai_app(dp_serving_args: dict) -> "Application":
- """Build an OpenAI compatible app with the DP attention deployment
- setup from the given builder configuration.
- Args:
- dp_serving_args: The configuration for the builder. It has to conform
- to the DPOpenAiServingArgs pydantic model.
- Returns:
- The configured Ray Serve Application.
- """
- from ray.llm._internal.serve.serving_patterns.data_parallel.builder import (
- build_dp_openai_app,
- )
- return build_dp_openai_app(builder_config=dp_serving_args)
- __all__ = [
- "LLMConfig",
- "LLMServingArgs",
- "ModelLoadingConfig",
- "CloudMirrorConfig",
- "LoraConfig",
- "build_llm_deployment",
- "build_openai_app",
- "build_pd_openai_app",
- "build_dp_deployment",
- "build_dp_openai_app",
- "LLMServer",
- "LLMRouter",
- ]
|