request_router.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. from ray.llm._internal.serve.routing_policies.prefix_aware.prefix_aware_router import (
  2. PrefixCacheAffinityRouter as _PrefixCacheAffinityRouter,
  3. )
  4. from ray.util.annotations import PublicAPI
  5. @PublicAPI(stability="alpha")
  6. class PrefixCacheAffinityRouter(_PrefixCacheAffinityRouter):
  7. """A request router that is aware of the KV cache.
  8. This router optimizes request routing by considering KV cache locality,
  9. directing requests with similar prefixes to the same replica to improve
  10. cache hit rates.
  11. The internal policy is this (it may change in the future):
  12. 1. Mixes between three strategies to balance prefix cache hit rate and load
  13. balancing:
  14. - When load is balanced (queue length difference < threshold), it
  15. selects replicas with the highest prefix match rate for the input text
  16. - When load is balanced but match rate is below 10%, it falls back to
  17. the smallest tenants (i.e. the replica with the least kv cache)
  18. - When load is imbalanced, it uses the default Power of Two selection
  19. 2. Maintains a prefix tree to track which replicas have processed similar
  20. inputs:
  21. - Inserts prompt text into the prefix tree after routing
  22. - Uses this history to inform future routing decisions
  23. Parameters:
  24. imbalanced_threshold: The threshold for considering the load imbalanced.
  25. match_rate_threshold: The threshold for considering the match rate.
  26. do_eviction: Whether to do eviction.
  27. eviction_threshold_chars: Number of characters in the tree to trigger
  28. eviction.
  29. eviction_target_chars: Number of characters in the tree to target for
  30. eviction.
  31. eviction_interval_secs: How often (in seconds) to run the eviction
  32. policy.
  33. """
  34. pass