File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 3838from apache_beam .options .pipeline_options import SetupOptions
3939from apache_beam .runners .runner import PipelineResult
4040
41- # Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the engine
42- # warms the sampler with many dummy sequences unless max_num_seqs is reduced, and
43- # the default gpu_memory_utilization can leave no free VRAM for that step.
41+ # Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the
42+ # engine warms the sampler with many dummy sequences unless max_num_seqs is
43+ # reduced, and the default gpu_memory_utilization can leave no free VRAM for
44+ # that step.
4445_DEFAULT_VLLM_MAX_NUM_SEQS = 32
4546_DEFAULT_VLLM_GPU_MEMORY_UTILIZATION = 0.72
4647
@@ -141,7 +142,7 @@ def parse_known_args(argv):
141142
142143
143144def build_vllm_server_kwargs (known_args ) -> dict [str , str ]:
144- """Returns CLI flags for ``VLLMCompletionsModelHandler(..., vllm_server_kwargs=...) ``."""
145+ """Returns vllm_server_kwargs for ``VLLMCompletionsModelHandler``."""
145146 return {
146147 'max-num-seqs' : str (known_args .vllm_max_num_seqs ),
147148 'gpu-memory-utilization' : str (known_args .vllm_gpu_memory_utilization ),
Original file line number Diff line number Diff line change @@ -201,9 +201,10 @@ def __init__(
201201 `python -m vllm.entrypoints.openai.api_serverv <beam provided args>
202202 <vllm_server_kwargs>`. For example, you could pass
203203 `{'echo': 'true'}` to prepend new messages with the previous message.
204- On ~16GB GPUs, pass lower ``max-num-seqs`` and ``gpu-memory-utilization``
205- values (see ``apache_beam.examples.inference.vllm_text_completion``).
206- For a list of possible kwargs, see
204+ On ~16GB GPUs, pass lower ``max-num-seqs`` and
205+ ``gpu-memory-utilization`` values (see
206+ ``apache_beam.examples.inference.vllm_text_completion``). For a list of
207+ possible kwargs, see
207208 https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
208209 min_batch_size: optional. the minimum batch size to use when batching
209210 inputs.
You can’t perform that action at this time.
0 commit comments