Skip to content

Commit bb43833

Browse files
authored
fix vllm lint (#38051)
1 parent db7c539 commit bb43833

2 files changed

Lines changed: 9 additions & 7 deletions

File tree

sdks/python/apache_beam/examples/inference/vllm_text_completion.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@
3838
from apache_beam.options.pipeline_options import SetupOptions
3939
from apache_beam.runners.runner import PipelineResult
4040

41-
# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the engine
42-
# warms the sampler with many dummy sequences unless max_num_seqs is reduced, and
43-
# the default gpu_memory_utilization can leave no free VRAM for that step.
41+
# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the
42+
# engine warms the sampler with many dummy sequences unless max_num_seqs is
43+
# reduced, and the default gpu_memory_utilization can leave no free VRAM for
44+
# that step.
4445
_DEFAULT_VLLM_MAX_NUM_SEQS = 32
4546
_DEFAULT_VLLM_GPU_MEMORY_UTILIZATION = 0.72
4647

@@ -141,7 +142,7 @@ def parse_known_args(argv):
141142

142143

143144
def build_vllm_server_kwargs(known_args) -> dict[str, str]:
144-
"""Returns CLI flags for ``VLLMCompletionsModelHandler(..., vllm_server_kwargs=...)``."""
145+
"""Returns vllm_server_kwargs for ``VLLMCompletionsModelHandler``."""
145146
return {
146147
'max-num-seqs': str(known_args.vllm_max_num_seqs),
147148
'gpu-memory-utilization': str(known_args.vllm_gpu_memory_utilization),

sdks/python/apache_beam/ml/inference/vllm_inference.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,10 @@ def __init__(
201201
`python -m vllm.entrypoints.openai.api_serverv <beam provided args>
202202
<vllm_server_kwargs>`. For example, you could pass
203203
`{'echo': 'true'}` to prepend new messages with the previous message.
204-
On ~16GB GPUs, pass lower ``max-num-seqs`` and ``gpu-memory-utilization``
205-
values (see ``apache_beam.examples.inference.vllm_text_completion``).
206-
For a list of possible kwargs, see
204+
On ~16GB GPUs, pass lower ``max-num-seqs`` and
205+
``gpu-memory-utilization`` values (see
206+
``apache_beam.examples.inference.vllm_text_completion``). For a list of
207+
possible kwargs, see
207208
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
208209
min_batch_size: optional. the minimum batch size to use when batching
209210
inputs.

0 commit comments

Comments
 (0)