Skip to content
Open
4 changes: 2 additions & 2 deletions .github/trigger_files/beam_PostCommit_Python.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"comment": "Modify this file in a trivial way to cause this test suite to run.",
"pr": "38069",
"modification": 49
"pr": "38701",
"modification": 51
}
24 changes: 21 additions & 3 deletions sdks/python/apache_beam/examples/inference/vllm_text_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,20 @@ def parse_known_args(argv):
'Passed to the vLLM OpenAI server as --gpu-memory-utilization '
'(fraction of total GPU memory for KV cache). Lower this if the '
'engine fails to start with CUDA out of memory.'))
parser.add_argument(
'--use_dynamo',
dest='use_dynamo',
action='store_true',
help=(
'Use embedded NVIDIA Dynamo as the vLLM engine. Requires '
'ai-dynamo[vllm] and the etcd binary in the runtime environment. '
'See VLLMCompletionsModelHandler for limitations of embedded mode.'))
parser.add_argument(
'--max_tokens',
dest='max_tokens',
type=int,
default=16,
help='Maximum number of tokens to generate for each example.')
return parser.parse_known_args(argv)


Expand Down Expand Up @@ -178,22 +192,26 @@ def run(
build_vllm_server_kwargs(known_args))

model_handler = VLLMCompletionsModelHandler(
model_name=known_args.model, vllm_server_kwargs=effective_vllm_kwargs)
model_name=known_args.model,
vllm_server_kwargs=effective_vllm_kwargs,
use_dynamo=known_args.use_dynamo)
input_examples = COMPLETION_EXAMPLES

if known_args.chat:
model_handler = VLLMChatModelHandler(
model_name=known_args.model,
chat_template_path=known_args.chat_template,
vllm_server_kwargs=dict(effective_vllm_kwargs))
vllm_server_kwargs=dict(effective_vllm_kwargs),
use_dynamo=known_args.use_dynamo)
input_examples = CHAT_EXAMPLES

pipeline = test_pipeline
if not test_pipeline:
pipeline = beam.Pipeline(options=pipeline_options)

examples = pipeline | "Create examples" >> beam.Create(input_examples)
predictions = examples | "RunInference" >> RunInference(model_handler)
predictions = examples | "RunInference" >> RunInference(
model_handler, inference_args={'max_tokens': known_args.max_tokens})
process_output = predictions | "Process Predictions" >> beam.ParDo(
PostProcessor())
_ = process_output | "WriteOutput" >> beam.io.WriteToText(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,22 @@ RUN python3 --version
RUN apt-get install -y curl
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 && pip install --upgrade pip

RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.58.1
RUN pip install openai vllm
RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.71.0
RUN pip install --no-cache-dir openai vllm ai-dynamo[vllm]

RUN apt install libcairo2-dev pkg-config python3-dev -y
RUN pip install pycairo

# etcd binary required by embedded NVIDIA Dynamo for runtime discovery.
ENV ETCD_VERSION=v3.5.13
RUN curl -L https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/etcd-${ETCD_VERSION}-linux-amd64.tar.gz -o /tmp/etcd.tar.gz && \
tar xzf /tmp/etcd.tar.gz -C /tmp && \
mv /tmp/etcd-${ETCD_VERSION}-linux-amd64/etcd /usr/local/bin/etcd && \
chmod +x /usr/local/bin/etcd && \
rm -rf /tmp/etcd*

# Copy the Apache Beam worker dependencies from the Beam Python 3.12 SDK image.
COPY --from=apache/beam_python3.12_sdk:2.58.1 /opt/apache/beam /opt/apache/beam
COPY --from=apache/beam_python3.12_sdk:2.71.0 /opt/apache/beam /opt/apache/beam

# Set the entrypoint to Apache Beam SDK worker launcher.
ENTRYPOINT [ "/opt/apache/beam/boot" ]
Loading
Loading