From 6717ed931e0cffa52eb2c54619cc0ed7acdb2ea8 Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Sat, 21 Jun 2025 11:03:04 +0800 Subject: [PATCH 1/5] [doc] Fold long code blocks to improve readability Signed-off-by: reidliu41 --- docs/ci/update_pytorch_version.md | 16 +- docs/cli/README.md | 35 +++- docs/configuration/conserving_memory.md | 40 +++++ docs/configuration/env_vars.md | 5 + docs/configuration/model_resolution.md | 5 + docs/configuration/optimization.md | 35 ++++ docs/configuration/serve_args.md | 5 + docs/contributing/README.md | 10 ++ docs/contributing/model/basic.md | 15 ++ docs/contributing/model/multimodal.md | 155 ++++++++++++++++ docs/contributing/model/registration.md | 10 ++ docs/contributing/profiling.md | 55 ++++++ docs/deployment/docker.md | 25 +++ docs/deployment/frameworks/cerebrium.md | 20 +++ docs/deployment/frameworks/dify.md | 5 + docs/deployment/frameworks/dstack.md | 15 ++ docs/deployment/frameworks/haystack.md | 10 +- docs/deployment/frameworks/litellm.md | 10 ++ docs/deployment/frameworks/lws.md | 20 +++ docs/deployment/frameworks/open-webui.md | 5 + docs/deployment/frameworks/skypilot.md | 5 + docs/deployment/frameworks/streamlit.md | 5 + docs/deployment/integrations/llamastack.md | 10 ++ .../integrations/production-stack.md | 20 ++- docs/deployment/k8s.md | 50 ++++++ docs/deployment/nginx.md | 55 ++++++ docs/design/arch_overview.md | 10 ++ docs/design/kernel/paged_attention.md | 70 ++++++++ docs/design/plugin_system.md | 5 + docs/features/lora.md | 55 +++++- docs/features/multimodal_inputs.md | 50 ++++++ docs/features/reasoning_outputs.md | 30 ++++ docs/features/spec_decode.md | 30 ++++ docs/features/structured_outputs.md | 50 +++++- docs/features/tool_calling.md | 25 ++- .../installation/aws_neuron.md | 15 ++ docs/getting_started/installation/cpu.md | 22 +++ .../installation/cpu/build.inc.md | 15 ++ .../installation/cpu/s390x.inc.md | 15 ++ .../installation/google_tpu.md | 15 ++ .../installation/gpu/cuda.inc.md | 85 +++++++++ .../installation/gpu/rocm.inc.md | 50 ++++++ .../installation/gpu/xpu.inc.md | 15 ++ .../installation/intel_gaudi.md | 45 ++++- docs/getting_started/quickstart.md | 40 +++++ .../models/extensions/runai_model_streamer.md | 25 +++ docs/models/generative_models.md | 25 +++ docs/models/pooling_models.md | 20 ++- docs/models/supported_models.md | 69 +++++++- docs/serving/distributed_serving.md | 10 ++ docs/serving/integrations/langchain.md | 5 + docs/serving/integrations/llamaindex.md | 5 + docs/serving/openai_compatible_server.md | 165 ++++++++++++++++-- docs/usage/metrics.md | 15 ++ docs/usage/troubleshooting.md | 45 +++++ docs/usage/usage_stats.md | 15 ++ 56 files changed, 1647 insertions(+), 35 deletions(-) diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md index 2ad3430a4de..199f7395c8c 100644 --- a/docs/ci/update_pytorch_version.md +++ b/docs/ci/update_pytorch_version.md @@ -91,12 +91,17 @@ source to unblock the update process. ### FlashInfer Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): -``` +
+Commands + +```bash export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' export FLASHINFER_ENABLE_SM90=1 uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1" ``` +
+ One caveat is that building FlashInfer from source adds approximately 30 minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a public location for immediate installation, such as https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl. For future releases, contact the PyTorch release @@ -105,14 +110,19 @@ team if you want to get the package published there. ### xFormers Similar to FlashInfer, here is how to build and install xFormers from source: -``` +
+Commands + +```bash export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX' MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" ``` +
+ ### Mamba -``` +```bash uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" ``` diff --git a/docs/cli/README.md b/docs/cli/README.md index df700fb743c..14031c5b43a 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -16,7 +16,8 @@ vllm {chat,complete,serve,bench,collect-env,run-batch} Start the vLLM OpenAI Compatible API server. -Examples: +
+Examples ```bash # Start with a model @@ -39,11 +40,14 @@ vllm serve --help=max-num-seqs vllm serve --help=max ``` +
+ ## chat Generate chat completions via the running API server. -Examples: +
+Examples ```bash # Directly connect to localhost API without arguments @@ -56,11 +60,14 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm chat --quick "hi" ``` +
+ ## complete Generate text completions based on the given prompt via the running API server. -Examples: +
+Examples ```bash # Directly connect to localhost API without arguments @@ -73,6 +80,8 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm complete --quick "The future of AI is" ``` +
+ ## bench Run benchmark tests for latency online serving throughput and offline inference throughput. @@ -89,7 +98,8 @@ vllm bench {latency, serve, throughput} Benchmark the latency of a single batch of requests. -Example: +
+Example ```bash vllm bench latency \ @@ -100,11 +110,14 @@ vllm bench latency \ --load-format dummy ``` +
+ ### serve Benchmark the online serving throughput. -Example: +
+Example ```bash vllm bench serve \ @@ -116,11 +129,14 @@ vllm bench serve \ --num-prompts 5 ``` +
+ ### throughput Benchmark offline inference throughput. -Example: +
+Example ```bash vllm bench throughput \ @@ -131,6 +147,8 @@ vllm bench throughput \ --load-format dummy ``` +
+ ## collect-env Start collecting environment information. @@ -143,7 +161,8 @@ vllm collect-env Run batch prompts and write results to file. -Examples: +
+Examples ```bash # Running with a local file @@ -159,6 +178,8 @@ vllm run-batch \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` +
+ ## More Help For detailed options of any subcommand, use: diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index a1283a503a6..00add26b2e1 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -8,6 +8,9 @@ Tensor parallelism (`tensor_parallel_size` option) can be used to split the mode The following code splits the model across 2 GPUs. +
+Code + ```python from vllm import LLM @@ -15,6 +18,8 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` +
+ !!! warning To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. @@ -40,6 +45,9 @@ Dynamic quantization is also supported via the `quantization` option -- see [her You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). +
+Command + ```python from vllm import LLM @@ -48,6 +56,8 @@ llm = LLM(model="adept/fuyu-8b", max_num_seqs=2) ``` +
+ ## Reduce CUDA Graphs By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. @@ -57,6 +67,9 @@ By default, we optimize model inference using CUDA graphs which take up extra me You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: +
+Code + ```python from vllm import LLM from vllm.config import CompilationConfig, CompilationLevel @@ -71,8 +84,13 @@ llm = LLM( ) ``` +
+ You can disable graph capturing completely via the `enforce_eager` flag: +
+Code + ```python from vllm import LLM @@ -80,6 +98,8 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) ``` +
+ ## Adjust cache size If you run out of CPU RAM, try the following options: @@ -91,6 +111,9 @@ If you run out of CPU RAM, try the following options: You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: +
+Code + ```python from vllm import LLM @@ -99,9 +122,14 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 3, "video": 1}) ``` +
+ You can go a step further and disable unused modalities completely by setting its limit to zero. For example, if your application only accepts image input, there is no need to allocate any memory for videos. +
+Code + ```python from vllm import LLM @@ -110,8 +138,13 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"video": 0}) ``` +
+ You can even run a multi-modal model for text-only inference: +
+Code + ```python from vllm import LLM @@ -120,6 +153,8 @@ llm = LLM(model="google/gemma-3-27b-it", limit_mm_per_prompt={"image": 0}) ``` +
+ ## Multi-modal processor arguments For certain models, you can adjust the multi-modal processor arguments to @@ -127,6 +162,9 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory. Here are some examples: +
+Code + ```python from vllm import LLM @@ -142,3 +180,5 @@ llm = LLM(model="OpenGVLab/InternVL2-2B", "max_dynamic_patch": 4, # Default is 12 }) ``` + +
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index f6d548a19d9..09aa4a595a8 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -7,6 +7,11 @@ vLLM uses the following environment variables to configure the system: All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). +
+Code + ```python --8<-- "vllm/envs.py:env-vars-definition" ``` + +
diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md index 8757c257d3e..fcf1ccb64a2 100644 --- a/docs/configuration/model_resolution.md +++ b/docs/configuration/model_resolution.md @@ -11,6 +11,9 @@ Nevertheless, our model resolution may fail for the following reasons: To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. For example: +
+Code + ```python from vllm import LLM @@ -20,4 +23,6 @@ model = LLM( ) ``` +
+ Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 811925c19e6..689f9770539 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -48,6 +48,9 @@ You can tune the performance by adjusting `max_num_batched_tokens`: - For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs. - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes). +
+Code + ```python from vllm import LLM @@ -55,6 +58,8 @@ from vllm import LLM llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384) ``` +
+ See related papers for more details ( or ). ## Parallelism Strategies @@ -70,6 +75,9 @@ Tensor parallelism shards model parameters across multiple GPUs within each mode - When the model is too large to fit on a single GPU - When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput +
+Code + ```python from vllm import LLM @@ -77,6 +85,8 @@ from vllm import LLM llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4) ``` +
+ For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential. ### Pipeline Parallelism (PP) @@ -90,6 +100,9 @@ Pipeline parallelism distributes model layers across multiple GPUs. Each GPU pro Pipeline parallelism can be combined with tensor parallelism for very large models: +
+Code + ```python from vllm import LLM @@ -101,6 +114,8 @@ llm = LLM( ) ``` +
+ ### Expert Parallelism (EP) Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs. @@ -134,6 +149,9 @@ If you encounter out-of-memory issues, consider these strategies: You can reduce memory usage by limiting the context length and batch size: +
+Code + ```python from vllm import LLM @@ -144,10 +162,15 @@ llm = LLM( ) ``` +
+ ### Adjust CUDA Graph Compilation CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level: +
+Code + ```python from vllm import LLM from vllm.config import CompilationConfig, CompilationLevel @@ -161,8 +184,13 @@ llm = LLM( ) ``` +
+ Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`: +
+Code + ```python from vllm import LLM @@ -172,10 +200,15 @@ llm = LLM( ) ``` +
+ ### Multimodal Models For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request: +
+Code + ```python from vllm import LLM @@ -185,3 +218,5 @@ llm = LLM( limit_mm_per_prompt={"image": 2} ) ``` + +
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md index 16b4b29f45d..0a999a4e7b9 100644 --- a/docs/configuration/serve_args.md +++ b/docs/configuration/serve_args.md @@ -17,6 +17,9 @@ The argument names must be the long form of those outlined [above][serve-args]. For example: +
+Config + ```yaml # config.yaml @@ -26,6 +29,8 @@ port: 6379 uvicorn-log-level: "info" ``` +
+ To use the above config file: ```bash diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 10c50e00724..50cb3ac2a7d 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -77,12 +77,17 @@ mkdocs serve Example output: +
+Output + ```console INFO - Documentation built in 106.83 seconds INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' INFO - [22:02:02] Serving on http://127.0.0.1:8000/ ``` +
+ #### View in Your Browser Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. @@ -93,6 +98,9 @@ For additional features and advanced configurations, refer to the official [MkDo ## Testing +
+Commands + ```bash pip install -r requirements/dev.txt @@ -113,6 +121,8 @@ pytest tests/ pytest -s -v tests/test_logger.py ``` +
+ !!! tip Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 0c0ba337925..998b86efcbc 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -27,6 +27,9 @@ All vLLM modules within the model must include a `prefix` argument in their cons The initialization code should look like this: +
+Code + ```python from torch import nn from vllm.config import VllmConfig @@ -55,10 +58,15 @@ class MyModelForCausalLM(nn.Module): self.model = MyModel(vllm_config, prefix=f"{prefix}.model") ``` +
+ ### Computation Code - Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model. +
+Code + ```python class MyModel(nn.Module): ... @@ -67,8 +75,13 @@ class MyModel(nn.Module): ... ``` +
+ - Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. +
+Code + ```python def forward( self, @@ -78,6 +91,8 @@ def forward( ... ``` +
+ !!! note Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index bed6d4e653d..17c75270be3 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -25,6 +25,9 @@ Further update the model as follows: - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. +
+ Code + ```python class YourModelForImage2Seq(nn.Module): ... @@ -48,11 +51,16 @@ Further update the model as follows: return vision_embeddings ``` +
+ !!! important The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. +
+ Code + ```python from .utils import merge_multimodal_embeddings @@ -79,8 +87,13 @@ Further update the model as follows: return inputs_embeds ``` +
+ - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. +
+ Code + ```python class YourModelForImage2Seq(nn.Module): ... @@ -90,6 +103,8 @@ Further update the model as follows: return self.language_model ``` +
+ - Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. ```diff @@ -115,11 +130,16 @@ to return the maximum number of input items for each modality supported by the m For example, if the model supports any number of images but only one video per prompt: +
+Code + ```python def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} ``` +
+ ## 3. Specify dummy inputs Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for @@ -135,6 +155,9 @@ Assuming that the memory usage increases with the number of tokens, the dummy in Looking at the code of HF's `LlavaForConditionalGeneration`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 n_image_tokens = (input_ids == self.config.image_token_index).sum().item() @@ -154,9 +177,14 @@ Assuming that the memory usage increases with the number of tokens, the dummy in inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) ``` +
+ The number of placeholder feature tokens per image is `image_features.shape[1]`. `image_features` is calculated inside the `get_image_features` method: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) @@ -172,12 +200,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return image_features ``` +
+ We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention mechanism doesn't change the sequence length of the output hidden states. +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) @@ -191,8 +224,13 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ) ``` +
+ To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 target_dtype = self.patch_embedding.weight.dtype @@ -208,16 +246,26 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return embeddings ``` +
+ We can infer that `embeddings.shape[1] == self.num_positions`, where +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 ``` +
+ Overall, the number of placeholder feature tokens for an image can be calculated as: +
+ Code + ```python def get_num_image_tokens( self, @@ -238,9 +286,14 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return num_image_tokens ``` +
+ Notice that the number of image tokens doesn't depend on the image width and height. We can simply use a dummy `image_size` to calculate the multimodal profiling data: +
+ Code + ```python # NOTE: In actuality, this is usually implemented as part of the # model's subclass of `BaseProcessingInfo`, but we show it as is @@ -268,8 +321,13 @@ Assuming that the memory usage increases with the number of tokens, the dummy in } ``` +
+ For the text, we simply expand the multimodal image token from the model config to match the desired number of images. +
+ Code + ```python def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -280,10 +338,15 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return image_token * num_images ``` +
+ === "No input placeholders: Fuyu" Looking at the code of HF's `FuyuForCausalLM`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 if image_patches is not None and past_key_values is None: @@ -300,6 +363,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ) ``` +
+ The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. @@ -312,6 +377,9 @@ Assuming that the memory usage increases with the number of tokens, the dummy in In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, returning the dimensions after resizing (but before padding) as metadata. +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) @@ -346,8 +414,13 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ] ``` +
+ In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 model_image_input = self.image_processor.preprocess_with_tokenizer_info( @@ -382,8 +455,13 @@ Assuming that the memory usage increases with the number of tokens, the dummy in assert num_patches == patches.shape[0] ``` +
+ The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 patch_size = patch_size if patch_size is not None else self.patch_size @@ -399,9 +477,14 @@ Assuming that the memory usage increases with the number of tokens, the dummy in num_patches = num_patches_per_dim_h * num_patches_per_dim_w ``` +
+ These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. +
+ Code + ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() @@ -409,16 +492,26 @@ Assuming that the memory usage increases with the number of tokens, the dummy in height=image_processor.size["height"]) ``` +
+ Fuyu does not expect image placeholders in the inputs to HF processor, so the dummy prompt text is empty regardless of the number of images. +
+ Code + ```python def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: return "" ``` +
+ For the multimodal image profiling data, the logic is very similar to LLaVA: +
+ Code + ```python def get_dummy_mm_data( self, @@ -437,6 +530,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in } ``` +
+ ## 4. Specify processing details Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] @@ -455,6 +550,9 @@ return a schema of the tensors outputted by the HF processor that are related to The output of `CLIPImageProcessor` is a simple tensor with shape `(num_images, num_channels, image_height, image_width)`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 images = [ @@ -466,8 +564,13 @@ return a schema of the tensors outputted by the HF processor that are related to return BatchFeature(data=data, tensor_type=return_tensors) ``` +
+ So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: +
+ Code + ```python def _get_mm_fields_config( self, @@ -479,6 +582,8 @@ return a schema of the tensors outputted by the HF processor that are related to ) ``` +
+ !!! note Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. @@ -488,6 +593,9 @@ return a schema of the tensors outputted by the HF processor that are related to The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates the patches from each image belonging to an item in the batch: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 image_input_ids.append(tensor_of_image_ids) @@ -499,12 +607,17 @@ return a schema of the tensors outputted by the HF processor that are related to batch_image_patches.append(image_patches) ``` +
+ The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore `(1, num_images, num_patches, patch_width * patch_height * num_channels)`. In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: +
+ Code + ```python def _call_hf_processor( self, @@ -535,12 +648,17 @@ return a schema of the tensors outputted by the HF processor that are related to return processed_outputs ``` +
+ !!! note Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling for text-only inputs to prevent unnecessary warnings from HF processor. This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: +
+ Code + ```python def _get_mm_fields_config( self, @@ -550,6 +668,8 @@ return a schema of the tensors outputted by the HF processor that are related to return dict(image_patches=MultiModalFieldConfig.batched("image")) ``` +
+ ### Prompt updates Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to @@ -562,6 +682,9 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies Looking at HF's `LlavaProcessor`: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 prompt_strings = [] @@ -570,9 +693,14 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies prompt_strings.append(sample) ``` +
+ It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: +
+ Code + ```python def _get_prompt_updates( self, @@ -603,6 +731,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ] ``` +
+ === "Handling additional tokens: Fuyu" Recall the layout of feature tokens from Step 2: @@ -616,6 +746,9 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies We define a helper function to return `ncols` and `nrows` directly: +
+ Code + ```python def get_image_feature_grid_size( self, @@ -642,8 +775,13 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies return ncols, nrows ``` +
+ Based on this, we can initially define our replacement tokens as: +
+ Code + ```python def get_replacement(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) @@ -659,9 +797,14 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows ``` +
+ However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, a BOS token (``) is also added to the promopt: +
+ Code + ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 model_image_input = self.image_processor.preprocess_with_tokenizer_info( @@ -684,9 +827,14 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ) ``` +
+ To assign the vision embeddings to only the image tokens, instead of a string you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: +
+ Code + ```python hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id # `` @@ -709,9 +857,14 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ) ``` +
+ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, we can search for it to conduct the replacement at the start of the string: +
+ Code + ```python def _get_prompt_updates( self, @@ -752,6 +905,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ] ``` +
+ ## 5. Register processor-related classes After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2), diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index a6dc1e32dfb..c2ee9ffe970 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -27,6 +27,9 @@ You can load an external model [using a plugin][plugin-system] without modifying To register the model, use the following code: +
+Code + ```python # The entrypoint of your plugin def register(): @@ -36,8 +39,13 @@ def register(): ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) ``` +
+ If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: +
+Code + ```python # The entrypoint of your plugin def register(): @@ -49,6 +57,8 @@ def register(): ) ``` +
+ !!! important If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. Read more about that [here][supports-multimodal]. diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index be01b9b65f6..1c4dc116a9b 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -29,16 +29,26 @@ Refer to for an example #### OpenAI Server +
+Command + ```bash VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B ``` +
+ benchmark_serving.py: +
+Commanad + ```bash python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 ``` +
+ ## Profile with NVIDIA Nsight Systems Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events. @@ -46,6 +56,9 @@ Nsight systems is an advanced tool that exposes more profiling details, such as [Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager. The following block is an example for Ubuntu. +
+Command + ```bash apt update apt install -y --no-install-recommends gnupg @@ -55,6 +68,8 @@ apt update apt install nsight-systems-cli ``` +
+ ### Example commands and usage #### Offline Inference @@ -63,14 +78,22 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo The following is an example using the `benchmarks/benchmark_latency.py` script: +
+Command + ```bash nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8 ``` +
+ #### OpenAI Server To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed. +
+Command + ```bash # server nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct @@ -79,18 +102,30 @@ nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512 ``` +
+ In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: +
+Command + ``` nsys sessions list ``` +
+ to get the session id in the form of `profile-XXXXX`, then run: +
+Command + ``` nsys stop --session=profile-XXXXX ``` +
+ to manually kill the profiler and generate your `nsys-rep` report. #### Analysis @@ -99,6 +134,9 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p CLI example: +
+Command + ```bash nsys stats report1.nsys-rep ... @@ -118,6 +156,8 @@ nsys stats report1.nsys-rep ... ``` +
+ GUI example: Screenshot 2025-03-05 at 11 48 42 AM @@ -136,6 +176,9 @@ The first helper is a Python decorator that can be used to profile a function. If a filename is specified, the profile will be saved to that file. If no filename is specified, profile data will be printed to stdout. +
+Code + ```python import vllm.utils @@ -145,11 +188,16 @@ def expensive_function(): pass ``` +
+ ### Example Usage - context manager The second helper is a context manager that can be used to profile a block of code. Similar to the decorator, the filename is optional. +
+Code + ```python import vllm.utils @@ -161,12 +209,19 @@ with vllm.utils.cprofile_context("another_function.prof"): another_function() ``` +
+ ### Analyzing Profile Results There are multiple tools available that can help analyze the profile results. One example is [snakeviz](https://jiffyclub.github.io/snakeviz/). +
+Command + ```bash pip install snakeviz snakeviz expensive_function.prof ``` + +
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 93d9e80f5b0..5feac71d07d 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -10,6 +10,9 @@ title: Using Docker vLLM offers an official Docker image for deployment. The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). +
+Command + ```console docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -20,8 +23,13 @@ docker run --runtime nvidia --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` +
+ This image can also be used with other container engines such as [Podman](https://podman.io/). +
+Command + ```console podman run --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -32,6 +40,8 @@ podman run --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` +
+ You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`). !!! note @@ -71,6 +81,9 @@ You can add any other [engine-args][engine-args] you need after the image tag (` You can build and run vLLM from source via the provided . To build vLLM: +
+Command + ```console # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 DOCKER_BUILDKIT=1 docker build . \ @@ -79,6 +92,8 @@ DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile ``` +
+ !!! note By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` @@ -97,6 +112,9 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). +
+Command + ```console # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) python3 use_existing_torch.py @@ -111,6 +129,8 @@ DOCKER_BUILDKIT=1 docker build . \ --build-arg vllm_fa_cmake_gpu_arches="90-real" ``` +
+ !!! note If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. @@ -126,6 +146,9 @@ DOCKER_BUILDKIT=1 docker build . \ To run vLLM with the custom-built Docker image: +
+Command + ```console docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -134,6 +157,8 @@ docker run --runtime nvidia --gpus all \ vllm/vllm-openai ``` +
+ The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). !!! note diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index 84cb2304fac..c6032facec2 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -24,6 +24,9 @@ cerebrium init vllm-project Next, to install the required packages, add the following to your cerebrium.toml: +
+Config + ```toml [cerebrium.deployment] docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" @@ -32,8 +35,13 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" vllm = "latest" ``` +
+ Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: +
+Code + ```python from vllm import LLM, SamplingParams @@ -54,6 +62,8 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): return {"results": results} ``` +
+ Then, run the following code to deploy it to the cloud: ```console @@ -62,6 +72,9 @@ cerebrium deploy If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) +
+Command + ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -H 'Content-Type: application/json' \ @@ -76,8 +89,13 @@ curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ }' ``` +
+ You should get a response like: +
+Response + ```python { "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", @@ -105,4 +123,6 @@ You should get a response like: } ``` +
+ You now have an autoscaling endpoint where you only pay for the compute you use! diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index 886484b5434..147e61211eb 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -24,6 +24,9 @@ vllm serve Qwen/Qwen1.5-7B-Chat - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): +
+Commands + ```console git clone https://github.com/langgenius/dify.git cd dify @@ -32,6 +35,8 @@ cp .env.example .env docker compose up -d ``` +
+ - Open the browser to access `http://localhost/install`, config the basic login information and login. - In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 7de92855745..0601e57cafd 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -26,6 +26,9 @@ dstack init Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: +
+Config + ```yaml type: service @@ -44,8 +47,13 @@ model: name: NousResearch/Llama-2-7b-chat-hf ``` +
+ Then, run the following CLI for provisioning: +
+Command + ```console $ dstack run . -f serve.dstack.yml @@ -73,8 +81,13 @@ spicy-treefrog-1 provisioning completed (running) Service is published at ... ``` +
+ After the provisioning, you can interact with the model by using the OpenAI SDK: +
+Code + ```python from openai import OpenAI @@ -96,5 +109,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` +
+ !!! note dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 2eac4a5279f..f6f9727b745 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -27,6 +27,9 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. +
+Code + ```python from haystack.components.generators.chat import OpenAIChatGenerator from haystack.dataclasses import ChatMessage @@ -49,7 +52,10 @@ print(response) print("-"*30) ``` -Output e.g.: +
+ +
+Output ```console ------------------------------ @@ -57,4 +63,6 @@ Output e.g.: ------------------------------ ``` +
+ For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md). diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index 3011cde8301..abd460b19be 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -34,6 +34,9 @@ vllm serve qwen/Qwen1.5-0.5B-Chat - Call it with litellm: +
+Code + ```python import litellm @@ -50,6 +53,8 @@ response = litellm.completion( print(response) ``` +
+ ### Embeddings - Start the vLLM server with the supported embedding model, e.g. @@ -60,6 +65,9 @@ vllm serve BAAI/bge-base-en-v1.5 - Call it with litellm: +
+Code + ```python from litellm import embedding import os @@ -73,4 +81,6 @@ embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello w print(embedding) ``` +
+ For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm). diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 18282a89ddf..42b980d1cd4 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -17,6 +17,9 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber Deploy the following yaml file `lws.yaml` +
+Config + ```yaml apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet @@ -111,6 +114,8 @@ spec: type: ClusterIP ``` +
+ ```bash kubectl apply -f lws.yaml ``` @@ -123,6 +128,9 @@ kubectl get pods Should get an output similar to this: +
+Output + ```bash NAME READY STATUS RESTARTS AGE vllm-0 1/1 Running 0 2s @@ -131,6 +139,8 @@ vllm-1 1/1 Running 0 2s vllm-1-1 1/1 Running 0 2s ``` +
+ Verify that the distributed tensor-parallel inference works: ```bash @@ -162,6 +172,9 @@ Forwarding from [::1]:8080 -> 8080 Open another terminal and send a request +
+Command + ```text curl http://localhost:8080/v1/completions \ -H "Content-Type: application/json" \ @@ -173,8 +186,13 @@ curl http://localhost:8080/v1/completions \ }' ``` +
+ The output should be similar to the following +
+Output + ```text { "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", @@ -197,3 +215,5 @@ The output should be similar to the following } } ``` + +
diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md index 1ab1931068f..7b98504ef8a 100644 --- a/docs/deployment/frameworks/open-webui.md +++ b/docs/deployment/frameworks/open-webui.md @@ -13,6 +13,9 @@ vllm serve qwen/Qwen1.5-0.5B-Chat 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): +
+Command + ```console docker run -d -p 3000:8080 \ --name open-webui \ @@ -22,6 +25,8 @@ docker run -d -p 3000:8080 \ ghcr.io/open-webui/open-webui:main ``` +
+ 1. Open it in the browser: On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index 9763745f237..d16bb23be92 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -24,6 +24,9 @@ sky check See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). +
+Config + ```yaml resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. @@ -67,6 +70,8 @@ run: | --stop-token-ids 128009,128001 ``` +
+ Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): ```console diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index 33ed8c5f5b5..c1e3ab0c738 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -29,6 +29,9 @@ pip install streamlit openai - Start the streamlit web UI and start to chat: +
+Commands + ```console streamlit run streamlit_openai_chatbot_webserver.py @@ -40,4 +43,6 @@ VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug ``` +
+ ![](../../assets/deployment/streamlit-chat.png) diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md index 2ae600a423f..0f1a6ee054c 100644 --- a/docs/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -15,6 +15,9 @@ pip install llama-stack -q Then start Llama Stack server pointing to your vLLM server with the following configuration: +
+Config + ```yaml inference: - provider_id: vllm0 @@ -23,6 +26,8 @@ inference: url: http://127.0.0.1:8000 ``` +
+ Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. ## Inference via Embedded vLLM @@ -30,6 +35,9 @@ Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distri An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) is also available. This is a sample of configuration using that method: +
+Config + ```yaml inference - provider_type: vllm @@ -37,3 +45,5 @@ inference model: Llama3.1-8B-Instruct tensor_parallel_size: 4 ``` + +
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md index 8288a4b6e6b..162330a15a3 100644 --- a/docs/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -60,7 +60,8 @@ And then you can send out a query to the OpenAI-compatible API to check the avai curl -o- http://localhost:30080/models ``` -Expected output: +
+Expected output ```json { @@ -77,8 +78,13 @@ Expected output: } ``` +
+ To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: +
+Command + ```bash curl -X POST http://localhost:30080/completions \ -H "Content-Type: application/json" \ @@ -89,7 +95,10 @@ curl -X POST http://localhost:30080/completions \ }' ``` -Expected output: +
+ +
+Expected output ```json { @@ -107,6 +116,8 @@ Expected output: } ``` +
+ ### Uninstall To remove the deployment, run: @@ -121,6 +132,9 @@ sudo helm uninstall vllm The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above: +
+Yaml + ```yaml servingEngineSpec: runtimeClassName: "" @@ -139,6 +153,8 @@ servingEngineSpec: pvcStorage: "10Gi" ``` +
+ In this YAML configuration: * **`modelSpec`** includes: * `name`: A nickname that you prefer to call the model. diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 7430f99a539..6efef154c1e 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -29,6 +29,9 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: +
+Config + ```bash cat < + Next, start the vLLM server as a Kubernetes Deployment and Service: +
+Config + ```bash cat < + We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): +
+Logs + ```console kubectl logs -l app.kubernetes.io/name=vllm ... @@ -120,6 +133,8 @@ INFO: Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ``` +
+ ## Deployment with GPUs **Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). @@ -128,6 +143,9 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) PVC is used to store the model cache and it is optional, you can use hostPath or other storage options +
+ Config + ```yaml apiVersion: v1 kind: PersistentVolumeClaim @@ -144,8 +162,13 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) volumeMode: Filesystem ``` +
+ Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models +
+ Config + ```yaml apiVersion: v1 kind: Secret @@ -157,12 +180,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) token: "REPLACE_WITH_TOKEN" ``` +
+ Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. Here are two examples for using NVIDIA GPU and AMD GPU. NVIDIA GPU: +
+ Config + ```yaml apiVersion: apps/v1 kind: Deployment @@ -233,10 +261,15 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) periodSeconds: 5 ``` +
+ AMD GPU: You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. +
+ Config + ```yaml apiVersion: apps/v1 kind: Deployment @@ -305,12 +338,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) mountPath: /dev/shm ``` +
+ You can get the full example with steps and sample yaml files from . 2. Create a Kubernetes Service for vLLM Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: +
+ Config + ```yaml apiVersion: v1 kind: Service @@ -330,17 +368,27 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) type: ClusterIP ``` +
+ 3. Deploy and Test Apply the deployment and service configurations using `kubectl apply -f `: +
+ Command + ```console kubectl apply -f deployment.yaml kubectl apply -f service.yaml ``` +
+ To test the deployment, run the following `curl` command: +
+ Command + ```console curl http://mistral-7b.default.svc.cluster.local/v1/completions \ -H "Content-Type: application/json" \ @@ -352,6 +400,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) }' ``` +
+ If the service is correctly deployed, you should receive a response from the vLLM model. ## Troubleshooting diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index f0ff5c1d0e7..eb1aa23cd77 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -11,12 +11,20 @@ This document shows how to launch multiple vLLM serving containers and use Nginx This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. +
+Command + ```console export vllm_root=`pwd` ``` +
+ Create a file named `Dockerfile.nginx`: +
+Dockerfile + ```console FROM nginx:latest RUN rm /etc/nginx/conf.d/default.conf @@ -24,18 +32,28 @@ EXPOSE 80 CMD ["nginx", "-g", "daemon off;"] ``` +
+ Build the container: +
+Command + ```console docker build . -f Dockerfile.nginx --tag nginx-lb ``` +
+ [](){ #nginxloadbalancer-nginx-conf } ## Create Simple Nginx Config file Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. +
+Config + ```console upstream backend { least_conn; @@ -54,17 +72,27 @@ server { } ``` +
+ [](){ #nginxloadbalancer-nginx-vllm-container } ## Build vLLM Container +
+Command + ```console cd $vllm_root docker build -f docker/Dockerfile . --tag vllm ``` +
+ If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: +
+Command + ```console cd $vllm_root docker build \ @@ -74,14 +102,21 @@ docker build \ --build-arg https_proxy=$https_proxy ``` +
+ [](){ #nginxloadbalancer-nginx-docker-network } ## Create Docker Network +
+Command + ```console docker network create vllm_nginx ``` +
+ [](){ #nginxloadbalancer-nginx-launch-container } ## Launch vLLM Containers @@ -93,6 +128,9 @@ Notes: - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. +
+Command + ```console mkdir -p ~/.cache/huggingface/hub/ hf_cache_dir=~/.cache/huggingface/ @@ -118,6 +156,8 @@ docker run \ --model meta-llama/Llama-2-7b-chat-hf ``` +
+ !!! note If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. @@ -125,6 +165,9 @@ docker run \ ## Launch Nginx +
+Command + ```console docker run \ -itd \ @@ -134,17 +177,29 @@ docker run \ --name nginx-lb nginx-lb:latest ``` +
+ [](){ #nginxloadbalancer-nginx-verify-nginx } ## Verify That vLLM Servers Are Ready +
+Command + ```console docker logs vllm0 | grep Uvicorn docker logs vllm1 | grep Uvicorn ``` +
+ Both outputs should look like this: +
+Output + ```console INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ``` + +
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index 14720a392aa..ca5b6530ac8 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -22,6 +22,9 @@ server. Here is a sample of `LLM` class usage: +
+Code + ```python from vllm import LLM, SamplingParams @@ -48,6 +51,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. The code for the `LLM` class can be found in . @@ -178,6 +183,9 @@ vision-language model. To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: +
+ Code + ```python class MyOldModel(nn.Module): def __init__( @@ -205,6 +213,8 @@ vision-language model. MyModel = MyOldModel ``` +
+ This way, the model can work with both old and new versions of vLLM. 3\. **Sharding and Quantization at Initialization**: Certain features require diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index 6ebe1ee48ac..d121435d187 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -39,6 +39,9 @@ refer to multi-dimensional arrays, but each thread only accesses the portion of data assigned to it. I have omitted all other runtime parameters here for simplicity. +
+Code + ```cpp template __device__ void paged_attention_kernel( @@ -51,6 +54,8 @@ __device__ void paged_attention_kernel( ) ``` +
+ There are also a list of template arguments above the function signature that are determined during compilation time. `scalar_t` represents the data type of the query, key, and value data elements, @@ -134,10 +139,15 @@ one query token data. Within each warp, every thread group will fetch the same query token data, but will multiply it with different key token data. +
+Code + ```cpp const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` +
+
![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
@@ -175,12 +185,17 @@ tokens are processed by the entire thread group after the kernel run. In this context, "handle" refers to performing the dot multiplication between query data and key data. +
+Code + ```cpp const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + kv_head_idx * kv_head_stride + physical_block_offset * x; ``` +
+ Unlike to `q_ptr`, `k_ptr` in each thread will point to different key token at different iterations. As shown above, that `k_ptr` points to key token data based on `k_cache` at assigned block, @@ -235,6 +250,9 @@ point to different tokens and prepare the `k_vecs` in the inner for loop. Finally, we perform the dot multiplication between the `q_vecs` and each `k_vecs`. +
+Code + ```cpp q_vecs = ... for ... { @@ -247,6 +265,8 @@ for ... { } ``` +
+ As mentioned before, for each thread, it only fetches part of the query and key token data at a time. However, there will be a cross thread group reduction happen in the `Qk_dot<>::dot` . So `qk` @@ -287,6 +307,9 @@ store the normalized softmax result). Also we can compare and collect the `qk_max` for all `qk`s that are calculated by current thread group. +
+Code + ```cpp if (thread_group_offset == 0) { const bool mask = token_idx >= context_len; @@ -295,10 +318,15 @@ if (thread_group_offset == 0) { } ``` +
+ Please note that the `logits` here is on shared memory, so each thread group will set the fields for its own assigned context tokens. Overall, the size of logits should be number of context tokens. +
+Code + ```cpp for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); @@ -309,10 +337,15 @@ if (lane == 0) { } ``` +
+ Then we need to get the reduced `qk_max` across each warp. The main idea is to make threads in warp to communicate with each other and get the final max `qk` . +
+Code + ```cpp for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); @@ -320,6 +353,8 @@ for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { qk_max = VLLM_SHFL_SYNC(qk_max, 0); ``` +
+ Finally, we can get the reduced `qk_max` from whole thread block by compare the `qk_max` from all warps in this thread block. Then we need to broadcast the final result to each thread. @@ -329,6 +364,9 @@ need to broadcast the final result to each thread. Similar to `qk_max`, we need to get the reduced sum value from the entire thread block too. +
+Code + ```cpp for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { float val = __expf(logits[i] - qk_max); @@ -339,12 +377,17 @@ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); ``` +
+ Firstly, sum all exp values from each thread group, and meanwhile, convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. Please note, the `qk_max` here is already the max `qk` across the whole thread block. And then we can do reduction for `exp_sum` across whole thread block just like the `qk_max`. +
+Code + ```cpp const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { @@ -352,6 +395,8 @@ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { } ``` +
+ Finally, with the reduced `qk_max` and `exp_sum`, we can obtain the final normalized softmax result as `logits`. This `logits` variable will be used for dot multiplication with the value data in @@ -390,6 +435,9 @@ multiple inner iterations, each warp will process one block of value tokens. And with multiple outer iterations, the whole context value tokens are processed +
+Code + ```cpp float accs[NUM_ROWS_PER_THREAD]; for ... { // Iteration over different blocks. @@ -402,6 +450,8 @@ for ... { // Iteration over different blocks. } ``` +
+ As shown in the above pseudo code, in the outer loop, similar to `k_ptr`, `logits_vec` iterates over different blocks and reads `V_VEC_SIZE` elements from `logits`. In the inner loop, each @@ -430,6 +480,9 @@ Now, we need to perform reduction for `accs` within each warp. This process allows each thread to accumulate the `accs` for the assigned head positions of all tokens in one block. +
+Code + ```cpp for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { float acc = accs[i]; @@ -440,6 +493,8 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { } ``` +
+ Next, we perform reduction for `accs` across all warps, allowing each thread to have the accumulation of `accs` for the assigned head positions of all context tokens. Please note that each `accs` @@ -448,6 +503,9 @@ elements of the entire head for all context tokens. However, overall, all results for output have been calculated but are just stored in different thread register memory. +
+Code + ```cpp float* out_smem = reinterpret_cast(shared_mem); for (int i = NUM_WARPS; i > 1; i /= 2) { @@ -470,20 +528,30 @@ for (int i = NUM_WARPS; i > 1; i /= 2) { } ``` +
+ ## Output Now we can write all of calculated result from local register memory to final output global memory. +
+Code + ```cpp scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE; ``` +
+ First, we need to define the `out_ptr` variable, which points to the start address of the assigned sequence and assigned head. +
+Code + ```cpp for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; @@ -493,6 +561,8 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { } ``` +
+ Finally, we need to iterate over different assigned head positions and write out the corresponding accumulated result based on the `out_ptr`. diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 0764dfb6501..93a616c5273 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -13,6 +13,9 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: +
+Code + ```python # inside `setup.py` file from setuptools import setup @@ -36,6 +39,8 @@ def register(): ) ``` +
+ For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). Every plugin has three parts: diff --git a/docs/features/lora.md b/docs/features/lora.md index 04e92dbc459..207947032a6 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -10,14 +10,22 @@ LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vll Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save them locally with +
+Code + ```python from huggingface_hub import snapshot_download sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") ``` +
+ Then we instantiate the base model and pass in the `enable_lora=True` flag: +
+Code + ```python from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest @@ -25,10 +33,15 @@ from vllm.lora.request import LoRARequest llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) ``` +
+ We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and the third parameter is the path to the LoRA adapter. +
+Code + ```python sampling_params = SamplingParams( temperature=0, @@ -48,6 +61,8 @@ outputs = llm.generate( ) ``` +
+ Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters @@ -55,12 +70,17 @@ Check out for an exa LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use `--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: +
+Command + ```bash vllm serve meta-llama/Llama-2-7b-hf \ --enable-lora \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` +
+ !!! note The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. @@ -68,6 +88,9 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): +
+Command + ```bash curl localhost:8000/v1/models | jq . { @@ -87,12 +110,17 @@ curl localhost:8000/v1/models | jq . } ``` +
+ Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and `max_loras` is set high enough). The following is an example request +
+Command + ```bash curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ @@ -104,6 +132,8 @@ curl http://localhost:8000/v1/completions \ }' | jq ``` +
+ ## Dynamically serving LoRA Adapters In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. @@ -123,7 +153,8 @@ Loading a LoRA Adapter: To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. -Example request to load a LoRA adapter: +
+Example request to load a LoRA adapter ```bash curl -X POST http://localhost:8000/v1/load_lora_adapter \ @@ -134,6 +165,8 @@ curl -X POST http://localhost:8000/v1/load_lora_adapter \ }' ``` +
+ Upon a successful request, the API will respond with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' added successfully`. If an error occurs, such as if the adapter cannot be found or loaded, an appropriate error message will be returned. @@ -144,7 +177,8 @@ with the name or ID of the adapter to be unloaded. Upon a successful request, the API responds with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' removed successfully`. -Example request to unload a LoRA adapter: +
+Example request to unload a LoRA adapter ```bash curl -X POST http://localhost:8000/v1/unload_lora_adapter \ @@ -154,6 +188,8 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ }' ``` +
+ ### Using Plugins Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter. @@ -168,7 +204,8 @@ Alternatively, follow these example steps to implement your own plugin: 1. Implement the LoRAResolver interface. - Example of a simple S3 LoRAResolver implementation: +
+ Example of a simple S3 LoRAResolver implementation ```python import os @@ -199,8 +236,13 @@ Alternatively, follow these example steps to implement your own plugin: return lora_request ``` +
+ 2. Register `LoRAResolver` plugin. +
+ Code + ```python from vllm.lora.resolver import LoRAResolverRegistry @@ -208,6 +250,8 @@ Alternatively, follow these example steps to implement your own plugin: LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver) ``` +
+ For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md). ## New format for `--lora-modules` @@ -234,6 +278,9 @@ The new format of `--lora-modules` is mainly to support the display of parent mo - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `root` field points to the artifact location of the lora adapter. +
+Command output + ```bash $ curl http://localhost:8000/v1/models @@ -269,3 +316,5 @@ $ curl http://localhost:8000/v1/models ] } ``` + +
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index afb9a6d4df9..469587f2f6f 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -20,6 +20,9 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: +
+Code + ```python from vllm import LLM @@ -62,10 +65,15 @@ for o in outputs: print(generated_text) ``` +
+ Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: +
+Code + ```python from vllm import LLM @@ -95,10 +103,15 @@ for o in outputs: print(generated_text) ``` +
+ Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: +
+Code + ```python from vllm import LLM @@ -126,6 +139,8 @@ for o in outputs: print(generated_text) ``` +
+ ### Video Inputs You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary @@ -144,6 +159,9 @@ Full example: To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. +
+Code + ```python from vllm import LLM @@ -167,8 +185,13 @@ for o in outputs: print(generated_text) ``` +
+ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: +
+Code + ```python # Construct the prompt based on your model prompt = ... @@ -207,6 +230,8 @@ for o in outputs: print(generated_text) ``` +
+ ## Online Serving Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). @@ -235,6 +260,9 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ Then, you can use the OpenAI client as follows: +
+Code + ```python from openai import OpenAI @@ -281,6 +309,8 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` +
+ Full example: !!! tip @@ -311,6 +341,9 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model Then, you can use the OpenAI client as follows: +
+Code + ```python from openai import OpenAI @@ -350,6 +383,8 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:", result) ``` +
+ Full example: !!! note @@ -373,6 +408,9 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b Then, you can use the OpenAI client as follows: +
+Code + ```python import base64 import requests @@ -425,8 +463,13 @@ result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) ``` +
+ Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: +
+Code + ```python chat_completion_from_url = client.chat.completions.create( messages=[{ @@ -452,6 +495,8 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` +
+ Full example: !!! note @@ -470,6 +515,9 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. The following example demonstrates how to pass image embeddings to the OpenAI server: +
+Code + ```python image_embedding = torch.load(...) grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct @@ -526,6 +574,8 @@ chat_completion = client.chat.completions.create( ) ``` +
+ !!! note Only one message can contain `{"type": "image_embeds"}`. If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 59ef10d9c96..22f3af7e864 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -33,6 +33,9 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ Next, make a request to the model that should return the reasoning content in the response. +
+Code + ```python from openai import OpenAI @@ -62,12 +65,17 @@ print("reasoning_content:", reasoning_content) print("content:", content) ``` +
+ The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. ## Streaming chat completions Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). +
+Output + ```json { "id": "chatcmpl-123", @@ -89,8 +97,13 @@ Streaming chat completions are also supported for reasoning models. The `reasoni } ``` +
+ OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: +
+Code + ```python from openai import OpenAI @@ -140,12 +153,17 @@ for chunk in stream: print(content, end="", flush=True) ``` +
+ Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). ## Tool Calling The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. +
+Code + ```python from openai import OpenAI @@ -182,6 +200,8 @@ print(f"Function called: {tool_call.name}") print(f"Arguments: {tool_call.arguments}") ``` +
+ For more examples, please refer to . ## Limitations @@ -192,6 +212,9 @@ For more examples, please refer to . +
+Code + ```python # import the required packages @@ -246,8 +269,13 @@ class ExampleParser(ReasoningParser): """ ``` +
+ Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in . +
+Code + ```python @dataclass class DeepSeekReasoner(Reasoner): @@ -272,6 +300,8 @@ class DeepSeekReasoner(Reasoner): ... ``` +
+ The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags. diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index 5080960f72d..e28a6036c90 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -18,6 +18,9 @@ Speculative decoding is a technique which improves inter-token latency in memory The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. +
+Code + ```python from vllm import LLM, SamplingParams @@ -42,8 +45,13 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ To perform the same with an online mode launch the server: +
+Command + ```bash python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ @@ -55,11 +63,16 @@ python -m vllm.entrypoints.openai.api_server \ --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}' ``` +
+ !!! warning Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. Then use a client: +
+Code + ```python from openai import OpenAI @@ -94,11 +107,16 @@ else: print(completion) ``` +
+ ## Speculating by matching n-grams in the prompt The following code configures vLLM to use speculative decoding where proposals are generated by matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) +
+Code + ```python from vllm import LLM, SamplingParams @@ -124,6 +142,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ ## Speculating using MLP speculators The following code configures vLLM to use speculative decoding where proposals are generated by @@ -131,6 +151,9 @@ draft models that conditioning draft predictions on both context vectors and sam For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or [this technical report](https://arxiv.org/abs/2404.19124). +
+Code + ```python from vllm import LLM, SamplingParams @@ -155,6 +178,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ Note that these speculative models currently need to be run without tensor parallelism, although it is possible to run the main model using tensor parallelism (see example above). Since the speculative models are relatively small, we still see significant speedups. However, this @@ -177,6 +202,9 @@ A variety of speculative models of this type are available on HF hub: The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). +
+Code + ```python from vllm import LLM, SamplingParams @@ -203,6 +231,8 @@ for output in outputs: ``` +
+ A few important things to consider when using the EAGLE based draft models: 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 044c7966099..7279e559dc0 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -33,6 +33,9 @@ text. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: +
+Code + ```python from openai import OpenAI client = OpenAI( @@ -51,8 +54,13 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` +
+ The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: +
+Code + ```python completion = client.chat.completions.create( model=model, @@ -67,6 +75,8 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` +
+ One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. For this we can use the `guided_json` parameter in two different ways: @@ -75,6 +85,9 @@ For this we can use the `guided_json` parameter in two different ways: The next example shows how to use the `guided_json` parameter with a Pydantic model: +
+Code + ```python from pydantic import BaseModel from enum import Enum @@ -111,6 +124,8 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` +
+ !!! tip While not strictly necessary, normally it´s better to indicate in the prompt the JSON schema and how the fields should be populated. This can improve the @@ -121,6 +136,9 @@ difficult to use, but it´s really powerful. It allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: +
+Code + ```python simplified_sql_grammar = """ root ::= select_statement @@ -149,6 +167,8 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` +
+ See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) ## Reasoning Outputs @@ -161,6 +181,9 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: +
+Code + ```python from pydantic import BaseModel @@ -190,6 +213,8 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("content: ", completion.choices[0].message.content) ``` +
+ See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) ## Experimental Automatic Parsing (OpenAI API) @@ -202,6 +227,9 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3. Here is a simple example demonstrating how to get structured output using Pydantic models: +
+Code + ```python from pydantic import BaseModel from openai import OpenAI @@ -228,7 +256,10 @@ print("Name:", message.parsed.name) print("Age:", message.parsed.age) ``` -Output: +
+ +
+Output ```console ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) @@ -236,8 +267,13 @@ Name: Cameron Age: 28 ``` +
+ Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: +
+Code + ```python from typing import List from pydantic import BaseModel @@ -268,7 +304,10 @@ for i, step in enumerate(message.parsed.steps): print("Answer:", message.parsed.final_answer) ``` -Output: +
+ +
+Output ```console ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) @@ -278,6 +317,8 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` +
+ An example of using `structural_tag` can be found here: ## Offline Inference @@ -296,6 +337,9 @@ These parameters can be used in the same way as the parameters from the Online Serving examples above. One example for the usage of the `choice` parameter is shown below: +
+Code + ```python from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams @@ -311,4 +355,6 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` +
+ See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 93ea164881c..5760b35ae17 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -6,6 +6,9 @@ vLLM currently supports named function calling, as well as the `auto`, `required Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the llama3 tool calling chat template from the vLLM examples directory: +
+Command + ```bash vllm serve meta-llama/Llama-3.1-8B-Instruct \ --enable-auto-tool-choice \ @@ -13,8 +16,13 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` +
+ Next, make a request to the model that should result in it using the available tools: +
+Code + ```python from openai import OpenAI import json @@ -54,7 +62,10 @@ print(f"Arguments: {tool_call.arguments}") print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` -Example output: +
+ +
+Example output ```text Function called: get_weather @@ -62,6 +73,8 @@ Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` +
+ This example demonstrates: * Setting up the server with tool calling enabled @@ -301,6 +314,9 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen Here is a summary of a plugin file: +
+Code + ```python # import the required packages @@ -345,11 +361,18 @@ class ExampleToolParser(ToolParser): ``` +
+ Then you can use this plugin in the command line like this. +
+Command + ```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ --chat-template \ ``` + +
diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md index 6b2efd85f06..5b29f1a849c 100644 --- a/docs/getting_started/installation/aws_neuron.md +++ b/docs/getting_started/installation/aws_neuron.md @@ -47,6 +47,9 @@ Currently, there are no pre-built Neuron wheels. To build and install vLLM from source, run: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -54,6 +57,8 @@ pip install -U -r requirements/neuron.txt VLLM_TARGET_DEVICE="neuron" pip install -e . ``` +
+ AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at , which contains several features in addition to what's available on vLLM V0. Please utilize the AWS Fork for the following features: @@ -66,6 +71,9 @@ Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs- To install the AWS Neuron fork, run the following: +
+Commands + ```console git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git cd upstreaming-to-vllm @@ -73,6 +81,8 @@ pip install -r requirements/neuron.txt VLLM_TARGET_DEVICE="neuron" pip install -e . ``` +
+ Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. ## Set up using Docker @@ -100,12 +110,17 @@ to perform most of the heavy lifting which includes PyTorch model initialization To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include +
+Config + ```console override_neuron_config={ "enable_bucketing":False, } ``` +
+ or when launching vLLM from the CLI, pass ```console diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 00bb5cae43f..29525246530 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -76,6 +76,9 @@ Currently, there are no pre-built CPU wheels. ### Build image from source +
+Commands + ```console $ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai . @@ -92,6 +95,8 @@ $ docker run --rm \ other vLLM OpenAI server arguments ``` +
+ !!! tip For ARM or Apple silicon, use `docker/Dockerfile.arm` @@ -119,6 +124,9 @@ vLLM CPU backend supports the following vLLM features: - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: +
+Commands + ```console sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library find / -name *libtcmalloc* # find the dynamic link library path @@ -126,14 +134,21 @@ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD python examples/offline_inference/basic/basic.py # run vLLM ``` +
+ - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: +
+Commands + ```console export VLLM_CPU_KVCACHE_SPACE=40 export VLLM_CPU_OMP_THREADS_BIND=0-29 vllm serve facebook/opt-125m ``` +
+ or using default auto thread binding: ```console @@ -142,8 +157,13 @@ export VLLM_CPU_NUM_OF_RESERVED_CPU=2 vllm serve facebook/opt-125m ``` +
+ - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: +
+Commands + ```console $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores @@ -171,6 +191,8 @@ $ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ python examples/offline_inference/basic/basic.py ``` +
+ - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. ## Other considerations diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 7ddadccb1b4..85016010ba3 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,26 +1,41 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +
+Commands + ```console sudo apt-get update -y sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` +
+ Second, clone vLLM project: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git vllm_source cd vllm_source ``` +
+ Third, install Python packages for vLLM CPU backend building: +
+Commands + ```console pip install --upgrade pip pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` +
+ Finally, build and install vLLM CPU backend: ```console diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index 670485feefb..52ad6c1b9a6 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -26,6 +26,9 @@ Currently the CPU implementation for s390x architecture supports FP32 datatype o Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4: +
+Command + ```console dnf install -y \ which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \ @@ -33,18 +36,28 @@ dnf install -y \ openssl-devel openblas openblas-devel wget autoconf automake libtool cmake numactl-devel ``` +
+ Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation. +
+Command + ```console curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . "$HOME/.cargo/env" ``` +
+ Execute the following commands to build and install vLLM from the source. !!! tip Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. +
+Command + ```console sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds pip install -v \ @@ -55,6 +68,8 @@ Execute the following commands to build and install vLLM from the source. pip install dist/*.whl ``` +
+ # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md index 0cb10b8de83..bbc0759ee67 100644 --- a/docs/getting_started/installation/google_tpu.md +++ b/docs/getting_started/installation/google_tpu.md @@ -68,6 +68,9 @@ For more information about using TPUs with GKE, see: Create a TPU v5e with 4 TPU chips: +
+Commands + ```console gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --node-id TPU_NAME \ @@ -78,6 +81,8 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` +
+ | Parameter name | Description | |--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | @@ -108,19 +113,29 @@ Currently, there are no pre-built TPU wheels. Install Miniconda: +
+Commands + ```bash wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh bash Miniconda3-latest-Linux-x86_64.sh source ~/.bashrc ``` +
+ Create and activate a Conda environment for vLLM: +
+Commands + ```bash conda create -n vllm python=3.10 -y conda activate vllm ``` +
+ Clone the vLLM repository and go to the vLLM directory: ```bash diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 4503bb44318..10001c2d09b 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -22,6 +22,9 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: +
+Commands + ```console # Install vLLM with CUDA 12.8. # If you are using pip. @@ -30,6 +33,8 @@ pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 uv pip install vllm --torch-backend=auto ``` +
+ We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. !!! note @@ -37,6 +42,9 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: +
+Commands + ```console # Install vLLM with CUDA 11.8. export VLLM_VERSION=0.6.1.post1 @@ -44,6 +52,8 @@ export PYTHON_VERSION=312 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` +
+ [](){ #install-the-latest-code } #### Install the latest code @@ -52,37 +62,55 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ##### Install the latest code using `pip` +
+Command + ```console pip install -U vllm \ --pre \ --extra-index-url https://wheels.vllm.ai/nightly ``` +
+ `--pre` is required for `pip` to consider pre-released versions. Another way to install the latest code is to use `uv`: +
+Command + ```console uv pip install -U vllm \ --torch-backend=auto \ --extra-index-url https://wheels.vllm.ai/nightly ``` +
+ ##### Install specific revisions using `pip` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: +
+Commands + ```console export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` +
+ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. ##### Install specific revisions using `uv` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: +
+Commands + ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch uv pip install vllm \ @@ -90,6 +118,8 @@ uv pip install vllm \ --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` +
+ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. # --8<-- [end:pre-built-wheels] @@ -99,12 +129,17 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm VLLM_USE_PRECOMPILED=1 pip install --editable . ``` +
+ This command will do the following: 1. Look for the current branch in your vLLM clone. @@ -118,12 +153,17 @@ This command will do the following: In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. +
+Commands + ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install --editable . ``` +
+ You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. !!! note @@ -134,12 +174,17 @@ You can find more information about vLLM's wheels in [install-the-latest-code][i If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm pip install -e . ``` +
+ !!! tip Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. @@ -160,6 +205,9 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -168,32 +216,47 @@ pip install -r requirements/build.txt pip install --no-build-isolation -e . ``` +
+ ##### Use the local cutlass for compilation Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` +
+ ##### Troubleshooting To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: +
+Commands + ```console export MAX_JOBS=6 pip install -e . ``` +
+ This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. +
+Command + ```console # Use `--ipc=host` to make sure the shared memory is large enough. docker run \ @@ -203,31 +266,48 @@ docker run \ --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` +
+ If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: +
+Commands + ```console export CUDA_HOME=/usr/local/cuda export PATH="${CUDA_HOME}/bin:$PATH" ``` +
+ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: +
+Commands + ```console nvcc --version # verify that nvcc is in your PATH ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` +
+ #### Unsupported OS build vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: +
+Commands + ```console export VLLM_TARGET_DEVICE=empty pip install -e . ``` +
+ # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] @@ -238,11 +318,16 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i Another way to access the latest code is to use the docker images: +
+Commands + ```console export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` +
+ These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. The latest code can contain bugs and may not be stable. Please use it with caution. diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 8019fb50f4d..480f34d77c3 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -31,16 +31,24 @@ Currently, there are no pre-built ROCm wheels. Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example: +
+ Commands + ```console # Install PyTorch $ pip uninstall torch -y $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 ``` +
+ 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) +
+ Commands + ```console python3 -m pip install ninja cmake wheel pybind11 pip uninstall -y triton @@ -52,6 +60,8 @@ Currently, there are no pre-built ROCm wheels. cd ../.. ``` +
+ !!! note If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. @@ -62,6 +72,9 @@ Currently, there are no pre-built ROCm wheels. For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. +
+ Commands + ```console git clone https://github.com/ROCm/flash-attention.git cd flash-attention @@ -71,11 +84,16 @@ Currently, there are no pre-built ROCm wheels. cd .. ``` +
+ !!! note You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps: +
+ Commands + ```console python3 -m pip uninstall -y aiter git clone --recursive https://github.com/ROCm/aiter.git @@ -85,11 +103,16 @@ Currently, there are no pre-built ROCm wheels. python3 setup.py develop ``` +
+ !!! note You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: +
+ Commands + ```bash pip install --upgrade pip @@ -109,6 +132,8 @@ Currently, there are no pre-built ROCm wheels. python3 setup.py develop ``` +
+ This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. !!! tip @@ -146,6 +171,9 @@ If you choose to build this rocm_base image yourself, the steps are as follows. It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: +
+Config + ```console { "features": { @@ -154,19 +182,29 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` +
+ To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: +
+Command + ```console DOCKER_BUILDKIT=1 docker build \ -f docker/Dockerfile.rocm_base \ -t rocm/vllm-dev:base . ``` +
+ #### Build an image with vLLM First, build a docker image from and launch a docker container from the image. It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: +
+Config + ```console { "features": { @@ -175,6 +213,8 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` +
+ uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: @@ -191,6 +231,9 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: +
+Commands + ```console DOCKER_BUILDKIT=1 docker build \ --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ @@ -199,8 +242,13 @@ DOCKER_BUILDKIT=1 docker build \ . ``` +
+ To run the above docker image `vllm-rocm`, use the below command: +
+Command + ```console docker run -it \ --network=host \ @@ -215,6 +263,8 @@ docker run -it \ bash ``` +
+ Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. # --8<-- [end:build-image-from-source] diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index 128fff164c3..7ab3a02481a 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -25,6 +25,9 @@ Currently, there are no pre-built XPU wheels. - First, install required driver and Intel OneAPI 2025.0 or later. - Second, install Python packages for vLLM XPU backend building: +
+Commands + ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -32,6 +35,8 @@ pip install --upgrade pip pip install -v -r requirements/xpu.txt ``` +
+ - Then, build and install vLLM XPU backend: ```console @@ -53,6 +58,9 @@ Currently, there are no pre-built XPU images. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] +
+Command + ```console $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . $ docker run -it \ @@ -63,11 +71,16 @@ $ docker run -it \ vllm-xpu-env ``` +
+ # --8<-- [end:build-image-from-source] # --8<-- [start:supported-features] XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: +
+Command + ```console python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ @@ -78,6 +91,8 @@ python -m vllm.entrypoints.openai.api_server \ -tp=8 ``` +
+ By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. # --8<-- [end:supported-features] diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md index f5970850aae..bf9d0477033 100644 --- a/docs/getting_started/installation/intel_gaudi.md +++ b/docs/getting_started/installation/intel_gaudi.md @@ -42,6 +42,9 @@ for more details. Use the following commands to run a Docker image: +
+Command + ```console docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest docker run \ @@ -55,6 +58,8 @@ docker run \ vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` +
+ ## Set up using Python ### Pre-built wheels @@ -65,6 +70,9 @@ Currently, there are no pre-built Intel Gaudi wheels. To build and install vLLM from source, run: +
+Command + ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -72,8 +80,13 @@ pip install -r requirements/hpu.txt python setup.py develop ``` +
+ Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: +
+Command + ```console git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork @@ -82,6 +95,8 @@ pip install -r requirements/hpu.txt python setup.py develop ``` +
+ ## Set up using Docker ### Pre-built images @@ -90,6 +105,9 @@ Currently, there are no pre-built Intel Gaudi images. ### Build image from source +
+Command + ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . docker run \ @@ -102,6 +120,8 @@ docker run \ --rm vllm-hpu-env ``` +
+ !!! tip If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. @@ -191,6 +211,9 @@ In a dynamic inference serving scenario, there is a need to minimize the number Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: +
+Logs + ```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] @@ -198,9 +221,12 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` +
+ `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. -Example (with ramp-up) +
+Example (with ramp-up) ```text min = 2, step = 32, max = 64 @@ -209,7 +235,10 @@ min = 2, step = 32, max = 64 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) ``` -Example (without ramp-up) +
+ +
+Example (without ramp-up) ```text min = 128, step = 128, max = 512 @@ -218,6 +247,8 @@ min = 128, step = 128, max = 512 => buckets = ramp_up + stable => (128, 256, 384, 512) ``` +
+ In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. !!! warning @@ -232,6 +263,9 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: +
+Logs + ```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB @@ -246,6 +280,8 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` +
+ This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. !!! tip @@ -279,6 +315,9 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): +
+Logs + ```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] @@ -311,6 +350,8 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, alloca INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) ``` +
+ ### Recommended vLLM Parameters - We recommend running inference on Gaudi 2 with `block_size` of 128 diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 38fc9925eb5..68fb9953e2b 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -19,12 +19,17 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: +
+Commands + ```console uv venv --python 3.12 --seed source .venv/bin/activate uv pip install vllm --torch-backend=auto ``` +
+ `uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment: @@ -35,6 +40,9 @@ uv run --with vllm vllm --help You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment. +
+Commands + ```console conda create -n myenv python=3.12 -y conda activate myenv @@ -42,6 +50,8 @@ pip install --upgrade uv uv pip install vllm --torch-backend=auto ``` +
+ !!! note For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM. @@ -67,6 +77,9 @@ The next section defines a list of input prompts and sampling parameters for tex However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. +
+Code + ```python prompts = [ "Hello, my name is", @@ -77,6 +90,8 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ``` +
+ The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models]. ```python @@ -92,6 +107,9 @@ llm = LLM(model="facebook/opt-125m") Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. +
+Code + ```python outputs = llm.generate(prompts, sampling_params) @@ -101,6 +119,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ [](){ #quickstart-online } ## OpenAI-Compatible Server @@ -134,6 +154,9 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: +
+Command + ```console curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ @@ -145,8 +168,13 @@ curl http://localhost:8000/v1/completions \ }' ``` +
+ Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: +
+Code + ```python from openai import OpenAI @@ -162,6 +190,8 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` +
+ A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM @@ -170,6 +200,9 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: +
+Command + ```console curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -182,8 +215,13 @@ curl http://localhost:8000/v1/chat/completions \ }' ``` +
+ Alternatively, you can use the `openai` Python package: +
+Code + ```python from openai import OpenAI # Set OpenAI's API key and API base to use vLLM's API server. @@ -205,6 +243,8 @@ chat_response = client.chat.completions.create( print("Chat response:", chat_response) ``` +
+ ## On Attention Backends Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications. diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 6755b574ea6..251381b7ae3 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -29,6 +29,9 @@ vllm serve s3://core-llm/Llama-3-8b \ To run model from a S3 compatible object store run: +
+Commands + ```console RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \ AWS_EC2_METADATA_DISABLED=true \ @@ -37,6 +40,8 @@ vllm serve s3://core-llm/Llama-3-8b \ --load-format runai_streamer ``` +
+ ## Tunable parameters You can tune parameters using `--model-loader-extra-config`: @@ -44,21 +49,31 @@ You can tune parameters using `--model-loader-extra-config`: You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. For reading from S3, it will be the number of client instances the host is opening to the S3 server. +
+Command + ```console vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ --load-format runai_streamer \ --model-loader-extra-config '{"concurrency":16}' ``` +
+ You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). +
+Command + ```console vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ --load-format runai_streamer \ --model-loader-extra-config '{"memory_limit":5368709120}' ``` +
+ !!! note For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). @@ -72,21 +87,31 @@ vllm serve /path/to/sharded/model --load-format runai_streamer_sharded The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`: +
+Command + ```console vllm serve /path/to/sharded/model \ --load-format runai_streamer_sharded \ --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` +
+ To create sharded model files, you can use the script provided in . This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: +
+Command + ```console vllm serve /path/to/sharded/model \ --load-format runai_streamer_sharded \ --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' ``` +
+ !!! note The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index e52c5ae01cb..b0864dc29e5 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -23,6 +23,9 @@ The [generate][vllm.LLM.generate] method is available to all generative models i It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), except that tokenization and detokenization are also performed automatically. +
+Code + ```python from vllm import LLM @@ -35,9 +38,14 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams]. For example, you can use greedy sampling by setting `temperature=0`: +
+Code + ```python from vllm import LLM, SamplingParams @@ -51,6 +59,8 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ !!! important By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. @@ -62,6 +72,9 @@ A code example can be found here: +Code + ```python from vllm import LLM from vllm.sampling_params import BeamSearchParams @@ -75,6 +88,8 @@ for output in outputs: print(f"Generated text: {generated_text!r}") ``` +
+ ### `LLM.chat` The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate]. @@ -85,6 +100,9 @@ and automatically applies the model's [chat template](https://huggingface.co/doc In general, only instruction-tuned models have a chat template. Base models may perform poorly as they are not trained to respond to the chat conversation. +
+Code + ```python from vllm import LLM @@ -115,11 +133,16 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +
+ A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: +
+Code + ```python from vllm.entrypoints.chat_utils import load_chat_template @@ -130,6 +153,8 @@ print("Loaded chat template:", custom_template) outputs = llm.chat(conversation, chat_template=custom_template) ``` +
+ ## Online Serving Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 89a128915a7..15a633c275e 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -98,6 +98,9 @@ It is designed for embedding models and cross encoder models. Embedding models u vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). +
+Code + ```python from vllm import LLM @@ -109,6 +112,8 @@ score = output.outputs.score print(f"Score: {score}") ``` +
+ A code example can be found here: ## Online Serving @@ -149,6 +154,9 @@ vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_ You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. +
+Code + ```python from vllm import LLM, PoolingParams @@ -160,6 +168,8 @@ outputs = model.embed(["Follow the white rabbit."], print(outputs[0].outputs) ``` +
+ A code example can be found here: ### Online Inference @@ -172,6 +182,9 @@ vllm serve jinaai/jina-embeddings-v3 --trust-remote-code You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. +
+Command + ```text curl http://127.0.0.1:8000/v1/embeddings \ -H 'accept: application/json' \ @@ -184,10 +197,15 @@ curl http://127.0.0.1:8000/v1/embeddings \ }' ``` -Expected output: +
+ +
+Expected output ```json {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` +
+ A openai client example can be found here: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 60f7dacebfa..283f3059e66 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -25,12 +25,17 @@ vLLM also supports model implementations that are available in Transformers. Thi To check if the modeling backend is Transformers, you can simply do this: +
+Code + ```python from vllm import LLM llm = LLM(model=..., task="generate") # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` +
+ If it is `TransformersForCausalLM` then it means it's based on Transformers! !!! tip @@ -70,7 +75,10 @@ To make your model compatible with the Transformers backend, it needs: 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. -```python title="modeling_my_model.py" +
+modeling_my_model.py + +```python from transformers import PreTrainedModel from torch import nn @@ -93,6 +101,8 @@ class MyModel(PreTrainedModel): _supports_attention_backend = True ``` +
+ Here is what happens in the background when this model is loaded: 1. The config is loaded. @@ -103,7 +113,10 @@ That's it! For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: -```python title="configuration_my_model.py" +
+configuration_my_model.py + +```python from transformers import PretrainedConfig @@ -123,6 +136,8 @@ class MyConfig(PretrainedConfig): } ``` +
+ - `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). - `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: * You only need to do this for layers which are not present on all pipeline stages @@ -145,6 +160,9 @@ The [Transformers backend][transformers-backend] enables you to run models direc !!! tip The easiest way to check if your model is really supported at runtime is to run the program below: +
+ Code + ```python from vllm import LLM @@ -159,6 +177,8 @@ The [Transformers backend][transformers-backend] enables you to run models direc print(output) ``` +
+ If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM. @@ -168,6 +188,9 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: +
+Commands + ```console # Download a model huggingface-cli download HuggingFaceH4/zephyr-7b-beta @@ -179,10 +202,15 @@ huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cach huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json ``` +
+ #### List the downloaded models Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: +
+Commands + ```console # List cached models huggingface-cli scan-cache @@ -194,10 +222,15 @@ huggingface-cli scan-cache -v huggingface-cli scan-cache --dir ~/.cache/huggingface/hub ``` +
+ #### Delete a cached model Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: +
+Commands + ```console # The `delete-cache` command requires extra dependencies to work with the TUI. # Please run `pip install huggingface_hub[cli]` to install them. @@ -224,19 +257,29 @@ Start deletion. Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. ``` +
+ #### Using a proxy Here are some tips for loading/downloading models from Hugging Face using a proxy: - Set the proxy globally for your session (or set it in the profile file): +
+Commands + ```shell export http_proxy=http://your.proxy.server:port export https_proxy=http://your.proxy.server:port ``` +
+ - Set the proxy for just the current command: +
+Commands + ```shell https_proxy=http://your.proxy.server:port huggingface-cli download @@ -244,8 +287,13 @@ https_proxy=http://your.proxy.server:port huggingface-cli download https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests ``` +
+ - Set the proxy in Python interpreter: +
+Code + ```python import os @@ -253,6 +301,8 @@ os.environ['http_proxy'] = 'http://your.proxy.server:port' os.environ['https_proxy'] = 'http://your.proxy.server:port' ``` +
+ ### ModelScope To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: @@ -263,6 +313,9 @@ export VLLM_USE_MODELSCOPE=True And use with `trust_remote_code=True`. +
+Code + ```python from vllm import LLM @@ -277,6 +330,8 @@ output = llm.encode("Hello, my name is") print(output) ``` +
+ [](){ #feature-status-legend } ## Feature Status Legend @@ -493,6 +548,9 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode Offline inference: +
+ Code + ```python from vllm import LLM @@ -502,6 +560,8 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode ) ``` +
+ Online serving: ```bash @@ -600,6 +660,9 @@ Specified using `--task generate`. For the best results, we recommend using the following dependency versions (tested on A10 and L40): +
+ Dependency versions + ```text # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) torch==2.5.1 @@ -622,6 +685,8 @@ Specified using `--task generate`. flash-attn>=2.5.6 # Not used in float32, but should be documented ``` +
+ **Note:** Make sure you understand the security implications of using outdated packages. !!! note diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 259af5cabcb..3d4dfa6f877 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -55,6 +55,9 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: +
+Command + ```console bash run_cluster.sh \ vllm/vllm-openai \ @@ -64,8 +67,13 @@ bash run_cluster.sh \ -e VLLM_HOST_IP=ip_of_this_node ``` +
+ On the rest of the worker nodes, run the following command: +
+Command + ```console bash run_cluster.sh \ vllm/vllm-openai \ @@ -75,6 +83,8 @@ bash run_cluster.sh \ -e VLLM_HOST_IP=ip_of_this_node ``` +
+ Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. !!! warning diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 14ea6a04434..16a50ffea14 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -13,6 +13,9 @@ pip install langchain langchain_community -q To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. +
+Code + ```python from langchain_community.llms import VLLM @@ -28,4 +31,6 @@ llm = VLLM(model="mosaicml/mpt-7b", print(llm("What is the capital of France ?")) ``` +
+ Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md index 251b7155c55..27b15a80a8a 100644 --- a/docs/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -13,6 +13,9 @@ pip install llama-index-llms-vllm -q To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. +
+Code + ```python from llama_index.llms.vllm import Vllm @@ -24,4 +27,6 @@ llm = Vllm( ) ``` +
+ Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 3002b2f92e4..8018d4b9fdf 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -15,6 +15,9 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). +
+Code + ```python from openai import OpenAI client = OpenAI( @@ -32,6 +35,8 @@ completion = client.chat.completions.create( print(completion.choices[0].message) ``` +
+ !!! tip vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. @@ -96,6 +101,9 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: +
+Code + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -105,6 +113,8 @@ completion = client.chat.completions.create( ) ``` +
+ Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like @@ -125,6 +135,9 @@ vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. +
+Code + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -137,6 +150,8 @@ completion = client.chat.completions.create( ) ``` +
+ ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled @@ -147,6 +162,9 @@ with `--enable-request-id-headers`. > rather than within the vLLM layer for this reason. > See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. +
+Code + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -169,6 +187,8 @@ completion = client.completions.create( print(completion._request_id) ``` +
+ ## API Reference [](){ #completions-api } @@ -184,16 +204,26 @@ Code example: The following [sampling parameters][sampling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" ``` +
+ [](){ #chat-api } ### Chat API @@ -212,16 +242,26 @@ Code example: The following [sampling parameters][sampling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" ``` +
+ [](){ #embeddings-api } ### Embeddings API @@ -243,6 +283,9 @@ and passing a list of `messages` in the request. Refer to the examples below for To serve the model: +
+ Command + ```bash vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ --trust-remote-code \ @@ -250,6 +293,8 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_vlm2vec.jinja ``` +
+ !!! important Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` to run this model in embedding mode instead of text generation mode. @@ -259,6 +304,9 @@ and passing a list of `messages` in the request. Refer to the examples below for Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: +
+ Code + ```python import requests @@ -283,10 +331,15 @@ and passing a list of `messages` in the request. Refer to the examples below for print("Embedding output:", response_json["data"][0]["embedding"]) ``` +
+ === "DSE-Qwen2-MRL" To serve the model: +
+ Command + ```bash vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ --trust-remote-code \ @@ -294,6 +347,8 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_dse_qwen2_vl.jinja ``` +
+ !!! important Like with VLM2Vec, we have to explicitly pass `--task embed`. @@ -310,22 +365,37 @@ Full example: +Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params" ``` + + The following extra parameters are supported by default: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" ``` +
+ For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" ``` +
+ [](){ #transcriptions-api } ### Transcriptions API @@ -343,16 +413,26 @@ Code example: The following [sampling parameters][sampling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" ``` +
+ [](){ #tokenizer-api } ### Tokenizer API @@ -387,7 +467,8 @@ Code example: You can classify multiple texts by passing an array of strings: -Request: +
+Request ```bash curl -v "http://127.0.0.1:8000/classify" \ @@ -401,7 +482,10 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -Response: +
+ +
+Response ```bash { @@ -438,9 +522,12 @@ Response: } ``` +
+ You can also pass a string directly to the `input` field: -Request: +
+Request ```bash curl -v "http://127.0.0.1:8000/classify" \ @@ -451,7 +538,10 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -Response: +
+ +
+Response ```bash { @@ -479,20 +569,32 @@ Response: } ``` +
+ #### Extra parameters The following [pooling parameters][pooling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params" ``` +
+ [](){ #score-api } ### Score API @@ -508,7 +610,8 @@ Code example: You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. -Request: +
+Request ```bash curl -X 'POST' \ @@ -523,7 +626,10 @@ curl -X 'POST' \ }' ``` -Response: +
+ +
+Response ```bash { @@ -542,13 +648,16 @@ Response: } ``` +
+ #### Batch inference You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs where each pair is built from `text_1` and a string in `text_2`. The total number of pairs is `len(text_2)`. -Request: +
+Request ```bash curl -X 'POST' \ @@ -565,7 +674,10 @@ curl -X 'POST' \ }' ``` -Response: +
+ +
+Response ```bash { @@ -589,6 +701,8 @@ Response: } ``` +
+ You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). The total number of pairs is `len(text_2)`. @@ -614,7 +728,8 @@ curl -X 'POST' \ }' ``` -Response: +
+Response ```bash { @@ -638,20 +753,32 @@ Response: } ``` +
+ #### Extra parameters The following [pooling parameters][pooling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params" ``` +
+ [](){ #rerank-api } ### Re-rank API @@ -675,7 +802,8 @@ Code example: Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. Result documents will be sorted by relevance, and the `index` property can be used to determine original order. -Request: +
+Request ```bash curl -X 'POST' \ @@ -693,7 +821,10 @@ curl -X 'POST' \ }' ``` -Response: +
+ +
+Response ```bash { @@ -721,16 +852,28 @@ Response: } ``` +
+ #### Extra parameters The following [pooling parameters][pooling-params] are supported. +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params" ``` +
+ The following extra parameters are supported: +
+Code + ```python --8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" ``` + +
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 6603aa83b4a..15ee9e683b5 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -6,12 +6,20 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker][deployment-docker]: +
+Command + ```console vllm serve unsloth/Llama-3.2-1B-Instruct ``` +
+ Then query the endpoint to get the latest metrics from the server: +
+Output + ```console $ curl http://0.0.0.0:8000/metrics @@ -29,12 +37,19 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I ... ``` +
+ The following metrics are exposed: +
+Code + ```python --8<-- "vllm/engine/metrics.py:metrics-definitions" ``` +
+ Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, and are then removed in version `X.Y+2`. diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index e9ab425a1d0..bafa7e26fdf 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -60,6 +60,9 @@ To identify the particular CUDA operation that causes the error, you can add `-- If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. +
+Code + ```python # Test PyTorch NCCL import torch @@ -123,6 +126,8 @@ dist.destroy_process_group(gloo_group) dist.destroy_process_group() ``` +
+ If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console @@ -155,6 +160,9 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b If you have seen a warning in your logs like this: +
+Logs + ```console WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting @@ -163,8 +171,13 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously for more information. ``` +
+ or an error from Python that looks like this: +
+Logs + ```console RuntimeError: An attempt has been made to start a new process before the @@ -185,17 +198,27 @@ RuntimeError: section in https://docs.python.org/3/library/multiprocessing.html ``` +
+ then you must update your Python code to guard usage of `vllm` behind a `if __name__ == '__main__':` block. For example, instead of this: +
+Code + ```python import vllm llm = vllm.LLM(...) ``` +
+ try this instead: +
+Code + ```python if __name__ == '__main__': import vllm @@ -203,10 +226,15 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` +
+ ## `torch.compile` Error vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: +
+Code + ```python import torch @@ -222,18 +250,25 @@ x = torch.randn(4, 4).cuda() print(f(x)) ``` +
+ If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. ## Model failed to be inspected If you see an error like: +
+Logs + ```text File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported raise ValueError( ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. ``` +
+ It means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated binaries in the vLLM build. Please read the logs carefully to determine the root cause of the error. @@ -242,6 +277,9 @@ Please read the logs carefully to determine the root cause of the error. If you see an error like: +
+Logs + ```text Traceback (most recent call last): ... @@ -250,14 +288,21 @@ Traceback (most recent call last): TypeError: 'NoneType' object is not iterable ``` +
+ or: +
+Logs + ```text File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported raise ValueError( ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] ``` +
+ But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model. ## Failed to infer device type diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md index 750cba7ed9c..ae6174369ef 100644 --- a/docs/usage/usage_stats.md +++ b/docs/usage/usage_stats.md @@ -10,6 +10,9 @@ The list of data collected by the latest version of vLLM can be found here: +Output + ```json { "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", @@ -41,19 +44,31 @@ Here is an example as of v0.4.0: } ``` + + You can preview the collected data by running the following command: +
+Command + ```bash tail ~/.config/vllm/usage_stats.json ``` +
+ ## Opting out You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: +
+Command + ```bash # Any of the following methods can disable usage stats collection export VLLM_NO_USAGE_STATS=1 export DO_NOT_TRACK=1 mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track ``` + +
From 68d5dd7e36f1c441b8ccf4a4787b9a467170beac Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Sat, 21 Jun 2025 12:22:42 +0800 Subject: [PATCH 2/5] remove some two lines situation and update some Signed-off-by: reidliu41 --- docs/ci/update_pytorch_version.md | 5 --- docs/contributing/model/multimodal.md | 10 ++++++ docs/contributing/profiling.md | 25 ------------- docs/deployment/frameworks/autogen.md | 10 ++++++ docs/deployment/frameworks/dstack.md | 5 +++ .../retrieval_augmented_generation.md | 10 ++++++ docs/deployment/frameworks/skypilot.md | 25 +++++++++++++ docs/deployment/nginx.md | 30 ---------------- docs/design/kernel/paged_attention.md | 5 --- docs/features/lora.md | 5 --- docs/features/quantization/auto_awq.md | 15 ++++++++ docs/features/quantization/bitblas.md | 10 ++++++ docs/features/quantization/bnb.md | 10 ++++++ docs/features/quantization/fp8.md | 30 ++++++++++++++++ docs/features/quantization/gguf.md | 10 ++++++ docs/features/quantization/gptqmodel.md | 10 ++++++ docs/features/quantization/int4.md | 25 +++++++++++++ docs/features/quantization/int8.md | 20 +++++++++++ docs/features/quantization/modelopt.md | 15 ++++++++ .../quantization/quantized_kvcache.md | 15 ++++++++ docs/features/quantization/quark.md | 35 +++++++++++++++++++ docs/features/quantization/torchao.md | 10 ++++++ .../installation/cpu/apple.inc.md | 10 ++++++ .../installation/cpu/build.inc.md | 5 --- .../installation/google_tpu.md | 5 --- .../installation/gpu/cuda.inc.md | 30 ---------------- docs/models/pooling_models.md | 15 ++++++++ docs/serving/distributed_serving.md | 15 ++++++++ docs/serving/openai_compatible_server.md | 5 +++ docs/usage/metrics.md | 5 --- docs/usage/troubleshooting.md | 10 ------ docs/usage/usage_stats.md | 5 --- 32 files changed, 310 insertions(+), 130 deletions(-) diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md index 199f7395c8c..092e33a7bc7 100644 --- a/docs/ci/update_pytorch_version.md +++ b/docs/ci/update_pytorch_version.md @@ -110,16 +110,11 @@ team if you want to get the package published there. ### xFormers Similar to FlashInfer, here is how to build and install xFormers from source: -
-Commands - ```bash export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX' MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" ``` -
- ### Mamba ```bash diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 17c75270be3..9d15daa747e 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -12,6 +12,9 @@ Further update the model as follows: - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: +
+ Code + ```diff def forward( self, @@ -20,6 +23,8 @@ Further update the model as follows: + pixel_values: torch.Tensor, ) -> SamplerOutput: ``` + +
More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it. @@ -915,6 +920,9 @@ and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` to register them to the multi-modal registry: +
+Code + ```diff from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY @@ -925,6 +933,8 @@ to register them to the multi-modal registry: class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` +
+ ## Notes ### Inserting feature tokens without replacement diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 1c4dc116a9b..6441d3ab092 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -29,26 +29,16 @@ Refer to for an example #### OpenAI Server -
-Command - ```bash VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B ``` -
- benchmark_serving.py: -
-Commanad - ```bash python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 ``` -
- ## Profile with NVIDIA Nsight Systems Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events. @@ -78,15 +68,10 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo The following is an example using the `benchmarks/benchmark_latency.py` script: -
-Command - ```bash nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8 ``` -
- #### OpenAI Server To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed. @@ -106,26 +91,16 @@ python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3 In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: -
-Command - ``` nsys sessions list ``` -
- to get the session id in the form of `profile-XXXXX`, then run: -
-Command - ``` nsys stop --session=profile-XXXXX ``` -
- to manually kill the profiler and generate your `nsys-rep` report. #### Analysis diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md index ad8c167659e..5b39ad46274 100644 --- a/docs/deployment/frameworks/autogen.md +++ b/docs/deployment/frameworks/autogen.md @@ -11,6 +11,9 @@ title: AutoGen - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment +
+Commands + ```console pip install vllm @@ -19,6 +22,8 @@ pip install vllm pip install -U "autogen-agentchat" "autogen-ext[openai]" ``` +
+ ## Deploy - Start the vLLM server with the supported chat completion model, e.g. @@ -30,6 +35,9 @@ python -m vllm.entrypoints.openai.api_server \ - Call it with AutoGen: +
+Code + ```python import asyncio from autogen_core.models import UserMessage @@ -76,6 +84,8 @@ async def main() -> None: asyncio.run(main()) ``` +
+ For details, see the tutorial: - [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/) diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 0601e57cafd..98ee73e976f 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -18,12 +18,17 @@ dstack server Next, to configure your dstack project, run: +
+Code + ```console mkdir -p vllm-dstack cd vllm-dstack dstack init ``` +
+ Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index cb26c8378de..dc1d3376dc9 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -15,6 +15,9 @@ Here are the integrations: - Setup vLLM and langchain environment +
+Command + ```console pip install -U vllm \ langchain_milvus langchain_openai \ @@ -22,6 +25,8 @@ pip install -U vllm \ langchain-text-splitters ``` +
+ ### Deploy - Start the vLLM server with the supported embedding model, e.g. @@ -52,6 +57,9 @@ python retrieval_augmented_generation_with_langchain.py - Setup vLLM and llamaindex environment +
+Command + ```console pip install vllm \ llama-index llama-index-readers-web \ @@ -60,6 +68,8 @@ pip install vllm \ llama-index-vector-stores-milvus \ ``` +
+ ### Deploy - Start the vLLM server with the supported embedding model, e.g. diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index d16bb23be92..17be9e09ccd 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -86,6 +86,9 @@ Check the output of the command. There will be a shareable gradio link (like the **Optional**: Serve the 70B model instead of the default 8B and use more GPU: +
+Command + ```console HF_TOKEN="your-huggingface-token" \ sky launch serving.yaml \ @@ -94,10 +97,15 @@ HF_TOKEN="your-huggingface-token" \ --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct ``` +
+ ## Scale up to multiple replicas SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. +
+Config + ```yaml service: replicas: 2 @@ -112,6 +120,8 @@ service: max_completion_tokens: 1 ``` +
+
Click to see the full recipe YAML @@ -163,12 +173,17 @@ run: | Start the serving the Llama-3 8B model on multiple replicas: +
+Command + ```console HF_TOKEN="your-huggingface-token" \ sky serve up -n vllm serving.yaml \ --env HF_TOKEN ``` +
+ Wait until the service is ready: ```console @@ -193,6 +208,9 @@ vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) R After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: +
+Commands + ```console ENDPOINT=$(sky serve status --endpoint 8081 vllm) curl -L http://$ENDPOINT/v1/chat/completions \ @@ -213,8 +231,13 @@ curl -L http://$ENDPOINT/v1/chat/completions \ }' ``` +
+ To enable autoscaling, you could replace the `replicas` with the following configs in `service`: +
+Config + ```yaml service: replica_policy: @@ -223,6 +246,8 @@ service: target_qps_per_replica: 2 ``` +
+ This will scale the service up to when the QPS exceeds 2 for each replica.
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index eb1aa23cd77..23c2b0c24ae 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -11,15 +11,10 @@ This document shows how to launch multiple vLLM serving containers and use Nginx This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. -
-Command - ```console export vllm_root=`pwd` ``` -
- Create a file named `Dockerfile.nginx`:
@@ -36,15 +31,10 @@ CMD ["nginx", "-g", "daemon off;"] Build the container: -
-Command - ```console docker build . -f Dockerfile.nginx --tag nginx-lb ``` -
- [](){ #nginxloadbalancer-nginx-conf } ## Create Simple Nginx Config file @@ -78,16 +68,11 @@ server { ## Build vLLM Container -
-Command - ```console cd $vllm_root docker build -f docker/Dockerfile . --tag vllm ``` -
- If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
@@ -108,15 +93,10 @@ docker build \ ## Create Docker Network -
-Command - ```console docker network create vllm_nginx ``` -
- [](){ #nginxloadbalancer-nginx-launch-container } ## Launch vLLM Containers @@ -183,23 +163,13 @@ docker run \ ## Verify That vLLM Servers Are Ready -
-Command - ```console docker logs vllm0 | grep Uvicorn docker logs vllm1 | grep Uvicorn ``` -
- Both outputs should look like this: -
-Output - ```console INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ``` - -
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index d121435d187..f24b9cf4994 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -139,15 +139,10 @@ one query token data. Within each warp, every thread group will fetch the same query token data, but will multiply it with different key token data. -
-Code - ```cpp const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` -
-
![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
diff --git a/docs/features/lora.md b/docs/features/lora.md index 207947032a6..a60fa4a6ada 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -10,17 +10,12 @@ LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vll Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save them locally with -
-Code - ```python from huggingface_hub import snapshot_download sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") ``` -
- Then we instantiate the base model and pass in the `enable_lora=True` flag:
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 4366a080f52..0698a07212a 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -15,6 +15,9 @@ pip install autoawq After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: +
+Code + ```python from awq import AutoAWQForCausalLM from transformers import AutoTokenizer @@ -39,16 +42,26 @@ tokenizer.save_pretrained(quant_path) print(f'Model is quantized and saved at "{quant_path}"') ``` +
+ To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: +
+Code + ```console python examples/offline_inference/llm_engine_example.py \ --model TheBloke/Llama-2-7b-Chat-AWQ \ --quantization awq ``` +
+ AWQ models are also supported directly through the LLM entrypoint: +
+Code + ```python from vllm import LLM, SamplingParams @@ -73,3 +86,5 @@ for output in outputs: generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` + +
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 9001725d9c0..ddda72a6917 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -27,6 +27,9 @@ Usually, these repositories have a `quantize_config.json` file that includes a ` ## Read bitblas format checkpoint +
+Code + ```python from vllm import LLM import torch @@ -41,8 +44,13 @@ llm = LLM( ) ``` +
+ ## Read gptq format checkpoint +
+Code + ```python from vllm import LLM import torch @@ -57,3 +65,5 @@ llm = LLM( max_model_len=1024 ) ``` + +
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index a8dc2476f30..20830fad3d5 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -22,6 +22,9 @@ And usually, these repositories have a config.json file that includes a quantiza For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument. +
+Code + ```python from vllm import LLM import torch @@ -34,10 +37,15 @@ llm = LLM( ) ``` +
+ ## Inflight quantization: load as 4bit quantization For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument. +
+Code + ```python from vllm import LLM import torch @@ -50,6 +58,8 @@ llm = LLM( ) ``` +
+ ## OpenAI Compatible Server Append the following to your model arguments for 4bit inflight quantization: diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 01d5d9da046..4d8afd234d7 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -39,6 +39,9 @@ The quantization process involves three main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: +
+Code + ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -49,6 +52,8 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` +
+ ### 2. Applying Quantization For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses: @@ -58,6 +63,9 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. +
+Code + ```python from llmcompressor.transformers import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier @@ -75,6 +83,8 @@ model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) ``` +
+ ### 3. Evaluating Accuracy Install `vllm` and `lm-evaluation-harness` for evaluation: @@ -85,6 +95,9 @@ pip install vllm lm-eval==0.4.4 Load and run the model in `vllm`: +
+Code + ```python from vllm import LLM model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") @@ -92,11 +105,16 @@ result = model.generate("Hello my name is") print(result[0].outputs[0].text) ``` +
+ Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): !!! note Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. +
+Commands + ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic $ lm_eval \ @@ -105,8 +123,13 @@ $ lm_eval \ --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 ``` +
+ Here's an example of the resulting scores: +
+Result + ```text |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| @@ -114,6 +137,8 @@ Here's an example of the resulting scores: | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| ``` +
+ ## Troubleshooting and Support If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository. @@ -124,6 +149,9 @@ Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achi In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. +
+Code + ```python from vllm import LLM model = LLM("facebook/opt-125m", quantization="fp8") @@ -132,5 +160,7 @@ result = model.generate("Hello, my name is") print(result[0].outputs[0].text) ``` +
+ !!! warning Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 72f758f653a..4528049b61b 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -32,6 +32,9 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path +
+Command + ```console # If you model is not supported by huggingface you can manually provide a huggingface compatible config path vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ @@ -39,8 +42,13 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 ``` +
+ You can also use the GGUF model directly through the LLM entrypoint: +
+Code + ```python from vllm import LLM, SamplingParams @@ -80,3 +88,5 @@ for output in outputs: generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` + +
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 53e938d2cbd..4f558aefd3d 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -31,6 +31,9 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: +
+Code + ```python from datasets import load_dataset from gptqmodel import GPTQModel, QuantizeConfig @@ -54,6 +57,8 @@ model.quantize(calibration_dataset, batch_size=2) model.save(quant_path) ``` +
+ ## Running a quantized model with vLLM To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: @@ -67,6 +72,9 @@ python examples/offline_inference/llm_engine_example.py \ GPTQModel quantized models are also supported directly through the LLM entrypoint: +
+Code + ```python from vllm import LLM, SamplingParams @@ -96,3 +104,5 @@ for output in outputs: print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-"*50) ``` + +
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index b7d09206365..a6fa6a2ea88 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -37,6 +37,9 @@ The quantization process involves four main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: +
+Code + ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -47,12 +50,17 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` +
+ ### 2. Preparing Calibration Data When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales. It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: +
+Code + ```python from datasets import load_dataset @@ -72,10 +80,15 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) ``` +
+ ### 3. Applying Quantization Now, apply the quantization algorithms: +
+Code + ```python from llmcompressor.transformers import oneshot from llmcompressor.modifiers.quantization import GPTQModifier @@ -99,6 +112,8 @@ model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) ``` +
+ This process creates a W4A16 model with weights quantized to 4-bit integers. ### 4. Evaluating Accuracy @@ -112,6 +127,9 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") To evaluate accuracy, you can use `lm_eval`: +
+Commands + ```console $ lm_eval --model vllm \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \ @@ -121,6 +139,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` +
+ !!! note Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. @@ -137,6 +157,9 @@ $ lm_eval --model vllm \ The following is an example of an expanded quantization recipe you can tune to your own use case: +
+Code + ```python from compressed_tensors.quantization import ( QuantizationArgs, @@ -166,6 +189,8 @@ recipe = GPTQModifier( ) ``` +
+ ## Troubleshooting and Support If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository. The full INT4 quantization example in `llm-compressor` is available [here](https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w4a16/llama3_example.py). diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 1d9fba9dc87..f7b9418764c 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -38,6 +38,9 @@ The quantization process involves four main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: +
+Code + ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -48,12 +51,17 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` +
+ ### 2. Preparing Calibration Data When quantizing activations to INT8, you need sample data to estimate the activation scales. It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: +
+Code + ```python from datasets import load_dataset @@ -73,10 +81,15 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) ``` +
+ ### 3. Applying Quantization Now, apply the quantization algorithms: +
+Code + ```python from llmcompressor.transformers import oneshot from llmcompressor.modifiers.quantization import GPTQModifier @@ -103,6 +116,8 @@ model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) ``` +
+ This process creates a W8A8 model with weights and activations quantized to 8-bit integers. ### 4. Evaluating Accuracy @@ -116,6 +131,9 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") To evaluate accuracy, you can use `lm_eval`: +
+Command + ```console $ lm_eval --model vllm \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ @@ -125,6 +143,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` +
+ !!! note Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 001d18657da..5e4c3c58a97 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -14,6 +14,9 @@ You can quantize HuggingFace models using the example scripts provided in the Te Below is an example showing how to quantize a model using modelopt's PTQ API: +
+Code + ```python import modelopt.torch.quantization as mtq from transformers import AutoModelForCausalLM @@ -33,8 +36,13 @@ def forward_loop(model): model = mtq.quantize(model, config, forward_loop) ``` +
+ After the model is quantized, you can export it to a quantized checkpoint using the export API: +
+Code + ```python import torch from modelopt.torch.export import export_hf_checkpoint @@ -46,8 +54,13 @@ with torch.inference_mode(): ) ``` +
+ The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: +
+Code + ```python from vllm import LLM, SamplingParams @@ -76,3 +89,5 @@ def main(): if __name__ == "__main__": main() ``` + +
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index e3ebd024bab..5b2dcaf9176 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -35,6 +35,9 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades Here is an example of how to enable FP8 quantization: +
+Code + ```python # To calculate kv cache scales on the fly enable the calculate_kv_scales # parameter @@ -50,6 +53,8 @@ out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) ``` +
+ The `kv_cache_dtype` argument specifies the data type for KV cache storage: - `"auto"`: Uses the model's default "unquantized" data type - `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU) @@ -71,6 +76,9 @@ pip install llmcompressor Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): +
+Code + ```python from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -133,10 +141,15 @@ model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) ``` +
+ The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. +
+Code + ```python from vllm import LLM, SamplingParams @@ -146,3 +159,5 @@ prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) ``` + +
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 35e9dbe2609..79ab189c1f0 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -42,6 +42,9 @@ The Quark quantization process can be listed for 5 steps as below: Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) to fetch model and tokenizer. +
+Code + ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -57,12 +60,17 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN tokenizer.pad_token = tokenizer.eos_token ``` +
+ ### 2. Prepare the Calibration Dataloader Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) to load calibration data. For more details about how to use calibration datasets efficiently, please refer to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). +
+Code + ```python from datasets import load_dataset from torch.utils.data import DataLoader @@ -80,6 +88,8 @@ calib_dataloader = DataLoader(tokenized_outputs['input_ids'], batch_size=BATCH_SIZE, drop_last=True) ``` +
+ ### 3. Set the Quantization Configuration We need to set the quantization configuration, you can check @@ -94,6 +104,9 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. AutoSmoothQuant config file for Llama is `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. +
+Code + ```python from quark.torch.quantization import (Config, QuantizationConfig, FP8E4M3PerTensorSpec, @@ -131,6 +144,8 @@ quant_config = Config( algo_config=algo_config) ``` +
+ ### 4. Quantize the Model and Export Then we can apply the quantization. After quantizing, we need to freeze the @@ -139,6 +154,9 @@ HuggingFace `safetensors`, you can refer to [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) for more exporting format details. +
+Code + ```python import torch from quark.torch import ModelQuantizer, ModelExporter @@ -164,10 +182,15 @@ with torch.no_grad(): quant_config=quant_config, tokenizer=tokenizer) ``` +
+ ### 5. Evaluation in vLLM Now, you can load and run the Quark quantized model directly through the LLM entrypoint: +
+Code + ```python from vllm import LLM, SamplingParams @@ -197,14 +220,21 @@ for output in outputs: print("-" * 60) ``` +
+ Or, you can use `lm_eval` to evaluate accuracy: +
+Command + ```console $ lm_eval --model vllm \ --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \ --tasks gsm8k ``` +
+ ## Quark Quantization Script In addition to the example of Python API above, Quark also offers a [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html) @@ -212,6 +242,9 @@ to quantize large language models more conveniently. It supports quantizing mode of different quantization schemes and optimization algorithms. It can export the quantized model and run evaluation tasks on the fly. With the script, the example above can be: +
+Code + ```console python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ --output_dir /path/to/output \ @@ -222,3 +255,5 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ --model_export hf_format \ --tasks gsm8k ``` + +
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index a7a517af85a..75dd8829e42 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -4,6 +4,9 @@ TorchAO is an architecture optimization library for PyTorch, it provides high pe We recommend installing the latest torchao nightly with +
+Command + ```console # Install the latest TorchAO nightly build # Choose the CUDA version that matches your system (cu126, cu128, etc.) @@ -12,9 +15,14 @@ pip install \ --index-url https://download.pytorch.org/whl/nightly/cu126 ``` +
+ ## Quantizing HuggingFace Models You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: +
+Code + ```Python import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -37,4 +45,6 @@ tokenizer.push_to_hub(hub_repo) quantized_model.push_to_hub(hub_repo, safe_serialization=False) ``` +
+ Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 7a91e3ce5e5..41c32202225 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -25,6 +25,9 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. +
+Command + ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -32,6 +35,8 @@ pip install -r requirements/cpu.txt pip install -e . ``` +
+ !!! note On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. @@ -40,6 +45,9 @@ pip install -e . If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). +
+Error + ```text [...] fatal error: 'map' file not found 1 | #include @@ -53,6 +61,8 @@ If the build has error like the following snippet where standard C++ headers can 1 error generated. ``` +
+ # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 85016010ba3..08cc94bc6e2 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -13,16 +13,11 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave / Second, clone vLLM project: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git vllm_source cd vllm_source ``` -
- Third, install Python packages for vLLM CPU backend building:
diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md index bbc0759ee67..49d848106c9 100644 --- a/docs/getting_started/installation/google_tpu.md +++ b/docs/getting_started/installation/google_tpu.md @@ -126,16 +126,11 @@ source ~/.bashrc Create and activate a Conda environment for vLLM: -
-Commands - ```bash conda create -n vllm python=3.10 -y conda activate vllm ``` -
- Clone the vLLM repository and go to the vLLM directory: ```bash diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 10001c2d09b..e4dcdaaecda 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -92,16 +92,11 @@ uv pip install -U vllm \ If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: -
-Commands - ```console export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` -
- Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. ##### Install specific revisions using `uv` @@ -239,16 +234,11 @@ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: -
-Commands - ```console export MAX_JOBS=6 pip install -e . ``` -
- This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. A side effect is a much slower build process. @@ -270,44 +260,29 @@ docker run \ If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: -
-Commands - ```console export CUDA_HOME=/usr/local/cuda export PATH="${CUDA_HOME}/bin:$PATH" ``` -
- Here is a sanity check to verify that the CUDA Toolkit is correctly installed: -
-Commands - ```console nvcc --version # verify that nvcc is in your PATH ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` -
- #### Unsupported OS build vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: -
-Commands - ```console export VLLM_TARGET_DEVICE=empty pip install -e . ``` -
- # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] @@ -318,16 +293,11 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i Another way to access the latest code is to use the docker images: -
-Commands - ```console export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` -
- These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. The latest code can contain bugs and may not be stable. Please use it with caution. diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 15a633c275e..c3a144c1de7 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -45,6 +45,9 @@ See [configuration][configuration] for a list of options when initializing the m The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. +
+Code + ```python from vllm import LLM @@ -55,11 +58,16 @@ data = output.outputs.data print(f"Data: {data!r}") ``` +
+ ### `LLM.embed` The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. It is primarily designed for embedding models. +
+Code + ```python from vllm import LLM @@ -70,6 +78,8 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` +
+ A code example can be found here: ### `LLM.classify` @@ -77,6 +87,9 @@ A code example can be found here: +Code + ```python from vllm import LLM @@ -87,6 +100,8 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` +
+ A code example can be found here: ### `LLM.score` diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 3d4dfa6f877..1e04f29f663 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -26,12 +26,17 @@ Multiprocessing will be used by default when not running in a Ray placement grou To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: +
+Code + ```python from vllm import LLM llm = LLM("facebook/opt-13b", tensor_parallel_size=4) output = llm.generate("San Francisco is a") ``` +
+ To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console @@ -41,12 +46,17 @@ To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when sta You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: +
+Command + ```console vllm serve gpt2 \ --tensor-parallel-size 4 \ --pipeline-parallel-size 2 ``` +
+ ## Running vLLM on multiple nodes If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. @@ -97,12 +107,17 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: +
+Command + ```console vllm serve /path/to/the/model/in/the/container \ --tensor-parallel-size 8 \ --pipeline-parallel-size 2 ``` +
+ You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 8018d4b9fdf..e7770ef9d3d 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -7,12 +7,17 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https:// In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) +
+Command + ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ --dtype auto \ --api-key token-abc123 ``` +
+ To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 15ee9e683b5..11fec27d689 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -6,15 +6,10 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker][deployment-docker]: -
-Command - ```console vllm serve unsloth/Llama-3.2-1B-Instruct ``` -
- Then query the endpoint to get the latest metrics from the server:
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index bafa7e26fdf..f69baa3db3c 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -203,22 +203,14 @@ RuntimeError: then you must update your Python code to guard usage of `vllm` behind a `if __name__ == '__main__':` block. For example, instead of this: -
-Code - ```python import vllm llm = vllm.LLM(...) ``` -
- try this instead: -
-Code - ```python if __name__ == '__main__': import vllm @@ -226,8 +218,6 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` -
- ## `torch.compile` Error vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md index ae6174369ef..1ab3085d939 100644 --- a/docs/usage/usage_stats.md +++ b/docs/usage/usage_stats.md @@ -48,15 +48,10 @@ Here is an example as of v0.4.0: You can preview the collected data by running the following command: -
-Command - ```bash tail ~/.config/vllm/usage_stats.json ``` -
- ## Opting out You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: From fe9e576dd5fab2adc43e77a26f283b3d6cd156ea Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Sat, 21 Jun 2025 12:52:30 +0800 Subject: [PATCH 3/5] recheck 2~3 lines again Signed-off-by: reidliu41 --- docs/ci/update_pytorch_version.md | 5 ---- docs/contributing/model/multimodal.md | 10 ------- docs/contributing/profiling.md | 5 ---- docs/deployment/frameworks/skypilot.md | 5 ---- docs/design/kernel/paged_attention.md | 5 ---- docs/features/quantization/gguf.md | 5 ++++ .../installation/aws_neuron.md | 5 ---- docs/getting_started/installation/cpu.md | 7 ----- .../installation/cpu/build.inc.md | 10 ------- .../installation/cpu/s390x.inc.md | 5 ---- .../installation/google_tpu.md | 5 ---- .../installation/gpu/cuda.inc.md | 30 ------------------- .../installation/gpu/rocm.inc.md | 10 ------- docs/getting_started/quickstart.md | 5 ---- .../models/extensions/runai_model_streamer.md | 20 ------------- docs/models/supported_models.md | 20 ------------- docs/serving/distributed_serving.md | 15 ---------- docs/serving/openai_compatible_server.md | 5 ---- docs/usage/troubleshooting.md | 15 ---------- 19 files changed, 5 insertions(+), 182 deletions(-) diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md index 092e33a7bc7..69fdc82ef97 100644 --- a/docs/ci/update_pytorch_version.md +++ b/docs/ci/update_pytorch_version.md @@ -91,17 +91,12 @@ source to unblock the update process. ### FlashInfer Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): -
-Commands - ```bash export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' export FLASHINFER_ENABLE_SM90=1 uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1" ``` -
- One caveat is that building FlashInfer from source adds approximately 30 minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a public location for immediate installation, such as https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl. For future releases, contact the PyTorch release diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 9d15daa747e..d8850ce0252 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -135,16 +135,11 @@ to return the maximum number of input items for each modality supported by the m For example, if the model supports any number of images but only one video per prompt: -
-Code - ```python def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} ``` -
- ## 3. Specify dummy inputs Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for @@ -502,16 +497,11 @@ Assuming that the memory usage increases with the number of tokens, the dummy in Fuyu does not expect image placeholders in the inputs to HF processor, so the dummy prompt text is empty regardless of the number of images. -
- Code - ```python def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: return "" ``` -
- For the multimodal image profiling data, the logic is very similar to LLaVA:
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 6441d3ab092..41361cb49af 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -191,12 +191,7 @@ with vllm.utils.cprofile_context("another_function.prof"): There are multiple tools available that can help analyze the profile results. One example is [snakeviz](https://jiffyclub.github.io/snakeviz/). -
-Command - ```bash pip install snakeviz snakeviz expensive_function.prof ``` - -
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index 17be9e09ccd..f4574cf1cb4 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -173,17 +173,12 @@ run: | Start the serving the Llama-3 8B model on multiple replicas: -
-Command - ```console HF_TOKEN="your-huggingface-token" \ sky serve up -n vllm serving.yaml \ --env HF_TOKEN ``` -
- Wait until the service is ready: ```console diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index f24b9cf4994..1a99288b357 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -180,17 +180,12 @@ tokens are processed by the entire thread group after the kernel run. In this context, "handle" refers to performing the dot multiplication between query data and key data. -
-Code - ```cpp const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + kv_head_idx * kv_head_stride + physical_block_offset * x; ``` -
- Unlike to `q_ptr`, `k_ptr` in each thread will point to different key token at different iterations. As shown above, that `k_ptr` points to key token data based on `k_cache` at assigned block, diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 4528049b61b..7389710d823 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -20,6 +20,9 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: +
+Command + ```console # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ @@ -27,6 +30,8 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ --tensor-parallel-size 2 ``` +
+ !!! warning We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md index 5b29f1a849c..dc28327ecaa 100644 --- a/docs/getting_started/installation/aws_neuron.md +++ b/docs/getting_started/installation/aws_neuron.md @@ -110,17 +110,12 @@ to perform most of the heavy lifting which includes PyTorch model initialization To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include -
-Config - ```console override_neuron_config={ "enable_bucketing":False, } ``` -
- or when launching vLLM from the CLI, pass ```console diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 29525246530..e620b056530 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -138,17 +138,12 @@ python examples/offline_inference/basic/basic.py # run vLLM - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: -
-Commands - ```console export VLLM_CPU_KVCACHE_SPACE=40 export VLLM_CPU_OMP_THREADS_BIND=0-29 vllm serve facebook/opt-125m ``` -
- or using default auto thread binding: ```console @@ -157,8 +152,6 @@ export VLLM_CPU_NUM_OF_RESERVED_CPU=2 vllm serve facebook/opt-125m ``` -
- - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 08cc94bc6e2..7ddadccb1b4 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,16 +1,11 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: -
-Commands - ```console sudo apt-get update -y sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` -
- Second, clone vLLM project: ```console @@ -20,17 +15,12 @@ cd vllm_source Third, install Python packages for vLLM CPU backend building: -
-Commands - ```console pip install --upgrade pip pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` -
- Finally, build and install vLLM CPU backend: ```console diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index 52ad6c1b9a6..6bb1818c29d 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -40,16 +40,11 @@ dnf install -y \ Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation. -
-Command - ```console curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . "$HOME/.cargo/env" ``` -
- Execute the following commands to build and install vLLM from the source. !!! tip diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md index 49d848106c9..9b5007bef14 100644 --- a/docs/getting_started/installation/google_tpu.md +++ b/docs/getting_started/installation/google_tpu.md @@ -113,17 +113,12 @@ Currently, there are no pre-built TPU wheels. Install Miniconda: -
-Commands - ```bash wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh bash Miniconda3-latest-Linux-x86_64.sh source ~/.bashrc ``` -
- Create and activate a Conda environment for vLLM: ```bash diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index e4dcdaaecda..c036bd0ab57 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -62,32 +62,22 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ##### Install the latest code using `pip` -
-Command - ```console pip install -U vllm \ --pre \ --extra-index-url https://wheels.vllm.ai/nightly ``` -
- `--pre` is required for `pip` to consider pre-released versions. Another way to install the latest code is to use `uv`: -
-Command - ```console uv pip install -U vllm \ --torch-backend=auto \ --extra-index-url https://wheels.vllm.ai/nightly ``` -
- ##### Install specific revisions using `pip` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: @@ -124,17 +114,12 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -
- This command will do the following: 1. Look for the current branch in your vLLM clone. @@ -148,17 +133,12 @@ This command will do the following: In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. -
-Commands - ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install --editable . ``` -
- You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. !!! note @@ -169,17 +149,12 @@ You can find more information about vLLM's wheels in [install-the-latest-code][i If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm pip install -e . ``` -
- !!! tip Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. @@ -218,17 +193,12 @@ pip install --no-build-isolation -e . Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` -
- ##### Troubleshooting To avoid your system being overloaded, you can limit the number of compilation jobs diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 480f34d77c3..aee471f49c6 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -31,17 +31,12 @@ Currently, there are no pre-built ROCm wheels. Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example: -
- Commands - ```console # Install PyTorch $ pip uninstall torch -y $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 ``` -
- 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) @@ -186,17 +181,12 @@ It is important that the user kicks off the docker build using buildkit. Either To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: -
-Command - ```console DOCKER_BUILDKIT=1 docker build \ -f docker/Dockerfile.rocm_base \ -t rocm/vllm-dev:base . ``` -
- #### Build an image with vLLM First, build a docker image from and launch a docker container from the image. diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 68fb9953e2b..adcdb89525c 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -19,17 +19,12 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: -
-Commands - ```console uv venv --python 3.12 --seed source .venv/bin/activate uv pip install vllm --torch-backend=auto ``` -
- `uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment: diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 251381b7ae3..8ee5933b9a2 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -49,31 +49,21 @@ You can tune parameters using `--model-loader-extra-config`: You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. For reading from S3, it will be the number of client instances the host is opening to the S3 server. -
-Command - ```console vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ --load-format runai_streamer \ --model-loader-extra-config '{"concurrency":16}' ``` -
- You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). -
-Command - ```console vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ --load-format runai_streamer \ --model-loader-extra-config '{"memory_limit":5368709120}' ``` -
- !!! note For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). @@ -87,31 +77,21 @@ vllm serve /path/to/sharded/model --load-format runai_streamer_sharded The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`: -
-Command - ```console vllm serve /path/to/sharded/model \ --load-format runai_streamer_sharded \ --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` -
- To create sharded model files, you can use the script provided in . This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: -
-Command - ```console vllm serve /path/to/sharded/model \ --load-format runai_streamer_sharded \ --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' ``` -
- !!! note The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 283f3059e66..427ca5d99ee 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -25,17 +25,12 @@ vLLM also supports model implementations that are available in Transformers. Thi To check if the modeling backend is Transformers, you can simply do this: -
-Code - ```python from vllm import LLM llm = LLM(model=..., task="generate") # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -
- If it is `TransformersForCausalLM` then it means it's based on Transformers! !!! tip @@ -265,21 +260,13 @@ Here are some tips for loading/downloading models from Hugging Face using a prox - Set the proxy globally for your session (or set it in the profile file): -
-Commands - ```shell export http_proxy=http://your.proxy.server:port export https_proxy=http://your.proxy.server:port ``` -
- - Set the proxy for just the current command: -
-Commands - ```shell https_proxy=http://your.proxy.server:port huggingface-cli download @@ -287,13 +274,8 @@ https_proxy=http://your.proxy.server:port huggingface-cli download https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests ``` -
- - Set the proxy in Python interpreter: -
-Code - ```python import os @@ -301,8 +283,6 @@ os.environ['http_proxy'] = 'http://your.proxy.server:port' os.environ['https_proxy'] = 'http://your.proxy.server:port' ``` -
- ### ModelScope To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 1e04f29f663..3d4dfa6f877 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -26,17 +26,12 @@ Multiprocessing will be used by default when not running in a Ray placement grou To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: -
-Code - ```python from vllm import LLM llm = LLM("facebook/opt-13b", tensor_parallel_size=4) output = llm.generate("San Francisco is a") ``` -
- To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console @@ -46,17 +41,12 @@ To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when sta You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: -
-Command - ```console vllm serve gpt2 \ --tensor-parallel-size 4 \ --pipeline-parallel-size 2 ``` -
- ## Running vLLM on multiple nodes If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. @@ -107,17 +97,12 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: -
-Command - ```console vllm serve /path/to/the/model/in/the/container \ --tensor-parallel-size 8 \ --pipeline-parallel-size 2 ``` -
- You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index e7770ef9d3d..8018d4b9fdf 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -7,17 +7,12 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https:// In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) -
-Command - ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ --dtype auto \ --api-key token-abc123 ``` -
- To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index f69baa3db3c..5df70548482 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -248,17 +248,12 @@ If it raises errors from `torch/_inductor` directory, usually it means you have If you see an error like: -
-Logs - ```text File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported raise ValueError( ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. ``` -
- It means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated binaries in the vLLM build. Please read the logs carefully to determine the root cause of the error. @@ -267,9 +262,6 @@ Please read the logs carefully to determine the root cause of the error. If you see an error like: -
-Logs - ```text Traceback (most recent call last): ... @@ -278,21 +270,14 @@ Traceback (most recent call last): TypeError: 'NoneType' object is not iterable ``` -
- or: -
-Logs - ```text File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported raise ValueError( ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] ``` -
- But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model. ## Failed to infer device type From eba4c4ef138bf22b95ad9ddf9402cf917dd109a7 Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Sun, 22 Jun 2025 00:36:55 +0800 Subject: [PATCH 4/5] leave more than 10 lines Signed-off-by: reidliu41 --- docs/cli/README.md | 23 ---- docs/configuration/conserving_memory.md | 30 ----- docs/configuration/model_resolution.md | 5 - docs/configuration/optimization.md | 30 ----- docs/configuration/serve_args.md | 5 - docs/contributing/model/basic.md | 10 -- docs/contributing/model/multimodal.md | 54 -------- docs/contributing/model/registration.md | 10 -- docs/contributing/profiling.md | 20 --- docs/deployment/docker.md | 18 --- docs/deployment/frameworks/cerebrium.md | 5 - docs/deployment/frameworks/lws.md | 10 -- docs/deployment/frameworks/open-webui.md | 5 - .../retrieval_augmented_generation.md | 10 -- docs/deployment/frameworks/skypilot.md | 15 +-- docs/deployment/frameworks/streamlit.md | 5 - docs/deployment/integrations/llamastack.md | 10 -- .../integrations/production-stack.md | 5 - docs/deployment/k8s.md | 20 --- docs/deployment/nginx.md | 15 --- docs/design/kernel/paged_attention.md | 55 --------- docs/features/lora.md | 30 +---- docs/features/quantization/auto_awq.md | 5 - docs/features/quantization/bnb.md | 10 -- docs/features/quantization/fp8.md | 25 ---- docs/features/quantization/gguf.md | 10 -- docs/features/quantization/int4.md | 10 -- docs/features/quantization/int8.md | 10 -- docs/features/quantization/modelopt.md | 5 - .../quantization/quantized_kvcache.md | 5 - docs/features/quantization/quark.md | 10 -- docs/features/quantization/torchao.md | 5 - docs/features/structured_outputs.md | 5 - docs/features/tool_calling.md | 15 +-- .../installation/aws_neuron.md | 10 -- docs/getting_started/installation/cpu.md | 5 - .../installation/cpu/apple.inc.md | 10 -- .../installation/cpu/s390x.inc.md | 10 -- .../installation/gpu/cuda.inc.md | 25 ---- .../installation/gpu/rocm.inc.md | 30 ----- .../installation/gpu/xpu.inc.md | 15 --- .../installation/intel_gaudi.md | 35 +----- docs/getting_started/quickstart.md | 25 ---- .../models/extensions/runai_model_streamer.md | 5 - docs/models/generative_models.md | 20 --- docs/models/pooling_models.md | 33 +---- docs/models/supported_models.md | 25 ---- docs/serving/distributed_serving.md | 10 -- docs/serving/integrations/llamaindex.md | 5 - docs/serving/openai_compatible_server.md | 115 +----------------- docs/usage/troubleshooting.md | 5 - docs/usage/usage_stats.md | 5 - 52 files changed, 11 insertions(+), 882 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index 14031c5b43a..a381d2f23b1 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -46,9 +46,6 @@ vllm serve --help=max Generate chat completions via the running API server. -
-Examples - ```bash # Directly connect to localhost API without arguments vllm chat @@ -60,15 +57,10 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 vllm chat --quick "hi" ``` -
- ## complete Generate text completions based on the given prompt via the running API server. -
-Examples - ```bash # Directly connect to localhost API without arguments vllm complete @@ -98,9 +90,6 @@ vllm bench {latency, serve, throughput} Benchmark the latency of a single batch of requests. -
-Example - ```bash vllm bench latency \ --model meta-llama/Llama-3.2-1B-Instruct \ @@ -110,15 +99,10 @@ vllm bench latency \ --load-format dummy ``` -
- ### serve Benchmark the online serving throughput. -
-Example - ```bash vllm bench serve \ --model meta-llama/Llama-3.2-1B-Instruct \ @@ -129,15 +113,10 @@ vllm bench serve \ --num-prompts 5 ``` -
- ### throughput Benchmark offline inference throughput. -
-Example - ```bash vllm bench throughput \ --model meta-llama/Llama-3.2-1B-Instruct \ @@ -147,8 +126,6 @@ vllm bench throughput \ --load-format dummy ``` -
- ## collect-env Start collecting environment information. diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 00add26b2e1..10469462fb3 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -8,9 +8,6 @@ Tensor parallelism (`tensor_parallel_size` option) can be used to split the mode The following code splits the model across 2 GPUs. -
-Code - ```python from vllm import LLM @@ -18,8 +15,6 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` -
- !!! warning To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. @@ -45,9 +40,6 @@ Dynamic quantization is also supported via the `quantization` option -- see [her You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). -
-Command - ```python from vllm import LLM @@ -56,8 +48,6 @@ llm = LLM(model="adept/fuyu-8b", max_num_seqs=2) ``` -
- ## Reduce CUDA Graphs By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. @@ -88,9 +78,6 @@ llm = LLM( You can disable graph capturing completely via the `enforce_eager` flag: -
-Code - ```python from vllm import LLM @@ -98,8 +85,6 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) ``` -
- ## Adjust cache size If you run out of CPU RAM, try the following options: @@ -111,9 +96,6 @@ If you run out of CPU RAM, try the following options: You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: -
-Code - ```python from vllm import LLM @@ -122,14 +104,9 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 3, "video": 1}) ``` -
- You can go a step further and disable unused modalities completely by setting its limit to zero. For example, if your application only accepts image input, there is no need to allocate any memory for videos. -
-Code - ```python from vllm import LLM @@ -138,13 +115,8 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"video": 0}) ``` -
- You can even run a multi-modal model for text-only inference: -
-Code - ```python from vllm import LLM @@ -153,8 +125,6 @@ llm = LLM(model="google/gemma-3-27b-it", limit_mm_per_prompt={"image": 0}) ``` -
- ## Multi-modal processor arguments For certain models, you can adjust the multi-modal processor arguments to diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md index fcf1ccb64a2..8757c257d3e 100644 --- a/docs/configuration/model_resolution.md +++ b/docs/configuration/model_resolution.md @@ -11,9 +11,6 @@ Nevertheless, our model resolution may fail for the following reasons: To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. For example: -
-Code - ```python from vllm import LLM @@ -23,6 +20,4 @@ model = LLM( ) ``` -
- Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 689f9770539..09d1598c67e 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -48,9 +48,6 @@ You can tune the performance by adjusting `max_num_batched_tokens`: - For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs. - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes). -
-Code - ```python from vllm import LLM @@ -58,8 +55,6 @@ from vllm import LLM llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384) ``` -
- See related papers for more details ( or ). ## Parallelism Strategies @@ -75,9 +70,6 @@ Tensor parallelism shards model parameters across multiple GPUs within each mode - When the model is too large to fit on a single GPU - When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput -
-Code - ```python from vllm import LLM @@ -85,8 +77,6 @@ from vllm import LLM llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4) ``` -
- For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential. ### Pipeline Parallelism (PP) @@ -100,9 +90,6 @@ Pipeline parallelism distributes model layers across multiple GPUs. Each GPU pro Pipeline parallelism can be combined with tensor parallelism for very large models: -
-Code - ```python from vllm import LLM @@ -114,8 +101,6 @@ llm = LLM( ) ``` -
- ### Expert Parallelism (EP) Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs. @@ -149,9 +134,6 @@ If you encounter out-of-memory issues, consider these strategies: You can reduce memory usage by limiting the context length and batch size: -
-Code - ```python from vllm import LLM @@ -162,8 +144,6 @@ llm = LLM( ) ``` -
- ### Adjust CUDA Graph Compilation CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level: @@ -188,9 +168,6 @@ llm = LLM( Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`: -
-Code - ```python from vllm import LLM @@ -200,15 +177,10 @@ llm = LLM( ) ``` -
- ### Multimodal Models For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request: -
-Code - ```python from vllm import LLM @@ -218,5 +190,3 @@ llm = LLM( limit_mm_per_prompt={"image": 2} ) ``` - -
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md index 0a999a4e7b9..16b4b29f45d 100644 --- a/docs/configuration/serve_args.md +++ b/docs/configuration/serve_args.md @@ -17,9 +17,6 @@ The argument names must be the long form of those outlined [above][serve-args]. For example: -
-Config - ```yaml # config.yaml @@ -29,8 +26,6 @@ port: 6379 uvicorn-log-level: "info" ``` -
- To use the above config file: ```bash diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 998b86efcbc..bdc241f8b25 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -64,9 +64,6 @@ class MyModelForCausalLM(nn.Module): - Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model. -
-Code - ```python class MyModel(nn.Module): ... @@ -75,13 +72,8 @@ class MyModel(nn.Module): ... ``` -
- - Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. -
-Code - ```python def forward( self, @@ -91,8 +83,6 @@ def forward( ... ``` -
- !!! note Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index d8850ce0252..cee50e97789 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -12,9 +12,6 @@ Further update the model as follows: - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: -
- Code - ```diff def forward( self, @@ -23,8 +20,6 @@ Further update the model as follows: + pixel_values: torch.Tensor, ) -> SamplerOutput: ``` - -
More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it. @@ -96,9 +91,6 @@ Further update the model as follows: - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. -
- Code - ```python class YourModelForImage2Seq(nn.Module): ... @@ -108,8 +100,6 @@ Further update the model as follows: return self.language_model ``` -
- - Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. ```diff @@ -250,17 +240,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in We can infer that `embeddings.shape[1] == self.num_positions`, where -
- Code - ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 ``` -
- Overall, the number of placeholder feature tokens for an image can be calculated as:
@@ -325,9 +310,6 @@ Assuming that the memory usage increases with the number of tokens, the dummy in For the text, we simply expand the multimodal image token from the model config to match the desired number of images. -
- Code - ```python def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -338,8 +320,6 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return image_token * num_images ``` -
- === "No input placeholders: Fuyu" Looking at the code of HF's `FuyuForCausalLM`: @@ -482,9 +462,6 @@ Assuming that the memory usage increases with the number of tokens, the dummy in These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. -
- Code - ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() @@ -492,8 +469,6 @@ Assuming that the memory usage increases with the number of tokens, the dummy in height=image_processor.size["height"]) ``` -
- Fuyu does not expect image placeholders in the inputs to HF processor, so the dummy prompt text is empty regardless of the number of images. @@ -545,8 +520,6 @@ return a schema of the tensors outputted by the HF processor that are related to The output of `CLIPImageProcessor` is a simple tensor with shape `(num_images, num_channels, image_height, image_width)`: -
- Code ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 @@ -559,13 +532,8 @@ return a schema of the tensors outputted by the HF processor that are related to return BatchFeature(data=data, tensor_type=return_tensors) ``` -
- So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: -
- Code - ```python def _get_mm_fields_config( self, @@ -577,8 +545,6 @@ return a schema of the tensors outputted by the HF processor that are related to ) ``` -
- !!! note Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. @@ -588,9 +554,6 @@ return a schema of the tensors outputted by the HF processor that are related to The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates the patches from each image belonging to an item in the batch: -
- Code - ```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 image_input_ids.append(tensor_of_image_ids) @@ -602,8 +565,6 @@ return a schema of the tensors outputted by the HF processor that are related to batch_image_patches.append(image_patches) ``` -
- The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore `(1, num_images, num_patches, patch_width * patch_height * num_channels)`. @@ -651,9 +612,6 @@ return a schema of the tensors outputted by the HF processor that are related to This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: -
- Code - ```python def _get_mm_fields_config( self, @@ -663,8 +621,6 @@ return a schema of the tensors outputted by the HF processor that are related to return dict(image_patches=MultiModalFieldConfig.batched("image")) ``` -
- ### Prompt updates Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to @@ -677,9 +633,6 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies Looking at HF's `LlavaProcessor`: -
- Code - ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 prompt_strings = [] @@ -688,8 +641,6 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies prompt_strings.append(sample) ``` -
- It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: @@ -910,9 +861,6 @@ and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` to register them to the multi-modal registry: -
-Code - ```diff from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY @@ -923,8 +871,6 @@ to register them to the multi-modal registry: class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` -
- ## Notes ### Inserting feature tokens without replacement diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index c2ee9ffe970..a6dc1e32dfb 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -27,9 +27,6 @@ You can load an external model [using a plugin][plugin-system] without modifying To register the model, use the following code: -
-Code - ```python # The entrypoint of your plugin def register(): @@ -39,13 +36,8 @@ def register(): ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) ``` -
- If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: -
-Code - ```python # The entrypoint of your plugin def register(): @@ -57,8 +49,6 @@ def register(): ) ``` -
- !!! important If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. Read more about that [here][supports-multimodal]. diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 41361cb49af..d9aab069fc4 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -46,9 +46,6 @@ Nsight systems is an advanced tool that exposes more profiling details, such as [Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager. The following block is an example for Ubuntu. -
-Command - ```bash apt update apt install -y --no-install-recommends gnupg @@ -58,8 +55,6 @@ apt update apt install nsight-systems-cli ``` -
- ### Example commands and usage #### Offline Inference @@ -76,9 +71,6 @@ nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed. -
-Command - ```bash # server nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct @@ -87,8 +79,6 @@ nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512 ``` -
- In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: ``` @@ -151,9 +141,6 @@ The first helper is a Python decorator that can be used to profile a function. If a filename is specified, the profile will be saved to that file. If no filename is specified, profile data will be printed to stdout. -
-Code - ```python import vllm.utils @@ -163,16 +150,11 @@ def expensive_function(): pass ``` -
- ### Example Usage - context manager The second helper is a context manager that can be used to profile a block of code. Similar to the decorator, the filename is optional. -
-Code - ```python import vllm.utils @@ -184,8 +166,6 @@ with vllm.utils.cprofile_context("another_function.prof"): another_function() ``` -
- ### Analyzing Profile Results There are multiple tools available that can help analyze the profile results. diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 5feac71d07d..895c4c47e91 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -10,9 +10,6 @@ title: Using Docker vLLM offers an official Docker image for deployment. The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). -
-Command - ```console docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -27,9 +24,6 @@ docker run --runtime nvidia --gpus all \ This image can also be used with other container engines such as [Podman](https://podman.io/). -
-Command - ```console podman run --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -40,8 +34,6 @@ podman run --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` -
- You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`). !!! note @@ -81,9 +73,6 @@ You can add any other [engine-args][engine-args] you need after the image tag (` You can build and run vLLM from source via the provided . To build vLLM: -
-Command - ```console # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 DOCKER_BUILDKIT=1 docker build . \ @@ -92,8 +81,6 @@ DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile ``` -
- !!! note By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` @@ -146,9 +133,6 @@ DOCKER_BUILDKIT=1 docker build . \ To run vLLM with the custom-built Docker image: -
-Command - ```console docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -157,8 +141,6 @@ docker run --runtime nvidia --gpus all \ vllm/vllm-openai ``` -
- The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). !!! note diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index c6032facec2..5ae5f5a1f2e 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -24,9 +24,6 @@ cerebrium init vllm-project Next, to install the required packages, add the following to your cerebrium.toml: -
-Config - ```toml [cerebrium.deployment] docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" @@ -35,8 +32,6 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" vllm = "latest" ``` -
- Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 42b980d1cd4..934250b3df9 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -128,9 +128,6 @@ kubectl get pods Should get an output similar to this: -
-Output - ```bash NAME READY STATUS RESTARTS AGE vllm-0 1/1 Running 0 2s @@ -139,8 +136,6 @@ vllm-1 1/1 Running 0 2s vllm-1-1 1/1 Running 0 2s ``` -
- Verify that the distributed tensor-parallel inference works: ```bash @@ -172,9 +167,6 @@ Forwarding from [::1]:8080 -> 8080 Open another terminal and send a request -
-Command - ```text curl http://localhost:8080/v1/completions \ -H "Content-Type: application/json" \ @@ -186,8 +178,6 @@ curl http://localhost:8080/v1/completions \ }' ``` -
- The output should be similar to the following
diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md index 7b98504ef8a..1ab1931068f 100644 --- a/docs/deployment/frameworks/open-webui.md +++ b/docs/deployment/frameworks/open-webui.md @@ -13,9 +13,6 @@ vllm serve qwen/Qwen1.5-0.5B-Chat 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): -
-Command - ```console docker run -d -p 3000:8080 \ --name open-webui \ @@ -25,8 +22,6 @@ docker run -d -p 3000:8080 \ ghcr.io/open-webui/open-webui:main ``` -
- 1. Open it in the browser: On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index dc1d3376dc9..cb26c8378de 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -15,9 +15,6 @@ Here are the integrations: - Setup vLLM and langchain environment -
-Command - ```console pip install -U vllm \ langchain_milvus langchain_openai \ @@ -25,8 +22,6 @@ pip install -U vllm \ langchain-text-splitters ``` -
- ### Deploy - Start the vLLM server with the supported embedding model, e.g. @@ -57,9 +52,6 @@ python retrieval_augmented_generation_with_langchain.py - Setup vLLM and llamaindex environment -
-Command - ```console pip install vllm \ llama-index llama-index-readers-web \ @@ -68,8 +60,6 @@ pip install vllm \ llama-index-vector-stores-milvus \ ``` -
- ### Deploy - Start the vLLM server with the supported embedding model, e.g. diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index f4574cf1cb4..d67eb0e756e 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -86,9 +86,6 @@ Check the output of the command. There will be a shareable gradio link (like the **Optional**: Serve the 70B model instead of the default 8B and use more GPU: -
-Command - ```console HF_TOKEN="your-huggingface-token" \ sky launch serving.yaml \ @@ -97,8 +94,6 @@ HF_TOKEN="your-huggingface-token" \ --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct ``` -
- ## Scale up to multiple replicas SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. @@ -185,8 +180,7 @@ Wait until the service is ready: watch -n10 sky serve status vllm ``` -
-Example outputs: +Example outputs: ```console Services @@ -199,8 +193,6 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 ``` -
- After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
@@ -230,9 +222,6 @@ curl -L http://$ENDPOINT/v1/chat/completions \ To enable autoscaling, you could replace the `replicas` with the following configs in `service`: -
-Config - ```yaml service: replica_policy: @@ -241,8 +230,6 @@ service: target_qps_per_replica: 2 ``` -
- This will scale the service up to when the QPS exceeds 2 for each replica.
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index c1e3ab0c738..33ed8c5f5b5 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -29,9 +29,6 @@ pip install streamlit openai - Start the streamlit web UI and start to chat: -
-Commands - ```console streamlit run streamlit_openai_chatbot_webserver.py @@ -43,6 +40,4 @@ VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug ``` -
- ![](../../assets/deployment/streamlit-chat.png) diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md index 0f1a6ee054c..2ae600a423f 100644 --- a/docs/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -15,9 +15,6 @@ pip install llama-stack -q Then start Llama Stack server pointing to your vLLM server with the following configuration: -
-Config - ```yaml inference: - provider_id: vllm0 @@ -26,8 +23,6 @@ inference: url: http://127.0.0.1:8000 ``` -
- Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. ## Inference via Embedded vLLM @@ -35,9 +30,6 @@ Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distri An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) is also available. This is a sample of configuration using that method: -
-Config - ```yaml inference - provider_type: vllm @@ -45,5 +37,3 @@ inference model: Llama3.1-8B-Instruct tensor_parallel_size: 4 ``` - -
diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md index 162330a15a3..b55fb5f0be6 100644 --- a/docs/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -82,9 +82,6 @@ curl -o- http://localhost:30080/models To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: -
-Command - ```bash curl -X POST http://localhost:30080/completions \ -H "Content-Type: application/json" \ @@ -95,8 +92,6 @@ curl -X POST http://localhost:30080/completions \ }' ``` -
-
Expected output diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 6efef154c1e..f38afe99b64 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -121,9 +121,6 @@ EOF We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): -
-Logs - ```console kubectl logs -l app.kubernetes.io/name=vllm ... @@ -133,8 +130,6 @@ INFO: Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ``` -
- ## Deployment with GPUs **Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). @@ -166,9 +161,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models -
- Config - ```yaml apiVersion: v1 kind: Secret @@ -179,8 +171,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) stringData: token: "REPLACE_WITH_TOKEN" ``` - -
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. @@ -374,21 +364,13 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) Apply the deployment and service configurations using `kubectl apply -f `: -
- Command - ```console kubectl apply -f deployment.yaml kubectl apply -f service.yaml ``` -
- To test the deployment, run the following `curl` command: -
- Command - ```console curl http://mistral-7b.default.svc.cluster.local/v1/completions \ -H "Content-Type: application/json" \ @@ -400,8 +382,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) }' ``` -
- If the service is correctly deployed, you should receive a response from the vLLM model. ## Troubleshooting diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index 23c2b0c24ae..89a7f3bd300 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -17,9 +17,6 @@ export vllm_root=`pwd` Create a file named `Dockerfile.nginx`: -
-Dockerfile - ```console FROM nginx:latest RUN rm /etc/nginx/conf.d/default.conf @@ -27,8 +24,6 @@ EXPOSE 80 CMD ["nginx", "-g", "daemon off;"] ``` -
- Build the container: ```console @@ -75,9 +70,6 @@ docker build -f docker/Dockerfile . --tag vllm If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: -
-Command - ```console cd $vllm_root docker build \ @@ -87,8 +79,6 @@ docker build \ --build-arg https_proxy=$https_proxy ``` -
- [](){ #nginxloadbalancer-nginx-docker-network } ## Create Docker Network @@ -145,9 +135,6 @@ docker run \ ## Launch Nginx -
-Command - ```console docker run \ -itd \ @@ -157,8 +144,6 @@ docker run \ --name nginx-lb nginx-lb:latest ``` -
- [](){ #nginxloadbalancer-nginx-verify-nginx } ## Verify That vLLM Servers Are Ready diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index 1a99288b357..bf736b7a984 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -39,9 +39,6 @@ refer to multi-dimensional arrays, but each thread only accesses the portion of data assigned to it. I have omitted all other runtime parameters here for simplicity. -
-Code - ```cpp template __device__ void paged_attention_kernel( @@ -54,8 +51,6 @@ __device__ void paged_attention_kernel( ) ``` -
- There are also a list of template arguments above the function signature that are determined during compilation time. `scalar_t` represents the data type of the query, key, and value data elements, @@ -240,9 +235,6 @@ point to different tokens and prepare the `k_vecs` in the inner for loop. Finally, we perform the dot multiplication between the `q_vecs` and each `k_vecs`. -
-Code - ```cpp q_vecs = ... for ... { @@ -255,8 +247,6 @@ for ... { } ``` -
- As mentioned before, for each thread, it only fetches part of the query and key token data at a time. However, there will be a cross thread group reduction happen in the `Qk_dot<>::dot` . So `qk` @@ -297,9 +287,6 @@ store the normalized softmax result). Also we can compare and collect the `qk_max` for all `qk`s that are calculated by current thread group. -
-Code - ```cpp if (thread_group_offset == 0) { const bool mask = token_idx >= context_len; @@ -308,15 +295,10 @@ if (thread_group_offset == 0) { } ``` -
- Please note that the `logits` here is on shared memory, so each thread group will set the fields for its own assigned context tokens. Overall, the size of logits should be number of context tokens. -
-Code - ```cpp for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); @@ -327,15 +309,10 @@ if (lane == 0) { } ``` -
- Then we need to get the reduced `qk_max` across each warp. The main idea is to make threads in warp to communicate with each other and get the final max `qk` . -
-Code - ```cpp for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); @@ -343,8 +320,6 @@ for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { qk_max = VLLM_SHFL_SYNC(qk_max, 0); ``` -
- Finally, we can get the reduced `qk_max` from whole thread block by compare the `qk_max` from all warps in this thread block. Then we need to broadcast the final result to each thread. @@ -354,9 +329,6 @@ need to broadcast the final result to each thread. Similar to `qk_max`, we need to get the reduced sum value from the entire thread block too. -
-Code - ```cpp for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { float val = __expf(logits[i] - qk_max); @@ -367,17 +339,12 @@ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); ``` -
- Firstly, sum all exp values from each thread group, and meanwhile, convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. Please note, the `qk_max` here is already the max `qk` across the whole thread block. And then we can do reduction for `exp_sum` across whole thread block just like the `qk_max`. -
-Code - ```cpp const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { @@ -385,8 +352,6 @@ for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { } ``` -
- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain the final normalized softmax result as `logits`. This `logits` variable will be used for dot multiplication with the value data in @@ -425,9 +390,6 @@ multiple inner iterations, each warp will process one block of value tokens. And with multiple outer iterations, the whole context value tokens are processed -
-Code - ```cpp float accs[NUM_ROWS_PER_THREAD]; for ... { // Iteration over different blocks. @@ -440,8 +402,6 @@ for ... { // Iteration over different blocks. } ``` -
- As shown in the above pseudo code, in the outer loop, similar to `k_ptr`, `logits_vec` iterates over different blocks and reads `V_VEC_SIZE` elements from `logits`. In the inner loop, each @@ -470,9 +430,6 @@ Now, we need to perform reduction for `accs` within each warp. This process allows each thread to accumulate the `accs` for the assigned head positions of all tokens in one block. -
-Code - ```cpp for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { float acc = accs[i]; @@ -483,8 +440,6 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { } ``` -
- Next, we perform reduction for `accs` across all warps, allowing each thread to have the accumulation of `accs` for the assigned head positions of all context tokens. Please note that each `accs` @@ -525,23 +480,15 @@ for (int i = NUM_WARPS; i > 1; i /= 2) { Now we can write all of calculated result from local register memory to final output global memory. -
-Code - ```cpp scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE; ``` -
- First, we need to define the `out_ptr` variable, which points to the start address of the assigned sequence and assigned head. -
-Code - ```cpp for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; @@ -551,8 +498,6 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { } ``` -
- Finally, we need to iterate over different assigned head positions and write out the corresponding accumulated result based on the `out_ptr`. diff --git a/docs/features/lora.md b/docs/features/lora.md index a60fa4a6ada..eae9ef3fc53 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -18,9 +18,6 @@ sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") Then we instantiate the base model and pass in the `enable_lora=True` flag: -
-Code - ```python from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest @@ -28,8 +25,6 @@ from vllm.lora.request import LoRARequest llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) ``` -
- We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and the third parameter is the path to the LoRA adapter. @@ -65,17 +60,12 @@ Check out for an exa LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use `--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: -
-Command - ```bash vllm serve meta-llama/Llama-2-7b-hf \ --enable-lora \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` -
- !!! note The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. @@ -113,9 +103,6 @@ LoRA adapter requests if they were provided and `max_loras` is set high enough). The following is an example request -
-Command - ```bash curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ @@ -127,8 +114,6 @@ curl http://localhost:8000/v1/completions \ }' | jq ``` -
- ## Dynamically serving LoRA Adapters In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. @@ -148,8 +133,7 @@ Loading a LoRA Adapter: To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. -
-Example request to load a LoRA adapter +Example request to load a LoRA adapter: ```bash curl -X POST http://localhost:8000/v1/load_lora_adapter \ @@ -160,8 +144,6 @@ curl -X POST http://localhost:8000/v1/load_lora_adapter \ }' ``` -
- Upon a successful request, the API will respond with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' added successfully`. If an error occurs, such as if the adapter cannot be found or loaded, an appropriate error message will be returned. @@ -172,8 +154,7 @@ with the name or ID of the adapter to be unloaded. Upon a successful request, the API responds with a `200 OK` status code from `vllm serve`, and `curl` returns the response body: `Success: LoRA adapter 'sql_adapter' removed successfully`. -
-Example request to unload a LoRA adapter +Example request to unload a LoRA adapter: ```bash curl -X POST http://localhost:8000/v1/unload_lora_adapter \ @@ -183,8 +164,6 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ }' ``` -
- ### Using Plugins Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter. @@ -235,9 +214,6 @@ Alternatively, follow these example steps to implement your own plugin: 2. Register `LoRAResolver` plugin. -
- Code - ```python from vllm.lora.resolver import LoRAResolverRegistry @@ -245,8 +221,6 @@ Alternatively, follow these example steps to implement your own plugin: LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver) ``` -
- For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md). ## New format for `--lora-modules` diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 0698a07212a..92f02fb91ba 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -46,17 +46,12 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: -
-Code - ```console python examples/offline_inference/llm_engine_example.py \ --model TheBloke/Llama-2-7b-Chat-AWQ \ --quantization awq ``` -
- AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 20830fad3d5..a8dc2476f30 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -22,9 +22,6 @@ And usually, these repositories have a config.json file that includes a quantiza For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument. -
-Code - ```python from vllm import LLM import torch @@ -37,15 +34,10 @@ llm = LLM( ) ``` -
- ## Inflight quantization: load as 4bit quantization For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument. -
-Code - ```python from vllm import LLM import torch @@ -58,8 +50,6 @@ llm = LLM( ) ``` -
- ## OpenAI Compatible Server Append the following to your model arguments for 4bit inflight quantization: diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 4d8afd234d7..3f405db0acc 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -39,9 +39,6 @@ The quantization process involves three main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: -
-Code - ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -52,8 +49,6 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` -
- ### 2. Applying Quantization For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses: @@ -95,9 +90,6 @@ pip install vllm lm-eval==0.4.4 Load and run the model in `vllm`: -
-Code - ```python from vllm import LLM model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") @@ -105,16 +97,11 @@ result = model.generate("Hello my name is") print(result[0].outputs[0].text) ``` -
- Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): !!! note Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. -
-Commands - ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic $ lm_eval \ @@ -123,13 +110,8 @@ $ lm_eval \ --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 ``` -
- Here's an example of the resulting scores: -
-Result - ```text |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| @@ -137,8 +119,6 @@ Here's an example of the resulting scores: | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| ``` -
- ## Troubleshooting and Support If you encounter any issues or have feature requests, please open an issue on the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor/issues) GitHub repository. @@ -149,9 +129,6 @@ Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achi In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. -
-Code - ```python from vllm import LLM model = LLM("facebook/opt-125m", quantization="fp8") @@ -160,7 +137,5 @@ result = model.generate("Hello, my name is") print(result[0].outputs[0].text) ``` -
- !!! warning Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 7389710d823..fe4422eb841 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -20,9 +20,6 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: -
-Command - ```console # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ @@ -30,16 +27,11 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ --tensor-parallel-size 2 ``` -
- !!! warning We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path -
-Command - ```console # If you model is not supported by huggingface you can manually provide a huggingface compatible config path vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ @@ -47,8 +39,6 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 ``` -
- You can also use the GGUF model directly through the LLM entrypoint:
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index a6fa6a2ea88..cf8ff3dd9f0 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -37,9 +37,6 @@ The quantization process involves four main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: -
-Code - ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -50,8 +47,6 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` -
- ### 2. Preparing Calibration Data When quantizing weights to INT4, you need sample data to estimate the weight updates and calibrated scales. @@ -127,9 +122,6 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") To evaluate accuracy, you can use `lm_eval`: -
-Commands - ```console $ lm_eval --model vllm \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \ @@ -139,8 +131,6 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -
- !!! note Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index f7b9418764c..cdc06b1aed9 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -38,9 +38,6 @@ The quantization process involves four main steps: Load your model and tokenizer using the standard `transformers` AutoModel classes: -
-Code - ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -51,8 +48,6 @@ model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` -
- ### 2. Preparing Calibration Data When quantizing activations to INT8, you need sample data to estimate the activation scales. @@ -131,9 +126,6 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") To evaluate accuracy, you can use `lm_eval`: -
-Command - ```console $ lm_eval --model vllm \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ @@ -143,8 +135,6 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -
- !!! note Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 5e4c3c58a97..5282746d5e6 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -40,9 +40,6 @@ model = mtq.quantize(model, config, forward_loop) After the model is quantized, you can export it to a quantized checkpoint using the export API: -
-Code - ```python import torch from modelopt.torch.export import export_hf_checkpoint @@ -54,8 +51,6 @@ with torch.inference_mode(): ) ``` -
- The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index 5b2dcaf9176..5e104367c8c 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -147,9 +147,6 @@ The above script will create a folder in your current directory containing your When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. -
-Code - ```python from vllm import LLM, SamplingParams @@ -159,5 +156,3 @@ prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) ``` - -
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 79ab189c1f0..1d24f07d594 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -224,17 +224,12 @@ for output in outputs: Or, you can use `lm_eval` to evaluate accuracy: -
-Command - ```console $ lm_eval --model vllm \ --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \ --tasks gsm8k ``` -
- ## Quark Quantization Script In addition to the example of Python API above, Quark also offers a [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html) @@ -242,9 +237,6 @@ to quantize large language models more conveniently. It supports quantizing mode of different quantization schemes and optimization algorithms. It can export the quantized model and run evaluation tasks on the fly. With the script, the example above can be: -
-Code - ```console python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ --output_dir /path/to/output \ @@ -255,5 +247,3 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ --model_export hf_format \ --tasks gsm8k ``` - -
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index 75dd8829e42..d8907f427c8 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -4,9 +4,6 @@ TorchAO is an architecture optimization library for PyTorch, it provides high pe We recommend installing the latest torchao nightly with -
-Command - ```console # Install the latest TorchAO nightly build # Choose the CUDA version that matches your system (cu126, cu128, etc.) @@ -15,8 +12,6 @@ pip install \ --index-url https://download.pytorch.org/whl/nightly/cu126 ``` -
- ## Quantizing HuggingFace Models You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 7279e559dc0..22c2d9f6129 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -258,17 +258,12 @@ print("Age:", message.parsed.age)
-
-Output - ```console ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) Name: Cameron Age: 28 ``` -
- Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 5760b35ae17..66db22847b4 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -6,9 +6,6 @@ vLLM currently supports named function calling, as well as the `auto`, `required Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8B model, so we need to use the llama3 tool calling chat template from the vLLM examples directory: -
-Command - ```bash vllm serve meta-llama/Llama-3.1-8B-Instruct \ --enable-auto-tool-choice \ @@ -16,8 +13,6 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` -
- Next, make a request to the model that should result in it using the available tools:
@@ -64,8 +59,7 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
-
-Example output +Example output: ```text Function called: get_weather @@ -73,8 +67,6 @@ Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` -
- This example demonstrates: * Setting up the server with tool calling enabled @@ -365,14 +357,9 @@ class ExampleToolParser(ToolParser): Then you can use this plugin in the command line like this. -
-Command - ```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ --chat-template \ ``` - -
diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md index dc28327ecaa..6b2efd85f06 100644 --- a/docs/getting_started/installation/aws_neuron.md +++ b/docs/getting_started/installation/aws_neuron.md @@ -47,9 +47,6 @@ Currently, there are no pre-built Neuron wheels. To build and install vLLM from source, run: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -57,8 +54,6 @@ pip install -U -r requirements/neuron.txt VLLM_TARGET_DEVICE="neuron" pip install -e . ``` -
- AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at , which contains several features in addition to what's available on vLLM V0. Please utilize the AWS Fork for the following features: @@ -71,9 +66,6 @@ Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs- To install the AWS Neuron fork, run the following: -
-Commands - ```console git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git cd upstreaming-to-vllm @@ -81,8 +73,6 @@ pip install -r requirements/neuron.txt VLLM_TARGET_DEVICE="neuron" pip install -e . ``` -
- Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. ## Set up using Docker diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index e620b056530..aab3f891fa1 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -124,9 +124,6 @@ vLLM CPU backend supports the following vLLM features: - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: -
-Commands - ```console sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library find / -name *libtcmalloc* # find the dynamic link library path @@ -134,8 +131,6 @@ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD python examples/offline_inference/basic/basic.py # run vLLM ``` -
- - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 41c32202225..7a91e3ce5e5 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -25,9 +25,6 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. -
-Command - ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -35,8 +32,6 @@ pip install -r requirements/cpu.txt pip install -e . ``` -
- !!! note On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. @@ -45,9 +40,6 @@ pip install -e . If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). -
-Error - ```text [...] fatal error: 'map' file not found 1 | #include @@ -61,8 +53,6 @@ If the build has error like the following snippet where standard C++ headers can 1 error generated. ``` -
- # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index 6bb1818c29d..670485feefb 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -26,9 +26,6 @@ Currently the CPU implementation for s390x architecture supports FP32 datatype o Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4: -
-Command - ```console dnf install -y \ which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \ @@ -36,8 +33,6 @@ dnf install -y \ openssl-devel openblas openblas-devel wget autoconf automake libtool cmake numactl-devel ``` -
- Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation. ```console @@ -50,9 +45,6 @@ Execute the following commands to build and install vLLM from the source. !!! tip Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. -
-Command - ```console sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds pip install -v \ @@ -63,8 +55,6 @@ Execute the following commands to build and install vLLM from the source. pip install dist/*.whl ``` -
- # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index c036bd0ab57..4503bb44318 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -22,9 +22,6 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: -
-Commands - ```console # Install vLLM with CUDA 12.8. # If you are using pip. @@ -33,8 +30,6 @@ pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 uv pip install vllm --torch-backend=auto ``` -
- We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. !!! note @@ -42,9 +37,6 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: -
-Commands - ```console # Install vLLM with CUDA 11.8. export VLLM_VERSION=0.6.1.post1 @@ -52,8 +44,6 @@ export PYTHON_VERSION=312 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` -
- [](){ #install-the-latest-code } #### Install the latest code @@ -93,9 +83,6 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: -
-Commands - ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch uv pip install vllm \ @@ -103,8 +90,6 @@ uv pip install vllm \ --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` -
- The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. # --8<-- [end:pre-built-wheels] @@ -175,9 +160,6 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -186,8 +168,6 @@ pip install -r requirements/build.txt pip install --no-build-isolation -e . ``` -
- ##### Use the local cutlass for compilation Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. @@ -214,9 +194,6 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. -
-Command - ```console # Use `--ipc=host` to make sure the shared memory is large enough. docker run \ @@ -226,8 +203,6 @@ docker run \ --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` -
- If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index aee471f49c6..c0bfa6823a0 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -41,9 +41,6 @@ Currently, there are no pre-built ROCm wheels. Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) -
- Commands - ```console python3 -m pip install ninja cmake wheel pybind11 pip uninstall -y triton @@ -55,8 +52,6 @@ Currently, there are no pre-built ROCm wheels. cd ../.. ``` -
- !!! note If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. @@ -67,9 +62,6 @@ Currently, there are no pre-built ROCm wheels. For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. -
- Commands - ```console git clone https://github.com/ROCm/flash-attention.git cd flash-attention @@ -79,16 +71,11 @@ Currently, there are no pre-built ROCm wheels. cd .. ``` -
- !!! note You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps: -
- Commands - ```console python3 -m pip uninstall -y aiter git clone --recursive https://github.com/ROCm/aiter.git @@ -98,8 +85,6 @@ Currently, there are no pre-built ROCm wheels. python3 setup.py develop ``` -
- !!! note You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. @@ -166,9 +151,6 @@ If you choose to build this rocm_base image yourself, the steps are as follows. It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: -
-Config - ```console { "features": { @@ -177,8 +159,6 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` -
- To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: ```console @@ -192,9 +172,6 @@ DOCKER_BUILDKIT=1 docker build \ First, build a docker image from and launch a docker container from the image. It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: -
-Config - ```console { "features": { @@ -203,8 +180,6 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` -
- uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: @@ -221,9 +196,6 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: -
-Commands - ```console DOCKER_BUILDKIT=1 docker build \ --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ @@ -232,8 +204,6 @@ DOCKER_BUILDKIT=1 docker build \ . ``` -
- To run the above docker image `vllm-rocm`, use the below command:
diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index 7ab3a02481a..128fff164c3 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -25,9 +25,6 @@ Currently, there are no pre-built XPU wheels. - First, install required driver and Intel OneAPI 2025.0 or later. - Second, install Python packages for vLLM XPU backend building: -
-Commands - ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -35,8 +32,6 @@ pip install --upgrade pip pip install -v -r requirements/xpu.txt ``` -
- - Then, build and install vLLM XPU backend: ```console @@ -58,9 +53,6 @@ Currently, there are no pre-built XPU images. # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] -
-Command - ```console $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . $ docker run -it \ @@ -71,16 +63,11 @@ $ docker run -it \ vllm-xpu-env ``` -
- # --8<-- [end:build-image-from-source] # --8<-- [start:supported-features] XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: -
-Command - ```console python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ @@ -91,8 +78,6 @@ python -m vllm.entrypoints.openai.api_server \ -tp=8 ``` -
- By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. # --8<-- [end:supported-features] diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md index bf9d0477033..c1987300f8d 100644 --- a/docs/getting_started/installation/intel_gaudi.md +++ b/docs/getting_started/installation/intel_gaudi.md @@ -42,9 +42,6 @@ for more details. Use the following commands to run a Docker image: -
-Command - ```console docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest docker run \ @@ -58,8 +55,6 @@ docker run \ vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -
- ## Set up using Python ### Pre-built wheels @@ -70,9 +65,6 @@ Currently, there are no pre-built Intel Gaudi wheels. To build and install vLLM from source, run: -
-Command - ```console git clone https://github.com/vllm-project/vllm.git cd vllm @@ -80,13 +72,8 @@ pip install -r requirements/hpu.txt python setup.py develop ``` -
- Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: -
-Command - ```console git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork @@ -95,8 +82,6 @@ pip install -r requirements/hpu.txt python setup.py develop ``` -
- ## Set up using Docker ### Pre-built images @@ -105,9 +90,6 @@ Currently, there are no pre-built Intel Gaudi images. ### Build image from source -
-Command - ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . docker run \ @@ -120,8 +102,6 @@ docker run \ --rm vllm-hpu-env ``` -
- !!! tip If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. @@ -211,9 +191,6 @@ In a dynamic inference serving scenario, there is a need to minimize the number Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -
-Logs - ```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] @@ -221,12 +198,9 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` -
- `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. -
-Example (with ramp-up) +Example (with ramp-up): ```text min = 2, step = 32, max = 64 @@ -235,10 +209,7 @@ min = 2, step = 32, max = 64 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) ``` -
- -
-Example (without ramp-up) +Example (without ramp-up): ```text min = 128, step = 128, max = 512 @@ -247,8 +218,6 @@ min = 128, step = 128, max = 512 => buckets = ramp_up + stable => (128, 256, 384, 512) ``` -
- In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. !!! warning diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index adcdb89525c..afc7aea46c6 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -35,9 +35,6 @@ uv run --with vllm vllm --help You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment. -
-Commands - ```console conda create -n myenv python=3.12 -y conda activate myenv @@ -45,8 +42,6 @@ pip install --upgrade uv uv pip install vllm --torch-backend=auto ``` -
- !!! note For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM. @@ -72,9 +67,6 @@ The next section defines a list of input prompts and sampling parameters for tex However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. -
-Code - ```python prompts = [ "Hello, my name is", @@ -85,8 +77,6 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ``` -
- The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models]. ```python @@ -102,9 +92,6 @@ llm = LLM(model="facebook/opt-125m") Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. -
-Code - ```python outputs = llm.generate(prompts, sampling_params) @@ -114,8 +101,6 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -
- [](){ #quickstart-online } ## OpenAI-Compatible Server @@ -149,9 +134,6 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: -
-Command - ```console curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ @@ -163,8 +145,6 @@ curl http://localhost:8000/v1/completions \ }' ``` -
- Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
@@ -195,9 +175,6 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: -
-Command - ```console curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -210,8 +187,6 @@ curl http://localhost:8000/v1/chat/completions \ }' ``` -
- Alternatively, you can use the `openai` Python package:
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 8ee5933b9a2..6755b574ea6 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -29,9 +29,6 @@ vllm serve s3://core-llm/Llama-3-8b \ To run model from a S3 compatible object store run: -
-Commands - ```console RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \ AWS_EC2_METADATA_DISABLED=true \ @@ -40,8 +37,6 @@ vllm serve s3://core-llm/Llama-3-8b \ --load-format runai_streamer ``` -
- ## Tunable parameters You can tune parameters using `--model-loader-extra-config`: diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index b0864dc29e5..5ffec85f653 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -23,9 +23,6 @@ The [generate][vllm.LLM.generate] method is available to all generative models i It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), except that tokenization and detokenization are also performed automatically. -
-Code - ```python from vllm import LLM @@ -38,14 +35,9 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -
- You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams]. For example, you can use greedy sampling by setting `temperature=0`: -
-Code - ```python from vllm import LLM, SamplingParams @@ -59,8 +51,6 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -
- !!! important By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. @@ -72,9 +62,6 @@ A code example can be found here: -Code - ```python from vllm import LLM from vllm.sampling_params import BeamSearchParams @@ -88,8 +75,6 @@ for output in outputs: print(f"Generated text: {generated_text!r}") ``` -
- ### `LLM.chat` The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate]. @@ -140,9 +125,6 @@ A code example can be found here: -Code - ```python from vllm.entrypoints.chat_utils import load_chat_template @@ -153,8 +135,6 @@ print("Loaded chat template:", custom_template) outputs = llm.chat(conversation, chat_template=custom_template) ``` -
- ## Online Serving Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index c3a144c1de7..0acb4d55214 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -45,9 +45,6 @@ See [configuration][configuration] for a list of options when initializing the m The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. -
-Code - ```python from vllm import LLM @@ -58,16 +55,11 @@ data = output.outputs.data print(f"Data: {data!r}") ``` -
- ### `LLM.embed` The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. It is primarily designed for embedding models. -
-Code - ```python from vllm import LLM @@ -78,8 +70,6 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -
- A code example can be found here: ### `LLM.classify` @@ -87,9 +77,6 @@ A code example can be found here: -Code - ```python from vllm import LLM @@ -100,8 +87,6 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -
- A code example can be found here: ### `LLM.score` @@ -113,9 +98,6 @@ It is designed for embedding models and cross encoder models. Embedding models u vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). -
-Code - ```python from vllm import LLM @@ -127,8 +109,6 @@ score = output.outputs.score print(f"Score: {score}") ``` -
- A code example can be found here: ## Online Serving @@ -169,9 +149,6 @@ vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_ You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. -
-Code - ```python from vllm import LLM, PoolingParams @@ -183,8 +160,6 @@ outputs = model.embed(["Follow the white rabbit."], print(outputs[0].outputs) ``` -
- A code example can be found here: ### Online Inference @@ -197,9 +172,6 @@ vllm serve jinaai/jina-embeddings-v3 --trust-remote-code You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. -
-Command - ```text curl http://127.0.0.1:8000/v1/embeddings \ -H 'accept: application/json' \ @@ -212,10 +184,7 @@ curl http://127.0.0.1:8000/v1/embeddings \ }' ``` -
- -
-Expected output +Expected output: ```json {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 427ca5d99ee..0efcabae7ca 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -155,9 +155,6 @@ The [Transformers backend][transformers-backend] enables you to run models direc !!! tip The easiest way to check if your model is really supported at runtime is to run the program below: -
- Code - ```python from vllm import LLM @@ -172,8 +169,6 @@ The [Transformers backend][transformers-backend] enables you to run models direc print(output) ``` -
- If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM. @@ -183,9 +178,6 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: -
-Commands - ```console # Download a model huggingface-cli download HuggingFaceH4/zephyr-7b-beta @@ -197,15 +189,10 @@ huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cach huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json ``` -
- #### List the downloaded models Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: -
-Commands - ```console # List cached models huggingface-cli scan-cache @@ -217,8 +204,6 @@ huggingface-cli scan-cache -v huggingface-cli scan-cache --dir ~/.cache/huggingface/hub ``` -
- #### Delete a cached model Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: @@ -293,9 +278,6 @@ export VLLM_USE_MODELSCOPE=True And use with `trust_remote_code=True`. -
-Code - ```python from vllm import LLM @@ -310,8 +292,6 @@ output = llm.encode("Hello, my name is") print(output) ``` -
- [](){ #feature-status-legend } ## Feature Status Legend @@ -528,9 +508,6 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode Offline inference: -
- Code - ```python from vllm import LLM @@ -540,8 +517,6 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode ) ``` -
- Online serving: ```bash diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 3d4dfa6f877..259af5cabcb 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -55,9 +55,6 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: -
-Command - ```console bash run_cluster.sh \ vllm/vllm-openai \ @@ -67,13 +64,8 @@ bash run_cluster.sh \ -e VLLM_HOST_IP=ip_of_this_node ``` -
- On the rest of the worker nodes, run the following command: -
-Command - ```console bash run_cluster.sh \ vllm/vllm-openai \ @@ -83,8 +75,6 @@ bash run_cluster.sh \ -e VLLM_HOST_IP=ip_of_this_node ``` -
- Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. !!! warning diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md index 27b15a80a8a..251b7155c55 100644 --- a/docs/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -13,9 +13,6 @@ pip install llama-index-llms-vllm -q To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. -
-Code - ```python from llama_index.llms.vllm import Vllm @@ -27,6 +24,4 @@ llm = Vllm( ) ``` -
- Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 8018d4b9fdf..e95f84404f0 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -101,9 +101,6 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: -
-Code - ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -113,8 +110,6 @@ completion = client.chat.completions.create( ) ``` -
- Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like @@ -135,9 +130,6 @@ vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. -
-Code - ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -150,8 +142,6 @@ completion = client.chat.completions.create( ) ``` -
- ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled @@ -204,26 +194,16 @@ Code example: The following [sampling parameters][sampling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" ``` -
- [](){ #chat-api } ### Chat API @@ -242,26 +222,16 @@ Code example: The following [sampling parameters][sampling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" ``` -
- [](){ #embeddings-api } ### Embeddings API @@ -283,9 +253,6 @@ and passing a list of `messages` in the request. Refer to the examples below for To serve the model: -
- Command - ```bash vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ --trust-remote-code \ @@ -293,8 +260,6 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_vlm2vec.jinja ``` -
- !!! important Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` to run this model in embedding mode instead of text generation mode. @@ -337,9 +302,6 @@ and passing a list of `messages` in the request. Refer to the examples below for To serve the model: -
- Command - ```bash vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ --trust-remote-code \ @@ -347,8 +309,6 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_dse_qwen2_vl.jinja ``` -
- !!! important Like with VLM2Vec, we have to explicitly pass `--task embed`. @@ -365,37 +325,22 @@ Full example: -Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params" ``` -
- The following extra parameters are supported by default: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" ``` -
- For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" ``` -
- [](){ #transcriptions-api } ### Transcriptions API @@ -413,26 +358,16 @@ Code example: The following [sampling parameters][sampling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" ``` -
- [](){ #tokenizer-api } ### Tokenizer API @@ -467,9 +402,6 @@ Code example: You can classify multiple texts by passing an array of strings: -
-Request - ```bash curl -v "http://127.0.0.1:8000/classify" \ -H "Content-Type: application/json" \ @@ -482,8 +414,6 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -
-
Response @@ -526,9 +456,6 @@ curl -v "http://127.0.0.1:8000/classify" \ You can also pass a string directly to the `input` field: -
-Request - ```bash curl -v "http://127.0.0.1:8000/classify" \ -H "Content-Type: application/json" \ @@ -538,8 +465,6 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -
-
Response @@ -575,26 +500,16 @@ curl -v "http://127.0.0.1:8000/classify" \ The following [pooling parameters][pooling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params" ``` -
- [](){ #score-api } ### Score API @@ -610,9 +525,6 @@ Code example: You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. -
-Request - ```bash curl -X 'POST' \ 'http://127.0.0.1:8000/score' \ @@ -626,8 +538,6 @@ curl -X 'POST' \ }' ``` -
-
Response @@ -707,7 +617,8 @@ You can pass a list to both `text_1` and `text_2`, forming multiple sentence pai where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). The total number of pairs is `len(text_2)`. -Request: +
+Request ```bash curl -X 'POST' \ @@ -728,6 +639,8 @@ curl -X 'POST' \ }' ``` +
+
Response @@ -759,26 +672,16 @@ curl -X 'POST' \ The following [pooling parameters][pooling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params" ``` -
- [](){ #rerank-api } ### Re-rank API @@ -858,22 +761,12 @@ curl -X 'POST' \ The following [pooling parameters][pooling-params] are supported. -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params" ``` -
- The following extra parameters are supported: -
-Code - ```python --8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" ``` - -
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index 5df70548482..f2b3cfb6995 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -160,9 +160,6 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b If you have seen a warning in your logs like this: -
-Logs - ```console WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting @@ -171,8 +168,6 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously for more information. ``` -
- or an error from Python that looks like this:
diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md index 1ab3085d939..4d7b70e6f42 100644 --- a/docs/usage/usage_stats.md +++ b/docs/usage/usage_stats.md @@ -56,14 +56,9 @@ tail ~/.config/vllm/usage_stats.json You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: -
-Command - ```bash # Any of the following methods can disable usage stats collection export VLLM_NO_USAGE_STATS=1 export DO_NOT_TRACK=1 mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track ``` - -
From ea3b9e062f292a1140d4e87a8c856154612a9bf1 Mon Sep 17 00:00:00 2001 From: reidliu41 Date: Mon, 23 Jun 2025 07:08:35 +0800 Subject: [PATCH 5/5] use MkDocs collapsible blocks Signed-off-by: reidliu41 --- docs/cli/README.md | 35 +- docs/configuration/conserving_memory.md | 64 +- docs/configuration/env_vars.md | 11 +- docs/configuration/optimization.md | 5 - docs/contributing/README.md | 38 +- docs/contributing/model/basic.md | 61 +- docs/contributing/model/multimodal.md | 870 ++++++++---------- docs/contributing/profiling.md | 45 +- docs/deployment/docker.md | 33 +- docs/deployment/frameworks/autogen.md | 102 +- docs/deployment/frameworks/cerebrium.md | 125 ++- docs/deployment/frameworks/dify.md | 5 - docs/deployment/frameworks/dstack.md | 144 ++- docs/deployment/frameworks/haystack.md | 54 +- docs/deployment/frameworks/litellm.md | 34 +- docs/deployment/frameworks/lws.md | 238 +++-- docs/deployment/frameworks/skypilot.md | 416 ++++----- .../integrations/production-stack.md | 93 +- docs/deployment/k8s.md | 166 ++-- docs/deployment/nginx.md | 92 +- docs/design/arch_overview.md | 104 +-- docs/design/kernel/paged_attention.md | 43 +- docs/design/plugin_system.md | 51 +- docs/features/lora.md | 210 ++--- docs/features/multimodal_inputs.md | 718 +++++++-------- docs/features/quantization/auto_awq.md | 96 +- docs/features/quantization/bitblas.md | 40 +- docs/features/quantization/fp8.md | 31 +- docs/features/quantization/gguf.md | 85 +- docs/features/quantization/gptqmodel.md | 88 +- docs/features/quantization/int4.md | 139 ++- docs/features/quantization/int8.md | 88 +- docs/features/quantization/modelopt.md | 80 +- .../quantization/quantized_kvcache.md | 154 ++-- docs/features/quantization/quark.md | 245 +++-- docs/features/quantization/torchao.md | 49 +- docs/features/reasoning_outputs.md | 432 +++++---- docs/features/spec_decode.md | 282 +++--- docs/features/structured_outputs.md | 411 ++++----- docs/features/tool_calling.md | 172 ++-- docs/getting_started/installation/cpu.md | 92 +- .../installation/google_tpu.md | 5 - .../installation/gpu/rocm.inc.md | 66 +- .../installation/intel_gaudi.md | 102 +- docs/getting_started/quickstart.md | 78 +- docs/models/generative_models.md | 65 +- docs/models/pooling_models.md | 2 - docs/models/supported_models.md | 49 +- docs/serving/integrations/langchain.md | 35 +- docs/serving/openai_compatible_server.md | 590 ++++++------ docs/usage/metrics.md | 50 +- docs/usage/troubleshooting.md | 181 ++-- docs/usage/usage_stats.md | 67 +- 53 files changed, 3531 insertions(+), 4000 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index a381d2f23b1..b2587a5e7cd 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -16,31 +16,28 @@ vllm {chat,complete,serve,bench,collect-env,run-batch} Start the vLLM OpenAI Compatible API server. -
-Examples - -```bash -# Start with a model -vllm serve meta-llama/Llama-2-7b-hf +??? Examples -# Specify the port -vllm serve meta-llama/Llama-2-7b-hf --port 8100 + ```bash + # Start with a model + vllm serve meta-llama/Llama-2-7b-hf -# Check with --help for more options -# To list all groups -vllm serve --help=listgroup + # Specify the port + vllm serve meta-llama/Llama-2-7b-hf --port 8100 -# To view a argument group -vllm serve --help=ModelConfig + # Check with --help for more options + # To list all groups + vllm serve --help=listgroup -# To view a single argument -vllm serve --help=max-num-seqs + # To view a argument group + vllm serve --help=ModelConfig -# To search by keyword -vllm serve --help=max -``` + # To view a single argument + vllm serve --help=max-num-seqs -
+ # To search by keyword + vllm serve --help=max + ``` ## chat diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 10469462fb3..e2303067e3e 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -57,24 +57,21 @@ By default, we optimize model inference using CUDA graphs which take up extra me You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: -
-Code - -```python -from vllm import LLM -from vllm.config import CompilationConfig, CompilationLevel - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - # By default, it goes up to max_num_seqs - cudagraph_capture_sizes=[1, 2, 4, 8, 16], - ), -) -``` - -
+??? Code + + ```python + from vllm import LLM + from vllm.config import CompilationConfig, CompilationLevel + + llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + # By default, it goes up to max_num_seqs + cudagraph_capture_sizes=[1, 2, 4, 8, 16], + ), + ) + ``` You can disable graph capturing completely via the `enforce_eager` flag: @@ -132,23 +129,20 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory. Here are some examples: -
-Code +??? Code -```python -from vllm import LLM + ```python + from vllm import LLM -# Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) - -# Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) -``` + # Available for Qwen2-VL series models + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) -
+ # Available for InternVL series models + llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) + ``` diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 09aa4a595a8..c875931c305 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -7,11 +7,8 @@ vLLM uses the following environment variables to configure the system: All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -
-Code +??? Code -```python ---8<-- "vllm/envs.py:env-vars-definition" -``` - -
+ ```python + --8<-- "vllm/envs.py:env-vars-definition" + ``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 09d1598c67e..811925c19e6 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -148,9 +148,6 @@ llm = LLM( CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level: -
-Code - ```python from vllm import LLM from vllm.config import CompilationConfig, CompilationLevel @@ -164,8 +161,6 @@ llm = LLM( ) ``` -
- Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`: ```python diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 50cb3ac2a7d..e977ec3d2f7 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -77,17 +77,12 @@ mkdocs serve Example output: -
-Output - ```console INFO - Documentation built in 106.83 seconds INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' INFO - [22:02:02] Serving on http://127.0.0.1:8000/ ``` -
- #### View in Your Browser Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. @@ -98,30 +93,27 @@ For additional features and advanced configurations, refer to the official [MkDo ## Testing -
-Commands +??? note "Commands" -```bash -pip install -r requirements/dev.txt + ```bash + pip install -r requirements/dev.txt -# Linting, formatting and static type checking -pre-commit install --hook-type pre-commit --hook-type commit-msg + # Linting, formatting and static type checking + pre-commit install --hook-type pre-commit --hook-type commit-msg -# You can manually run pre-commit with -pre-commit run --all-files + # You can manually run pre-commit with + pre-commit run --all-files -# To manually run something from CI that does not run -# locally by default, you can run: -pre-commit run mypy-3.9 --hook-stage manual --all-files + # To manually run something from CI that does not run + # locally by default, you can run: + pre-commit run mypy-3.9 --hook-stage manual --all-files -# Unit tests -pytest tests/ - -# Run tests for a single test file with detailed output -pytest -s -v tests/test_logger.py -``` + # Unit tests + pytest tests/ -
+ # Run tests for a single test file with detailed output + pytest -s -v tests/test_logger.py + ``` !!! tip Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index bdc241f8b25..644d21482ef 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -27,38 +27,35 @@ All vLLM modules within the model must include a `prefix` argument in their cons The initialization code should look like this: -
-Code - -```python -from torch import nn -from vllm.config import VllmConfig -from vllm.attention import Attention - -class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - -class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - -class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - -class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") -``` - -
+??? Code + + ```python + from torch import nn + from vllm.config import VllmConfig + from vllm.attention import Attention + + class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + + class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + + class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + + class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") + ``` ### Computation Code diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index cee50e97789..6ff2abbae63 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -25,69 +25,63 @@ Further update the model as follows: - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. -
- Code + ??? Code - ```python - class YourModelForImage2Seq(nn.Module): - ... + ```python + class YourModelForImage2Seq(nn.Module): + ... - def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - assert self.vision_encoder is not None - image_features = self.vision_encoder(image_input) - return self.multi_modal_projector(image_features) + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - # Validate the multimodal input keyword arguments - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - - # Run multimodal inputs through encoder and projector - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - ``` + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None -
+ # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` !!! important The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. -
- Code + ??? Code - ```python - from .utils import merge_multimodal_embeddings + ```python + from .utils import merge_multimodal_embeddings - class YourModelForImage2Seq(nn.Module): - ... + class YourModelForImage2Seq(nn.Module): + ... - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - - # `get_input_embeddings` should already be implemented for the language - # model as one of the requirements of basic vLLM model implementation. - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_index) - - return inputs_embeds - ``` + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: -
+ # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. @@ -145,52 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in Looking at the code of HF's `LlavaForConditionalGeneration`: -
- Code + ??? Code - ```python - # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 - n_image_tokens = (input_ids == self.config.image_token_index).sum().item() - n_image_features = image_features.shape[0] * image_features.shape[1] + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) - ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - ``` - -
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + ``` The number of placeholder feature tokens per image is `image_features.shape[1]`. `image_features` is calculated inside the `get_image_features` method: -
- Code + ??? Code - ```python - # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - - selected_image_feature = image_outputs.hidden_states[vision_feature_layer] - if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - else: - raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = self.multi_modal_projector(selected_image_feature) - return image_features - ``` + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) -
+ selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + ``` We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). @@ -198,9 +186,6 @@ Assuming that the memory usage increases with the number of tokens, the dummy in The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention mechanism doesn't change the sequence length of the output hidden states. -
- Code - ```python # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) @@ -214,29 +199,24 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ) ``` -
- To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: -
- Code + ??? Code - ```python - # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - if interpolate_pos_encoding: - embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) - else: - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - ``` + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) -
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + ``` We can infer that `embeddings.shape[1] == self.num_positions`, where @@ -248,65 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in Overall, the number of placeholder feature tokens for an image can be calculated as: -
- Code - - ```python - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self.get_hf_config() - hf_processor = self.get_hf_processor() + ??? Code - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size + ```python + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() - num_image_tokens = (image_size // patch_size) ** 2 + 1 - if hf_processor.vision_feature_select_strategy == "default": - num_image_tokens -= 1 + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size - return num_image_tokens - ``` + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 -
+ return num_image_tokens + ``` Notice that the number of image tokens doesn't depend on the image width and height. We can simply use a dummy `image_size` to calculate the multimodal profiling data: -
- Code + ??? Code - ```python - # NOTE: In actuality, this is usually implemented as part of the - # model's subclass of `BaseProcessingInfo`, but we show it as is - # here for simplicity. - def get_image_size_with_most_features(self) -> ImageSize: - hf_config = self.get_hf_config() - width = height = hf_config.image_size - return ImageSize(width=width, height=height) + ```python + # NOTE: In actuality, this is usually implemented as part of the + # model's subclass of `BaseProcessingInfo`, but we show it as is + # here for simplicity. + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = \ - self.info.get_image_size_with_most_features() - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - ``` - -
+ def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` For the text, we simply expand the multimodal image token from the model config to match the desired number of images. @@ -324,26 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in Looking at the code of HF's `FuyuForCausalLM`: -
- Code - - ```python - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 - if image_patches is not None and past_key_values is None: - patch_embeddings = [ - self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) - .squeeze(0) - .to(inputs_embeds.device) - for patch in image_patches - ] - inputs_embeds = self.gather_continuous_embeddings( - word_embeddings=inputs_embeds, - continuous_embeddings=patch_embeddings, - image_patch_input_indices=image_patches_indices, - ) - ``` + ??? Code -
+ ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 + if image_patches is not None and past_key_values is None: + patch_embeddings = [ + self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) + .squeeze(0) + .to(inputs_embeds.device) + for patch in image_patches + ] + inputs_embeds = self.gather_continuous_embeddings( + word_embeddings=inputs_embeds, + continuous_embeddings=patch_embeddings, + image_patch_input_indices=image_patches_indices, + ) + ``` The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. @@ -357,107 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, returning the dimensions after resizing (but before padding) as metadata. -
- Code + ??? Code - ```python - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 - image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) - batch_images = image_encoding["images"] - image_unpadded_heights = image_encoding["image_unpadded_heights"] - image_unpadded_widths = image_encoding["image_unpadded_widths"] - - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L - if do_resize: - batch_images = [ - [self.resize(image, size=size, input_data_format=input_data_format) for image in images] - for images in batch_images - ] - - image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] - image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] - image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] - - if do_pad: - batch_images = [ - [ - self.pad_image( - image, - size=size, - mode=padding_mode, - constant_values=padding_value, - input_data_format=input_data_format, - ) - for image in images + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 + image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) + batch_images = image_encoding["images"] + image_unpadded_heights = image_encoding["image_unpadded_heights"] + image_unpadded_widths = image_encoding["image_unpadded_widths"] + + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L + if do_resize: + batch_images = [ + [self.resize(image, size=size, input_data_format=input_data_format) for image in images] + for images in batch_images ] - for images in batch_images - ] - ``` -
+ image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] + image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] + image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] + + if do_pad: + batch_images = [ + [ + self.pad_image( + image, + size=size, + mode=padding_mode, + constant_values=padding_value, + input_data_format=input_data_format, + ) + for image in images + ] + for images in batch_images + ] + ``` In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: -
- Code - - ```python - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 - model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, - ) - - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 - image_height, image_width = image.shape[1], image.shape[2] - if variable_sized: # variable_sized=True - new_h = min( - image_height, - math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, - ) - new_w = min( - image_width, - math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, + ??? Code + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, ) - image = image[:, :new_h, :new_w] - image_height, image_width = new_h, new_w - num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) - tensor_of_image_ids = torch.full( - [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device - ) - patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) - assert num_patches == patches.shape[0] - ``` + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 + image_height, image_width = image.shape[1], image.shape[2] + if variable_sized: # variable_sized=True + new_h = min( + image_height, + math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, + ) + new_w = min( + image_width, + math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, + ) + image = image[:, :new_h, :new_w] + image_height, image_width = new_h, new_w -
+ num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) + tensor_of_image_ids = torch.full( + [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device + ) + patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) + assert num_patches == patches.shape[0] + ``` The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: -
- Code + ??? Code - ```python - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 - patch_size = patch_size if patch_size is not None else self.patch_size - patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] - - if image_height % patch_height != 0: - raise ValueError(f"{image_height=} must be divisible by {patch_height}") - if image_width % patch_width != 0: - raise ValueError(f"{image_width=} must be divisible by {patch_width}") - - num_patches_per_dim_h = image_height // patch_height - num_patches_per_dim_w = image_width // patch_width - num_patches = num_patches_per_dim_h * num_patches_per_dim_w - ``` + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 + patch_size = patch_size if patch_size is not None else self.patch_size + patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] -
+ if image_height % patch_height != 0: + raise ValueError(f"{image_height=} must be divisible by {patch_height}") + if image_width % patch_width != 0: + raise ValueError(f"{image_width=} must be divisible by {patch_width}") + + num_patches_per_dim_h = image_height // patch_height + num_patches_per_dim_w = image_width // patch_width + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + ``` These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. @@ -479,28 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in For the multimodal image profiling data, the logic is very similar to LLaVA: -
- Code - - ```python - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - target_width, target_height = \ - self.info.get_image_size_with_most_features() - num_images = mm_counts.get("image", 0) - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - ``` + ??? Code -
+ ```python + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` ## 4. Specify processing details @@ -571,40 +530,37 @@ return a schema of the tensors outputted by the HF processor that are related to In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: -
- Code - - ```python - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - ) -> BatchFeature: - processed_outputs = super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) + ??? Code - image_patches = processed_outputs.get("image_patches") - if image_patches is not None: - images = mm_data["images"] - assert isinstance(images, list) + ```python + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) - # Original output: (1, num_images, Pn, Px * Py * C) - # New output: (num_images, Pn, Px * Py * C) - assert (isinstance(image_patches, list) - and len(image_patches) == 1) - assert (isinstance(image_patches[0], torch.Tensor) - and len(image_patches[0]) == len(images)) + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) - processed_outputs["image_patches"] = image_patches[0] + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) - return processed_outputs - ``` + processed_outputs["image_patches"] = image_patches[0] -
+ return processed_outputs + ``` !!! note Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling @@ -644,40 +600,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: -
- Code - - ```python - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index + ??? Code - def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - - image_size = images.get_image_size(item_idx) - num_image_tokens = self.info.get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - - return [image_token_id] * num_image_tokens + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement, - ), - ] - ``` + return [image_token_id] * num_image_tokens -
+ return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + ``` === "Handling additional tokens: Fuyu" @@ -692,140 +645,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies We define a helper function to return `ncols` and `nrows` directly: -
- Code - - ```python - def get_image_feature_grid_size( - self, - *, - image_width: int, - image_height: int, - ) -> tuple[int, int]: - image_processor = self.get_image_processor() - target_width = image_processor.size["width"] - target_height = image_processor.size["height"] - patch_width = image_processor.patch_size["width"] - patch_height = image_processor.patch_size["height"] - - if not (image_width <= target_width and image_height <= target_height): - height_scale_factor = target_height / image_height - width_scale_factor = target_width / image_width - optimal_scale_factor = min(height_scale_factor, width_scale_factor) - - image_height = int(image_height * optimal_scale_factor) - image_width = int(image_width * optimal_scale_factor) - - ncols = math.ceil(image_width / patch_width) - nrows = math.ceil(image_height / patch_height) - return ncols, nrows - ``` + ??? Code -
+ ```python + def get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + image_processor = self.get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] + patch_width = image_processor.patch_size["width"] + patch_height = image_processor.patch_size["height"] + + if not (image_width <= target_width and image_height <= target_height): + height_scale_factor = target_height / image_height + width_scale_factor = target_width / image_width + optimal_scale_factor = min(height_scale_factor, width_scale_factor) + + image_height = int(image_height * optimal_scale_factor) + image_width = int(image_width * optimal_scale_factor) + + ncols = math.ceil(image_width / patch_width) + nrows = math.ceil(image_height / patch_height) + return ncols, nrows + ``` Based on this, we can initially define our replacement tokens as: -
- Code - - ```python - def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) + ??? Code - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) + ```python + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) - # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` - # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` - return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows - ``` + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) -
+ # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` + # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` + return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + ``` However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, a BOS token (``) is also added to the promopt: -
- Code - - ```python - # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 - model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, - ) - prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( - tokenizer=self.tokenizer, - prompts=prompts, - scale_factors=scale_factors, - max_tokens_to_generate=self.max_tokens_to_generate, - max_position_embeddings=self.max_position_embeddings, - add_BOS=True, - add_beginning_of_answer_token=True, - ) - ``` - -
- - To assign the vision embeddings to only the image tokens, instead of a string - you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: - -
- Code - - ```python - hf_config = self.info.get_hf_config() - bos_token_id = hf_config.bos_token_id # `` - assert isinstance(bos_token_id, int) - - def get_replacement_fuyu(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, + ??? Code + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows - - return PromptUpdateDetails.select_token_id( - image_tokens + [bos_token_id], - embed_token_id=_IMAGE_TOKEN_ID, + prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( + tokenizer=self.tokenizer, + prompts=prompts, + scale_factors=scale_factors, + max_tokens_to_generate=self.max_tokens_to_generate, + max_position_embeddings=self.max_position_embeddings, + add_BOS=True, + add_beginning_of_answer_token=True, ) - ``` - -
+ ``` - Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, - we can search for it to conduct the replacement at the start of the string: + To assign the vision embeddings to only the image tokens, instead of a string + you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: -
- Code + ??? Code - ```python - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> Sequence[PromptUpdate]: + ```python hf_config = self.info.get_hf_config() - bos_token_id = hf_config.bos_token_id + bos_token_id = hf_config.bos_token_id # `` assert isinstance(bos_token_id, int) - tokenizer = self.info.get_tokenizer() - eot_token_id = tokenizer.bos_token_id - assert isinstance(eot_token_id, int) - def get_replacement_fuyu(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) @@ -841,17 +744,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_tokens + [bos_token_id], embed_token_id=_IMAGE_TOKEN_ID, ) + ``` - return [ - PromptReplacement( - modality="image", - target=[eot_token_id], - replacement=get_replacement_fuyu, - ) - ] - ``` + Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, + we can search for it to conduct the replacement at the start of the string: + + ??? Code + + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) + + tokenizer = self.info.get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows -
+ return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, + ) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + ``` ## 5. Register processor-related classes diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index d9aab069fc4..6d6366741aa 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -97,31 +97,26 @@ to manually kill the profiler and generate your `nsys-rep` report. You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started). -CLI example: - -
-Command - -```bash -nsys stats report1.nsys-rep -... - ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): - - Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name - -------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- - 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… - 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… - 12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off… - 9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_… - 5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel, (bool)1>(T1 *, cons… - 4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern… - 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, in… - 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… -... -``` - -
+??? CLI example + + ```bash + nsys stats report1.nsys-rep + ... + ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): + + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- + 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… + 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… + 12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off… + 9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_… + 5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel, (bool)1>(T1 *, cons… + 4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern… + 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, in… + 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… + ... + ``` GUI example: diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 895c4c47e91..eb84db7871e 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -20,8 +20,6 @@ docker run --runtime nvidia --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` -
- This image can also be used with other container engines such as [Podman](https://podman.io/). ```console @@ -99,24 +97,21 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). -
-Command - -```console -# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) -python3 use_existing_torch.py -DOCKER_BUILDKIT=1 docker build . \ - --file docker/Dockerfile \ - --target vllm-openai \ - --platform "linux/arm64" \ - -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" -``` +??? Command -
+ ```console + # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) + python3 use_existing_torch.py + DOCKER_BUILDKIT=1 docker build . \ + --file docker/Dockerfile \ + --target vllm-openai \ + --platform "linux/arm64" \ + -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" + ``` !!! note If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md index 5b39ad46274..295664daead 100644 --- a/docs/deployment/frameworks/autogen.md +++ b/docs/deployment/frameworks/autogen.md @@ -11,9 +11,6 @@ title: AutoGen - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment -
-Commands - ```console pip install vllm @@ -22,8 +19,6 @@ pip install vllm pip install -U "autogen-agentchat" "autogen-ext[openai]" ``` -
- ## Deploy - Start the vLLM server with the supported chat completion model, e.g. @@ -35,56 +30,53 @@ python -m vllm.entrypoints.openai.api_server \ - Call it with AutoGen: -
-Code - -```python -import asyncio -from autogen_core.models import UserMessage -from autogen_ext.models.openai import OpenAIChatCompletionClient -from autogen_core.models import ModelFamily - - -async def main() -> None: - # Create a model client - model_client = OpenAIChatCompletionClient( - model="mistralai/Mistral-7B-Instruct-v0.2", - base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", - api_key="EMPTY", - model_info={ - "vision": False, - "function_calling": False, - "json_output": False, - "family": ModelFamily.MISTRAL, - "structured_output": True, - }, - ) - - messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] - - # Create a stream. - stream = model_client.create_stream(messages=messages) - - # Iterate over the stream and print the responses. - print("Streamed responses:") - async for response in stream: - if isinstance(response, str): - # A partial response is a string. - print(response, flush=True, end="") - else: - # The last response is a CreateResult object with the complete message. - print("\n\n------------\n") - print("The complete response:", flush=True) - print(response.content, flush=True) - - # Close the client when done. - await model_client.close() - - -asyncio.run(main()) -``` - -
+??? Code + + ```python + import asyncio + from autogen_core.models import UserMessage + from autogen_ext.models.openai import OpenAIChatCompletionClient + from autogen_core.models import ModelFamily + + + async def main() -> None: + # Create a model client + model_client = OpenAIChatCompletionClient( + model="mistralai/Mistral-7B-Instruct-v0.2", + base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", + api_key="EMPTY", + model_info={ + "vision": False, + "function_calling": False, + "json_output": False, + "family": ModelFamily.MISTRAL, + "structured_output": True, + }, + ) + + messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] + + # Create a stream. + stream = model_client.create_stream(messages=messages) + + # Iterate over the stream and print the responses. + print("Streamed responses:") + async for response in stream: + if isinstance(response, str): + # A partial response is a string. + print(response, flush=True, end="") + else: + # The last response is a CreateResult object with the complete message. + print("\n\n------------\n") + print("The complete response:", flush=True) + print(response.content, flush=True) + + # Close the client when done. + await model_client.close() + + + asyncio.run(main()) + ``` For details, see the tutorial: diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index 5ae5f5a1f2e..8e096f26db7 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -34,30 +34,27 @@ vllm = "latest" Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: -
-Code +??? Code -```python -from vllm import LLM, SamplingParams + ```python + from vllm import LLM, SamplingParams -llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") -def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): - sampling_params = SamplingParams(temperature=temperature, top_p=top_p) - outputs = llm.generate(prompts, sampling_params) + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - results = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - results.append({"prompt": prompt, "generated_text": generated_text}) + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) - return {"results": results} -``` - -
+ return {"results": results} + ``` Then, run the following code to deploy it to the cloud: @@ -67,57 +64,51 @@ cerebrium deploy If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) -
-Command - -```python -curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ - -H 'Content-Type: application/json' \ - -H 'Authorization: ' \ - --data '{ - "prompts": [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is" - ] - }' -``` - -
+??? Command + + ```python + curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' + ``` You should get a response like: -
-Response - -```python -{ - "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", - "result": { - "result": [ - { - "prompt": "Hello, my name is", - "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" - }, - { - "prompt": "The president of the United States is", - "generated_text": " elected every four years. This is a democratic system.\n\n5. What" - }, - { - "prompt": "The capital of France is", - "generated_text": " Paris.\n" - }, - { - "prompt": "The future of AI is", - "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." - } - ] - }, - "run_time_ms": 152.53663063049316 -} -``` - -
+??? Response + + ```python + { + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 + } + ``` You now have an autoscaling endpoint where you only pay for the compute you use! diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index 147e61211eb..886484b5434 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -24,9 +24,6 @@ vllm serve Qwen/Qwen1.5-7B-Chat - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): -
-Commands - ```console git clone https://github.com/langgenius/dify.git cd dify @@ -35,8 +32,6 @@ cp .env.example .env docker compose up -d ``` -
- - Open the browser to access `http://localhost/install`, config the basic login information and login. - In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 98ee73e976f..0b91fc88ce3 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -18,103 +18,89 @@ dstack server Next, to configure your dstack project, run: -
-Code - ```console mkdir -p vllm-dstack cd vllm-dstack dstack init ``` -
- Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: -
-Config - -```yaml -type: service - -python: "3.11" -env: - - MODEL=NousResearch/Llama-2-7b-chat-hf -port: 8000 -resources: - gpu: 24GB -commands: - - pip install vllm - - vllm serve $MODEL --port 8000 -model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf -``` - -
+??? Config + + ```yaml + type: service + + python: "3.11" + env: + - MODEL=NousResearch/Llama-2-7b-chat-hf + port: 8000 + resources: + gpu: 24GB + commands: + - pip install vllm + - vllm serve $MODEL --port 8000 + model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf + ``` Then, run the following CLI for provisioning: -
-Command - -```console -$ dstack run . -f serve.dstack.yml - -⠸ Getting run plan... - Configuration serve.dstack.yml - Project deep-diver-main - User deep-diver - Min resources 2..xCPU, 8GB.., 1xGPU (24GB) - Max price - - Max duration - - Spot policy auto - Retry policy no - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - ... - Shown 3 of 193 offers, $5.876 max - -Continue? [y/n]: y -⠙ Submitting run... -⠏ Launching spicy-treefrog-1 (pulling) -spicy-treefrog-1 provisioning completed (running) -Service is published at ... -``` - -
+??? Command + + ```console + $ dstack run . -f serve.dstack.yml + + ⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + + Continue? [y/n]: y + ⠙ Submitting run... + ⠏ Launching spicy-treefrog-1 (pulling) + spicy-treefrog-1 provisioning completed (running) + Service is published at ... + ``` After the provisioning, you can interact with the model by using the OpenAI SDK: -
-Code +??? Code -```python -from openai import OpenAI + ```python + from openai import OpenAI -client = OpenAI( - base_url="https://gateway.", - api_key="" -) + client = OpenAI( + base_url="https://gateway.", + api_key="" + ) -completion = client.chat.completions.create( - model="NousResearch/Llama-2-7b-chat-hf", - messages=[ - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming.", - } - ] -) - -print(completion.choices[0].message.content) -``` + completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] + ) -
+ print(completion.choices[0].message.content) + ``` !!! note dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index f6f9727b745..04d9eba3065 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -27,35 +27,29 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1 - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. -
-Code - -```python -from haystack.components.generators.chat import OpenAIChatGenerator -from haystack.dataclasses import ChatMessage -from haystack.utils import Secret - -generator = OpenAIChatGenerator( - # for compatibility with the OpenAI API, a placeholder api_key is needed - api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), - model="mistralai/Mistral-7B-Instruct-v0.1", - api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", - generation_kwargs = {"max_tokens": 512} -) - -response = generator.run( - messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] -) - -print("-"*30) -print(response) -print("-"*30) -``` - -
- -
-Output +??? Code + + ```python + from haystack.components.generators.chat import OpenAIChatGenerator + from haystack.dataclasses import ChatMessage + from haystack.utils import Secret + + generator = OpenAIChatGenerator( + # for compatibility with the OpenAI API, a placeholder api_key is needed + api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), + model="mistralai/Mistral-7B-Instruct-v0.1", + api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", + generation_kwargs = {"max_tokens": 512} + ) + + response = generator.run( + messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] + ) + + print("-"*30) + print(response) + print("-"*30) + ``` ```console ------------------------------ @@ -63,6 +57,4 @@ print("-"*30) ------------------------------ ``` -
- For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md). diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index abd460b19be..8498feaa297 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -34,26 +34,23 @@ vllm serve qwen/Qwen1.5-0.5B-Chat - Call it with litellm: -
-Code +??? Code -```python -import litellm + ```python + import litellm -messages = [{ "content": "Hello, how are you?","role": "user"}] + messages = [{ "content": "Hello, how are you?","role": "user"}] -# hosted_vllm is prefix key word and necessary -response = litellm.completion( - model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name - messages=messages, - api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", - temperature=0.2, - max_tokens=80) - -print(response) -``` + # hosted_vllm is prefix key word and necessary + response = litellm.completion( + model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name + messages=messages, + api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", + temperature=0.2, + max_tokens=80) -
+ print(response) + ``` ### Embeddings @@ -65,9 +62,6 @@ vllm serve BAAI/bge-base-en-v1.5 - Call it with litellm: -
-Code - ```python from litellm import embedding import os @@ -81,6 +75,4 @@ embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello w print(embedding) ``` -
- For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm). diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 934250b3df9..9df95287690 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -17,104 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber Deploy the following yaml file `lws.yaml` -
-Config - -```yaml -apiVersion: leaderworkerset.x-k8s.io/v1 -kind: LeaderWorkerSet -metadata: - name: vllm -spec: - replicas: 2 - leaderWorkerTemplate: - size: 2 - restartPolicy: RecreateGroupOnPodRestart - leaderTemplate: - metadata: - labels: - role: leader - spec: - containers: - - name: vllm-leader - image: docker.io/vllm/vllm-openai:latest - env: - - name: HUGGING_FACE_HUB_TOKEN - value: - command: - - sh - - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" - resources: - limits: - nvidia.com/gpu: "8" - memory: 1124Gi - ephemeral-storage: 800Gi - requests: - ephemeral-storage: 800Gi - cpu: 125 - ports: - - containerPort: 8080 - readinessProbe: - tcpSocket: - port: 8080 - initialDelaySeconds: 15 - periodSeconds: 10 - volumeMounts: - - mountPath: /dev/shm - name: dshm - volumes: - - name: dshm - emptyDir: - medium: Memory - sizeLimit: 15Gi - workerTemplate: - spec: - containers: - - name: vllm-worker - image: docker.io/vllm/vllm-openai:latest - command: - - sh - - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" - resources: - limits: - nvidia.com/gpu: "8" - memory: 1124Gi - ephemeral-storage: 800Gi - requests: - ephemeral-storage: 800Gi - cpu: 125 - env: - - name: HUGGING_FACE_HUB_TOKEN - value: - volumeMounts: - - mountPath: /dev/shm - name: dshm - volumes: - - name: dshm - emptyDir: - medium: Memory - sizeLimit: 15Gi ---- -apiVersion: v1 -kind: Service -metadata: - name: vllm-leader -spec: - ports: - - name: http - port: 8080 - protocol: TCP - targetPort: 8080 - selector: - leaderworkerset.sigs.k8s.io/name: vllm - role: leader - type: ClusterIP -``` - -
+??? Yaml + + ```yaml + apiVersion: leaderworkerset.x-k8s.io/v1 + kind: LeaderWorkerSet + metadata: + name: vllm + spec: + replicas: 2 + leaderWorkerTemplate: + size: 2 + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - name: vllm-leader + image: docker.io/vllm/vllm-openai:latest + env: + - name: HUGGING_FACE_HUB_TOKEN + value: + command: + - sh + - -c + - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); + python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + ports: + - containerPort: 8080 + readinessProbe: + tcpSocket: + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + workerTemplate: + spec: + containers: + - name: vllm-worker + image: docker.io/vllm/vllm-openai:latest + command: + - sh + - -c + - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + env: + - name: HUGGING_FACE_HUB_TOKEN + value: + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + --- + apiVersion: v1 + kind: Service + metadata: + name: vllm-leader + spec: + ports: + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + leaderworkerset.sigs.k8s.io/name: vllm + role: leader + type: ClusterIP + ``` ```bash kubectl apply -f lws.yaml @@ -180,30 +177,27 @@ curl http://localhost:8080/v1/completions \ The output should be similar to the following -
-Output +??? Output -```text -{ - "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", - "object": "text_completion", - "created": 1715138766, - "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", - "choices": [ + ```text { - "index": 0, - "text": " top destination for foodies, with", - "logprobs": null, - "finish_reason": "length", - "stop_reason": null + "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", + "object": "text_completion", + "created": 1715138766, + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", + "choices": [ + { + "index": 0, + "text": " top destination for foodies, with", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null + } + ], + "usage": { + "prompt_tokens": 5, + "total_tokens": 12, + "completion_tokens": 7 + } } - ], - "usage": { - "prompt_tokens": 5, - "total_tokens": 12, - "completion_tokens": 7 - } -} -``` - -
+ ``` diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index d67eb0e756e..b649312971b 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -24,53 +24,50 @@ sky check See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). -
-Config - -```yaml -resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - -envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - -setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - -run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 -``` - -
+??? Yaml + + ```yaml + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 + ``` Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): @@ -98,73 +95,67 @@ HF_TOKEN="your-huggingface-token" \ SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. -
-Config - -```yaml -service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 -``` - -
- -
-Click to see the full recipe YAML - -```yaml -service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? +??? Yaml + + ```yaml + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? max_completion_tokens: 1 + ``` -resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - -envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - -setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - -run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log -``` - -
+??? Yaml + + ```yaml + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + ``` Start the serving the Llama-3 8B model on multiple replicas: @@ -195,30 +186,27 @@ vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) R After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: -
-Commands - -```console -ENDPOINT=$(sky serve status --endpoint 8081 vllm) -curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' -``` - -
+??? Commands + + ```bash + ENDPOINT=$(sky serve status --endpoint 8081 vllm) + curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' + ``` To enable autoscaling, you could replace the `replicas` with the following configs in `service`: @@ -232,57 +220,54 @@ service: This will scale the service up to when the QPS exceeds 2 for each replica. -
-Click to see the full recipe YAML - -```yaml -service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - -resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - -envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - -setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - -run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log -``` - -
+??? Yaml + + ```yaml + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + ``` To update the service with the new config: @@ -300,38 +285,35 @@ sky serve down vllm It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. -
-Click to see the full GUI YAML +??? Yaml -```yaml -envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - -resources: - cpus: 2 - -setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - # Install Gradio for web UI. - pip install gradio openai - -run: | - conda activate vllm - export PATH=$PATH:/sbin - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log -``` + ```yaml + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. -
+ resources: + cpus: 2 + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + + run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log + ``` 1. Start the chat web UI: diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md index b55fb5f0be6..2b1cc6f6fee 100644 --- a/docs/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -60,25 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai curl -o- http://localhost:30080/models ``` -
-Expected output +??? Output -```json -{ - "object": "list", - "data": [ + ```json { - "id": "facebook/opt-125m", - "object": "model", - "created": 1737428424, - "owned_by": "vllm", - "root": null + "object": "list", + "data": [ + { + "id": "facebook/opt-125m", + "object": "model", + "created": 1737428424, + "owned_by": "vllm", + "root": null + } + ] } - ] -} -``` - -
+ ``` To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: @@ -92,26 +89,23 @@ curl -X POST http://localhost:30080/completions \ }' ``` -
-Expected output +??? Output -```json -{ - "id": "completion-id", - "object": "text_completion", - "created": 1737428424, - "model": "facebook/opt-125m", - "choices": [ + ```json { - "text": " there was a brave knight who...", - "index": 0, - "finish_reason": "length" + "id": "completion-id", + "object": "text_completion", + "created": 1737428424, + "model": "facebook/opt-125m", + "choices": [ + { + "text": " there was a brave knight who...", + "index": 0, + "finish_reason": "length" + } + ] } - ] -} -``` - -
+ ``` ### Uninstall @@ -127,28 +121,25 @@ sudo helm uninstall vllm The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above: -
-Yaml +??? Yaml -```yaml -servingEngineSpec: - runtimeClassName: "" - modelSpec: - - name: "opt125m" - repository: "vllm/vllm-openai" - tag: "latest" - modelURL: "facebook/opt-125m" + ```yaml + servingEngineSpec: + runtimeClassName: "" + modelSpec: + - name: "opt125m" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "facebook/opt-125m" - replicaCount: 1 + replicaCount: 1 - requestCPU: 6 - requestMemory: "16Gi" - requestGPU: 1 - - pvcStorage: "10Gi" -``` + requestCPU: 6 + requestMemory: "16Gi" + requestGPU: 1 -
+ pvcStorage: "10Gi" + ``` In this YAML configuration: * **`modelSpec`** includes: diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index f38afe99b64..13225ba208f 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -29,95 +29,89 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: -
-Config - -```bash -cat < + ```bash + cat < -Config - -```bash -cat < + persistentVolumeClaim: + claimName: vllm-models + --- + apiVersion: v1 + kind: Service + metadata: + name: vllm-server + spec: + selector: + app.kubernetes.io/name: vllm + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP + EOF + ``` We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): @@ -139,7 +133,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
- Config + Yaml ```yaml apiVersion: v1 @@ -179,7 +173,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) NVIDIA GPU:
- Config + Yaml ```yaml apiVersion: apps/v1 @@ -258,7 +252,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
- Config + Yaml ```yaml apiVersion: apps/v1 @@ -337,7 +331,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
- Config + Yaml ```yaml apiVersion: v1 diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index 89a7f3bd300..752be76b386 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -36,28 +36,25 @@ docker build . -f Dockerfile.nginx --tag nginx-lb Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. -
-Config +??? Config -```console -upstream backend { - least_conn; - server vllm0:8000 max_fails=3 fail_timeout=10000s; - server vllm1:8000 max_fails=3 fail_timeout=10000s; -} -server { - listen 80; - location / { - proxy_pass http://backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; + ```console + upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; } -} -``` - -
+ server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + ``` [](){ #nginxloadbalancer-nginx-vllm-container } @@ -98,35 +95,32 @@ Notes: - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. -
-Command - -```console -mkdir -p ~/.cache/huggingface/hub/ -hf_cache_dir=~/.cache/huggingface/ -docker run \ - -itd \ - --ipc host \ - --network vllm_nginx \ - --gpus device=0 \ - --shm-size=10.24gb \ - -v $hf_cache_dir:/root/.cache/huggingface/ \ - -p 8081:8000 \ - --name vllm0 vllm \ - --model meta-llama/Llama-2-7b-chat-hf -docker run \ - -itd \ - --ipc host \ - --network vllm_nginx \ - --gpus device=1 \ - --shm-size=10.24gb \ - -v $hf_cache_dir:/root/.cache/huggingface/ \ - -p 8082:8000 \ - --name vllm1 vllm \ - --model meta-llama/Llama-2-7b-chat-hf -``` - -
+??? Commands + + ```console + mkdir -p ~/.cache/huggingface/hub/ + hf_cache_dir=~/.cache/huggingface/ + docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=0 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8081:8000 \ + --name vllm0 vllm \ + --model meta-llama/Llama-2-7b-chat-hf + docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=1 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8082:8000 \ + --name vllm1 vllm \ + --model meta-llama/Llama-2-7b-chat-hf + ``` !!! note If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index ca5b6530ac8..9bfdab17007 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -22,36 +22,33 @@ server. Here is a sample of `LLM` class usage: -
-Code +??? Code -```python -from vllm import LLM, SamplingParams - -# Define a list of input prompts -prompts = [ - "Hello, my name is", - "The capital of France is", - "The largest ocean is", -] + ```python + from vllm import LLM, SamplingParams -# Define sampling parameters -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Define a list of input prompts + prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", + ] -# Initialize the LLM engine with the OPT-125M model -llm = LLM(model="facebook/opt-125m") + # Define sampling parameters + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Generate outputs for the input prompts -outputs = llm.generate(prompts, sampling_params) + # Initialize the LLM engine with the OPT-125M model + llm = LLM(model="facebook/opt-125m") -# Print the generated outputs -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` + # Generate outputs for the input prompts + outputs = llm.generate(prompts, sampling_params) -
+ # Print the generated outputs + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. @@ -183,37 +180,34 @@ vision-language model. To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: -
- Code - - ```python - class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - - from vllm.config import VllmConfig - class MyNewModel(MyOldModel): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - - if __version__ >= "0.6.4": - MyModel = MyNewModel - else: - MyModel = MyOldModel - ``` - -
+ ??? Code + + ```python + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + ``` This way, the model can work with both old and new versions of vLLM. diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index bf736b7a984..ff135a73196 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -448,32 +448,29 @@ elements of the entire head for all context tokens. However, overall, all results for output have been calculated but are just stored in different thread register memory. -
-Code +??? Code -```cpp -float* out_smem = reinterpret_cast(shared_mem); -for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ```cpp + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. ... - accs[i] += src[row_idx]; + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. } - - // Write out the accs. -} -``` - -
+ ``` ## Output diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 93a616c5273..944f0e680de 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -13,33 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: -
-Code - -```python -# inside `setup.py` file -from setuptools import setup - -setup(name='vllm_add_dummy_model', - version='0.1', - packages=['vllm_add_dummy_model'], - entry_points={ - 'vllm.general_plugins': - ["register_dummy_model = vllm_add_dummy_model:register"] - }) - -# inside `vllm_add_dummy_model.py` file -def register(): - from vllm import ModelRegistry - - if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model( - "MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava", - ) -``` - -
+??? Code + + ```python + # inside `setup.py` file + from setuptools import setup + + setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + + # inside `vllm_add_dummy_model.py` file + def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model( + "MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava", + ) + ``` For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). diff --git a/docs/features/lora.md b/docs/features/lora.md index eae9ef3fc53..4ccc3290e56 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -29,29 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and the third parameter is the path to the LoRA adapter. -
-Code +??? Code -```python -sampling_params = SamplingParams( - temperature=0, - max_tokens=256, - stop=["[/assistant]"] -) - -prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", -] - -outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) -) -``` + ```python + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] + ) + + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", + ] -
+ outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) + ) + ``` Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. @@ -73,29 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): -
-Command +??? Command -```bash -curl localhost:8000/v1/models | jq . -{ - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - ... - }, - { - "id": "sql-lora", - "object": "model", - ... - } - ] -} -``` - -
+ ```bash + curl localhost:8000/v1/models | jq . + { + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + ... + }, + { + "id": "sql-lora", + "object": "model", + ... + } + ] + } + ``` Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other @@ -178,39 +172,36 @@ Alternatively, follow these example steps to implement your own plugin: 1. Implement the LoRAResolver interface. -
- Example of a simple S3 LoRAResolver implementation - - ```python - import os - import s3fs - from vllm.lora.request import LoRARequest - from vllm.lora.resolver import LoRAResolver - - class S3LoRAResolver(LoRAResolver): - def __init__(self): - self.s3 = s3fs.S3FileSystem() - self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") - self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") - - async def resolve_lora(self, base_model_name, lora_name): - s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) - local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) - - # Download the LoRA from S3 to the local path - await self.s3._get( - s3_path, local_path, recursive=True, maxdepth=1 - ) - - lora_request = LoRARequest( - lora_name=lora_name, - lora_path=local_path, - lora_int_id=abs(hash(lora_name)) - ) - return lora_request - ``` - -
+ ??? Example of a simple S3 LoRAResolver implementation + + ```python + import os + import s3fs + from vllm.lora.request import LoRARequest + from vllm.lora.resolver import LoRAResolver + + class S3LoRAResolver(LoRAResolver): + def __init__(self): + self.s3 = s3fs.S3FileSystem() + self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") + self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") + + async def resolve_lora(self, base_model_name, lora_name): + s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + + # Download the LoRA from S3 to the local path + await self.s3._get( + s3_path, local_path, recursive=True, maxdepth=1 + ) + + lora_request = LoRARequest( + lora_name=lora_name, + lora_path=local_path, + lora_int_id=abs(hash(lora_name)) + ) + return lora_request + ``` 2. Register `LoRAResolver` plugin. @@ -247,43 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `root` field points to the artifact location of the lora adapter. -
-Command output +??? Command output -```bash -$ curl http://localhost:8000/v1/models - -{ - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", - "parent": null, - "permission": [ + ```bash + $ curl http://localhost:8000/v1/models + + { + "object": "list", + "data": [ { - ..... - } - ] - }, - { - "id": "sql-lora", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", - "parent": meta-llama/Llama-2-7b-hf, - "permission": [ + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", + "parent": null, + "permission": [ + { + ..... + } + ] + }, { - .... + "id": "sql-lora", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", + "parent": meta-llama/Llama-2-7b-hf, + "permission": [ + { + .... + } + ] } ] - } - ] -} -``` - -
+ } + ``` diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 469587f2f6f..d4465beb859 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -20,126 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: -
-Code - -```python -from vllm import LLM - -llm = LLM(model="llava-hf/llava-1.5-7b-hf") - -# Refer to the HuggingFace repo for the correct format to use -prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - -# Load the image using PIL.Image -image = PIL.Image.open(...) - -# Single prompt inference -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image}, -}) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -# Batch inference -image_1 = PIL.Image.open(...) -image_2 = PIL.Image.open(...) -outputs = llm.generate( - [ - { - "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_1}, - }, - { - "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_2}, - } - ] -) +??? Code -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + ```python + from vllm import LLM + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Load the image using PIL.Image + image = PIL.Image.open(...) + + # Single prompt inference + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, + }) -
+ for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Batch inference + image_1 = PIL.Image.open(...) + image_2 = PIL.Image.open(...) + outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: -
-Code - -```python -from vllm import LLM +??? Code -llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, # Required to load Phi-3.5-vision - max_model_len=4096, # Otherwise, it may not fit in smaller GPUs - limit_mm_per_prompt={"image": 2}, # The maximum number to accept -) + ```python + from vllm import LLM -# Refer to the HuggingFace repo for the correct format to use -prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept + ) -# Load the images using PIL.Image -image1 = PIL.Image.open(...) -image2 = PIL.Image.open(...) + # Refer to the HuggingFace repo for the correct format to use + prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, -}) + # Load the images using PIL.Image + image1 = PIL.Image.open(...) + image2 = PIL.Image.open(...) -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, + }) -
+ for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: -
-Code +??? Code -```python -from vllm import LLM + ```python + from vllm import LLM -# Specify the maximum number of frames per video to be 4. This can be changed. -llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + # Specify the maximum number of frames per video to be 4. This can be changed. + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) -# Create the request payload. -video_frames = ... # load your video making sure it only has the number of frames specified earlier. -message = { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, - ], -} -for i in range(len(video_frames)): - base64_image = encode_image(video_frames[i]) # base64 encoding. - new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - message["content"].append(new_image) - -# Perform inference and log output. -outputs = llm.chat([message]) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + # Create the request payload. + video_frames = ... # load your video making sure it only has the number of frames specified earlier. + message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], + } + for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + + # Perform inference and log output. + outputs = llm.chat([message]) -
+ for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` ### Video Inputs @@ -159,78 +150,72 @@ Full example: To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. -
-Code - -```python -from vllm import LLM +??? Code -# Inference with image embeddings as input -llm = LLM(model="llava-hf/llava-1.5-7b-hf") + ```python + from vllm import LLM -# Refer to the HuggingFace repo for the correct format to use -prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + # Inference with image embeddings as input + llm = LLM(model="llava-hf/llava-1.5-7b-hf") -# Embeddings for single image -# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) -image_embeds = torch.load(...) + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, -}) + # Embeddings for single image + # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) -
+ for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: -
-Code +??? Code -```python -# Construct the prompt based on your model -prompt = ... + ```python + # Construct the prompt based on your model + prompt = ... -# Embeddings for multiple images -# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) -image_embeds = torch.load(...) + # Embeddings for multiple images + # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) -# Qwen2-VL -llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) -mm_data = { - "image": { - "image_embeds": image_embeds, - # image_grid_thw is needed to calculate positional encoding. - "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), - } -} - -# MiniCPM-V -llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) -mm_data = { - "image": { - "image_embeds": image_embeds, - # image_sizes is needed to calculate details of the sliced image. - "image_sizes": [image.size for image in images], # list of image sizes + # Qwen2-VL + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } } -} -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, -}) + # MiniCPM-V + llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_sizes is needed to calculate details of the sliced image. + "image_sizes": [image.size for image in images], # list of image sizes + } + } -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) -
+ for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` ## Online Serving @@ -260,56 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ Then, you can use the OpenAI client as follows: -
-Code - -```python -from openai import OpenAI - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -# Single-image input inference -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - -chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }], -) -print("Chat completion output:", chat_response.choices[0].message.content) - -# Multi-image input inference -image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" -image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" - -chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, - ], - }], -) -print("Chat completion output:", chat_response.choices[0].message.content) -``` - -
+??? Code + + ```python + from openai import OpenAI + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Single-image input inference + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + + # Multi-image input inference + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + ``` Full example: @@ -341,49 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model Then, you can use the OpenAI client as follows: -
-Code +??? Code -```python -from openai import OpenAI + ```python + from openai import OpenAI -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" + video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" -## Use video url in the payload -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + ## Use video url in the payload + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this video?" }, - }, - ], - }], - model=model, - max_completion_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output from image url:", result) -``` - -
+ { + "type": "video_url", + "video_url": { + "url": video_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from image url:", result) + ``` Full example: @@ -408,94 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b Then, you can use the OpenAI client as follows: -
-Code +??? Code -```python -import base64 -import requests -from openai import OpenAI -from vllm.assets.audio import AudioAsset + ```python + import base64 + import requests + from openai import OpenAI + from vllm.assets.audio import AudioAsset -def encode_base64_content_from_url(content_url: str) -> str: - """Encode a content retrieved from a remote url to base64 format.""" + def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" - with requests.get(content_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') - return result + return result -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -# Any format supported by librosa is supported -audio_url = AudioAsset("winning_call").url -audio_base64 = encode_base64_content_from_url(audio_url) + # Any format supported by librosa is supported + audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) -chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" }, - }, - ], - }], - model=model, - max_completion_tokens=64, -) - -result = chat_completion_from_base64.choices[0].message.content -print("Chat completion output from input audio:", result) -``` - -
+ { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) + ``` Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: -
-Code +??? Code -```python -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url + ```python + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" }, - }, - ], - }], - model=model, - max_completion_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output from audio url:", result) -``` - -
+ { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) + ``` Full example: @@ -515,66 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. The following example demonstrates how to pass image embeddings to the OpenAI server: -
-Code - -```python -image_embedding = torch.load(...) -grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct - -buffer = io.BytesIO() -torch.save(image_embedding, buffer) -buffer.seek(0) -binary_data = buffer.read() -base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -# Basic usage - this is equivalent to the LLaVA example for offline inference -model = "llava-hf/llava-1.5-7b-hf" -embeds = { - "type": "image_embeds", - "image_embeds": f"{base64_image_embedding}" -} - -# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) -model = "Qwen/Qwen2-VL-2B-Instruct" -embeds = { - "type": "image_embeds", - "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct - }, -} -model = "openbmb/MiniCPM-V-2_6" -embeds = { - "type": "image_embeds", - "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 - }, -} -chat_completion = client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ - { - "type": "text", - "text": "What's in this image?", - }, - embeds, - ], - }, -], - model=model, -) -``` +??? Code + + ```python + image_embedding = torch.load(...) + grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct + + buffer = io.BytesIO() + torch.save(image_embedding, buffer) + buffer.seek(0) + binary_data = buffer.read() + base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') + + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Basic usage - this is equivalent to the LLaVA example for offline inference + model = "llava-hf/llava-1.5-7b-hf" + embeds = { + "type": "image_embeds", + "image_embeds": f"{base64_image_embedding}" + } -
+ # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) + model = "Qwen/Qwen2-VL-2B-Instruct" + embeds = { + "type": "image_embeds", + "image_embeds": { + "image_embeds": f"{base64_image_embedding}" , # Required + "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct + }, + } + model = "openbmb/MiniCPM-V-2_6" + embeds = { + "type": "image_embeds", + "image_embeds": { + "image_embeds": f"{base64_image_embedding}" , # Required + "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 + }, + } + chat_completion = client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + { + "type": "text", + "text": "What's in this image?", + }, + embeds, + ], + }, + ], + model=model, + ) + ``` !!! note Only one message can contain `{"type": "image_embeds"}`. diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 92f02fb91ba..8362672f40b 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -15,34 +15,31 @@ pip install autoawq After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: -
-Code +??? Code -```python -from awq import AutoAWQForCausalLM -from transformers import AutoTokenizer + ```python + from awq import AutoAWQForCausalLM + from transformers import AutoTokenizer -model_path = 'mistralai/Mistral-7B-Instruct-v0.2' -quant_path = 'mistral-instruct-v0.2-awq' -quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + model_path = 'mistralai/Mistral-7B-Instruct-v0.2' + quant_path = 'mistral-instruct-v0.2-awq' + quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } -# Load model -model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} -) -tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + # Load model + model = AutoAWQForCausalLM.from_pretrained( + model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) -# Quantize -model.quantize(tokenizer, quant_config=quant_config) + # Quantize + model.quantize(tokenizer, quant_config=quant_config) -# Save quantized model -model.save_quantized(quant_path) -tokenizer.save_pretrained(quant_path) + # Save quantized model + model.save_quantized(quant_path) + tokenizer.save_pretrained(quant_path) -print(f'Model is quantized and saved at "{quant_path}"') -``` - -
+ print(f'Model is quantized and saved at "{quant_path}"') + ``` To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: @@ -54,32 +51,29 @@ python examples/offline_inference/llm_engine_example.py \ AWQ models are also supported directly through the LLM entrypoint: -
-Code - -```python -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -
+??? Code + + ```python + from vllm import LLM, SamplingParams + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index ddda72a6917..3f8ae7a959c 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -27,9 +27,6 @@ Usually, these repositories have a `quantize_config.json` file that includes a ` ## Read bitblas format checkpoint -
-Code - ```python from vllm import LLM import torch @@ -44,26 +41,21 @@ llm = LLM( ) ``` -
- ## Read gptq format checkpoint -
-Code - -```python -from vllm import LLM -import torch - -# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. -model_id = "hxbgsyxh/llama-13b-4bit-g-1" -llm = LLM( - model=model_id, - dtype=torch.float16, - trust_remote_code=True, - quantization="bitblas", - max_model_len=1024 -) -``` - -
+??? Code + + ```python + from vllm import LLM + import torch + + # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. + model_id = "hxbgsyxh/llama-13b-4bit-g-1" + llm = LLM( + model=model_id, + dtype=torch.float16, + trust_remote_code=True, + quantization="bitblas", + max_model_len=1024 + ) + ``` diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 3f405db0acc..ec7639af805 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -58,27 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. -
-Code +??? Code -```python -from llmcompressor.transformers import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier - -# Configure the simple PTQ quantization -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + ```python + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import QuantizationModifier -# Apply the quantization algorithm. -oneshot(model=model, recipe=recipe) + # Configure the simple PTQ quantization + recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) -# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -tokenizer.save_pretrained(SAVE_DIR) -``` + # Apply the quantization algorithm. + oneshot(model=model, recipe=recipe) -
+ # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic + SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" + model.save_pretrained(SAVE_DIR) + tokenizer.save_pretrained(SAVE_DIR) + ``` ### 3. Evaluating Accuracy diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index fe4422eb841..014b513eeda 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -41,47 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ You can also use the GGUF model directly through the LLM entrypoint: -
-Code - -```python -from vllm import LLM, SamplingParams - -# In this script, we demonstrate how to pass input to the chat method: -conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, -] - -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.chat(conversation, sampling_params) - -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -
+??? Code + + ```python + from vllm import LLM, SamplingParams + + # In this script, we demonstrate how to pass input to the chat method: + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.chat(conversation, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 4f558aefd3d..2f088f474f1 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -31,33 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: -
-Code +??? Code -```python -from datasets import load_dataset -from gptqmodel import GPTQModel, QuantizeConfig + ```python + from datasets import load_dataset + from gptqmodel import GPTQModel, QuantizeConfig -model_id = "meta-llama/Llama-3.2-1B-Instruct" -quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" + model_id = "meta-llama/Llama-3.2-1B-Instruct" + quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" -calibration_dataset = load_dataset( - "allenai/c4", - data_files="en/c4-train.00001-of-01024.json.gz", - split="train" - ).select(range(1024))["text"] + calibration_dataset = load_dataset( + "allenai/c4", + data_files="en/c4-train.00001-of-01024.json.gz", + split="train" + ).select(range(1024))["text"] -quant_config = QuantizeConfig(bits=4, group_size=128) + quant_config = QuantizeConfig(bits=4, group_size=128) -model = GPTQModel.load(model_id, quant_config) + model = GPTQModel.load(model_id, quant_config) -# increase `batch_size` to match gpu/vram specs to speed up quantization -model.quantize(calibration_dataset, batch_size=2) + # increase `batch_size` to match gpu/vram specs to speed up quantization + model.quantize(calibration_dataset, batch_size=2) -model.save(quant_path) -``` - -
+ model.save(quant_path) + ``` ## Running a quantized model with vLLM @@ -72,37 +69,34 @@ python examples/offline_inference/llm_engine_example.py \ GPTQModel quantized models are also supported directly through the LLM entrypoint: -
-Code +??? Code -```python -from vllm import LLM, SamplingParams + ```python + from vllm import LLM, SamplingParams -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.6, top_p=0.9) + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.6, top_p=0.9) -# Create an LLM. -llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") + # Create an LLM. + llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -print("-"*50) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + # Print the outputs. print("-"*50) -``` - -
+ for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-"*50) + ``` diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index cf8ff3dd9f0..185e13649f4 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -53,61 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: -
-Code +??? Code -```python -from datasets import load_dataset - -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 + ```python + from datasets import load_dataset -# Load and preprocess the dataset -ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + NUM_CALIBRATION_SAMPLES = 512 + MAX_SEQUENCE_LENGTH = 2048 -def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} -ds = ds.map(preprocess) + # Load and preprocess the dataset + ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") + ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) -def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) -ds = ds.map(tokenize, remove_columns=ds.column_names) -``` + def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} + ds = ds.map(preprocess) -
+ def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) + ds = ds.map(tokenize, remove_columns=ds.column_names) + ``` ### 3. Applying Quantization Now, apply the quantization algorithms: -
-Code +??? Code -```python -from llmcompressor.transformers import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - -# Configure the quantization algorithms -recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) - -# Apply quantization -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, -) + ```python + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + from llmcompressor.modifiers.smoothquant import SmoothQuantModifier -# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) -``` + # Configure the quantization algorithms + recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) -
+ # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 + SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + ``` This process creates a W4A16 model with weights quantized to 4-bit integers. @@ -147,39 +141,36 @@ $ lm_eval --model vllm \ The following is an example of an expanded quantization recipe you can tune to your own use case: -
-Code - -```python -from compressed_tensors.quantization import ( - QuantizationArgs, - QuantizationScheme, - QuantizationStrategy, - QuantizationType, -) -recipe = GPTQModifier( - targets="Linear", - config_groups={ - "config_group": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs( - num_bits=4, - type=QuantizationType.INT, - strategy=QuantizationStrategy.GROUP, - group_size=128, - symmetric=True, - dynamic=False, - actorder="weight", +??? Code + + ```python + from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationScheme, + QuantizationStrategy, + QuantizationType, + ) + recipe = GPTQModifier( + targets="Linear", + config_groups={ + "config_group": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=4, + type=QuantizationType.INT, + strategy=QuantizationStrategy.GROUP, + group_size=128, + symmetric=True, + dynamic=False, + actorder="weight", + ), ), - ), - }, - ignore=["lm_head"], - update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01 -) -``` - -
+ }, + ignore=["lm_head"], + update_size=NUM_CALIBRATION_SAMPLES, + dampening_frac=0.01 + ) + ``` ## Troubleshooting and Support diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index cdc06b1aed9..de5ae5c0440 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -54,27 +54,26 @@ When quantizing activations to INT8, you need sample data to estimate the activa It's best to use calibration data that closely matches your deployment data. For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: -
-Code +??? Code -```python -from datasets import load_dataset + ```python + from datasets import load_dataset -NUM_CALIBRATION_SAMPLES = 512 -MAX_SEQUENCE_LENGTH = 2048 + NUM_CALIBRATION_SAMPLES = 512 + MAX_SEQUENCE_LENGTH = 2048 -# Load and preprocess the dataset -ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + # Load and preprocess the dataset + ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") + ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) -def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} -ds = ds.map(preprocess) + def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} + ds = ds.map(preprocess) -def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) -ds = ds.map(tokenize, remove_columns=ds.column_names) -``` + def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) + ds = ds.map(tokenize, remove_columns=ds.column_names) + ```
@@ -82,36 +81,33 @@ ds = ds.map(tokenize, remove_columns=ds.column_names) Now, apply the quantization algorithms: -
-Code - -```python -from llmcompressor.transformers import oneshot -from llmcompressor.modifiers.quantization import GPTQModifier -from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - -# Configure the quantization algorithms -recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), -] - -# Apply quantization -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, -) - -# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) -``` - -
+??? Code + + ```python + from llmcompressor.transformers import oneshot + from llmcompressor.modifiers.quantization import GPTQModifier + from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + + # Configure the quantization algorithms + recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), + ] + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + ) + + # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token + SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + ``` This process creates a W8A8 model with weights and activations quantized to 8-bit integers. diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 5282746d5e6..0bb6003832b 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -14,29 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te Below is an example showing how to quantize a model using modelopt's PTQ API: -
-Code +??? Code -```python -import modelopt.torch.quantization as mtq -from transformers import AutoModelForCausalLM - -# Load the model from HuggingFace -model = AutoModelForCausalLM.from_pretrained("") + ```python + import modelopt.torch.quantization as mtq + from transformers import AutoModelForCausalLM -# Select the quantization config, for example, FP8 -config = mtq.FP8_DEFAULT_CFG + # Load the model from HuggingFace + model = AutoModelForCausalLM.from_pretrained("") -# Define a forward loop function for calibration -def forward_loop(model): - for data in calib_set: - model(data) + # Select the quantization config, for example, FP8 + config = mtq.FP8_DEFAULT_CFG -# PTQ with in-place replacement of quantized modules -model = mtq.quantize(model, config, forward_loop) -``` + # Define a forward loop function for calibration + def forward_loop(model): + for data in calib_set: + model(data) -
+ # PTQ with in-place replacement of quantized modules + model = mtq.quantize(model, config, forward_loop) + ``` After the model is quantized, you can export it to a quantized checkpoint using the export API: @@ -53,36 +50,33 @@ with torch.inference_mode(): The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: -
-Code +??? Code -```python -from vllm import LLM, SamplingParams - -def main(): + ```python + from vllm import LLM, SamplingParams - model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" - # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint - llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) + def main(): - sampling_params = SamplingParams(temperature=0.8, top_p=0.9) + model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" + # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.9) - outputs = llm.generate(prompts, sampling_params) + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + outputs = llm.generate(prompts, sampling_params) -if __name__ == "__main__": - main() -``` + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -
+ if __name__ == "__main__": + main() + ``` diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index 5e104367c8c..52b8d38ace1 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -35,25 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades Here is an example of how to enable FP8 quantization: -
-Code +??? Code -```python -# To calculate kv cache scales on the fly enable the calculate_kv_scales -# parameter - -from vllm import LLM, SamplingParams + ```python + # To calculate kv cache scales on the fly enable the calculate_kv_scales + # parameter -sampling_params = SamplingParams(temperature=0.7, top_p=0.8) -llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) -prompt = "London is the capital of" -out = llm.generate(prompt, sampling_params)[0].outputs[0].text -print(out) -``` + from vllm import LLM, SamplingParams -
+ sampling_params = SamplingParams(temperature=0.7, top_p=0.8) + llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True) + prompt = "London is the capital of" + out = llm.generate(prompt, sampling_params)[0].outputs[0].text + print(out) + ``` The `kv_cache_dtype` argument specifies the data type for KV cache storage: - `"auto"`: Uses the model's default "unquantized" data type @@ -76,72 +73,69 @@ pip install llmcompressor Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): -
-Code - -```python -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor.transformers import oneshot - -# Select model and load it -MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Select calibration dataset -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" - -# Configure calibration parameters -NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point -MAX_SEQUENCE_LENGTH = 2048 - -# Load and preprocess dataset -ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) -ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - -def process_and_tokenize(example): - text = tokenizer.apply_chat_template(example["messages"], tokenize=False) - return tokenizer( - text, - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, +??? Code + + ```python + from datasets import load_dataset + from transformers import AutoModelForCausalLM, AutoTokenizer + from llmcompressor.transformers import oneshot + + # Select model and load it + MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + + # Select calibration dataset + DATASET_ID = "HuggingFaceH4/ultrachat_200k" + DATASET_SPLIT = "train_sft" + + # Configure calibration parameters + NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point + MAX_SEQUENCE_LENGTH = 2048 + + # Load and preprocess dataset + ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) + ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + def process_and_tokenize(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + text, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) + + # Configure quantization settings + recipe = """ + quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + """ + + # Apply quantization + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) - -# Configure quantization settings -recipe = """ -quant_stage: - quant_modifiers: - QuantizationModifier: - kv_cache_scheme: - num_bits: 8 - type: float - strategy: tensor - dynamic: false - symmetric: true -""" - -# Apply quantization -oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, -) - -# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) -``` - -
+ # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV + SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + ``` The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 1d24f07d594..6e77584da23 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -42,25 +42,22 @@ The Quark quantization process can be listed for 5 steps as below: Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) to fetch model and tokenizer. -
-Code +??? Code -```python -from transformers import AutoTokenizer, AutoModelForCausalLM + ```python + from transformers import AutoTokenizer, AutoModelForCausalLM -MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" -MAX_SEQ_LEN = 512 + MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" + MAX_SEQ_LEN = 512 -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", -) -model.eval() + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", + ) + model.eval() -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN) -tokenizer.pad_token = tokenizer.eos_token -``` - -
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN) + tokenizer.pad_token = tokenizer.eos_token + ``` ### 2. Prepare the Calibration Dataloader @@ -68,27 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic to load calibration data. For more details about how to use calibration datasets efficiently, please refer to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). -
-Code +??? Code -```python -from datasets import load_dataset -from torch.utils.data import DataLoader + ```python + from datasets import load_dataset + from torch.utils.data import DataLoader -BATCH_SIZE = 1 -NUM_CALIBRATION_DATA = 512 + BATCH_SIZE = 1 + NUM_CALIBRATION_DATA = 512 -# Load the dataset and get calibration data. -dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") -text_data = dataset["text"][:NUM_CALIBRATION_DATA] + # Load the dataset and get calibration data. + dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") + text_data = dataset["text"][:NUM_CALIBRATION_DATA] -tokenized_outputs = tokenizer(text_data, return_tensors="pt", - padding=True, truncation=True, max_length=MAX_SEQ_LEN) -calib_dataloader = DataLoader(tokenized_outputs['input_ids'], - batch_size=BATCH_SIZE, drop_last=True) -``` - -
+ tokenized_outputs = tokenizer(text_data, return_tensors="pt", + padding=True, truncation=True, max_length=MAX_SEQ_LEN) + calib_dataloader = DataLoader(tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, drop_last=True) + ``` ### 3. Set the Quantization Configuration @@ -104,47 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. AutoSmoothQuant config file for Llama is `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. -
-Code - -```python -from quark.torch.quantization import (Config, QuantizationConfig, - FP8E4M3PerTensorSpec, - load_quant_algo_config_from_file) - -# Define fp8/per-tensor/static spec. -FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", - is_dynamic=False).to_quantization_spec() - -# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. -global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, - weight=FP8_PER_TENSOR_SPEC) - -# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. -KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC -kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] -kv_cache_quant_config = {name : - QuantizationConfig(input_tensors=global_quant_config.input_tensors, - weight=global_quant_config.weight, - output_tensors=KV_CACHE_SPEC) - for name in kv_cache_layer_names_for_llama} -layer_quant_config = kv_cache_quant_config.copy() - -# Define algorithm config by config file. -LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = - 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' -algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) - -EXCLUDE_LAYERS = ["lm_head"] -quant_config = Config( - global_quant_config=global_quant_config, - layer_quant_config=layer_quant_config, - kv_cache_quant_config=kv_cache_quant_config, - exclude=EXCLUDE_LAYERS, - algo_config=algo_config) -``` - -
+??? Code + + ```python + from quark.torch.quantization import (Config, QuantizationConfig, + FP8E4M3PerTensorSpec, + load_quant_algo_config_from_file) + + # Define fp8/per-tensor/static spec. + FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", + is_dynamic=False).to_quantization_spec() + + # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. + global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC) + + # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. + KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC + kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] + kv_cache_quant_config = {name : + QuantizationConfig(input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC) + for name in kv_cache_layer_names_for_llama} + layer_quant_config = kv_cache_quant_config.copy() + + # Define algorithm config by config file. + LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = + 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' + algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) + + EXCLUDE_LAYERS = ["lm_head"] + quant_config = Config( + global_quant_config=global_quant_config, + layer_quant_config=layer_quant_config, + kv_cache_quant_config=kv_cache_quant_config, + exclude=EXCLUDE_LAYERS, + algo_config=algo_config) + ``` ### 4. Quantize the Model and Export @@ -154,73 +145,67 @@ HuggingFace `safetensors`, you can refer to [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) for more exporting format details. -
-Code - -```python -import torch -from quark.torch import ModelQuantizer, ModelExporter -from quark.torch.export import ExporterConfig, JsonExporterConfig +??? Code -# Apply quantization. -quantizer = ModelQuantizer(quant_config) -quant_model = quantizer.quantize_model(model, calib_dataloader) + ```python + import torch + from quark.torch import ModelQuantizer, ModelExporter + from quark.torch.export import ExporterConfig, JsonExporterConfig -# Freeze quantized model to export. -freezed_model = quantizer.freeze(model) + # Apply quantization. + quantizer = ModelQuantizer(quant_config) + quant_model = quantizer.quantize_model(model, calib_dataloader) -# Define export config. -LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] -export_config = ExporterConfig(json_export_config=JsonExporterConfig()) -export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP + # Freeze quantized model to export. + freezed_model = quantizer.freeze(model) -# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant -EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" -exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) -with torch.no_grad(): - exporter.export_safetensors_model(freezed_model, - quant_config=quant_config, tokenizer=tokenizer) -``` + # Define export config. + LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] + export_config = ExporterConfig(json_export_config=JsonExporterConfig()) + export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP -
+ # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant + EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" + exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) + with torch.no_grad(): + exporter.export_safetensors_model(freezed_model, + quant_config=quant_config, tokenizer=tokenizer) + ``` ### 5. Evaluation in vLLM Now, you can load and run the Quark quantized model directly through the LLM entrypoint: -
-Code - -```python -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", - kv_cache_dtype='fp8',quantization='quark') -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -print("\nGenerated Outputs:\n" + "-" * 60) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}") - print(f"Output: {generated_text!r}") - print("-" * 60) -``` - -
+??? Code + + ```python + from vllm import LLM, SamplingParams + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype='fp8',quantization='quark') + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + ``` Or, you can use `lm_eval` to evaluate accuracy: diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index d8907f427c8..c45979a3611 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -15,31 +15,28 @@ pip install \ ## Quantizing HuggingFace Models You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: -
-Code - -```Python -import torch -from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer -from torchao.quantization import Int8WeightOnlyConfig - -model_name = "meta-llama/Meta-Llama-3-8B" -quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) -quantized_model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype="auto", - device_map="auto", - quantization_config=quantization_config -) -tokenizer = AutoTokenizer.from_pretrained(model_name) -input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") - -hub_repo = # YOUR HUB REPO ID -tokenizer.push_to_hub(hub_repo) -quantized_model.push_to_hub(hub_repo, safe_serialization=False) -``` - -
+??? Code + + ```Python + import torch + from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer + from torchao.quantization import Int8WeightOnlyConfig + + model_name = "meta-llama/Meta-Llama-3-8B" + quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) + quantized_model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config + ) + tokenizer = AutoTokenizer.from_pretrained(model_name) + input_text = "What are we having for dinner?" + input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + + hub_repo = # YOUR HUB REPO ID + tokenizer.push_to_hub(hub_repo) + quantized_model.push_to_hub(hub_repo, safe_serialization=False) + ``` Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 22f3af7e864..2e6afe61663 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -33,39 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ Next, make a request to the model that should return the reasoning content in the response. -
-Code +??? Code -```python -from openai import OpenAI + ```python + from openai import OpenAI -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -models = client.models.list() -model = models.data[0].id + models = client.models.list() + model = models.data[0].id -# Round 1 -messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` -# For Qwen3 series, if you want to disable thinking in reasoning mode, add: -# extra_body={"chat_template_kwargs": {"enable_thinking": False}} -response = client.chat.completions.create(model=model, messages=messages) + # Round 1 + messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + # For Qwen3 series, if you want to disable thinking in reasoning mode, add: + # extra_body={"chat_template_kwargs": {"enable_thinking": False}} + response = client.chat.completions.create(model=model, messages=messages) -reasoning_content = response.choices[0].message.reasoning_content -content = response.choices[0].message.content + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content -print("reasoning_content:", reasoning_content) -print("content:", content) -``` - -
+ print("reasoning_content:", reasoning_content) + print("content:", content) + ``` The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. @@ -73,87 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). -
-Output - -```json -{ - "id": "chatcmpl-123", - "object": "chat.completion.chunk", - "created": 1694268190, - "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "system_fingerprint": "fp_44709d6fcb", - "choices": [ - { - "index": 0, - "delta": { - "role": "assistant", - "reasoning_content": "is", - }, - "logprobs": null, - "finish_reason": null - } - ] -} -``` - -
+??? Json + + ```json + { + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1694268190, + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": "is", + }, + "logprobs": null, + "finish_reason": null + } + ] + } + ``` OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: -
-Code - -```python -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` -# For Qwen3 series, if you want to disable thinking in reasoning mode, add: -# extra_body={"chat_template_kwargs": {"enable_thinking": False}} -stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) - -print("client: Start streaming chat completions...") -printed_reasoning_content = False -printed_content = False - -for chunk in stream: - reasoning_content = None - content = None - # Check the content is reasoning_content or content - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content = chunk.choices[0].delta.reasoning_content - elif hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content - - if reasoning_content is not None: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) - elif content is not None: - if not printed_content: - printed_content = True - print("\ncontent:", end="", flush=True) - # Extract and print the content - print(content, end="", flush=True) -``` - -
+??? Code + + ```python + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + # For Qwen3 series, if you want to disable thinking in reasoning mode, add: + # extra_body={"chat_template_kwargs": {"enable_thinking": False}} + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + + print("client: Start streaming chat completions...") + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) + ``` Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). @@ -161,46 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. -
-Code - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") - -tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["location", "unit"] +??? Code + + ```python + from openai import OpenAI + + client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"] + } } - } -}] - -response = client.chat.completions.create( - model=client.models.list().data[0].id, - messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], - tools=tools, - tool_choice="auto" -) + }] -print(response) -tool_call = response.choices[0].message.tool_calls[0].function + response = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], + tools=tools, + tool_choice="auto" + ) -print(f"reasoning_content: {response.choices[0].message.reasoning_content}") -print(f"Function called: {tool_call.name}") -print(f"Arguments: {tool_call.arguments}") -``` + print(response) + tool_call = response.choices[0].message.tool_calls[0].function -
+ print(f"reasoning_content: {response.choices[0].message.reasoning_content}") + print(f"Function called: {tool_call.name}") + print(f"Arguments: {tool_call.arguments}") + ``` For more examples, please refer to . @@ -212,95 +200,89 @@ For more examples, please refer to . -
-Code - -```python -# import the required packages - -from vllm.reasoning import ReasoningParser, ReasoningParserManager -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) - -# define a reasoning parser and register it to vllm -# the name list in register_module can be used -# in --reasoning-parser. -@ReasoningParserManager.register_module(["example"]) -class ExampleParser(ReasoningParser): - def __init__(self, tokenizer: AnyTokenizer): - super().__init__(tokenizer) - - def extract_reasoning_content_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - ) -> Union[DeltaMessage, None]: - """ - Instance method that should be implemented for extracting reasoning - from an incomplete response; for use when handling reasoning calls and - streaming. Has to be an instance method because it requires state - - the current tokens/diffs, but also the information about what has - previously been parsed and extracted (see constructor) - """ - - def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest - ) -> tuple[Optional[str], Optional[str]]: - """ - Extract reasoning content from a complete model-generated string. - - Used for non-streaming responses where we have the entire model response - available before sending to the client. - - Parameters: - model_output: str - The model-generated string to extract reasoning content from. - - request: ChatCompletionRequest - The request object that was used to generate the model_output. - - Returns: - tuple[Optional[str], Optional[str]] - A tuple containing the reasoning content and the content. - """ -``` - -
+??? Code + + ```python + # import the required packages + + from vllm.reasoning import ReasoningParser, ReasoningParserManager + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + + # define a reasoning parser and register it to vllm + # the name list in register_module can be used + # in --reasoning-parser. + @ReasoningParserManager.register_module(["example"]) + class ExampleParser(ReasoningParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting reasoning + from an incomplete response; for use when handling reasoning calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from a complete model-generated string. + + Used for non-streaming responses where we have the entire model response + available before sending to the client. + + Parameters: + model_output: str + The model-generated string to extract reasoning content from. + + request: ChatCompletionRequest + The request object that was used to generate the model_output. + + Returns: + tuple[Optional[str], Optional[str]] + A tuple containing the reasoning content and the content. + """ + ``` Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in . -
-Code - -```python -@dataclass -class DeepSeekReasoner(Reasoner): - """ - Reasoner for DeepSeek R series models. - """ - start_token_id: int - end_token_id: int - - start_token: str = "" - end_token: str = "" - - @classmethod - def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: - return cls(start_token_id=tokenizer.encode( - "", add_special_tokens=False)[0], - end_token_id=tokenizer.encode("", - add_special_tokens=False)[0]) - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - return self.end_token_id in input_ids - ... -``` +??? Code -
+ ```python + @dataclass + class DeepSeekReasoner(Reasoner): + """ + Reasoner for DeepSeek R series models. + """ + start_token_id: int + end_token_id: int + + start_token: str = "" + end_token: str = "" + + @classmethod + def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: + return cls(start_token_id=tokenizer.encode( + "", add_special_tokens=False)[0], + end_token_id=tokenizer.encode("", + add_special_tokens=False)[0]) + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return self.end_token_id in input_ids + ... + ``` The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index e28a6036c90..7055cde1e99 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -18,40 +18,34 @@ Speculative decoding is a technique which improves inter-token latency in memory The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. -
-Code - -```python -from vllm import LLM, SamplingParams - -prompts = [ - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_config={ - "model": "facebook/opt-125m", - "num_speculative_tokens": 5, - }, -) -outputs = llm.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -
+??? Code + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_config={ + "model": "facebook/opt-125m", + "num_speculative_tokens": 5, + }, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` To perform the same with an online mode launch the server: -
-Command - ```bash python -m vllm.entrypoints.openai.api_server \ --host 0.0.0.0 \ @@ -63,86 +57,78 @@ python -m vllm.entrypoints.openai.api_server \ --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}' ``` -
- !!! warning Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. Then use a client: -
-Code - -```python -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Completion API -stream = False -completion = client.completions.create( - model=model, - prompt="The future of AI is", - echo=False, - n=1, - stream=stream, -) - -print("Completion results:") -if stream: - for c in completion: - print(c) -else: - print(completion) -``` - -
+??? Code + + ```python + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + # Completion API + stream = False + completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, + ) + + print("Completion results:") + if stream: + for c in completion: + print(c) + else: + print(completion) + ``` ## Speculating by matching n-grams in the prompt The following code configures vLLM to use speculative decoding where proposals are generated by matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) -
-Code - -```python -from vllm import LLM, SamplingParams - -prompts = [ - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_config={ - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 4, - }, -) -outputs = llm.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -
+??? Code + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_config={ + "method": "ngram", + "num_speculative_tokens": 5, + "prompt_lookup_max": 4, + }, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` ## Speculating using MLP speculators @@ -151,34 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or [this technical report](https://arxiv.org/abs/2404.19124). -
-Code - -```python -from vllm import LLM, SamplingParams - -prompts = [ - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -llm = LLM( - model="meta-llama/Meta-Llama-3.1-70B-Instruct", - tensor_parallel_size=4, - speculative_config={ - "model": "ibm-ai-platform/llama3-70b-accelerator", - "draft_tensor_parallel_size": 1, - }, -) -outputs = llm.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` +??? Code + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -
+ llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_config={ + "model": "ibm-ai-platform/llama3-70b-accelerator", + "draft_tensor_parallel_size": 1, + }, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` Note that these speculative models currently need to be run without tensor parallelism, although it is possible to run the main model using tensor parallelism (see example above). Since the @@ -202,36 +185,33 @@ A variety of speculative models of this type are available on HF hub: The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). -
-Code - -```python -from vllm import LLM, SamplingParams +??? Code -prompts = [ - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + ```python + from vllm import LLM, SamplingParams -llm = LLM( - model="meta-llama/Meta-Llama-3-8B-Instruct", - tensor_parallel_size=4, - speculative_config={ - "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", - "draft_tensor_parallel_size": 1, - }, -) + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -outputs = llm.generate(prompts, sampling_params) + llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + tensor_parallel_size=4, + speculative_config={ + "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", + "draft_tensor_parallel_size": 1, + }, + ) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + outputs = llm.generate(prompts, sampling_params) -``` + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -
+ ``` A few important things to consider when using the EAGLE based draft models: diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 22c2d9f6129..b63f344ebd5 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -33,49 +33,43 @@ text. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: -
-Code - -```python -from openai import OpenAI -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", -) -model = client.models.list().data[0].id - -completion = client.chat.completions.create( - model=model, - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, -) -print(completion.choices[0].message.content) -``` - -
+??? Code + + ```python + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", + ) + model = client.models.list().data[0].id + + completion = client.chat.completions.create( + model=model, + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, + ) + print(completion.choices[0].message.content) + ``` The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: -
-Code - -```python -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", - } - ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, -) -print(completion.choices[0].message.content) -``` - -
+??? Code + + ```python + completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, + ) + print(completion.choices[0].message.content) + ``` One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. For this we can use the `guided_json` parameter in two different ways: @@ -85,46 +79,43 @@ For this we can use the `guided_json` parameter in two different ways: The next example shows how to use the `guided_json` parameter with a Pydantic model: -
-Code - -```python -from pydantic import BaseModel -from enum import Enum - -class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - -class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - -json_schema = CarDescription.model_json_schema() - -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", - } - ], - "response_format": { - "type": "json_schema", - "json_schema": { - "name": "car-description", - "schema": CarDescription.model_json_schema() +??? Code + + ```python + from pydantic import BaseModel + from enum import Enum + + class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + json_schema = CarDescription.model_json_schema() + + completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema() + }, }, - }, -) -print(completion.choices[0].message.content) -``` - -
+ ) + print(completion.choices[0].message.content) + ``` !!! tip While not strictly necessary, normally it´s better to indicate in the prompt the @@ -136,38 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: -
-Code +??? Code -```python -simplified_sql_grammar = """ - root ::= select_statement + ```python + simplified_sql_grammar = """ + root ::= select_statement - select_statement ::= "SELECT " column " from " table " where " condition + select_statement ::= "SELECT " column " from " table " where " condition - column ::= "col_1 " | "col_2 " + column ::= "col_1 " | "col_2 " - table ::= "table_1 " | "table_2 " + table ::= "table_1 " | "table_2 " - condition ::= column "= " number + condition ::= column "= " number - number ::= "1 " | "2 " -""" + number ::= "1 " | "2 " + """ -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, -) -print(completion.choices[0].message.content) -``` - -
+ completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, + ) + print(completion.choices[0].message.content) + ``` See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) @@ -181,39 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: -
-Code - -```python -from pydantic import BaseModel - - -class People(BaseModel): - name: str - age: int - - -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": "Generate a JSON with the name and age of one random person.", - } - ], - response_format={ - "type": "json_schema", - "json_schema": { - "name": "people", - "schema": People.model_json_schema() - } - }, -) -print("reasoning_content: ", completion.choices[0].message.reasoning_content) -print("content: ", completion.choices[0].message.content) -``` +??? Code + + ```python + from pydantic import BaseModel + + + class People(BaseModel): + name: str + age: int -
+ + completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate a JSON with the name and age of one random person.", + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "people", + "schema": People.model_json_schema() + } + }, + ) + print("reasoning_content: ", completion.choices[0].message.reasoning_content) + print("content: ", completion.choices[0].message.content) + ``` See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) @@ -227,36 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3. Here is a simple example demonstrating how to get structured output using Pydantic models: -
-Code - -```python -from pydantic import BaseModel -from openai import OpenAI - -class Info(BaseModel): - name: str - age: int - -client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") -model = client.models.list().data[0].id -completion = client.beta.chat.completions.parse( - model=model, - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, - ], - response_format=Info, -) - -message = completion.choices[0].message -print(message) -assert message.parsed -print("Name:", message.parsed.name) -print("Age:", message.parsed.age) -``` - -
+??? Code + + ```python + from pydantic import BaseModel + from openai import OpenAI + + class Info(BaseModel): + name: str + age: int + + client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") + model = client.models.list().data[0].id + completion = client.beta.chat.completions.parse( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + print("Name:", message.parsed.name) + print("Age:", message.parsed.age) + ``` ```console ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) @@ -266,43 +248,39 @@ Age: 28 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: -
-Code - -```python -from typing import List -from pydantic import BaseModel -from openai import OpenAI - -class Step(BaseModel): - explanation: str - output: str - -class MathResponse(BaseModel): - steps: list[Step] - final_answer: str - -completion = client.beta.chat.completions.parse( - model=model, - messages=[ - {"role": "system", "content": "You are a helpful expert math tutor."}, - {"role": "user", "content": "Solve 8x + 31 = 2."}, - ], - response_format=MathResponse, -) - -message = completion.choices[0].message -print(message) -assert message.parsed -for i, step in enumerate(message.parsed.steps): - print(f"Step #{i}:", step) -print("Answer:", message.parsed.final_answer) -``` +??? Code + + ```python + from typing import List + from pydantic import BaseModel + from openai import OpenAI -
+ class Step(BaseModel): + explanation: str + output: str -
-Output + class MathResponse(BaseModel): + steps: list[Step] + final_answer: str + + completion = client.beta.chat.completions.parse( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) + print("Answer:", message.parsed.final_answer) + ``` + +Output: ```console ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) @@ -312,8 +290,6 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` -
- An example of using `structural_tag` can be found here: ## Offline Inference @@ -332,24 +308,21 @@ These parameters can be used in the same way as the parameters from the Online Serving examples above. One example for the usage of the `choice` parameter is shown below: -
-Code - -```python -from vllm import LLM, SamplingParams -from vllm.sampling_params import GuidedDecodingParams +??? Code -llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + ```python + from vllm import LLM, SamplingParams + from vllm.sampling_params import GuidedDecodingParams -guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) -sampling_params = SamplingParams(guided_decoding=guided_decoding_params) -outputs = llm.generate( - prompts="Classify this sentiment: vLLM is wonderful!", - sampling_params=sampling_params, -) -print(outputs[0].outputs[0].text) -``` + llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") -
+ guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) + sampling_params = SamplingParams(guided_decoding=guided_decoding_params) + outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, + ) + print(outputs[0].outputs[0].text) + ``` See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 66db22847b4..9fb878777a4 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -15,49 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ Next, make a request to the model that should result in it using the available tools: -
-Code - -```python -from openai import OpenAI -import json - -client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") - -def get_weather(location: str, unit: str): - return f"Getting the weather for {location} in {unit}..." -tool_functions = {"get_weather": get_weather} - -tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["location", "unit"] +??? Code + + ```python + from openai import OpenAI + import json + + client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") + + def get_weather(location: str, unit: str): + return f"Getting the weather for {location} in {unit}..." + tool_functions = {"get_weather": get_weather} + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"] + } } - } -}] - -response = client.chat.completions.create( - model=client.models.list().data[0].id, - messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], - tools=tools, - tool_choice="auto" -) - -tool_call = response.choices[0].message.tool_calls[0].function -print(f"Function called: {tool_call.name}") -print(f"Arguments: {tool_call.arguments}") -print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") -``` + }] -
+ response = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], + tools=tools, + tool_choice="auto" + ) + + tool_call = response.choices[0].message.tool_calls[0].function + print(f"Function called: {tool_call.name}") + print(f"Arguments: {tool_call.arguments}") + print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") + ``` Example output: @@ -306,54 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen Here is a summary of a plugin file: -
-Code - -```python - -# import the required packages - -# define a tool parser and register it to vllm -# the name list in register_module can be used -# in --tool-call-parser. you can define as many -# tool parsers as you want here. -@ToolParserManager.register_module(["example"]) -class ExampleToolParser(ToolParser): - def __init__(self, tokenizer: AnyTokenizer): - super().__init__(tokenizer) - - # adjust request. e.g.: set skip special tokens - # to False for tool call output. - def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: - return request - - # implement the tool call parse for stream call - def extract_tool_calls_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: - return delta - - # implement the tool parse for non-stream call - def extract_tool_calls( - self, - model_output: str, - request: ChatCompletionRequest, - ) -> ExtractedToolCallInformation: - return ExtractedToolCallInformation(tools_called=False, - tool_calls=[], - content=text) - -``` - -
+??? Code + + ```python + + # import the required packages + + # define a tool parser and register it to vllm + # the name list in register_module can be used + # in --tool-call-parser. you can define as many + # tool parsers as you want here. + @ToolParserManager.register_module(["example"]) + class ExampleToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # adjust request. e.g.: set skip special tokens + # to False for tool call output. + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + return request + + # implement the tool call parse for stream call + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + return delta + + # implement the tool parse for non-stream call + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=text) + + ``` Then you can use this plugin in the command line like this. diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index aab3f891fa1..3f75d1aef30 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -76,26 +76,23 @@ Currently, there are no pre-built CPU wheels. ### Build image from source -
-Commands +??? Commands -```console -$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai . - -# Launching OpenAI server -$ docker run --rm \ - --privileged=true \ - --shm-size=4g \ - -p 8000:8000 \ - -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ - vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ - --dtype=bfloat16 \ - other vLLM OpenAI server arguments -``` - -
+ ```console + $ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai . + + # Launching OpenAI server + $ docker run --rm \ + --privileged=true \ + --shm-size=4g \ + -p 8000:8000 \ + -e VLLM_CPU_KVCACHE_SPACE= \ + -e VLLM_CPU_OMP_THREADS_BIND= \ + vllm-cpu-env \ + --model=meta-llama/Llama-3.2-1B-Instruct \ + --dtype=bfloat16 \ + other vLLM OpenAI server arguments + ``` !!! tip For ARM or Apple silicon, use `docker/Dockerfile.arm` @@ -149,37 +146,34 @@ vllm serve facebook/opt-125m - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: -
-Commands +??? Commands -```console -$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores - -# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. -CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ -0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 -1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 -2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 -3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 -4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 -5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 -6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 -7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 -8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 -9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 -10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 -11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 -12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 -13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 -14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 -15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - -# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 -$ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference/basic/basic.py -``` - -
+ ```console + $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + + # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. + CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ + 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + + # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 + $ export VLLM_CPU_OMP_THREADS_BIND=0-7 + $ python examples/offline_inference/basic/basic.py + ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md index 9b5007bef14..0cb10b8de83 100644 --- a/docs/getting_started/installation/google_tpu.md +++ b/docs/getting_started/installation/google_tpu.md @@ -68,9 +68,6 @@ For more information about using TPUs with GKE, see: Create a TPU v5e with 4 TPU chips: -
-Commands - ```console gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --node-id TPU_NAME \ @@ -81,8 +78,6 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -
- | Parameter name | Description | |--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index c0bfa6823a0..6bc714fe6e8 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -90,29 +90,26 @@ Currently, there are no pre-built ROCm wheels. 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: -
- Commands + ??? Commands - ```bash - pip install --upgrade pip + ```bash + pip install --upgrade pip - # Build & install AMD SMI - pip install /opt/rocm/share/amd_smi + # Build & install AMD SMI + pip install /opt/rocm/share/amd_smi - # Install dependencies - pip install --upgrade numba \ - scipy \ - huggingface-hub[cli,hf_transfer] \ - setuptools_scm - pip install "numpy<2" - pip install -r requirements/rocm.txt + # Install dependencies + pip install --upgrade numba \ + scipy \ + huggingface-hub[cli,hf_transfer] \ + setuptools_scm + pip install "numpy<2" + pip install -r requirements/rocm.txt - # Build vLLM for MI210/MI250/MI300. - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - python3 setup.py develop - ``` - -
+ # Build vLLM for MI210/MI250/MI300. + export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + python3 setup.py develop + ``` This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. @@ -206,24 +203,21 @@ DOCKER_BUILDKIT=1 docker build \ To run the above docker image `vllm-rocm`, use the below command: -
-Command - -```console -docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash -``` +??? Command -
+ ```console + docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash + ``` Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md index c1987300f8d..056caa70814 100644 --- a/docs/getting_started/installation/intel_gaudi.md +++ b/docs/getting_started/installation/intel_gaudi.md @@ -232,24 +232,21 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -
-Logs - -```text -INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB -INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB -INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB -... -INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB -INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB -INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB -... -INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB -INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB -``` - -
+??? Logs + + ```text + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + ... + INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + ... + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + ``` This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. @@ -284,42 +281,39 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -
-Logs - -```text -INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] -INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] -INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] -INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] -INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) -INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache -INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 -INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) -INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB -... -INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) -INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB -... -INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB -INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB -... -INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB -INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB -INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB -INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB -INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB -INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] -INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] -INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory -INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) -``` - -
+??? Logs + + ```text + INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + ... + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + ... + INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + ... + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + ``` ### Recommended vLLM Parameters diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index afc7aea46c6..d02cb18bcb9 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -147,25 +147,22 @@ curl http://localhost:8000/v1/completions \ Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: -
-Code - -```python -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) -completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") -print("Completion result:", completion) -``` - -
+??? Code + + ```python + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") + print("Completion result:", completion) + ``` A more detailed client example can be found here: @@ -189,31 +186,28 @@ curl http://localhost:8000/v1/chat/completions \ Alternatively, you can use the `openai` Python package: -
-Code +??? Code -```python -from openai import OpenAI -# Set OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -chat_response = client.chat.completions.create( - model="Qwen/Qwen2.5-1.5B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Tell me a joke."}, - ] -) -print("Chat response:", chat_response) -``` + ```python + from openai import OpenAI + # Set OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -
+ chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a joke."}, + ] + ) + print("Chat response:", chat_response) + ``` ## On Attention Backends diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 5ffec85f653..355ed506e5d 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -85,40 +85,37 @@ and automatically applies the model's [chat template](https://huggingface.co/doc In general, only instruction-tuned models have a chat template. Base models may perform poorly as they are not trained to respond to the chat conversation. -
-Code - -```python -from vllm import LLM - -llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") -conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, -] -outputs = llm.chat(conversation) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -
+??? Code + + ```python + from vllm import LLM + + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, + ] + outputs = llm.chat(conversation) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` A code example can be found here: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 0acb4d55214..89a128915a7 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -190,6 +190,4 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -
- A openai client example can be found here: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0efcabae7ca..fff6c729a58 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -615,32 +615,29 @@ Specified using `--task generate`. For the best results, we recommend using the following dependency versions (tested on A10 and L40): -
- Dependency versions - - ```text - # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) - torch==2.5.1 - torchvision==0.20.1 - transformers==4.48.1 - tokenizers==0.21.0 - tiktoken==0.7.0 - vllm==0.7.0 - - # Optional but recommended for improved performance and stability - triton==3.1.0 - xformers==0.0.28.post3 - uvloop==0.21.0 - protobuf==5.29.3 - openai==1.60.2 - opencv-python-headless==4.11.0.86 - pillow==10.4.0 - - # Installed FlashAttention (for float16 only) - flash-attn>=2.5.6 # Not used in float32, but should be documented - ``` - -
+ ??? Dependency versions + + ```text + # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) + torch==2.5.1 + torchvision==0.20.1 + transformers==4.48.1 + tokenizers==0.21.0 + tiktoken==0.7.0 + vllm==0.7.0 + + # Optional but recommended for improved performance and stability + triton==3.1.0 + xformers==0.0.28.post3 + uvloop==0.21.0 + protobuf==5.29.3 + openai==1.60.2 + opencv-python-headless==4.11.0.86 + pillow==10.4.0 + + # Installed FlashAttention (for float16 only) + flash-attn>=2.5.6 # Not used in float32, but should be documented + ``` **Note:** Make sure you understand the security implications of using outdated packages. diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 16a50ffea14..d7e2b41651c 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -13,24 +13,21 @@ pip install langchain langchain_community -q To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. -
-Code - -```python -from langchain_community.llms import VLLM - -llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference -) - -print(llm("What is the capital of France ?")) -``` - -
+??? Code + + ```python + from langchain_community.llms import VLLM + + llm = VLLM(model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # tensor_parallel_size=... # for distributed inference + ) + + print(llm("What is the capital of France ?")) + ``` Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index e95f84404f0..7862778464d 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -15,27 +15,24 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). -
-Code +??? Code -```python -from openai import OpenAI -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", -) - -completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Hello!"} - ] -) + ```python + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", + ) -print(completion.choices[0].message) -``` + completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Hello!"} + ] + ) -
+ print(completion.choices[0].message) + ``` !!! tip vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. @@ -152,32 +149,29 @@ with `--enable-request-id-headers`. > rather than within the vLLM layer for this reason. > See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. -
-Code - -```python -completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_headers={ - "x-request-id": "sentiment-classification-00001", - } -) -print(completion._request_id) - -completion = client.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - prompt="A robot may not injure a human being", - extra_headers={ - "x-request-id": "completion-test", - } -) -print(completion._request_id) -``` +??? Code -
+ ```python + completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } + ) + print(completion._request_id) + + completion = client.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } + ) + print(completion._request_id) + ``` ## API Reference @@ -194,15 +188,19 @@ Code example: The following [sampling parameters][sampling-params] are supported. -```python ---8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" + ``` The following extra parameters are supported: -```python ---8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" + ``` [](){ #chat-api } @@ -222,15 +220,19 @@ Code example: The following [sampling parameters][sampling-params] are supported. -```python ---8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" + ``` The following extra parameters are supported: -```python ---8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" + ``` [](){ #embeddings-api } @@ -269,34 +271,31 @@ and passing a list of `messages` in the request. Refer to the examples below for Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: -
- Code - - ```python - import requests - - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, - ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) - ``` - -
+ ??? Code + + ```python + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + ``` === "DSE-Qwen2-MRL" @@ -331,15 +330,19 @@ The following [pooling parameters][pooling-params] are supported. The following extra parameters are supported by default: -```python ---8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" + ``` For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -```python ---8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" + ``` [](){ #transcriptions-api } @@ -358,15 +361,19 @@ Code example: The following [sampling parameters][sampling-params] are supported. -```python ---8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" + ``` The following extra parameters are supported: -```python ---8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" -``` +??? Code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" + ``` [](){ #tokenizer-api } @@ -414,45 +421,42 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -
-Response +??? Response -```bash -{ - "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", - "object": "list", - "created": 1745383065, - "model": "jason9693/Qwen2.5-1.5B-apeach", - "data": [ - { - "index": 0, - "label": "Default", - "probs": [ - 0.565970778465271, - 0.4340292513370514 - ], - "num_classes": 2 - }, + ```bash { - "index": 1, - "label": "Spoiled", - "probs": [ - 0.26448777318000793, - 0.7355121970176697 + "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", + "object": "list", + "created": 1745383065, + "model": "jason9693/Qwen2.5-1.5B-apeach", + "data": [ + { + "index": 0, + "label": "Default", + "probs": [ + 0.565970778465271, + 0.4340292513370514 + ], + "num_classes": 2 + }, + { + "index": 1, + "label": "Spoiled", + "probs": [ + 0.26448777318000793, + 0.7355121970176697 + ], + "num_classes": 2 + } ], - "num_classes": 2 + "usage": { + "prompt_tokens": 20, + "total_tokens": 20, + "completion_tokens": 0, + "prompt_tokens_details": null + } } - ], - "usage": { - "prompt_tokens": 20, - "total_tokens": 20, - "completion_tokens": 0, - "prompt_tokens_details": null - } -} -``` - -
+ ``` You can also pass a string directly to the `input` field: @@ -465,36 +469,33 @@ curl -v "http://127.0.0.1:8000/classify" \ }' ``` -
-Response +??? Response -```bash -{ - "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", - "object": "list", - "created": 1745383213, - "model": "jason9693/Qwen2.5-1.5B-apeach", - "data": [ + ```bash { - "index": 0, - "label": "Default", - "probs": [ - 0.565970778465271, - 0.4340292513370514 + "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", + "object": "list", + "created": 1745383213, + "model": "jason9693/Qwen2.5-1.5B-apeach", + "data": [ + { + "index": 0, + "label": "Default", + "probs": [ + 0.565970778465271, + 0.4340292513370514 + ], + "num_classes": 2 + } ], - "num_classes": 2 + "usage": { + "prompt_tokens": 10, + "total_tokens": 10, + "completion_tokens": 0, + "prompt_tokens_details": null + } } - ], - "usage": { - "prompt_tokens": 10, - "total_tokens": 10, - "completion_tokens": 0, - "prompt_tokens_details": null - } -} -``` - -
+ ``` #### Extra parameters @@ -538,27 +539,24 @@ curl -X 'POST' \ }' ``` -
-Response +??? Response -```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ + ```bash { - "index": 0, - "object": "score", - "score": 1 + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + } + ], + "usage": {} } - ], - "usage": {} -} -``` - -
+ ``` #### Batch inference @@ -566,107 +564,95 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente where each pair is built from `text_1` and a string in `text_2`. The total number of pairs is `len(text_2)`. -
-Request - -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "text_1": "What is the capital of France?", - "text_2": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." - ] -}' -``` +??? Request -
+ ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "text_1": "What is the capital of France?", + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] + }' + ``` -
-Response +??? Response -```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693570, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": 0.001094818115234375 - }, + ```bash { - "index": 1, - "object": "score", - "score": 1 + "id": "score-request-id", + "object": "list", + "created": 693570, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 0.001094818115234375 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} } - ], - "usage": {} -} -``` - -
+ ``` You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). The total number of pairs is `len(text_2)`. -
-Request - -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "encoding_format": "float", - "text_1": [ - "What is the capital of Brazil?", - "What is the capital of France?" - ], - "text_2": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." - ] -}' -``` +??? Request -
+ ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": [ + "What is the capital of Brazil?", + "What is the capital of France?" + ], + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] + }' + ``` -
-Response +??? Response -```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": 1 - }, + ```bash { - "index": 1, - "object": "score", - "score": 1 + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} } - ], - "usage": {} -} -``` - -
+ ``` #### Extra parameters @@ -705,57 +691,51 @@ Code example: Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. Result documents will be sorted by relevance, and the `index` property can be used to determine original order. -
-Request - -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/v1/rerank' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-base", - "query": "What is the capital of France?", - "documents": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - "Horses and cows are both animals" - ] -}' -``` +??? Request -
+ ```bash + curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/rerank' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Horses and cows are both animals" + ] + }' + ``` -
-Response +??? Response -```bash -{ - "id": "rerank-fae51b2b664d4ed38f5969b612edff77", - "model": "BAAI/bge-reranker-base", - "usage": { - "total_tokens": 56 - }, - "results": [ - { - "index": 1, - "document": { - "text": "The capital of France is Paris." - }, - "relevance_score": 0.99853515625 - }, + ```bash { - "index": 0, - "document": { - "text": "The capital of Brazil is Brasilia." + "id": "rerank-fae51b2b664d4ed38f5969b612edff77", + "model": "BAAI/bge-reranker-base", + "usage": { + "total_tokens": 56 }, - "relevance_score": 0.0005860328674316406 + "results": [ + { + "index": 1, + "document": { + "text": "The capital of France is Paris." + }, + "relevance_score": 0.99853515625 + }, + { + "index": 0, + "document": { + "text": "The capital of Brazil is Brasilia." + }, + "relevance_score": 0.0005860328674316406 + } + ] } - ] -} -``` - -
+ ``` #### Extra parameters diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 11fec27d689..988b9a55172 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -12,38 +12,32 @@ vllm serve unsloth/Llama-3.2-1B-Instruct Then query the endpoint to get the latest metrics from the server: -
-Output - -```console -$ curl http://0.0.0.0:8000/metrics - -# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. -# TYPE vllm:iteration_tokens_total histogram -vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 -vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 -... -``` - -
+??? Output + + ```console + $ curl http://0.0.0.0:8000/metrics + + # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. + # TYPE vllm:iteration_tokens_total histogram + vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 + vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + ... + ``` The following metrics are exposed: -
-Code - -```python ---8<-- "vllm/engine/metrics.py:metrics-definitions" -``` +??? Code -
+ ```python + --8<-- "vllm/engine/metrics.py:metrics-definitions" + ``` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index f2b3cfb6995..9403abfad85 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -60,73 +60,70 @@ To identify the particular CUDA operation that causes the error, you can add `-- If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. -
-Code +??? Code + + ```python + # Test PyTorch NCCL + import torch + import torch.distributed as dist + dist.init_process_group(backend="nccl") + local_rank = dist.get_rank() % torch.cuda.device_count() + torch.cuda.set_device(local_rank) + data = torch.FloatTensor([1,] * 128).to("cuda") + dist.all_reduce(data, op=dist.ReduceOp.SUM) + torch.cuda.synchronize() + value = data.mean().item() + world_size = dist.get_world_size() + assert value == world_size, f"Expected {world_size}, got {value}" -```python -# Test PyTorch NCCL -import torch -import torch.distributed as dist -dist.init_process_group(backend="nccl") -local_rank = dist.get_rank() % torch.cuda.device_count() -torch.cuda.set_device(local_rank) -data = torch.FloatTensor([1,] * 128).to("cuda") -dist.all_reduce(data, op=dist.ReduceOp.SUM) -torch.cuda.synchronize() -value = data.mean().item() -world_size = dist.get_world_size() -assert value == world_size, f"Expected {world_size}, got {value}" - -print("PyTorch NCCL is successful!") - -# Test PyTorch GLOO -gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") -cpu_data = torch.FloatTensor([1,] * 128) -dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) -value = cpu_data.mean().item() -assert value == world_size, f"Expected {world_size}, got {value}" - -print("PyTorch GLOO is successful!") - -if world_size <= 1: - exit() - -# Test vLLM NCCL, with cuda graph -from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - -pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) -# pynccl is enabled by default for 0.6.5+, -# but for 0.6.4 and below, we need to enable it manually. -# keep the code for backward compatibility when because people -# prefer to read the latest documentation. -pynccl.disabled = False - -s = torch.cuda.Stream() -with torch.cuda.stream(s): - data.fill_(1) - out = pynccl.all_reduce(data, stream=s) - value = out.mean().item() + print("PyTorch NCCL is successful!") + + # Test PyTorch GLOO + gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") + cpu_data = torch.FloatTensor([1,] * 128) + dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) + value = cpu_data.mean().item() assert value == world_size, f"Expected {world_size}, got {value}" -print("vLLM NCCL is successful!") + print("PyTorch GLOO is successful!") -g = torch.cuda.CUDAGraph() -with torch.cuda.graph(cuda_graph=g, stream=s): - out = pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + if world_size <= 1: + exit() -data.fill_(1) -g.replay() -torch.cuda.current_stream().synchronize() -value = out.mean().item() -assert value == world_size, f"Expected {world_size}, got {value}" + # Test vLLM NCCL, with cuda graph + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator -print("vLLM NCCL with cuda graph is successful!") + pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) + # pynccl is enabled by default for 0.6.5+, + # but for 0.6.4 and below, we need to enable it manually. + # keep the code for backward compatibility when because people + # prefer to read the latest documentation. + pynccl.disabled = False -dist.destroy_process_group(gloo_group) -dist.destroy_process_group() -``` + s = torch.cuda.Stream() + with torch.cuda.stream(s): + data.fill_(1) + out = pynccl.all_reduce(data, stream=s) + value = out.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL is successful!") -
+ g = torch.cuda.CUDAGraph() + with torch.cuda.graph(cuda_graph=g, stream=s): + out = pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + + data.fill_(1) + g.replay() + torch.cuda.current_stream().synchronize() + value = out.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + + print("vLLM NCCL with cuda graph is successful!") + + dist.destroy_process_group(gloo_group) + dist.destroy_process_group() + ``` If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: @@ -170,30 +167,27 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously or an error from Python that looks like this: -
-Logs +??? Logs -```console -RuntimeError: - An attempt has been made to start a new process before the - current process has finished its bootstrapping phase. + ```console + RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. - This probably means that you are not using fork to start your - child processes and you have forgotten to use the proper idiom - in the main module: + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: - if __name__ == '__main__': - freeze_support() - ... + if __name__ == '__main__': + freeze_support() + ... - The "freeze_support()" line can be omitted if the program - is not going to be frozen to produce an executable. + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. - To fix this issue, refer to the "Safe importing of main module" - section in https://docs.python.org/3/library/multiprocessing.html -``` - -
+ To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html + ``` then you must update your Python code to guard usage of `vllm` behind a `if __name__ == '__main__':` block. For example, instead of this: @@ -217,25 +211,22 @@ if __name__ == '__main__': vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: -
-Code +??? Code -```python -import torch - -@torch.compile -def f(x): - # a simple function to test torch.compile - x = x + 1 - x = x * 2 - x = x.sin() - return x - -x = torch.randn(4, 4).cuda() -print(f(x)) -``` + ```python + import torch + + @torch.compile + def f(x): + # a simple function to test torch.compile + x = x + 1 + x = x * 2 + x = x.sin() + return x -
+ x = torch.randn(4, 4).cuda() + print(f(x)) + ``` If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md index 4d7b70e6f42..78d2a6784bc 100644 --- a/docs/usage/usage_stats.md +++ b/docs/usage/usage_stats.md @@ -10,41 +10,38 @@ The list of data collected by the latest version of vLLM can be found here: -Output - -```json -{ - "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", - "provider": "GCP", - "num_cpu": 24, - "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", - "cpu_family_model_stepping": "6,85,7", - "total_memory": 101261135872, - "architecture": "x86_64", - "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", - "gpu_count": 2, - "gpu_type": "NVIDIA L4", - "gpu_memory_per_device": 23580639232, - "model_architecture": "OPTForCausalLM", - "vllm_version": "0.3.2+cu123", - "context": "LLM_CLASS", - "log_time": 1711663373492490000, - "source": "production", - "dtype": "torch.float16", - "tensor_parallel_size": 1, - "block_size": 16, - "gpu_memory_utilization": 0.9, - "quantization": null, - "kv_cache_dtype": "auto", - "enable_lora": false, - "enable_prefix_caching": false, - "enforce_eager": false, - "disable_custom_all_reduce": true -} -``` - -
+??? Output + + ```json + { + "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", + "provider": "GCP", + "num_cpu": 24, + "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", + "cpu_family_model_stepping": "6,85,7", + "total_memory": 101261135872, + "architecture": "x86_64", + "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", + "gpu_count": 2, + "gpu_type": "NVIDIA L4", + "gpu_memory_per_device": 23580639232, + "model_architecture": "OPTForCausalLM", + "vllm_version": "0.3.2+cu123", + "context": "LLM_CLASS", + "log_time": 1711663373492490000, + "source": "production", + "dtype": "torch.float16", + "tensor_parallel_size": 1, + "block_size": 16, + "gpu_memory_utilization": 0.9, + "quantization": null, + "kv_cache_dtype": "auto", + "enable_lora": false, + "enable_prefix_caching": false, + "enforce_eager": false, + "disable_custom_all_reduce": true + } + ``` You can preview the collected data by running the following command: