offload weights to cpu before fp8 online quant

yma11 · yma11 · commit 6de80f9b66f4 · 2025-06-23T07:28:40.000Z
Signed-off-by: yan &lt;yanma1@habana.ai&gt;
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
@@ -135,4 +135,4 @@ print(result[0].outputs[0].text)
 ```
 
 !!! warning
-    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
+    Currently, by default we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. To avoid this, adding `VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1` can allow offloading weights to cpu before quantization and quantized weights will be kept in device.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Expanded quantized model tests for CPU offloading
@@ -11,6 +11,16 @@
 from ..utils import compare_two_settings
 
 
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_offload_weights_before_quant_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
+                         ["--quantization", "fp8"], ["--quantization", "fp8"],
+                         {"VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT": "1"},
+                         max_wait_seconds=480)
+
+
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
diff --git a/vllm/config.py b/vllm/config.py
@@ -1551,6 +1551,10 @@ def _verify_args(self) -> Self:
             raise ValueError("CPU offload space must be non-negative"
                              f", but got {self.cpu_offload_gb}")
 
+        if self.cpu_offload_gb > 0 and envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
+            raise ValueError("CPU offload can't work together with"
+                             "OFFLOAD_WEIGHTS_BEFORE_QUANT")
+
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -133,6 +133,7 @@
     VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
     VLLM_KV_CACHE_LAYOUT: Optional[str] = None
     VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
+    VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT: bool = False
 
 
 def get_default_cache_root():
@@ -918,6 +919,10 @@ def get_vllm_port() -> Optional[int]:
     # or bad hardware but it may add compute overhead.
     "VLLM_COMPUTE_NANS_IN_LOGITS":
     lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
+
+    # Offload model weights to cpu before online fp8 quantization
+    "VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT":
+    lambda: os.environ.get("VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT", "0") == "1",
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -246,10 +246,14 @@ def create_weights(
                         if self.quant_config.is_checkpoint_fp8_serialized else
                         params_dtype)
 
+        # Force offloading weights to cpu if VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
+        # enabled, otherwise use original device config which can be gpu or cpu
+        # (may happen when cpu_offload_gb > 0)
         weight = ModelWeightParameter(data=torch.empty(
             output_size_per_partition,
             input_size_per_partition,
-            dtype=weight_dtype),
+            dtype=weight_dtype,
+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
                                       input_dim=1,
                                       output_dim=0,
                                       weight_loader=weight_loader)
@@ -513,16 +517,19 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             num_experts,
             2 * intermediate_size_per_partition,
             hidden_size,
-            dtype=params_dtype),
+            dtype=params_dtype,
+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
                                         requires_grad=False)
+
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
         w2_weight = torch.nn.Parameter(torch.empty(
             num_experts,
             hidden_size,
             intermediate_size_per_partition,
-            dtype=params_dtype),
+            dtype=params_dtype,
+            device="cpu" if envs.VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT else None),
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -16,6 +16,7 @@
 from vllm.attention import Attention
 from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
                          set_current_vllm_config)
+from vllm.envs import VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
@@ -144,26 +145,30 @@ def device_loading_context(module: torch.nn.Module,
         yield module
 
     finally:
-        # Restore parameters to their original devices, ignoring new parameters
-        pin_memory = is_pin_memory_available()
-        for name, p in module.named_parameters():
-            if name in original_device_states:
-                original_device: torch.device = original_device_states[name]
-                if original_device.type == "cpu":
-                    # `torch.empty_like` does not support `pin_memory` argument
-                    cpu_data = torch.empty_strided(
-                        size=p.data.size(),
-                        stride=p.data.stride(),
-                        dtype=p.data.dtype,
-                        layout=p.data.layout,
-                        device="cpu",
-                        pin_memory=pin_memory,
-                    )
-                    cpu_data.copy_(p.data)
-                    p.data = cpu_data
-                else:
-                    p.data = p.data.to(original_device)
-        # New parameters or parameters already on target device are untouched
+        # If weights were loaded onto the CPU for FP8 online quantization, there
+        # is no need to move them back to the original device.
+        if not VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT:
+            # Restore parameters to their original devices, ignoring new parameters # noqa: E501
+            pin_memory = is_pin_memory_available()
+            for name, p in module.named_parameters():
+                if name in original_device_states:
+                    original_device: torch.device = original_device_states[
+                        name]
+                    if original_device.type == "cpu":
+                        # `torch.empty_like` does not support `pin_memory` argument # noqa: E501
+                        cpu_data = torch.empty_strided(
+                            size=p.data.size(),
+                            stride=p.data.stride(),
+                            dtype=p.data.dtype,
+                            layout=p.data.layout,
+                            device="cpu",
+                            pin_memory=pin_memory,
+                        )
+                        cpu_data.copy_(p.data)
+                        p.data = cpu_data
+                    else:
+                        p.data = p.data.to(original_device)
+            # New parameters or parameters already on target device are untouched # noqa: E501
 
 
 def resolve_transformers_arch(model_config: ModelConfig,