vllm-project · MiaoChangyu · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 25, 2025
diff --git a/examples/offline_inference/simple_growthing_obj_graph.py b/examples/offline_inference/simple_growthing_obj_graph.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+
+from vllm import LLM, SamplingParams
+
+# Enable object graph analysis by setting environment variable
+os.environ["VLLM_OBJ_GRAPH_DIR"] = "./vllm_obj_graph"
+
+# Sample prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of artificial intelligence is",
+]
+# Create sampling parameters object
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create LLM instance
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
+
+    # Start object graph analysis
+    llm.start_object_graph()
+
+    # Generate text from prompts. The output is a list of RequestOutput objects
+    # containing the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Stop object graph analysis
+    llm.stop_object_graph()
+
+    # Print output results
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Add buffer time to wait for background processes (if multiprocessing is enabled)
+    # to complete writing object graph analysis output.
+    time.sleep(10)
+    print(f"Completed! Results saved to: {os.environ['VLLM_OBJ_GRAPH_DIR']}")
+    print("You can check the generated files to analyze memory growth")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -45,3 +45,4 @@ python-json-logger # Used by logging as per examples/others/logging_configuratio
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
+objgraph # Required for memory debugging and object graph analysis
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -1162,6 +1162,12 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         self.engine.stop_profile()
 
+    async def start_object_graph(self) -> None:
+        self.engine.start_object_graph()
+
+    async def stop_object_graph(self) -> None:
+        self.engine.stop_object_graph()
+
     async def reset_mm_cache(self) -> None:
         self.engine.reset_mm_cache()
 

@@ -1854,6 +1854,12 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.model_executor.stop_profile()
 
+    def start_object_graph(self) -> None:
+        self.model_executor.start_object_graph()
+
+    def stop_object_graph(self) -> None:
+        self.model_executor.stop_object_graph()
+
     def sleep(self, level: int = 1) -> None:
         assert self.vllm_config.model_config.enable_sleep_mode, (
             "Sleep mode is not enabled in the model config")

diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
@@ -82,6 +82,9 @@ class RPCUProfileRequest(Enum):
     START_PROFILE = 1
     STOP_PROFILE = 2
 
+class RPCUObjectGraphRequest(Enum):
+    START_OBJECT_GRAPH = 1
+    STOP_OBJECT_GRAPH = 2
 
 class RPCResetMultiModalCacheRequest(Enum):
     RESET = 1
@@ -130,7 +133,8 @@ class RPCAdapterLoadedResponse:
                       RPCUProfileRequest, RPCLoadAdapterRequest,
                       RPCResetMultiModalCacheRequest,
                       RPCResetPrefixCacheRequest, RPCSleepRequest,
-                      RPCWakeUpRequest, RPCIsSleepingRequest]
+                      RPCWakeUpRequest, RPCIsSleepingRequest,
+                      RPCUObjectGraphRequest]
 
 REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
                           RPCIsSleepingResponse, RPCError]

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -35,6 +35,7 @@
                                          RPCResetPrefixCacheRequest,
                                          RPCSleepRequest, RPCStartupRequest,
                                          RPCStartupResponse,
+                                         RPCUObjectGraphRequest,
                                          RPCUProfileRequest, RPCWakeUpRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
@@ -615,6 +616,20 @@ async def stop_profile(self) -> None:
 
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
+
+    async def start_object_graph(self) -> None:
+        """Start object graph the engine"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUObjectGraphRequest.START_OBJECT_GRAPH,
+            socket=self.input_socket)
+
+    async def stop_object_graph(self) -> None:
+        """Stop object graph the engine"""
+
+        await self._send_one_way_rpc_request(
+            request=RPCUObjectGraphRequest.STOP_OBJECT_GRAPH,
+            socket=self.input_socket)
 
     async def reset_mm_cache(self) -> None:
         """Reset the multi-modal cache"""

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -27,6 +27,7 @@
                                          RPCResetPrefixCacheRequest,
                                          RPCSleepRequest, RPCStartupRequest,
                                          RPCStartupResponse,
+                                         RPCUObjectGraphRequest,
                                          RPCUProfileRequest, RPCWakeUpRequest)
 # yapf: enable
 from vllm.logger import init_logger
@@ -284,6 +285,11 @@ def handle_new_input(self):
                     self.wake_up(request.tags)
                 elif isinstance(request, RPCIsSleepingRequest):
                     self._handle_is_sleeping_request(request)
+                elif isinstance(request, RPCUObjectGraphRequest):
+                    if request == RPCUObjectGraphRequest.START_OBJECT_GRAPH:
+                        self.start_object_graph()
+                    else:
+                        self.stop_object_graph()
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -416,6 +422,12 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.engine.stop_profile()
 
+    def start_object_graph(self) -> None:
+        self.engine.start_object_graph()
+
+    def stop_object_graph(self) -> None:
+        self.engine.stop_object_graph()
+
     def reset_mm_cache(self) -> bool:
         return self.engine.reset_mm_cache()
 

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -294,6 +294,16 @@ async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
 
+    @abstractmethod
+    async def start_object_graph(self) -> None:
+        """Start object graph the engine"""
+        ...
+
+    @abstractmethod
+    async def stop_object_graph(self) -> None:
+        """Stop object graph the engine"""
+        ...
+
     @abstractmethod
     async def reset_mm_cache(self) -> None:
         """Reset the multi-modal cache"""

@@ -1329,6 +1329,12 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
+    def start_object_graph(self) -> None:
+        self.llm_engine.start_object_graph()
+
+    def stop_object_graph(self) -> None:
+        self.llm_engine.stop_object_graph()
+
     def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         return self.llm_engine.reset_prefix_cache(device)
 

@@ -972,6 +972,26 @@ async def stop_profile(raw_request: Request):
         return Response(status_code=200)
 
 
+if envs.VLLM_OBJ_GRAPH_DIR:
+    logger.warning(
+        "Object Graph is enabled in the API server. This should ONLY be "
+        "used for local development!")
+
+    @router.post("/start_object_graph")
+    async def start_object_graph(raw_request: Request):
+        logger.info("Starting object graph...")
+        await engine_client(raw_request).start_object_graph()
+        logger.info("Object graph started.")
+        return Response(status_code=200)
+
+    @router.post("/stop_object_graph")
+    async def stop_object_graph(raw_request: Request):
+        logger.info("Stopping object graph...")
+        await engine_client(raw_request).stop_object_graph()
+        logger.info("Object graph stopped.")
+        return Response(status_code=200)
+
+
 if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     logger.warning(
         "LoRA dynamic loading & unloading is enabled in the API server. "

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -77,6 +77,7 @@
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_OBJ_GRAPH_DIR: Optional[str] = None
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -599,6 +600,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
 
+    # Enables memory object graph tracking if set. Path to the directory where
+    # object graph files are saved. Note that it must be an absolute path.
+    "VLLM_OBJ_GRAPH_DIR":
+    lambda: (None if os.getenv("VLLM_OBJ_GRAPH_DIR", None) is None else os.path
+             .expanduser(os.getenv("VLLM_OBJ_GRAPH_DIR", "."))),
+
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ":
     lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),

@@ -199,6 +199,12 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.collective_rpc("stop_profile")
 
+    def start_object_graph(self) -> None:
+        self.collective_rpc("start_object_graph")
+
+    def stop_object_graph(self) -> None:
+        self.collective_rpc("stop_object_graph")
+
     def sleep(self, level: int = 1):
         if self.is_sleeping:
             logger.warning("Executor is already sleeping.")

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -2929,3 +2929,105 @@ def is_torch_equal_or_newer(target: str) -> bool:
 def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
     torch_version = version.parse(torch_version)
     return torch_version >= version.parse(target)
+
+class GrowingMemoryObjGraph:
+    def __init__(self):
+        from vllm import envs
+        if not envs.VLLM_OBJ_GRAPH_DIR:
+            raise RuntimeError("VLLM_OBJ_GRAPH_DIR is not set.")
+        self._obj_graph_dir = envs.VLLM_OBJ_GRAPH_DIR
+        os.makedirs(self._obj_graph_dir, exist_ok=True)
+
+        self._start_state = False
+
+
+    def start(self) -> str:
+        import objgraph
+
+        gc.collect()
+        objgraph.growth()
+        self._start_state = True
+        self.start_time = time.time()
+        return "start growing obj graph statistics"
+
+    def stop(self) -> str:
+        import objgraph
+
+        if not self._start_state:
+            msg = "obj graph statistics is not started"
+            logger.warning(msg)
+            return msg
+
+        # Generate output filename with date
+        current_date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        # Create subdirectory for this analysis
+        analysis_dir = os.path.join(self._obj_graph_dir,
+                                    f"analysis_{current_date}")
+        try:
+            os.makedirs(analysis_dir, exist_ok=True)
+        except OSError as e:
+            logger.error("Failed to create directory %s: %s", analysis_dir, e)
+            return f"Failed to create directory: {e}"
+
+        output_lines = []
+        start_time_formatted = datetime.datetime.fromtimestamp(
+            self.start_time).strftime("%Y-%m-%d %H:%M:%S")
+        current_time_formatted = datetime.datetime.now().strftime(
+            "%Y-%m-%d %H:%M:%S")
+        output_lines.append(f"{'='*50}")
+        output_lines.append(
+            f"start {start_time_formatted}, current: {current_time_formatted}")
+        output_lines.append(f"{'='*50}")
+
+        gc.collect()
+        growth_info = objgraph.growth()
+
+        for gt in growth_info:
+            output_lines.append(
+                f"Growth type: {gt[0]}, Count: {gt[1]}, Growth amount: {gt[2]}"
+            )
+
+        for gt in growth_info:
+            # Get the first object of this type
+            try:
+                obj = objgraph.by_type(gt[0])[0]
+            except IndexError:
+                logger.warning("Type %s has no available objects", gt[0])
+                continue
+
+        # Generate back reference graph
+        objgraph.show_backrefs(
+            obj,
+            max_depth=10,
+            too_many=5,
+            filename=os.path.join(analysis_dir, f"{gt[0]}_backrefs.dot"),
+        )
+
+        # Generate reference graph
+        objgraph.show_refs(
+            obj,
+            max_depth=10,
+            too_many=5,
+            filename=os.path.join(analysis_dir, f"{gt[0]}_refs.dot"),
+        )
+
+        # Generate reference chain to module
+        objgraph.show_chain(
+            objgraph.find_backref_chain(obj, objgraph.is_proper_module),
+            filename=os.path.join(analysis_dir, f"{gt[0]}_chain.dot"),
+        )
+
+        output_file_path = os.path.join(analysis_dir,
+                                        "growing_memory_stats.log")
+        try:
+            with open(output_file_path, 'w', encoding='utf-8') as f:
+                for line in output_lines:
+                    f.write(line + '\n')
+        except OSError as e:
+            logger.error("Failed to write to file %s: %s", output_file_path, e)
+            return f"Failed to write to file: {e}"
+
+        logger.info("obj graph statistics completed, output_lines: %s",
+                    output_lines)
+        return "obj graph statistics completed"