Skip to content

[Misc][tool] Add a growth obj analyse tools to troubleshoot OOM issues #20024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions examples/offline_inference/simple_growthing_obj_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import os
import time

from vllm import LLM, SamplingParams

# Enable object graph analysis by setting environment variable
os.environ["VLLM_OBJ_GRAPH_DIR"] = "./vllm_obj_graph"

# Sample prompts
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of artificial intelligence is",
]
# Create sampling parameters object
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)


def main():
# Create LLM instance
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)

# Start object graph analysis
llm.start_object_graph()

# Generate text from prompts. The output is a list of RequestOutput objects
# containing the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)

# Stop object graph analysis
llm.stop_object_graph()

# Print output results
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

# Add buffer time to wait for background processes (if multiprocessing is enabled)
# to complete writing object graph analysis output.
time.sleep(10)
print(f"Completed! Results saved to: {os.environ['VLLM_OBJ_GRAPH_DIR']}")
print("You can check the generated files to analyze memory growth")

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ python-json-logger # Used by logging as per examples/others/logging_configuratio
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
pybase64 # fast base64 implementation
objgraph # Required for memory debugging and object graph analysis
6 changes: 6 additions & 0 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,12 @@ async def start_profile(self) -> None:
async def stop_profile(self) -> None:
self.engine.stop_profile()

async def start_object_graph(self) -> None:
self.engine.start_object_graph()

async def stop_object_graph(self) -> None:
self.engine.stop_object_graph()

async def reset_mm_cache(self) -> None:
self.engine.reset_mm_cache()

Expand Down
6 changes: 6 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1854,6 +1854,12 @@ def start_profile(self) -> None:
def stop_profile(self) -> None:
self.model_executor.stop_profile()

def start_object_graph(self) -> None:
self.model_executor.start_object_graph()

def stop_object_graph(self) -> None:
self.model_executor.stop_object_graph()

def sleep(self, level: int = 1) -> None:
assert self.vllm_config.model_config.enable_sleep_mode, (
"Sleep mode is not enabled in the model config")
Expand Down
6 changes: 5 additions & 1 deletion vllm/engine/multiprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ class RPCUProfileRequest(Enum):
START_PROFILE = 1
STOP_PROFILE = 2

class RPCUObjectGraphRequest(Enum):
START_OBJECT_GRAPH = 1
STOP_OBJECT_GRAPH = 2

class RPCResetMultiModalCacheRequest(Enum):
RESET = 1
Expand Down Expand Up @@ -130,7 +133,8 @@ class RPCAdapterLoadedResponse:
RPCUProfileRequest, RPCLoadAdapterRequest,
RPCResetMultiModalCacheRequest,
RPCResetPrefixCacheRequest, RPCSleepRequest,
RPCWakeUpRequest, RPCIsSleepingRequest]
RPCWakeUpRequest, RPCIsSleepingRequest,
RPCUObjectGraphRequest]

REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
RPCIsSleepingResponse, RPCError]
Expand Down
15 changes: 15 additions & 0 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
RPCResetPrefixCacheRequest,
RPCSleepRequest, RPCStartupRequest,
RPCStartupResponse,
RPCUObjectGraphRequest,
RPCUProfileRequest, RPCWakeUpRequest)
from vllm.engine.protocol import EngineClient
# yapf: enable
Expand Down Expand Up @@ -615,6 +616,20 @@ async def stop_profile(self) -> None:

await self._send_one_way_rpc_request(
request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)

async def start_object_graph(self) -> None:
"""Start object graph the engine"""

await self._send_one_way_rpc_request(
request=RPCUObjectGraphRequest.START_OBJECT_GRAPH,
socket=self.input_socket)

async def stop_object_graph(self) -> None:
"""Stop object graph the engine"""

await self._send_one_way_rpc_request(
request=RPCUObjectGraphRequest.STOP_OBJECT_GRAPH,
socket=self.input_socket)

async def reset_mm_cache(self) -> None:
"""Reset the multi-modal cache"""
Expand Down
12 changes: 12 additions & 0 deletions vllm/engine/multiprocessing/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
RPCResetPrefixCacheRequest,
RPCSleepRequest, RPCStartupRequest,
RPCStartupResponse,
RPCUObjectGraphRequest,
RPCUProfileRequest, RPCWakeUpRequest)
# yapf: enable
from vllm.logger import init_logger
Expand Down Expand Up @@ -284,6 +285,11 @@ def handle_new_input(self):
self.wake_up(request.tags)
elif isinstance(request, RPCIsSleepingRequest):
self._handle_is_sleeping_request(request)
elif isinstance(request, RPCUObjectGraphRequest):
if request == RPCUObjectGraphRequest.START_OBJECT_GRAPH:
self.start_object_graph()
else:
self.stop_object_graph()
else:
raise ValueError("Unknown RPCRequest Type: "
f"{type(request)}")
Expand Down Expand Up @@ -416,6 +422,12 @@ def start_profile(self) -> None:
def stop_profile(self) -> None:
self.engine.stop_profile()

def start_object_graph(self) -> None:
self.engine.start_object_graph()

def stop_object_graph(self) -> None:
self.engine.stop_object_graph()

def reset_mm_cache(self) -> bool:
return self.engine.reset_mm_cache()

Expand Down
10 changes: 10 additions & 0 deletions vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,16 @@ async def stop_profile(self) -> None:
"""Start profiling the engine"""
...

@abstractmethod
async def start_object_graph(self) -> None:
"""Start object graph the engine"""
...

@abstractmethod
async def stop_object_graph(self) -> None:
"""Stop object graph the engine"""
...

@abstractmethod
async def reset_mm_cache(self) -> None:
"""Reset the multi-modal cache"""
Expand Down
6 changes: 6 additions & 0 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1329,6 +1329,12 @@ def start_profile(self) -> None:
def stop_profile(self) -> None:
self.llm_engine.stop_profile()

def start_object_graph(self) -> None:
self.llm_engine.start_object_graph()

def stop_object_graph(self) -> None:
self.llm_engine.stop_object_graph()

def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
return self.llm_engine.reset_prefix_cache(device)

Expand Down
20 changes: 20 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,26 @@ async def stop_profile(raw_request: Request):
return Response(status_code=200)


if envs.VLLM_OBJ_GRAPH_DIR:
logger.warning(
"Object Graph is enabled in the API server. This should ONLY be "
"used for local development!")

@router.post("/start_object_graph")
async def start_object_graph(raw_request: Request):
logger.info("Starting object graph...")
await engine_client(raw_request).start_object_graph()
logger.info("Object graph started.")
return Response(status_code=200)

@router.post("/stop_object_graph")
async def stop_object_graph(raw_request: Request):
logger.info("Stopping object graph...")
await engine_client(raw_request).stop_object_graph()
logger.info("Object graph stopped.")
return Response(status_code=200)


if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
logger.warning(
"LoRA dynamic loading & unloading is enabled in the API server. "
Expand Down
7 changes: 7 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
VLLM_PLUGINS: Optional[list[str]] = None
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
VLLM_OBJ_GRAPH_DIR: Optional[str] = None
VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
Expand Down Expand Up @@ -599,6 +600,12 @@ def get_vllm_port() -> Optional[int]:
lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),

# Enables memory object graph tracking if set. Path to the directory where
# object graph files are saved. Note that it must be an absolute path.
"VLLM_OBJ_GRAPH_DIR":
lambda: (None if os.getenv("VLLM_OBJ_GRAPH_DIR", None) is None else os.path
.expanduser(os.getenv("VLLM_OBJ_GRAPH_DIR", "."))),

# If set, vLLM will use Triton implementations of AWQ.
"VLLM_USE_TRITON_AWQ":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
Expand Down
6 changes: 6 additions & 0 deletions vllm/executor/executor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ def start_profile(self) -> None:
def stop_profile(self) -> None:
self.collective_rpc("stop_profile")

def start_object_graph(self) -> None:
self.collective_rpc("start_object_graph")

def stop_object_graph(self) -> None:
self.collective_rpc("stop_object_graph")

def sleep(self, level: int = 1):
if self.is_sleeping:
logger.warning("Executor is already sleeping.")
Expand Down
102 changes: 102 additions & 0 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2929,3 +2929,105 @@ def is_torch_equal_or_newer(target: str) -> bool:
def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
torch_version = version.parse(torch_version)
return torch_version >= version.parse(target)

class GrowingMemoryObjGraph:
def __init__(self):
from vllm import envs
if not envs.VLLM_OBJ_GRAPH_DIR:
raise RuntimeError("VLLM_OBJ_GRAPH_DIR is not set.")
self._obj_graph_dir = envs.VLLM_OBJ_GRAPH_DIR
os.makedirs(self._obj_graph_dir, exist_ok=True)

self._start_state = False


def start(self) -> str:
import objgraph

gc.collect()
objgraph.growth()
self._start_state = True
self.start_time = time.time()
return "start growing obj graph statistics"

def stop(self) -> str:
import objgraph

if not self._start_state:
msg = "obj graph statistics is not started"
logger.warning(msg)
return msg

# Generate output filename with date
current_date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Create subdirectory for this analysis
analysis_dir = os.path.join(self._obj_graph_dir,
f"analysis_{current_date}")
try:
os.makedirs(analysis_dir, exist_ok=True)
except OSError as e:
logger.error("Failed to create directory %s: %s", analysis_dir, e)
return f"Failed to create directory: {e}"

output_lines = []
start_time_formatted = datetime.datetime.fromtimestamp(
self.start_time).strftime("%Y-%m-%d %H:%M:%S")
current_time_formatted = datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S")
output_lines.append(f"{'='*50}")
output_lines.append(
f"start {start_time_formatted}, current: {current_time_formatted}")
output_lines.append(f"{'='*50}")

gc.collect()
growth_info = objgraph.growth()

for gt in growth_info:
output_lines.append(
f"Growth type: {gt[0]}, Count: {gt[1]}, Growth amount: {gt[2]}"
)

for gt in growth_info:
# Get the first object of this type
try:
obj = objgraph.by_type(gt[0])[0]
except IndexError:
logger.warning("Type %s has no available objects", gt[0])
continue

# Generate back reference graph
objgraph.show_backrefs(
obj,
max_depth=10,
too_many=5,
filename=os.path.join(analysis_dir, f"{gt[0]}_backrefs.dot"),
)

# Generate reference graph
objgraph.show_refs(
obj,
max_depth=10,
too_many=5,
filename=os.path.join(analysis_dir, f"{gt[0]}_refs.dot"),
)

# Generate reference chain to module
objgraph.show_chain(
objgraph.find_backref_chain(obj, objgraph.is_proper_module),
filename=os.path.join(analysis_dir, f"{gt[0]}_chain.dot"),
)

output_file_path = os.path.join(analysis_dir,
"growing_memory_stats.log")
try:
with open(output_file_path, 'w', encoding='utf-8') as f:
for line in output_lines:
f.write(line + '\n')
except OSError as e:
logger.error("Failed to write to file %s: %s", output_file_path, e)
return f"Failed to write to file: {e}"

logger.info("obj graph statistics completed, output_lines: %s",
output_lines)
return "obj graph statistics completed"
Loading
Loading