feat: support offline expert load distribution recording

jianzs · jianzs · commit 80299e0ade26 · 2025-06-15T15:21:19.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -284,6 +284,21 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         """Start profiling the engine"""
         ...
+        
+    @abstractmethod
+    async def start_expert_distribution_record(self) -> None:
+        """Start recording expert distribution"""
+        ...
+    
+    @abstractmethod
+    async def stop_expert_distribution_record(self) -> None:
+        """Stop recording expert distribution"""
+        ...
+    
+    @abstractmethod
+    async def dump_expert_distribution_record(self) -> None:
+        """Dump expert distribution record"""
+        ...
 
     @abstractmethod
     async def reset_mm_cache(self) -> None:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -972,6 +972,41 @@ async def stop_profile(raw_request: Request):
         return Response(status_code=200)
 
 
+if envs.VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR:
+    @router.get("/start_expert_distribution_record")
+    async def start_expert_distribution_record(raw_request: Request):
+        """Start recording the expert distribution. Clear the previous record if any."""
+        logger.info("Starting expert distribution record...")
+        await engine_client(raw_request).start_expert_distribution_record()
+        logger.info("Expert distribution record started.")
+        return Response(
+            content="Start recording the expert distribution.\n",
+            status_code=200,
+        )
+
+    @router.get("/stop_expert_distribution_record")
+    async def stop_expert_distribution_record(raw_request: Request):
+        """Stop recording the expert distribution."""
+        logger.info("Stopping expert distribution record...")
+        await engine_client(raw_request).stop_expert_distribution_record()
+        logger.info("Expert distribution record stopped.")
+        return Response(
+            content="Stop recording the expert distribution.\n",
+            status_code=200,
+        )
+
+    @router.get("/dump_expert_distribution_record")
+    async def dump_expert_distribution_record(raw_request: Request):
+        """Dump expert distribution record."""
+        logger.info("Dumping expert distribution record...")
+        await engine_client(raw_request).dump_expert_distribution_record()
+        logger.info("Expert distribution record dumped.")
+        return Response(
+            content="Dump expert distribution record.\n",
+            status_code=200,
+        )
+
+
 if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     logger.warning(
         "LoRA dynamic loading & unloading is enabled in the API server. "
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -128,7 +128,7 @@
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
     VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
-
+    VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR: Optional[str] = None
 
 def get_default_cache_root():
     return os.getenv(
@@ -879,6 +879,11 @@ def get_vllm_port() -> Optional[int]:
     # processes via zmq.
     "VLLM_MQ_MAX_CHUNK_BYTES_MB":
     lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
+    # Directory to store expert distribution recorder files.
+
+    "VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR":
+    lambda: os.path.expanduser(
+        os.getenv("VLLM_EXPERT_DISTRIBUTION_RECORDER_DIR", None)),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -498,6 +498,15 @@ async def reset_mm_cache(self) -> None:
         self.processor.mm_input_cache_client.reset()
         await self.engine_core.reset_mm_cache_async()
 
+    async def start_expert_distribution_record(self):
+        await self.engine_core.expert_distribution_record_async(is_start=True)
+
+    async def stop_expert_distribution_record(self):
+        await self.engine_core.expert_distribution_record_async(is_start=False)
+
+    async def dump_expert_distribution_record(self):
+        await self.engine_core.dump_expert_distribution_record_async()
+
     async def reset_prefix_cache(self,
                                  device: Optional[Device] = None) -> None:
         if device == Device.CPU:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -302,6 +302,12 @@ def reset_mm_cache(self):
 
         self.mm_input_cache_server.reset()
 
+    def expert_distribution_record(self, is_start: bool) -> None:
+        self.model_executor.expert_distribution_record(is_start)
+
+    def dump_expert_distribution_record(self) -> None:
+        self.model_executor.dump_expert_distribution_record()
+
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -105,6 +105,12 @@ def add_request(self, request: EngineCoreRequest) -> None:
 
     def profile(self, is_start: bool = True) -> None:
         raise NotImplementedError
+    
+    def expert_distribution_record(self, is_start: bool) -> None:
+        raise NotImplementedError
+    
+    def dump_expert_distribution_record(self) -> None:
+        raise NotImplementedError
 
     def reset_mm_cache(self) -> None:
         raise NotImplementedError
@@ -857,6 +863,12 @@ async def profile_async(self, is_start: bool = True) -> None:
     async def reset_mm_cache_async(self) -> None:
         await self.call_utility_async("reset_mm_cache")
 
+    async def expert_distribution_record_async(self, is_start: bool) -> None:
+        await self.call_utility_async("expert_distribution_record", is_start)
+
+    async def dump_expert_distribution_record_async(self) -> None:
+        await self.call_utility_async("dump_expert_distribution_record")
+
     async def reset_prefix_cache_async(self) -> None:
         await self.call_utility_async("reset_prefix_cache")
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
@@ -95,6 +95,12 @@ def max_concurrent_batches(self) -> int:
     def profile(self, is_start: bool = True):
         self.collective_rpc("profile", args=(is_start, ))
 
+    def expert_distribution_record(self, is_start: bool):
+        self.collective_rpc("expert_distribution_record",
+                            args=(is_start,))
+
+    def dump_expert_distribution_record(self):
+        self.collective_rpc("dump_expert_distribution_record")
 
 class UniProcExecutor(UniProcExecutorV0, Executor):
     pass