Fixes distributed setup in benchmarking scripts (isaac-sim#2194)

kellyguo11 · web-flow · commit bc7c9f5c7c2a · 2025-03-31T13:43:08.000-07:00
# Description Previously, benchmark scripts were stopping the benchmark outside of the global rank check and this occasionally causes issues on processes with global ranks > 0. This change moves the call to be inside the if statement such that it is only called on the rank 0 process. ## Type of change  - Bug fix (non-breaking change which fixes an issue) ## Screenshots Please attach before and after screenshots of the change if applicable.  ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [ ] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
diff --git a/scripts/benchmarks/benchmark_non_rl.py b/scripts/benchmarks/benchmark_non_rl.py
@@ -193,7 +193,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
         log_runtime_step_times(benchmark, environment_step_times, compute_stats=True)
 
-    benchmark.stop()
+        benchmark.stop()
 
     # close the simulator
     env.close()
diff --git a/scripts/benchmarks/benchmark_rlgames.py b/scripts/benchmarks/benchmark_rlgames.py
@@ -248,7 +248,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         log_rl_policy_rewards(benchmark, log_data["rewards/iter"])
         log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"])
 
-    benchmark.stop()
+        benchmark.stop()
 
     # close the simulator
     env.close()
diff --git a/scripts/benchmarks/benchmark_rsl_rl.py b/scripts/benchmarks/benchmark_rsl_rl.py
@@ -142,6 +142,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
 
     # multi-gpu training configuration
+    world_rank = 0
     if args_cli.distributed:
         env_cfg.sim.device = f"cuda:{app_launcher.local_rank}"
         agent_cfg.device = f"cuda:{app_launcher.local_rank}"
@@ -150,6 +151,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         seed = agent_cfg.seed + app_launcher.local_rank
         env_cfg.seed = seed
         agent_cfg.seed = seed
+        world_rank = app_launcher.global_rank
 
     # specify directory for logging experiments
     log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
@@ -211,34 +213,35 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     # run training
     runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)
 
-    benchmark.store_measurements()
-
-    # parse tensorboard file stats
-    log_data = parse_tf_logs(log_dir)
+    if world_rank == 0:
+        benchmark.store_measurements()
+
+        # parse tensorboard file stats
+        log_data = parse_tf_logs(log_dir)
+
+        # prepare RL timing dict
+        collection_fps = (
+            1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
+        )
+        rl_training_times = {
+            "Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
+            "Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
+            "Collection FPS": collection_fps.tolist(),
+            "Total FPS": log_data["Perf/total_fps"],
+        }
 
-    # prepare RL timing dict
-    collection_fps = (
-        1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
-    )
-    rl_training_times = {
-        "Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
-        "Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
-        "Collection FPS": collection_fps.tolist(),
-        "Total FPS": log_data["Perf/total_fps"],
-    }
-
-    # log additional metrics to benchmark services
-    log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
-    log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
-    log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
-    log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
-    log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
-    log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
-    log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
-    log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
-    log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
-
-    benchmark.stop()
+        # log additional metrics to benchmark services
+        log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
+        log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
+        log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
+        log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
+        log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
+        log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
+        log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
+        log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
+        log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
+
+        benchmark.stop()
 
     # close the simulator
     env.close()