Skip to content

Commit bc7c9f5

Browse files
authored
Fixes distributed setup in benchmarking scripts (isaac-sim#2194)
# Description Previously, benchmark scripts were stopping the benchmark outside of the global rank check and this occasionally causes issues on processes with global ranks > 0. This change moves the call to be inside the if statement such that it is only called on the rank 0 process. ## Type of change <!-- As you go through the list, delete the ones that are not applicable. --> - Bug fix (non-breaking change which fixes an issue) ## Screenshots Please attach before and after screenshots of the change if applicable. <!-- Example: | Before | After | | ------ | ----- | | _gif/png before_ | _gif/png after_ | To upload images to a PR -- simply drag and drop an image while in edit mode and it should upload the image directly. You can then paste that source into the above before/after sections. --> ## Checklist - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [ ] I have added my name to the `CONTRIBUTORS.md` or my name already exists there <!-- As you go through the checklist above, you can mark something as done by putting an x character in it For example, - [x] I have done this task - [ ] I have not done this task -->
1 parent d41c5a9 commit bc7c9f5

File tree

3 files changed

+32
-29
lines changed

3 files changed

+32
-29
lines changed

scripts/benchmarks/benchmark_non_rl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
193193
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
194194
log_runtime_step_times(benchmark, environment_step_times, compute_stats=True)
195195

196-
benchmark.stop()
196+
benchmark.stop()
197197

198198
# close the simulator
199199
env.close()

scripts/benchmarks/benchmark_rlgames.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
248248
log_rl_policy_rewards(benchmark, log_data["rewards/iter"])
249249
log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"])
250250

251-
benchmark.stop()
251+
benchmark.stop()
252252

253253
# close the simulator
254254
env.close()

scripts/benchmarks/benchmark_rsl_rl.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
142142
env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
143143

144144
# multi-gpu training configuration
145+
world_rank = 0
145146
if args_cli.distributed:
146147
env_cfg.sim.device = f"cuda:{app_launcher.local_rank}"
147148
agent_cfg.device = f"cuda:{app_launcher.local_rank}"
@@ -150,6 +151,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
150151
seed = agent_cfg.seed + app_launcher.local_rank
151152
env_cfg.seed = seed
152153
agent_cfg.seed = seed
154+
world_rank = app_launcher.global_rank
153155

154156
# specify directory for logging experiments
155157
log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
@@ -211,34 +213,35 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
211213
# run training
212214
runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)
213215

214-
benchmark.store_measurements()
215-
216-
# parse tensorboard file stats
217-
log_data = parse_tf_logs(log_dir)
216+
if world_rank == 0:
217+
benchmark.store_measurements()
218+
219+
# parse tensorboard file stats
220+
log_data = parse_tf_logs(log_dir)
221+
222+
# prepare RL timing dict
223+
collection_fps = (
224+
1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
225+
)
226+
rl_training_times = {
227+
"Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
228+
"Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
229+
"Collection FPS": collection_fps.tolist(),
230+
"Total FPS": log_data["Perf/total_fps"],
231+
}
218232

219-
# prepare RL timing dict
220-
collection_fps = (
221-
1 / (np.array(log_data["Perf/collection time"])) * env.unwrapped.num_envs * agent_cfg.num_steps_per_env
222-
)
223-
rl_training_times = {
224-
"Collection Time": (np.array(log_data["Perf/collection time"]) / 1000).tolist(),
225-
"Learning Time": (np.array(log_data["Perf/learning_time"]) / 1000).tolist(),
226-
"Collection FPS": collection_fps.tolist(),
227-
"Total FPS": log_data["Perf/total_fps"],
228-
}
229-
230-
# log additional metrics to benchmark services
231-
log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
232-
log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
233-
log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
234-
log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
235-
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
236-
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
237-
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
238-
log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
239-
log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
240-
241-
benchmark.stop()
233+
# log additional metrics to benchmark services
234+
log_app_start_time(benchmark, (app_start_time_end - app_start_time_begin) / 1e6)
235+
log_python_imports_time(benchmark, (imports_time_end - imports_time_begin) / 1e6)
236+
log_task_start_time(benchmark, (task_startup_time_end - task_startup_time_begin) / 1e6)
237+
log_scene_creation_time(benchmark, Timer.get_timer_info("scene_creation") * 1000)
238+
log_simulation_start_time(benchmark, Timer.get_timer_info("simulation_start") * 1000)
239+
log_total_start_time(benchmark, (task_startup_time_end - app_start_time_begin) / 1e6)
240+
log_runtime_step_times(benchmark, rl_training_times, compute_stats=True)
241+
log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"])
242+
log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"])
243+
244+
benchmark.stop()
242245

243246
# close the simulator
244247
env.close()

0 commit comments

Comments
 (0)