From a331692e48bf6935473d650da4ae2b303e4392df Mon Sep 17 00:00:00 2001 From: Shikai Li Date: Mon, 28 Apr 2025 07:48:20 -0700 Subject: [PATCH] Use cudaMemsetAsync to setup IndexShuffling kernel. (#4016) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/4016 X-link: https://github.com/facebookresearch/FBGEMM/pull/1104 It is too expensive to launch a ATen kernel to do setup. Use cudaMemsetAsync instead. hipMemsetAsync is somehow more expensive than launching a kernel. Avoid doing so for now. Reviewed By: Alkaid-Benetnash Differential Revision: D73602755 --- .../gen_ai/src/moe/index_shuffling.cu | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu b/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu index c9ea073144..3e0d7abdb7 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu @@ -276,7 +276,23 @@ std::tuple index_shuffling_torch( at::Tensor shuffled_expert_indices = allocate_index_tensor(num_tokens); at::Tensor shuffled_token_indices = allocate_index_tensor(num_tokens); +#ifdef USE_ROCM counts.zero_(); + // TODO(shikaili): hipMetsetAsync is more expensive than ATen set zero. + /* + hipMemsetAsync( + counts.data_ptr(), + 0, + counts.numel() * counts.dtype().itemsize(), + at::cuda::getCurrentCUDAStream()); + */ +#else + cudaMemsetAsync( + counts.data_ptr(), + 0, + counts.numel() * counts.dtype().itemsize(), + at::cuda::getCurrentCUDAStream()); +#endif // Avoid expensive `cudaGetDeviceProperties` call. if (num_sms < 0) { @@ -298,7 +314,6 @@ std::tuple index_shuffling_torch( kernel = (void*)index_shuffling_kernel; \ smem_size = sizeof(SharedStorage); - int num_tokens_per_tile; if (num_experts == 16) { DISPATCH(16, kNumTokensPerTile); } else {