Fix shared memory check for HIP (#4044)

q10 · facebook-github-bot · commit f7ca2992cc8c · 2025-04-29T19:02:13.000-07:00
Summary: Pull Request resolved: #4044 X-link: facebookresearch/FBGEMM#1128 - Update shared memory checks on HIP to use sharedMemPerBlock instead of sharedMemPerBlockOptin since the latter is not supported on HIP Reviewed By: sryap Differential Revision: D73868502 fbshipit-source-id: 1f83323af696007cbc6a33ad3ed65ff8184d4156
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh
@@ -181,9 +181,20 @@ struct KernelLauncher {
       const cudaDeviceProp& properties,
       const size_t shared_mem_per_block) const {
     // NOTE: sharedMemPerBlockOptin is the maximum possible shared memory that
-    // can be used per block by explicit special opt-in, and is larger than
-    // sharedMemPerBlock.
+    // can be used per block by explicit special opt-in, and is generally larger
+    // than sharedMemPerBlock.
+    //
+    // However, this feature does not exist in HIP at the moment, and while more
+    // recent versions of ROCm (6.4+?) set the value of sharedMemPerBlockOptin
+    // to be sharedMemPerBlock, older versions of ROCm set the value to zero.
+    //
+    // See:
+    //  https://github.com/ROCm/HIP/issues/3516
+#ifdef __HIP_PLATFORM_AMD__
+    const auto smem_limits = properties.sharedMemPerBlock;
+#else
     const auto smem_limits = properties.sharedMemPerBlockOptin;
+#endif
 
     TORCH_CHECK(
         shared_mem_per_block <= smem_limits,
diff --git a/fbgemm_gpu/test/utils/kernel_launcher_test.cu b/fbgemm_gpu/test/utils/kernel_launcher_test.cu
@@ -270,7 +270,8 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
       },
       std::exception);
 
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
+#if defined(__HIP_PLATFORM_AMD__) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))
   // Test max thread count
   EXPECT_THROW(
       {
@@ -296,8 +297,12 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
             tensor_sum_kernel<float>,
             8,
             1024,
-            // Requested shared memory size is too large
+    // Requested shared memory size is too large
+#ifdef __HIP_PLATFORM_AMD__
+            properties.sharedMemPerBlock + 1,
+#else
             properties.sharedMemPerBlockOptin + 1,
+#endif
             at::cuda::getCurrentCUDAStream(),
             PTA_B(C, float, 1, 64),
             PTA_B(A, float, 1, 64),