Fix int32_t to auto for code around WeightRow

q10 · facebook-github-bot · commit 68301c84cea5 · 2025-04-25T14:26:46.000-07:00
Summary:
- Fix `int32_t` to `auto` for code around `WeightRow`

- Fix `kINT8QparamsBytes` from `float` to `int32_t`

Differential Revision: D73690651
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_meta_template.cpp
@@ -33,7 +33,7 @@
 using namespace fbgemm_gpu;
 using Tensor = at::Tensor;
 
-[[maybe_unused]] static constexpr float kINT8QparamsBytes = 8;
+[[maybe_unused]] static constexpr int32_t kINT8QparamsBytes = 8;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel Definitions
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh
@@ -81,11 +81,10 @@ static constexpr uint32_t kFullWarpMask = 0xff'ff'ff'ff;
 
 static constexpr float kQParamEps = 1e-8f;
 
-/* For rowwise int8 quantization, two quantization parameters (qparams)
-will be stored at the end of each row in FP32 formats, appending a total of
-8 bytes to each row.
-*/
-static constexpr float kINT8QparamsBytes = 8;
+// For rowwise int8 quantization, two quantization parameters (qparams) will be
+// stored at the end of each row in FP32 formats, appending a total of 8 bytes
+// to each row.
+static constexpr int32_t kINT8QparamsBytes = 8;
 
 template <typename T>
 DEVICE_INLINE T shfl_xor(
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh
@@ -214,9 +214,9 @@ struct WeightRow {
 
   DEVICE_INLINE void warp_copy_to_cache(
       cache_t* dst_row,
-      const int32_t dim_length,
-      const int32_t num_lanes,
-      const int32_t lane_id) {
+      const uint32_t dim_length,
+      const uint32_t num_lanes,
+      const uint32_t lane_id) {
     if constexpr (std::is_same_v<emb_t, cache_t>) {
       // No conversion required when emb_t and cache_t are the same type
       for (int32_t d = lane_id * 4; d < dim_length; d += num_lanes * 4) {
@@ -237,9 +237,9 @@ struct WeightRow {
   }
 
   DEVICE_INLINE void warp_evict_cache(
-      const int32_t dim_length,
-      const int32_t num_lanes,
-      const int32_t lane_id) {
+      const uint32_t dim_length,
+      const uint32_t num_lanes,
+      const uint32_t lane_id) {
     float2 qparams;
 
     if constexpr (std::is_same_v<emb_t, uint8_t>) {
@@ -248,7 +248,7 @@ struct WeightRow {
           std::numeric_limits<at::acc_type<cache_t, true>>::lowest();
 
       // Compute the qparams from the cache row (not embedding row) weights
-      for (int32_t d = lane_id; d * 4 < dim_length; d += num_lanes) {
+      for (auto d = lane_id; d * 4 < dim_length; d += num_lanes) {
         const auto cache_slice = load(d * 4, qparams); // qparams not used
         local_max = max(local_max, cache_slice.vmax());
         local_min = min(local_min, cache_slice.vmin());
@@ -263,7 +263,7 @@ struct WeightRow {
       }
     }
 
-    for (int32_t d = lane_id * 4; d < dim_length; d += num_lanes * 4) {
+    for (auto d = lane_id * 4; d < dim_length; d += num_lanes * 4) {
       // Evict the slice into the embedding row
       evict_cache(d, qparams);
     }