Optimize if-statements with if-constexpr (#4022)

q10 · facebook-github-bot · commit eff548eb55f4 · 2025-04-28T12:55:36.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1110 Pull Request resolved: #4022 - Replace if-statements with if-constexpr blocks to optimize out some code branches completely. This is to facilitate `WeightRow` class cleanups - Replace uses of WeightRow with WeightRowAccessor where cache loading and eviction are not used Reviewed By: sryap, spcyppt Differential Revision: D73678501
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -214,7 +214,10 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
                 {%- if not dense %}
                 const auto {{ locs_or_addrs_idx }}_j = shfl_sync({{ locs_or_addrs_idx }}, j);
                 {%- endif %}
+
                 at::acc_type<cache_t, true> grad_indice_weight = 0.0;
+                [[maybe_unused]] const auto weight_row =
+                    WeightRowAccessor<emb_t, at::acc_type<cache_t, true>>(&weights[offset_idx_j], D);
 
                 #pragma unroll kFixedMaxVecsPerThread
                 for (int32_t vec = 0;
@@ -241,32 +244,15 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
                             weight.acc.z * grad_out[vec].acc.z +
                             weight.acc.w * grad_out[vec].acc.w;
                     } else {
-                        auto weight_row = WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
-                            &weights[offset_idx_j],
-                            nullptr,
-                            D);
-                        float2 qparams;
-                        if (std::is_same<emb_t, uint8_t>::value) {
-                            qparams = weight_row.load_qparams();
-                        }
-                        Vec4TAcc<cache_t> weight =
-                        weight_row.load(d, qparams);
+                        const auto weight = weight_row.load(d);
                         grad_indice_weight += weight.acc.x * grad_out[vec].acc.x +
                             weight.acc.y * grad_out[vec].acc.y +
                             weight.acc.z * grad_out[vec].acc.z +
                             weight.acc.w * grad_out[vec].acc.w;
                     }
                     {%- else %}
-                    auto weight_row = WeightRow<emb_t, cache_t, at::acc_type<cache_t, true>>(
-                        &weights[offset_idx_j],
-                        nullptr,
-                        D);
-                    float2 qparams;
-                    if (std::is_same<emb_t, uint8_t>::value) {
-                        qparams = weight_row.load_qparams();
-                    }
-                    Vec4TAcc<cache_t> weight =
-                    weight_row.load(d, qparams);
+                    const auto weight = weight_row.load(d);
+
                     grad_indice_weight += weight.acc.x * grad_out[vec].acc.x +
                         weight.acc.y * grad_out[vec].acc.y +
                         weight.acc.z * grad_out[vec].acc.z +
diff --git a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh
@@ -57,7 +57,7 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
     const int32_t max_vecs_per_thread,
     {{ args.split_ref_kernel_args | replace_pta_namespace() | join(",\n    ") }}
 ) {
-    constexpr auto kIsInt8 = std::is_same<emb_t, uint8_t>::value;
+    constexpr auto kIsInt8 = std::is_same_v<emb_t, uint8_t>;
     // Copy value to max_vecs to make max_vecs_per_thread known at compile time
     // when kUseVecBlocking == false
     const int32_t max_vecs =
@@ -107,8 +107,10 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
             threadIdx.x + run_id * blockDim.x);
 
     float2 qparams_template;
-    if (kIsInt8 && !cache_weights) {
-        qparams_template = weight_row_template.load_qparams();
+    if constexpr (kIsInt8) {
+        if (!cache_weights) {
+            qparams_template = weight_row_template.load_qparams();
+        }
     }
 
     {{ split_precomputation }}
@@ -142,23 +144,25 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
        )
     }}
 
-    if (kIsInt8 && !cache_weights) {
-        // Calculate new qparams after row update
-        qparams_new = thrust_find_qparams<at::acc_type<cache_t, true>>(
-            shared_weight_update_row, D);
-        weight_row_template.store_qparams(qparams_new);
+    if constexpr (kIsInt8) {
+        if (!cache_weights) {
+            // Calculate new qparams after row update
+            qparams_new = thrust_find_qparams<at::acc_type<cache_t, true>>(
+                shared_weight_update_row, D);
+            weight_row_template.store_qparams(qparams_new);
 
-        // Fetch cached updated row from shared mem and quantize on-the-fly
-        // when saving to lowp embedding
-        for (int32_t vec = 0;
-            (vec * kThreadGroupSize + threadIdx.x) * VEC_WIDTH < D;
-            ++vec) {
-            const auto d_vec = vec * kThreadGroupSize + threadIdx.x;
-            const int32_t d = d_vec * VEC_WIDTH;
-            weight_row_template.store(
-                shared_weight_update_row[d_vec],
-                d,
-                qparams_new);
+            // Fetch cached updated row from shared mem and quantize on-the-fly
+            // when saving to lowp embedding
+            for (int32_t vec = 0;
+                (vec * kThreadGroupSize + threadIdx.x) * VEC_WIDTH < D;
+                ++vec) {
+                const auto d_vec = vec * kThreadGroupSize + threadIdx.x;
+                const int32_t d = d_vec * VEC_WIDTH;
+                weight_row_template.store(
+                    shared_weight_update_row[d_vec],
+                    d,
+                    qparams_new);
+            }
         }
     }
 
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/weight_row.cuh
@@ -84,20 +84,23 @@ DEVICE_INLINE void store_qparams_to_row(uint8_t* ptr, float2 qparams) {
   auto ptr_as_uint = reinterpret_cast<uintptr_t>(ptr);
   if (ptr_as_uint % 8 == 0) {
     *reinterpret_cast<float2*>(ptr) = qparams;
+
   } else if (ptr_as_uint % 4 == 0) {
     auto* ptr_float = reinterpret_cast<float*>(ptr);
     auto* qparam_ptr = reinterpret_cast<const float*>(&qparams.x);
 #pragma unroll
     for (int i = 0; i < 2; ++i) {
       ptr_float[i] = qparam_ptr[i];
     }
+
   } else if (ptr_as_uint % 2 == 0) {
     auto* ptr_16bit = reinterpret_cast<uint16_t*>(ptr);
     auto* qparam_ptr = reinterpret_cast<const uint16_t*>(&qparams.x);
 #pragma unroll
     for (int i = 0; i < 4; ++i) {
       ptr_16bit[i] = qparam_ptr[i];
     }
+
   } else {
     auto* qparam_ptr = reinterpret_cast<const uint8_t*>(&qparams.x);
 #pragma unroll