fix the type hack in dramKV wrapper (#4012)

Siqiao Chen · facebook-github-bot · commit 004064644040 · 2025-04-24T11:45:39.000-07:00
Summary: Pull Request resolved: #4012 X-link: facebookresearch/FBGEMM#1099 in previous diff, we hard code dram wrapper to be float. In this diff, we allow call site to customize type of weight stored in dram. Currently it support FP32 and FP16. Reviewed By: emlin Differential Revision: D73477947 fbshipit-source-id: 25dda60b61a2a257e3548a7710df75ef4d6b9f88
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
@@ -8,8 +8,15 @@
 
 #pragma once
 
+#include "../ssd_split_embeddings_cache/kv_tensor_wrapper.h"
 #include "dram_kv_embedding_cache.h"
 
+namespace {
+using DramKVEmbeddingCacheVariant = std::variant<
+    std::shared_ptr<kv_mem::DramKVEmbeddingCache<float>>,
+    std::shared_ptr<kv_mem::DramKVEmbeddingCache<at::Half>>>;
+}
+
 namespace kv_mem {
 
 class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
@@ -21,61 +28,128 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
       int64_t num_shards = 8,
       int64_t num_threads = 32,
       int64_t row_storage_bitwidth = 32,
-      int64_t weight_ttl_in_hours = 2)
-      : impl_(std::make_shared<kv_mem::DramKVEmbeddingCache<float>>(
-            max_D,
-            uniform_init_lower,
-            uniform_init_upper,
-            num_shards,
-            num_threads,
-            row_storage_bitwidth,
-            weight_ttl_in_hours)) {}
+      int64_t weight_ttl_in_hours = 2) {
+    if (row_storage_bitwidth == 16) {
+      impl_ = std::make_shared<kv_mem::DramKVEmbeddingCache<at::Half>>(
+          max_D,
+          uniform_init_lower,
+          uniform_init_upper,
+          num_shards,
+          num_threads,
+          row_storage_bitwidth,
+          weight_ttl_in_hours);
+    } else if (row_storage_bitwidth == 32) {
+      impl_ = std::make_shared<kv_mem::DramKVEmbeddingCache<float>>(
+          max_D,
+          uniform_init_lower,
+          uniform_init_upper,
+          num_shards,
+          num_threads,
+          row_storage_bitwidth,
+          weight_ttl_in_hours);
+    } else {
+      throw std::runtime_error("Failed to create recording device");
+    }
+  }
 
   void set_cuda(
       at::Tensor indices,
       at::Tensor weights,
       at::Tensor count,
       int64_t timestep,
       bool is_bwd) {
-    return impl_->set_cuda(indices, weights, count, timestep, is_bwd);
+    return std::visit(
+        [&indices, &weights, &count, &timestep](auto& ptr) {
+          if (ptr) {
+            ptr->set_cuda(indices, weights, count, timestep);
+          }
+        },
+        impl_);
   }
 
   void get_cuda(at::Tensor indices, at::Tensor weights, at::Tensor count) {
-    return impl_->get_cuda(indices, weights, count);
+    return std::visit(
+        [&indices, &weights, &count](auto& ptr) {
+          if (ptr) {
+            ptr->get_cuda(indices, weights, count);
+          }
+        },
+        impl_);
   }
 
   void set(at::Tensor indices, at::Tensor weights, at::Tensor count) {
-    return impl_->set(indices, weights, count);
+    return std::visit(
+        [&indices, &weights, &count](auto& ptr) {
+          if (ptr) {
+            ptr->set(indices, weights, count);
+          }
+        },
+        impl_);
   }
 
   void flush() {
-    return impl_->flush();
+    return std::visit(
+        [](auto& ptr) {
+          if (ptr) {
+            ptr->flush();
+          }
+        },
+        impl_);
   }
 
   void set_range_to_storage(
       const at::Tensor& weights,
       const int64_t start,
       const int64_t length) {
-    return impl_->set_range_to_storage(weights, start, length);
+    return std::visit(
+        [&weights, &start, &length](auto& ptr) {
+          if (ptr) {
+            ptr->set_range_to_storage(weights, start, length);
+          }
+        },
+        impl_);
   }
 
   void get(
       at::Tensor indices,
       at::Tensor weights,
       at::Tensor count,
       int64_t sleep_ms) {
-    return impl_->get(indices, weights, count, sleep_ms);
+    return std::visit(
+        [&indices, &weights, &count, sleep_ms](auto& ptr) {
+          if (ptr) {
+            ptr->get(indices, weights, count, sleep_ms);
+          }
+        },
+        impl_);
   }
 
   void wait_util_filling_work_done() {
-    return impl_->wait_util_filling_work_done();
+    return std::visit(
+        [](auto& ptr) {
+          if (ptr) {
+            ptr->wait_util_filling_work_done();
+          }
+        },
+        impl_);
   }
 
   at::Tensor get_keys_in_range(int64_t start, int64_t end) {
-    return impl_->get_keys_in_range(start, end);
+    return std::visit(
+        [&start, &end](auto& ptr) {
+          if (ptr) {
+            return ptr->get_keys_in_range(start, end);
+          }
+          return at::empty({0});
+        },
+        impl_);
   }
 
-  std::shared_ptr<kv_mem::DramKVEmbeddingCache<float>> impl_;
+ private:
+  // friend class EmbeddingRocksDBWrapper;
+  friend class ssd::KVTensorWrapper;
+
+  DramKVEmbeddingCacheVariant impl_;
 };
 
 } // namespace kv_mem