From 19043319d9e7fc66e963cfe3ad7ee74fc248f7b6 Mon Sep 17 00:00:00 2001
From: Drew Hubley <dahubley@hotmail.ca>
Date: Sat, 4 Jan 2025 14:34:16 -0400
Subject: [PATCH] Added skeleton of batch based GPU assignment

---
 CMakeLists.txt                 |  1 +
 include/xtensor/xassign.hpp    | 14 +++++-
 include/xtensor/xcontainer.hpp | 16 +++++++
 include/xtensor/xdevice.hpp    | 86 ++++++++++++++++++++++++++++++++++
 include/xtensor/xfunction.hpp  | 20 ++++++++
 include/xtensor/xmath.hpp      | 54 +++++++++++++++------
 include/xtensor/xoperation.hpp |  5 ++
 test/CMakeLists.txt            |  1 +
 test/test_xdevice_assign.cpp   | 39 +++++++++++++++
 9 files changed, 220 insertions(+), 16 deletions(-)
 create mode 100644 include/xtensor/xdevice.hpp
 create mode 100644 test/test_xdevice_assign.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67c58f083..0eb7c079b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ set(XTENSOR_HEADERS
     ${XTENSOR_INCLUDE_DIR}/xtensor/xcomplex.hpp
     ${XTENSOR_INCLUDE_DIR}/xtensor/xcontainer.hpp
     ${XTENSOR_INCLUDE_DIR}/xtensor/xcsv.hpp
+    ${XTENSOR_INCLUDE_DIR}/xtensor/xdevice.hpp
     ${XTENSOR_INCLUDE_DIR}/xtensor/xdynamic_view.hpp
     ${XTENSOR_INCLUDE_DIR}/xtensor/xeval.hpp
     ${XTENSOR_INCLUDE_DIR}/xtensor/xexception.hpp
diff --git a/include/xtensor/xassign.hpp b/include/xtensor/xassign.hpp
index 2ec698ae6..7c84c7a7c 100644
--- a/include/xtensor/xassign.hpp
+++ b/include/xtensor/xassign.hpp
@@ -168,6 +168,17 @@ namespace xt
         static void run_impl(E1& e1, const E2& e2, std::false_type);
     };
 
+    class device_assigner
+    {
+    public:
+
+        template <class E1, class E2>
+        static void run(E1& e1, const E2& e2)
+        {
+            e1.store_device(e2.load_device());
+        }
+    };
+
     /*************************
      * strided_loop_assigner *
      *************************/
@@ -463,7 +474,8 @@ namespace xt
                 // in compilation error for expressions that do not provide a SIMD interface.
                 // simd_assign is true if simd_linear_assign() or simd_linear_assign(de1, de2)
                 // is true.
-                linear_assigner<simd_assign>::run(de1, de2);
+                //linear_assigner<simd_assign>::run(de1, de2);
+                device_assigner::run(de1, de2);
             }
             else
             {
diff --git a/include/xtensor/xcontainer.hpp b/include/xtensor/xcontainer.hpp
index 46dd0cf1b..921c4b446 100644
--- a/include/xtensor/xcontainer.hpp
+++ b/include/xtensor/xcontainer.hpp
@@ -27,6 +27,7 @@
 #include "xstrides.hpp"
 #include "xtensor_config.hpp"
 #include "xtensor_forward.hpp"
+#include "xdevice.hpp"
 
 namespace xt
 {
@@ -112,6 +113,8 @@ namespace xt
         using reverse_linear_iterator = typename iterable_base::reverse_linear_iterator;
         using const_reverse_linear_iterator = typename iterable_base::const_reverse_linear_iterator;
 
+        using container_device_return_type_t = host_device_batch<value_type>;
+
         static_assert(static_layout != layout_type::any, "Container layout can never be layout_type::any!");
 
         size_type size() const noexcept;
@@ -187,6 +190,19 @@ namespace xt
         container_simd_return_type_t<storage_type, value_type, requested_type>
         /*simd_return_type<requested_type>*/ load_simd(size_type i) const;
 
+        template<class device_batch>
+        void store_device(device_batch&& e)
+        {
+            //check length matching
+            e.store_host(storage().data());
+        }
+
+        container_device_return_type_t load_device() const
+        {
+            auto ptr = data();
+            return container_device_return_type_t(ptr, size());
+        }
+
         linear_iterator linear_begin() noexcept;
         linear_iterator linear_end() noexcept;
 
diff --git a/include/xtensor/xdevice.hpp b/include/xtensor/xdevice.hpp
new file mode 100644
index 000000000..470e9376c
--- /dev/null
+++ b/include/xtensor/xdevice.hpp
@@ -0,0 +1,86 @@
+#ifndef XTENSOR_DEVICE_HPP
+#define XTENSOR_DEVICE_HPP
+
+#include <memory>
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+namespace xt{
+    namespace detail{
+
+    }
+    /**
+     * Device implementation for the various operations. All device specific code goes in here disabled via macro
+     * for invalid syntax which might be needed for Sycl or CUDA.
+     */
+//#ifdef XTENSOR_DEVICE_ASSIGN
+    template<class T>
+    class host_device_batch
+    {
+    public:
+        host_device_batch(const T* ptr, std::size_t size)
+        {
+            //copy the data to the device
+            //CUDA Impl = Nearly identical
+            m_data.resize(size);
+            std::copy(ptr, ptr + size, std::begin(m_data));
+        }
+        template<class A>
+        host_device_batch& operator+(const host_device_batch<A>& rhs)
+        {
+            //CUDA impl = thrust::transform(m_data.begin(), m_data.end(), rhs.m_data().begin(), m_data.end(), thrust::plus<T>{});
+            std::transform(std::begin(m_data), std::end(m_data), std::begin(rhs.m_data), std::begin(m_data), std::plus<T>{});
+            return *this;
+        }
+        template<class A>
+        host_device_batch& operator-(const host_device_batch<A>& rhs)
+        {
+            std::transform(std::begin(m_data), std::end(m_data), std::begin(rhs.m_data), std::begin(m_data), std::minus<T>{});
+            return *this;
+        }
+        template<class A>
+        host_device_batch& operator*(const host_device_batch<A>& rhs)
+        {
+            std::transform(std::begin(m_data), std::end(m_data), std::begin(rhs.m_data), std::begin(m_data), std::multiplies<T>{});
+            return *this;
+        }
+        template<class A>
+        host_device_batch& operator/(const host_device_batch<A>& rhs)
+        {
+            std::transform(std::begin(m_data), std::end(m_data), std::begin(rhs.m_data), std::begin(m_data), std::divides<T>{});
+            return *this;
+        }
+        void store_host(T* dst)
+        {
+            std::copy(std::begin(m_data), std::end(m_data), dst);
+        }
+    private:
+        //CUDA impl = thrust::device_vector<T> m_data;
+        std::vector<T> m_data;
+    };
+//#endif
+
+    // template<class T>
+    // class cuda_device_batch : public batch<host_device_batch<T>>
+    // {
+    // public:
+        
+    // };
+
+    // template<class T>
+    // class intel_device_batch : public batch<host_device_batch<T>>
+    // {
+    // public:
+        
+    // };
+
+    // template<class T>
+    // class opencl_device_batch : public batch<host_device_batch<T>>
+    // {
+    // public:
+        
+    // };
+}
+
+#endif
diff --git a/include/xtensor/xfunction.hpp b/include/xtensor/xfunction.hpp
index f11362cdb..c9d5362c0 100644
--- a/include/xtensor/xfunction.hpp
+++ b/include/xtensor/xfunction.hpp
@@ -31,6 +31,7 @@
 #include "xstrides.hpp"
 #include "xtensor_simd.hpp"
 #include "xutils.hpp"
+#include "xdevice.hpp"
 
 namespace xt
 {
@@ -283,6 +284,7 @@ namespace xt
         using const_iterator = typename iterable_base::const_iterator;
         using reverse_iterator = typename iterable_base::reverse_iterator;
         using const_reverse_iterator = typename iterable_base::const_reverse_iterator;
+        using device_return_type = host_device_batch<value_type>;
 
         template <class Func, class... CTA, class U = std::enable_if_t<!std::is_base_of<std::decay_t<Func>, self_type>::value>>
         xfunction(Func&& f, CTA&&... e) noexcept;
@@ -361,6 +363,8 @@ namespace xt
         template <class align, class requested_type = value_type, std::size_t N = xt_simd::simd_traits<requested_type>::size>
         simd_return_type<requested_type> load_simd(size_type i) const;
 
+        device_return_type load_device() const;
+
         const tuple_type& arguments() const noexcept;
 
         const functor_type& functor() const noexcept;
@@ -385,6 +389,9 @@ namespace xt
         template <class align, class requested_type, std::size_t N, std::size_t... I>
         auto load_simd_impl(std::index_sequence<I...>, size_type i) const;
 
+        template <std::size_t... I>
+        inline auto load_device_impl(std::index_sequence<I...>) const;
+
         template <class Func, std::size_t... I>
         const_stepper build_stepper(Func&& f, std::index_sequence<I...>) const noexcept;
 
@@ -844,6 +851,12 @@ namespace xt
         return operator()();
     }
 
+    template <class F, class... CT>
+    inline auto xfunction<F, CT...>::load_device() const -> device_return_type
+    {
+        return load_device_impl(std::make_index_sequence<sizeof...(CT)>());
+    }
+
     template <class F, class... CT>
     template <class align, class requested_type, std::size_t N>
     inline auto xfunction<F, CT...>::load_simd(size_type i) const -> simd_return_type<requested_type>
@@ -912,6 +925,13 @@ namespace xt
         return m_f.simd_apply((std::get<I>(m_e).template load_simd<align, requested_type>(i))...);
     }
 
+    template <class F, class... CT>
+    template <std::size_t... I>
+    inline auto xfunction<F, CT...>::load_device_impl(std::index_sequence<I...>) const
+    {
+        return m_f.device_apply((std::get<I>(m_e).load_device())...);
+    }
+
     template <class F, class... CT>
     template <class Func, std::size_t... I>
     inline auto xfunction<F, CT...>::build_stepper(Func&& f, std::index_sequence<I...>) const noexcept
diff --git a/include/xtensor/xmath.hpp b/include/xtensor/xmath.hpp
index d7ab0820a..2deae2bbf 100644
--- a/include/xtensor/xmath.hpp
+++ b/include/xtensor/xmath.hpp
@@ -81,21 +81,27 @@ namespace xt
     XTENSOR_INT_SPECIALIZATION_IMPL(FUNC_NAME, RETURN_VAL, unsigned long long);
 
 
-#define XTENSOR_UNARY_MATH_FUNCTOR(NAME)              \
-    struct NAME##_fun                                 \
-    {                                                 \
-        template <class T>                            \
-        constexpr auto operator()(const T& arg) const \
-        {                                             \
-            using math::NAME;                         \
-            return NAME(arg);                         \
-        }                                             \
-        template <class B>                            \
-        constexpr auto simd_apply(const B& arg) const \
-        {                                             \
-            using math::NAME;                         \
-            return NAME(arg);                         \
-        }                                             \
+#define XTENSOR_UNARY_MATH_FUNCTOR(NAME)                \
+    struct NAME##_fun                                   \
+    {                                                   \
+        template <class T>                              \
+        constexpr auto operator()(const T& arg) const   \
+        {                                               \
+            using math::NAME;                           \
+            return NAME(arg);                           \
+        }                                               \
+        template <class B>                              \
+        constexpr auto simd_apply(const B& arg) const   \
+        {                                               \
+            using math::NAME;                           \
+            return NAME(arg);                           \
+        }                                               \
+        template <class B>                              \
+        constexpr auto device_apply(const B& arg) const \
+        {                                               \
+            using math::NAME;                           \
+            return NAME(arg);                           \
+        }                                               \        
     }
 
 #define XTENSOR_UNARY_MATH_FUNCTOR_COMPLEX_REDUCING(NAME) \
@@ -113,6 +119,12 @@ namespace xt
             using math::NAME;                             \
             return NAME(arg);                             \
         }                                                 \
+        template <class B>                                \
+        constexpr auto device_apply(const B& arg) const   \
+        {                                                 \
+            using math::NAME;                             \
+            return NAME(arg);                             \
+        }                                                 \
     }
 
 #define XTENSOR_BINARY_MATH_FUNCTOR(NAME)                               \
@@ -130,6 +142,12 @@ namespace xt
             using math::NAME;                                           \
             return NAME(arg1, arg2);                                    \
         }                                                               \
+        template <class B>                                              \
+        constexpr auto device_apply(const B& arg1, const B& arg2) const   \
+        {                                                               \
+            using math::NAME;                                           \
+            return NAME(arg1, arg2);                                    \
+        }                                                               \
     }
 
 #define XTENSOR_TERNARY_MATH_FUNCTOR(NAME)                                              \
@@ -147,6 +165,12 @@ namespace xt
             using math::NAME;                                                           \
             return NAME(arg1, arg2, arg3);                                              \
         }                                                                               \
+        template <class B>                                                              \
+        auto device_apply(const B& arg1, const B& arg2, const B& arg3) const              \
+        {                                                                               \
+            using math::NAME;                                                           \
+            return NAME(arg1, arg2, arg3);                                              \
+        }                                                                               \
     }
 
     namespace math
diff --git a/include/xtensor/xoperation.hpp b/include/xtensor/xoperation.hpp
index 44d639130..4010a7e12 100644
--- a/include/xtensor/xoperation.hpp
+++ b/include/xtensor/xoperation.hpp
@@ -79,6 +79,11 @@ namespace xt
         {                                                              \
             return (arg1 OP arg2);                                     \
         }                                                              \
+        template <class B>                                             \
+        constexpr auto device_apply(B&& arg1, const B&& arg2) const \
+        {                                                              \
+            return (arg1 OP arg2);                                     \
+        }                                                              \
     }
 
     namespace detail
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 19a03dc4d..38659c3ad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -183,6 +183,7 @@ set(XTENSOR_TESTS
     test_xcomplex.cpp
     test_xcsv.cpp
     test_xdatesupport.cpp
+    test_xdevice_assign.cpp
     test_xdynamic_view.cpp
     test_xfunctor_adaptor.cpp
     test_xfixed.cpp
diff --git a/test/test_xdevice_assign.cpp b/test/test_xdevice_assign.cpp
new file mode 100644
index 000000000..38468e29d
--- /dev/null
+++ b/test/test_xdevice_assign.cpp
@@ -0,0 +1,39 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht          *
+ * Copyright (c) QuantStack                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+// This file is generated from test/files/cppy_source/test_extended_broadcast_view.cppy by preprocess.py!
+// Warning: This file should not be modified directly! Instead, modify the `*.cppy` file.
+
+
+#include <algorithm>
+
+#include "xtensor/xarray.hpp"
+#include "xtensor/xfixed.hpp"
+#include "xtensor/xnoalias.hpp"
+#include "xtensor/xstrided_view.hpp"
+#include "xtensor/xtensor.hpp"
+#include "xtensor/xview.hpp"
+
+#include "test_common_macros.hpp"
+
+namespace xt
+{
+    TEST(test_xdevice, basic_xfunction)
+    {
+        std::vector<double> expectation = {2,3,4,5,6};
+
+        xt::xarray<float> a = {1., 2., 3., 4., 5.};
+        xt::xarray<float> b = xt::ones_like(a);
+        auto c = xt::xtensor<float, 1>::from_shape(a.shape());
+	    c = a + b;
+        for(size_t i = 0; i < expectation.size(); i++)
+        {
+            ASSERT_EQ(c(i), expectation.at(i));
+        }
+    }
+}