Updated Clad-Cuda tutorial

ioanaif · vgvassilev · commit cd0f0758b149 · 2022-02-23T11:49:49.000+02:00
diff --git a/_pages/tutorials/2021-08-20-clad_cuda_simple_tutorial.md b/_pages/tutorials/2021-08-20-clad_cuda_simple_tutorial.md
@@ -26,14 +26,14 @@ Function. We include Clad and we defined the function such as:
 
 
 ```cuda
-#include <iostream>
 #include "clad/Differentiator/Differentiator.h"
 
+extern "C" int printf(const char*,...);
 #define N 100
 
 __device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
     double t = 0;
-    for (int i = 0; i< dim; i++)
+    for (int i = 0; i < dim; i++)
         t += (x[i] - p[i]) * (x[i] - p[i]);
     t = -t / (2*sigma*sigma);
     return std::pow(2*M_PI, -dim/2.0) * std::pow(sigma, -0.5) * std::exp(t);
@@ -43,28 +43,19 @@ __device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
 
 ### Definition of Clad gradient
 
-Having our custom function declared, we now forward declare our AD function
-following Clad’s convention, call Clad gradient and set a device function
+Having our custom function declared, we call Clad gradient and set a device function
 pointer to be used for the GPU execution:
 
 
 ```cuda
-typedef void(*func) (double* x, double* p, double sigma, int dim,
-                     clad::array_ref<double> _d_x, clad::array_ref<double> _d_p,
-                     clad::array_ref<double> _d_sigma,
-                     clad::array_ref<double> _d_dim);
-
-//Body to be generated by Clad
-__device__ __host__ void gauss_grad(double* x, double* p, double sigma, int dim,
-                                    clad::array_ref<double> _d_x,
-                                    clad::array_ref<double> _d_p,
-                                    clad::array_ref<double> _d_sigma,
-                                    clad::array_ref<double> _d_dim);
-
-auto gauss_g = clad::gradient(gauss);
+auto gauss_g = clad::gradient(gauss, "x,p");
 
 //Device function pointer
-__device__ func p_gauss = gauss_grad;
+auto p_gauss = gauss_g.getFunctionPtr();
+
+// using func = void(*)(double* x, double* p, double sigma, int dim, 
+//                      clad::array_ref<double> _d_x,clad::array_ref<double> _d_p);
+using func = decltype(p_gauss);
 ```
 
 
@@ -77,15 +68,11 @@ be defined as:
 
 ```cuda
 __global__ void compute(func op, double* d_x, double* d_y,
-                        double* d_sigma, int n, double* result_dx,
+                        double sigma, int n, double* result_dx,
                         double* result_dy) {
     int i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i < N) {
-        double result_dim[4] = {};
-        (*op)(&d_x[i],&d_y[i], &d_sigma, 1, &result_dim[0], &result_dim[1],
-          &result_dim[2], &result_dim[3]);
-        result_dx[i] = result_dim[0];
-        result_dy[i] = result_dim[1];
+    if (i < n) {
+      (*op)(&d_x[i],&d_y[i], sigma, /*dim*/1, &result_dx[i], &result_dy[i], nullptr, nullptr);
     }
 }
 ```
@@ -111,25 +98,23 @@ to the host.
 
 
 ```cuda
-int main(void) {
+int main() {
 
    // x and y point to the host arrays, allocated with malloc in the typical
    // fashion, and the d_x and d_y arrays point to device arrays allocated with
    // the cudaMalloc function from the CUDA runtime API
 
    double *x, *d_x;
    double *y, *d_y;
-   double sigma, *d_sigma;
+   double sigma = 50.;
    x = (double*)malloc(N*sizeof(double));
    y = (double*)malloc(N*sizeof(double));
-   sigma = (double*)malloc(N*sizeof(double));
 
    // The host code will initialize the host arrays
 
    for (int i = 0; i < N; i++) {
        x[i] = rand()%100;
        y[i] = rand()%100;
-       sigma[i] = rand()%100;
    }
 
    func h_gauss;
@@ -141,8 +126,6 @@ int main(void) {
    cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
    cudaMalloc(&d_y, N*sizeof(double));
    cudaMemcpy(d_y, y, N*sizeof(double), cudaMemcpyHostToDevice);
-   cudaMalloc(&d_sigma, sizeof(double));
-   cudaMemcpy(d_sigma, &sigma, sizeof(double), cudaMemcpyHostToDevice);
 
    // Similar to the x,y arrays, we employ host and device results array so
    // that we can copy the computed values from the device back to the host
@@ -153,16 +136,21 @@ int main(void) {
    cudaMalloc(&dx_result, N*sizeof(double));
    cudaMalloc(&dy_result, N*sizeof(double));
 
-
    cudaMemcpyFromSymbol(&h_gauss, p_gauss, sizeof(func));
 
    // The computation kernel is launched by the statement:
-   compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, d_sigma, N, dx_result, dy_result);
+   compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, sigma, N, dx_result, dy_result);
    cudaDeviceSynchronize();
 
    // After computation, the results hosted on the device should be copied to host
    cudaMemcpy(result_x, dx_result, N*sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(result_y, dy_result, N*sizeof(double), cudaMemcpyDeviceToHost);
+
+   printf("sigma=%f\n", sigma);
+   for (int i = 0; i < N; i+=10000) {
+       printf("x[%d]='%f';y[%d]='%f'\n", i, x[i], i, y[i]);
+       printf("grad_x[%d]='%.10f';grad_y[%d]='%.10f'\n", i, result_x[i], i, result_y[i]);
+   }
 }
 ```