Skip to content

Commit cd0f075

Browse files
ioanaifvgvassilev
authored andcommitted
Updated Clad-Cuda tutorial
1 parent 3d01f0c commit cd0f075

File tree

1 file changed

+21
-33
lines changed

1 file changed

+21
-33
lines changed

Diff for: _pages/tutorials/2021-08-20-clad_cuda_simple_tutorial.md

+21-33
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ Function. We include Clad and we defined the function such as:
2626

2727

2828
```cuda
29-
#include <iostream>
3029
#include "clad/Differentiator/Differentiator.h"
3130
31+
extern "C" int printf(const char*,...);
3232
#define N 100
3333
3434
__device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
3535
double t = 0;
36-
for (int i = 0; i< dim; i++)
36+
for (int i = 0; i < dim; i++)
3737
t += (x[i] - p[i]) * (x[i] - p[i]);
3838
t = -t / (2*sigma*sigma);
3939
return std::pow(2*M_PI, -dim/2.0) * std::pow(sigma, -0.5) * std::exp(t);
@@ -43,28 +43,19 @@ __device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
4343

4444
### Definition of Clad gradient
4545

46-
Having our custom function declared, we now forward declare our AD function
47-
following Clad’s convention, call Clad gradient and set a device function
46+
Having our custom function declared, we call Clad gradient and set a device function
4847
pointer to be used for the GPU execution:
4948

5049

5150
```cuda
52-
typedef void(*func) (double* x, double* p, double sigma, int dim,
53-
clad::array_ref<double> _d_x, clad::array_ref<double> _d_p,
54-
clad::array_ref<double> _d_sigma,
55-
clad::array_ref<double> _d_dim);
56-
57-
//Body to be generated by Clad
58-
__device__ __host__ void gauss_grad(double* x, double* p, double sigma, int dim,
59-
clad::array_ref<double> _d_x,
60-
clad::array_ref<double> _d_p,
61-
clad::array_ref<double> _d_sigma,
62-
clad::array_ref<double> _d_dim);
63-
64-
auto gauss_g = clad::gradient(gauss);
51+
auto gauss_g = clad::gradient(gauss, "x,p");
6552
6653
//Device function pointer
67-
__device__ func p_gauss = gauss_grad;
54+
auto p_gauss = gauss_g.getFunctionPtr();
55+
56+
// using func = void(*)(double* x, double* p, double sigma, int dim,
57+
// clad::array_ref<double> _d_x,clad::array_ref<double> _d_p);
58+
using func = decltype(p_gauss);
6859
```
6960

7061

@@ -77,15 +68,11 @@ be defined as:
7768

7869
```cuda
7970
__global__ void compute(func op, double* d_x, double* d_y,
80-
double* d_sigma, int n, double* result_dx,
71+
double sigma, int n, double* result_dx,
8172
double* result_dy) {
8273
int i = blockIdx.x*blockDim.x + threadIdx.x;
83-
if (i < N) {
84-
double result_dim[4] = {};
85-
(*op)(&d_x[i],&d_y[i], &d_sigma, 1, &result_dim[0], &result_dim[1],
86-
&result_dim[2], &result_dim[3]);
87-
result_dx[i] = result_dim[0];
88-
result_dy[i] = result_dim[1];
74+
if (i < n) {
75+
(*op)(&d_x[i],&d_y[i], sigma, /*dim*/1, &result_dx[i], &result_dy[i], nullptr, nullptr);
8976
}
9077
}
9178
```
@@ -111,25 +98,23 @@ to the host.
11198

11299

113100
```cuda
114-
int main(void) {
101+
int main() {
115102
116103
// x and y point to the host arrays, allocated with malloc in the typical
117104
// fashion, and the d_x and d_y arrays point to device arrays allocated with
118105
// the cudaMalloc function from the CUDA runtime API
119106
120107
double *x, *d_x;
121108
double *y, *d_y;
122-
double sigma, *d_sigma;
109+
double sigma = 50.;
123110
x = (double*)malloc(N*sizeof(double));
124111
y = (double*)malloc(N*sizeof(double));
125-
sigma = (double*)malloc(N*sizeof(double));
126112
127113
// The host code will initialize the host arrays
128114
129115
for (int i = 0; i < N; i++) {
130116
x[i] = rand()%100;
131117
y[i] = rand()%100;
132-
sigma[i] = rand()%100;
133118
}
134119
135120
func h_gauss;
@@ -141,8 +126,6 @@ int main(void) {
141126
cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
142127
cudaMalloc(&d_y, N*sizeof(double));
143128
cudaMemcpy(d_y, y, N*sizeof(double), cudaMemcpyHostToDevice);
144-
cudaMalloc(&d_sigma, sizeof(double));
145-
cudaMemcpy(d_sigma, &sigma, sizeof(double), cudaMemcpyHostToDevice);
146129
147130
// Similar to the x,y arrays, we employ host and device results array so
148131
// that we can copy the computed values from the device back to the host
@@ -153,16 +136,21 @@ int main(void) {
153136
cudaMalloc(&dx_result, N*sizeof(double));
154137
cudaMalloc(&dy_result, N*sizeof(double));
155138
156-
157139
cudaMemcpyFromSymbol(&h_gauss, p_gauss, sizeof(func));
158140
159141
// The computation kernel is launched by the statement:
160-
compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, d_sigma, N, dx_result, dy_result);
142+
compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, sigma, N, dx_result, dy_result);
161143
cudaDeviceSynchronize();
162144
163145
// After computation, the results hosted on the device should be copied to host
164146
cudaMemcpy(result_x, dx_result, N*sizeof(double), cudaMemcpyDeviceToHost);
165147
cudaMemcpy(result_y, dy_result, N*sizeof(double), cudaMemcpyDeviceToHost);
148+
149+
printf("sigma=%f\n", sigma);
150+
for (int i = 0; i < N; i+=10000) {
151+
printf("x[%d]='%f';y[%d]='%f'\n", i, x[i], i, y[i]);
152+
printf("grad_x[%d]='%.10f';grad_y[%d]='%.10f'\n", i, result_x[i], i, result_y[i]);
153+
}
166154
}
167155
```
168156

0 commit comments

Comments
 (0)