@@ -26,14 +26,14 @@ Function. We include Clad and we defined the function such as:
26
26
27
27
28
28
``` cuda
29
- #include <iostream>
30
29
#include "clad/Differentiator/Differentiator.h"
31
30
31
+ extern "C" int printf(const char*,...);
32
32
#define N 100
33
33
34
34
__device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
35
35
double t = 0;
36
- for (int i = 0; i< dim; i++)
36
+ for (int i = 0; i < dim; i++)
37
37
t += (x[i] - p[i]) * (x[i] - p[i]);
38
38
t = -t / (2*sigma*sigma);
39
39
return std::pow(2*M_PI, -dim/2.0) * std::pow(sigma, -0.5) * std::exp(t);
@@ -43,28 +43,19 @@ __device__ __host__ double gauss(double* x, double* p, double sigma, int dim) {
43
43
44
44
### Definition of Clad gradient
45
45
46
- Having our custom function declared, we now forward declare our AD function
47
- following Clad’s convention, call Clad gradient and set a device function
46
+ Having our custom function declared, we call Clad gradient and set a device function
48
47
pointer to be used for the GPU execution:
49
48
50
49
51
50
``` cuda
52
- typedef void(*func) (double* x, double* p, double sigma, int dim,
53
- clad::array_ref<double> _d_x, clad::array_ref<double> _d_p,
54
- clad::array_ref<double> _d_sigma,
55
- clad::array_ref<double> _d_dim);
56
-
57
- //Body to be generated by Clad
58
- __device__ __host__ void gauss_grad(double* x, double* p, double sigma, int dim,
59
- clad::array_ref<double> _d_x,
60
- clad::array_ref<double> _d_p,
61
- clad::array_ref<double> _d_sigma,
62
- clad::array_ref<double> _d_dim);
63
-
64
- auto gauss_g = clad::gradient(gauss);
51
+ auto gauss_g = clad::gradient(gauss, "x,p");
65
52
66
53
//Device function pointer
67
- __device__ func p_gauss = gauss_grad;
54
+ auto p_gauss = gauss_g.getFunctionPtr();
55
+
56
+ // using func = void(*)(double* x, double* p, double sigma, int dim,
57
+ // clad::array_ref<double> _d_x,clad::array_ref<double> _d_p);
58
+ using func = decltype(p_gauss);
68
59
```
69
60
70
61
@@ -77,15 +68,11 @@ be defined as:
77
68
78
69
``` cuda
79
70
__global__ void compute(func op, double* d_x, double* d_y,
80
- double* d_sigma , int n, double* result_dx,
71
+ double sigma , int n, double* result_dx,
81
72
double* result_dy) {
82
73
int i = blockIdx.x*blockDim.x + threadIdx.x;
83
- if (i < N) {
84
- double result_dim[4] = {};
85
- (*op)(&d_x[i],&d_y[i], &d_sigma, 1, &result_dim[0], &result_dim[1],
86
- &result_dim[2], &result_dim[3]);
87
- result_dx[i] = result_dim[0];
88
- result_dy[i] = result_dim[1];
74
+ if (i < n) {
75
+ (*op)(&d_x[i],&d_y[i], sigma, /*dim*/1, &result_dx[i], &result_dy[i], nullptr, nullptr);
89
76
}
90
77
}
91
78
```
@@ -111,25 +98,23 @@ to the host.
111
98
112
99
113
100
``` cuda
114
- int main(void ) {
101
+ int main() {
115
102
116
103
// x and y point to the host arrays, allocated with malloc in the typical
117
104
// fashion, and the d_x and d_y arrays point to device arrays allocated with
118
105
// the cudaMalloc function from the CUDA runtime API
119
106
120
107
double *x, *d_x;
121
108
double *y, *d_y;
122
- double sigma, *d_sigma ;
109
+ double sigma = 50. ;
123
110
x = (double*)malloc(N*sizeof(double));
124
111
y = (double*)malloc(N*sizeof(double));
125
- sigma = (double*)malloc(N*sizeof(double));
126
112
127
113
// The host code will initialize the host arrays
128
114
129
115
for (int i = 0; i < N; i++) {
130
116
x[i] = rand()%100;
131
117
y[i] = rand()%100;
132
- sigma[i] = rand()%100;
133
118
}
134
119
135
120
func h_gauss;
@@ -141,8 +126,6 @@ int main(void) {
141
126
cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
142
127
cudaMalloc(&d_y, N*sizeof(double));
143
128
cudaMemcpy(d_y, y, N*sizeof(double), cudaMemcpyHostToDevice);
144
- cudaMalloc(&d_sigma, sizeof(double));
145
- cudaMemcpy(d_sigma, &sigma, sizeof(double), cudaMemcpyHostToDevice);
146
129
147
130
// Similar to the x,y arrays, we employ host and device results array so
148
131
// that we can copy the computed values from the device back to the host
@@ -153,16 +136,21 @@ int main(void) {
153
136
cudaMalloc(&dx_result, N*sizeof(double));
154
137
cudaMalloc(&dy_result, N*sizeof(double));
155
138
156
-
157
139
cudaMemcpyFromSymbol(&h_gauss, p_gauss, sizeof(func));
158
140
159
141
// The computation kernel is launched by the statement:
160
- compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, d_sigma , N, dx_result, dy_result);
142
+ compute<<<N/256+1, 256>>>(h_gauss, d_x, d_y, sigma , N, dx_result, dy_result);
161
143
cudaDeviceSynchronize();
162
144
163
145
// After computation, the results hosted on the device should be copied to host
164
146
cudaMemcpy(result_x, dx_result, N*sizeof(double), cudaMemcpyDeviceToHost);
165
147
cudaMemcpy(result_y, dy_result, N*sizeof(double), cudaMemcpyDeviceToHost);
148
+
149
+ printf("sigma=%f\n", sigma);
150
+ for (int i = 0; i < N; i+=10000) {
151
+ printf("x[%d]='%f';y[%d]='%f'\n", i, x[i], i, y[i]);
152
+ printf("grad_x[%d]='%.10f';grad_y[%d]='%.10f'\n", i, result_x[i], i, result_y[i]);
153
+ }
166
154
}
167
155
```
168
156
0 commit comments