From eebbcebb4d703f9ec7ba9d272aebd1d475b5bdec Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Wed, 11 Oct 2023 17:43:55 +0200 Subject: [PATCH 01/70] First commit --- src/alge/cs_gradient.cxx | 27 ++- src/alge/cs_gradient_cuda.cu | 327 +++++++++++++++++++++++++++++++++++ src/alge/cs_gradient_priv.h | 13 ++ src/base/cs_base_cuda.cu | 4 +- 4 files changed, 367 insertions(+), 4 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 4275a85135..82a6aad6cf 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6890,6 +6890,13 @@ _lsq_vector_gradient(const cs_mesh_t *m, cs_cocg_6_t *restrict cocgb_s = NULL; cs_cocg_6_t *restrict cocg = NULL; + +#if defined(HAVE_CUDA) + bool accel = (cs_get_device_id() > -1) ? true : false; +#else + bool accel = false; +#endif + _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s); cs_real_33_t *rhs; @@ -6898,14 +6905,28 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute Right-Hand Side */ /*-------------------------*/ - -# pragma omp parallel for +#if defined(HAVE_CUDA) + cs_lsq_vector_gradient_cuda( + m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + gradv, + rhs); +#else + # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) for (cs_lnum_t j = 0; j < 3; j++) rhs[c_id][i][j] = 0.0; } + /* Contribution from interior faces */ for (int g_id = 0; g_id < n_i_groups; g_id++) { @@ -6959,6 +6980,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ } /* loop on thread groups */ +#endif /* Contribution from extended neighborhood */ @@ -7031,6 +7053,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute gradient */ /*------------------*/ + #pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 0188482838..ddbf8973cd 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -430,6 +430,79 @@ _init_rhsv(cs_lnum_t size, } } + +/*---------------------------------------------------------------------------- + * Initialize RHS with null values + *----------------------------------------------------------------------------*/ + +__global__ static void +_init_rhs(cs_lnum_t size, + cs_real_33_t *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_id < size) { + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++) + rhs[c_id][i][j] = 0.0; + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face(cs_lnum_t size, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_lnum_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + cs_lnum_t s_id = i_group_index[2*c_id]; + cs_lnum_t e_id = i_group_index[2*c_id + 1]; + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond; + cs_lnum_t c_id1, c_id2; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id1 = i_face_cells[index][0]; + c_id2 = i_face_cells[index][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1; + _weight2 = 1; + } + else{ + _pond = 1; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + rhs[c_id1][i][j] += _weight2 * fctb[j]; + rhs[c_id2][i][j] += _weight1 * fctb[j]; + } + } + } + + +} + /*---------------------------------------------------------------------------- * Synchronize of copy a cs_real_t type array from the host to a device. * @@ -475,6 +548,129 @@ _sync_or_copy_real_h2d(const cs_real_t *val_h, *buf_d = _buf_d; } + +/*---------------------------------------------------------------------------- + * Synchronize of copy a cs_real_3_t type array from the host to a device. + * + * parameters: + * val_h <-- pointer to host data + * n_vals <-- number of data values + * device_id <-- associated device id + * stream <-- associated stream (for async prefetch only) + * val_d --> matching pointer on device + * buf_d --> matching allocation pointer on device (should be freed + * after use if non-NULL) + *----------------------------------------------------------------------------*/ + +static void +_sync_or_copy_real_3_h2d(const cs_real_3_t *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const cs_real_3_t **val_d, + void **buf_d) +{ + const cs_real_3_t *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(cs_real_3_t); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const cs_real_3_t *)_buf_d; + } + else { + _val_d = (const cs_real_3_t *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + +/*---------------------------------------------------------------------------- + * Synchronize of copy a cs_real_33_t type array from the host to a device. + * + * parameters: + * val_h <-- pointer to host data + * n_vals <-- number of data values + * device_id <-- associated device id + * stream <-- associated stream (for async prefetch only) + * val_d --> matching pointer on device + * buf_d --> matching allocation pointer on device (should be freed + * after use if non-NULL) + *----------------------------------------------------------------------------*/ + +static void +_sync_or_copy_real_33_h2d(const cs_real_33_t *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const cs_real_33_t **val_d, + void **buf_d) +{ + const cs_real_33_t *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(cs_real_33_t); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const cs_real_33_t *)_buf_d; + } + else { + _val_d = (const cs_real_33_t *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + +static void +_sync_or_copy_lnum_h2d(const cs_lnum_t *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const cs_lnum_t **val_d, + void **buf_d) +{ + const cs_lnum_t *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(cs_lnum_t); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const cs_lnum_t *)_buf_d; + } + else { + _val_d = (const cs_lnum_t *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ /*============================================================================= @@ -749,3 +945,134 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t *m, } /*----------------------------------------------------------------------------*/ +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*============================================================================= + * Semi-private function definitions + *============================================================================*/ + +/*---------------------------------------------------------------------------- + * Compute cell gradient using least-squares reconstruction for non-orthogonal + * meshes (nswrgp > 1). + * + * Optionally, a volume force generating a hydrostatic pressure component + * may be accounted for. + * + * cocg is computed to account for variable B.C.'s (flux). + * + * parameters: + * m <-- pointer to associated mesh structure + * madj <-- pointer to mesh adjacencies structure + * fvq <-- pointer to associated finite volume quantities + * halo_type <-- halo type (extended or not) + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream, stream1; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream); + + cs_real_33_t *rhs_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL; + const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; + const cs_real_33_t *coefb_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + + unsigned int blocksize = 256; + unsigned int gridsize_b + = (unsigned int)ceil((double)m->n_b_cells / blocksize); + unsigned int gridsize_bf + = (unsigned int)ceil((double)m->n_b_faces / blocksize); + unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); + unsigned int gridsize_ext + = (unsigned int)ceil((double)n_cells_ext / blocksize); + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_idx); + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + const cs_lnum_3_t *restrict cell_f_cen + = (const cs_lnum_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + const cs_real_t *restrict b_dist + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + + + // _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, + // &pvar_d, &_pvar_d); + + _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream1, + &coefa_d, &_coefa_d); + _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, + &coefb_d, &_coefb_d); + + _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, + &coefb_d, &_coefb_d); + + // _sync_or_copy_lnum_h2d(cell_cells_idx, n_cells_ext, device_id, stream1, + // &cell_cells_idx_d, &_cell_cells_idx_d); + // CS_CUDA_CHECK(cudaMalloc((void**) &_cell_cells_idx_d, n_cells_ext*sizeof(int))); + // cs_cuda_copy_h2d(_cell_cells_idx_d, cell_cells_idx, n_cells_ext); + // CS_CUDA_CHECK(cudaMemcpy(_cell_cells_idx_d, cell_cells_idx, n_cells_ext*sizeof(int), cudaMemcpyHostToDevice)); + + cudaStreamDestroy(stream1); + cudaStreamSynchronize(0); + + _init_rhs<<>> + (n_cells_ext, rhs_d); + + // _compute_rhs_lsq_v_i_face<<>> + // (n_cells, cell_cells_idx_d, i_face_cells, cell_f_cen, rhs_d, pvar, weight, c_weight); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + /* Sync to host */ + if (rhs_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(rhs, rhs_d, size); + } + else + cs_sync_d2h(rhs); + +} \ No newline at end of file diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 261c3de9fd..d76445313c 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -109,6 +109,19 @@ cs_gradient_scalar_lsq_cuda(const cs_mesh_t *m, cs_cocg_6_t *restrict cocgb, cs_real_3_t *restrict grad); +void +cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs); + #endif /* defined(HAVE_CUDA) */ /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ diff --git a/src/base/cs_base_cuda.cu b/src/base/cs_base_cuda.cu index 266babc33b..8acf641015 100644 --- a/src/base/cs_base_cuda.cu +++ b/src/base/cs_base_cuda.cu @@ -224,7 +224,7 @@ cs_cuda_mem_free(void *p, CS_CUDA_CHECK_CALL(cudaFree(p), file_name, line_num); #if 0 - CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num); + CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num)); #endif } @@ -257,7 +257,7 @@ cs_cuda_mem_free_host(void *p, CS_CUDA_CHECK_CALL(cudaFreeHost(p), file_name, line_num); #if 0 - CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num); + CS_CUDA_CHECK_CALL((cudaDeviceSynchronize(), file_name, line_num)); #endif } From 7bdfe3c8a4979d2196421400e1e2340de7e16f32 Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Fri, 13 Oct 2023 12:32:15 +0200 Subject: [PATCH 02/70] More of kernels --- src/alge/cs_gradient.cxx | 4 +- src/alge/cs_gradient_cuda.cu | 468 ++++++++++++++++++++++++++++++++--- src/alge/cs_gradient_priv.h | 1 + 3 files changed, 435 insertions(+), 38 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 82a6aad6cf..9e949ac7f5 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6916,6 +6916,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, coefbv, pvar, c_weight, + cocg, gradv, rhs); #else @@ -6926,7 +6927,6 @@ _lsq_vector_gradient(const cs_mesh_t *m, rhs[c_id][i][j] = 0.0; } - /* Contribution from interior faces */ for (int g_id = 0; g_id < n_i_groups; g_id++) { @@ -6980,7 +6980,6 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ } /* loop on thread groups */ -#endif /* Contribution from extended neighborhood */ @@ -7069,6 +7068,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, + rhs[c_id][i][2] * cocg[c_id][2]; } } +#endif /* Compute gradient on boundary cells */ /*------------------------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index ddbf8973cd..094c34d146 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -94,6 +94,23 @@ * Private function definitions *============================================================================*/ +__global__ void isnan_cuda(cs_lnum_t size, + cs_real_33_t *restrict rhs, + bool status) +{ + cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; + + if (id >= size) + return; + + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++){ + if(isnan(rhs[id][i][j])){ + status = true; + } + } +} + /*---------------------------------------------------------------------------- * Recompute cocg at boundaries, using saved cocgb *----------------------------------------------------------------------------*/ @@ -440,7 +457,6 @@ _init_rhs(cs_lnum_t size, cs_real_33_t *restrict rhs) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) { for (cs_lnum_t i = 0; i < 3; i++) for (cs_lnum_t j = 0; j < 3; j++) @@ -450,22 +466,22 @@ _init_rhs(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_i_face(cs_lnum_t size, - const cs_lnum_t *i_group_index, - const cs_lnum_2_t *i_face_cells, - const cs_lnum_3_t *cell_f_cen, - cs_real_33_t *rhs, - const cs_real_3_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight) + int2 i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if(c_id >= size){ return; } - cs_lnum_t s_id = i_group_index[2*c_id]; - cs_lnum_t e_id = i_group_index[2*c_id + 1]; - cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond; + cs_lnum_t s_id = i_group_index.x; + cs_lnum_t e_id = i_group_index.y; + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; cs_lnum_t c_id1, c_id2; for(cs_lnum_t index = s_id; index < e_id; index++){ @@ -479,11 +495,11 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); if (c_weight == NULL){ - _weight1 = 1; - _weight2 = 1; + _weight1 = 1.; + _weight2 = 1.; } else{ - _pond = 1; + _pond = weight[index]; _denom = 1. / ( _pond *c_weight[c_id1] + (1. - _pond)*c_weight[c_id2]); _weight1 = c_weight[c_id1] * _denom; @@ -491,7 +507,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, } for(cs_lnum_t i = 0; i < 3; i++){ - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; rhs[c_id1][i][j] += _weight2 * fctb[j]; @@ -499,7 +515,196 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, } } } +} + +__global__ static void +_compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, + const cs_lnum_t *cell_cells_idx, + const cs_lnum_t *cell_cells_lst, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= size){ + return; + } + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + cs_real_t dc[3], ddc, pfac; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + cs_lnum_t c_id2 = cell_cells_idx[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0] * dc[0] + dc[1] * dc[1] + dc[2] * dc[2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + rhs[c_id1][i][j] += dc[j] * pfac; + } + } + } + +} + +__global__ static void +_compute_rhs_lsq_v_b_face(cs_lnum_t size, + int2 b_group_index, + const cs_lnum_t *b_face_cells, + const cs_real_3_t *cell_f_cen, + const cs_real_3_t *b_face_normal, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *b_dist, + const cs_real_33_t *coefbv, + const cs_real_3_t *coefav, + const int inc) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + cs_lnum_t s_id = b_group_index.x; + cs_lnum_t e_id = b_group_index.y; + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id1 = b_face_cells[index]; + + /* Normal is vector 0 if the b_face_normal norm is too small */ + norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] + + b_face_normal[index][1]*b_face_normal[index][1] + + b_face_normal[index][2]*b_face_normal[index][2]); + + inverse_norm = 1. / norm; + + n_d_dist[0] = inverse_norm * b_face_normal[index][0]; + n_d_dist[1] = inverse_norm * b_face_normal[index][1]; + n_d_dist[2] = inverse_norm * b_face_normal[index][2]; + + d_b_dist = 1. / b_dist[index]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[index][i]*inc + + ( coefbv[index][0][i] * pvar[c_id1][0] + + coefbv[index][1][i] * pvar[c_id1][1] + + coefbv[index][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + rhs[c_id1][i][0] += n_d_dist[0] * pfac; + rhs[c_id1][i][1] += n_d_dist[1] * pfac; + rhs[c_id1][i][2] += n_d_dist[2] * pfac; + } + } +} + +__global__ static void +_compute_gradient_lsq_v(cs_lnum_t size, + cs_real_33_t *gradv, + cs_real_33_t *rhs, + cs_cocg_6_t *cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + + rhs[c_id][i][1] * cocg[c_id][3] + + rhs[c_id][i][2] * cocg[c_id][5]; + + gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + + rhs[c_id][i][1] * cocg[c_id][1] + + rhs[c_id][i][2] * cocg[c_id][4]; + + gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + + rhs[c_id][i][1] * cocg[c_id][4] + + rhs[c_id][i][2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_b_v(cs_lnum_t size, + cs_lnum_t n_b_cells, + cs_lnum_t *b_cells, + cs_real_33_t *gradv, + cs_real_33_t *rhs, + cs_cocg_6_t *cocg, + cs_real_3_t *b_face_normal, + cs_lnum_t *cell_b_faces, + cs_lnum_t *cell_b_faces_idx) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + cs_lnum_t _33_9_idx[9][2]; + int nn = 0; + for (int ll = 0; ll < 3; ll++) { + for (int mm = 0; mm < 3; mm++) { + _33_9_idx[nn][0] = ll; + _33_9_idx[nn][1] = mm; + nn++; + } + } + + /* Loop on boundary cells */ + cs_lnum_t c_id1 = b_cells[c_id]; + cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + + cocgb[0][0] = cocg[c_id][0]; + cocgb[0][1] = cocg[c_id][3]; + cocgb[0][2] = cocg[c_id][5]; + cocgb[1][0] = cocg[c_id][3]; + cocgb[1][1] = cocg[c_id][1]; + cocgb[1][2] = cocg[c_id][4]; + cocgb[2][0] = cocg[c_id][5]; + cocgb[2][1] = cocg[c_id][4]; + cocgb[2][2] = cocg[c_id][2]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + cs_lnum_t f_id; + cs_real_3_t normal; + cs_real_t norm, inverse_norm; + + for (cs_lnum_t index = s_id; index < e_id; index++) { + + f_id = cell_b_faces[index]; + + /* Normal is vector 0 if the b_face_normal norm is too small */ + norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] + + b_face_normal[index][1]*b_face_normal[index][1] + + b_face_normal[index][2]*b_face_normal[index][2]); + inverse_norm = 1. / norm; + + normal[0] = inverse_norm * b_face_normal[index][0]; + normal[1] = inverse_norm * b_face_normal[index][1]; + normal[2] = inverse_norm * b_face_normal[index][2]; + + for (cs_lnum_t ii = 0; ii < 3; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) + cocgb[ii][jj] += normal[ii] * normal[jj]; + } + + } } @@ -981,6 +1186,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs) { @@ -998,6 +1204,11 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *rhs_d; CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); + cs_cocg_6_t *cocg_d; + CS_CUDA_CHECK(cudaMalloc(&cocg_d, n_cells_ext * sizeof(cs_cocg_6_t))); + + cs_cuda_copy_h2d(cocg_d, cocg, n_cells_ext * sizeof(cs_cocg_6_t)); + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, *_cell_cells_idx_d = NULL; const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; @@ -1023,13 +1234,25 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index_raw = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index_raw = m->b_face_numbering->group_index; + // const cs_lnum_t *restrict i_group_index + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(i_group_index_raw); + // const cs_lnum_t *restrict b_group_index + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(b_group_index_raw); + int2 i_group_index, b_group_index; + i_group_index.x = i_group_index_raw[0]; + i_group_index.y = i_group_index_raw[1]; + + b_group_index.x = b_group_index_raw[0]; + b_group_index.y = b_group_index_raw[1]; const cs_lnum_t *restrict cell_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); const cs_real_3_t *restrict cell_cen = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); - const cs_lnum_3_t *restrict cell_f_cen - = (const cs_lnum_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); const cs_real_t *restrict weight = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); const cs_real_t *restrict b_dist @@ -1038,41 +1261,214 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); - // _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, - // &pvar_d, &_pvar_d); + _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, + &pvar_d, &_pvar_d); _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream1, &coefa_d, &_coefa_d); _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, &coefb_d, &_coefb_d); - - _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, - &coefb_d, &_coefb_d); - // _sync_or_copy_lnum_h2d(cell_cells_idx, n_cells_ext, device_id, stream1, - // &cell_cells_idx_d, &_cell_cells_idx_d); - // CS_CUDA_CHECK(cudaMalloc((void**) &_cell_cells_idx_d, n_cells_ext*sizeof(int))); - // cs_cuda_copy_h2d(_cell_cells_idx_d, cell_cells_idx, n_cells_ext); - // CS_CUDA_CHECK(cudaMemcpy(_cell_cells_idx_d, cell_cells_idx, n_cells_ext*sizeof(int), cudaMemcpyHostToDevice)); - - cudaStreamDestroy(stream1); + // cudaStreamDestroy(stream1); cudaStreamSynchronize(0); - _init_rhs<<>> + cudaError_t error; + cudaEvent_t start,stop; + float msecTotal = 0.0f; + error = cudaEventCreate(&start); + error = cudaEventCreate(&stop); + + // Record the start event + error = cudaEventRecord(start, NULL); + error = cudaEventSynchronize(start); + + _init_rhs<<>> (n_cells_ext, rhs_d); - // _compute_rhs_lsq_v_i_face<<>> - // (n_cells, cell_cells_idx_d, i_face_cells, cell_f_cen, rhs_d, pvar, weight, c_weight); + error = cudaEventRecord(stop, NULL); + error = cudaEventSynchronize(stop); + error = cudaEventElapsedTime(&msecTotal, start, stop); + + bool status = false; + cs_lnum_t count_nan = 0, count_inf = 0; + // isnan_cuda<<>> + // (n_cells_ext, rhs_d, status); + + // if(status) + // printf("Nan found in rhs after init kernel"); + + // printf("Init execution time %f\n", msecTotal); + // printf("n_group: %d n_thread: %d n_group[0]: %d n_group[1]: %d\n", m->b_face_numbering->n_groups, m->b_face_numbering->n_threads, m->b_face_numbering->group_index[0], m->b_face_numbering->group_index[1]); + + // for(cs_lnum_t id = 0; id < n_cells_ext; id++){ + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // if(isnan(rhs[id][i][j])){ + // status = true; + // count_nan++; + // } + // if(isinf(rhs[id][i][j])){ + // status = true; + // count_inf++; + // } + // } + // } + // } + // if(status){ + // printf("%d Nans found in rhs before interieur face kernel\n", count_nan); + // printf("%d Infs found in rhs before interieur face kernel\n", count_inf); + // } + + error = cudaEventCreate(&start); + error = cudaEventCreate(&stop); + + // Record the start event + error = cudaEventRecord(start, NULL); + error = cudaEventSynchronize(start); + + _compute_rhs_lsq_v_i_face<<>> + (n_cells, i_group_index, i_face_cells, cell_f_cen, rhs_d, pvar_d, weight, c_weight); + + error = cudaEventRecord(stop, NULL); + error = cudaEventSynchronize(stop); + msecTotal = 0.0f; + error = cudaEventElapsedTime(&msecTotal, start, stop); + + // isnan_cuda<<>> + // (n_cells_ext, rhs_d, status); + + // count_nan = 0; count_inf = 0; + // for(cs_lnum_t id = 0; id < n_cells_ext; id++){ + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // if(isnan(rhs[id][i][j])){ + // status = true; + // count_nan++; + // } + // if(isinf(rhs[id][i][j])){ + // status = true; + // count_inf++; + // } + // } + // } + // } + // if(status){ + // printf("%d Nans found in rhs after interieur face kernel\n", count_nan); + // printf("%d Infs found in rhs after interieur face kernel\n", count_inf); + // } + + if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ + error = cudaEventCreate(&start); + error = cudaEventCreate(&stop); + + // Record the start event + error = cudaEventRecord(start, NULL); + error = cudaEventSynchronize(start); + _compute_rhs_lsq_v_b_neighbor<<>> + (n_cells, cell_cells_idx, cell_cells_lst, cell_f_cen, rhs_d, pvar_d); + + error = cudaEventRecord(stop, NULL); + error = cudaEventSynchronize(stop); + msecTotal = 0.0f; + error = cudaEventElapsedTime(&msecTotal, start, stop); + } - cudaStreamSynchronize(stream); - cudaStreamDestroy(stream); + error = cudaEventCreate(&start); + error = cudaEventCreate(&stop); - /* Sync to host */ + // Record the start event + error = cudaEventRecord(start, NULL); + error = cudaEventSynchronize(start); + + _compute_rhs_lsq_v_b_face<<>> + (n_cells, b_group_index, b_face_cells, cell_f_cen, b_face_normal, rhs_d, pvar_d, b_dist, coefb_d, coefa_d, inc); + + error = cudaEventRecord(stop, NULL); + error = cudaEventSynchronize(stop); + msecTotal = 0.0f; + error = cudaEventElapsedTime(&msecTotal, start, stop); + + if (rhs_d != NULL) { size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; cs_cuda_copy_d2h(rhs, rhs_d, size); } else cs_sync_d2h(rhs); + + // /* Compute gradient */ + // /*------------------*/ + + // void *_grad_d = NULL; + cs_real_33_t *grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); + + // if (cs_check_device_ptr(gradv) == CS_ALLOC_HOST) { + // size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; + // CS_CUDA_CHECK(cudaMalloc(&_grad_d, size)); + // grad_d = (cs_real_33_t *)_grad_d; + // } + // else { + // grad_d = (cs_real_33_t *)cs_get_device_ptr((void *)gradv); + // } + + error = cudaEventCreate(&start); + error = cudaEventCreate(&stop); + + // Record the start event + error = cudaEventRecord(start, NULL); + error = cudaEventSynchronize(start); + + _compute_gradient_lsq_v<<>> + (n_cells, grad_d, rhs_d, cocg_d); + + error = cudaEventRecord(stop, NULL); + error = cudaEventSynchronize(stop); + msecTotal = 0.0f; + error = cudaEventElapsedTime(&msecTotal, start, stop); + + cudaStreamSynchronize(stream1); + cudaStreamDestroy(stream1); + + /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(gradv, grad_d, size); + } + else + cs_sync_d2h(gradv); + + // count_nan = 0; count_inf = 0; + // for(cs_lnum_t id = 0; id < n_cells; id++){ + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // if(isnan(gradv[id][i][j])){ + // status = true; + // count_nan++; + // } + // if(isinf(gradv[id][i][j])){ + // status = true; + // count_inf++; + // } + // } + // } + // } + // if(status){ + // printf("%d Nans found in gradv after boundary face kernel\n", count_nan); + // printf("%d Infs found in gradv after boundary face kernel\n", count_inf); + // } + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(cocg_d)); + CS_CUDA_CHECK(cudaFree(grad_d)); -} \ No newline at end of file +} + +// cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) \ No newline at end of file diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index d76445313c..fe61f924ec 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -119,6 +119,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs); From 2de394ecda5a529b975c6fe0ad9ef3e2c9319233 Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Wed, 25 Oct 2023 15:11:24 +0200 Subject: [PATCH 03/70] Gradient of face loop --- src/alge/cs_gradient.cxx | 4 +- src/alge/cs_gradient_cuda.cu | 192 +++++++++++++---------------------- 2 files changed, 73 insertions(+), 123 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 9e949ac7f5..99759ffd13 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6980,7 +6980,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ } /* loop on thread groups */ - +#endif /* Contribution from extended neighborhood */ if (halo_type == CS_HALO_EXTENDED) { @@ -7068,7 +7068,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, + rhs[c_id][i][2] * cocg[c_id][2]; } } -#endif +// #endif /* Compute gradient on boundary cells */ /*------------------------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 094c34d146..ac1711f018 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -474,9 +474,9 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, const cs_real_t *weight, const cs_real_t *c_weight) { - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= size){ + if(f_id >= size){ return; } cs_lnum_t s_id = i_group_index.x; @@ -484,35 +484,37 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; cs_lnum_t c_id1, c_id2; - for(cs_lnum_t index = s_id; index < e_id; index++){ - c_id1 = i_face_cells[index][0]; - c_id2 = i_face_cells[index][1]; + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; - ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - - if (c_weight == NULL){ - _weight1 = 1.; - _weight2 = 1.; - } - else{ - _pond = weight[index]; - _denom = 1. / ( _pond *c_weight[c_id1] - + (1. - _pond)*c_weight[c_id2]); - _weight1 = c_weight[c_id1] * _denom; - _weight2 = c_weight[c_id2] * _denom; - } - - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - rhs[c_id1][i][j] += _weight2 * fctb[j]; - rhs[c_id2][i][j] += _weight1 * fctb[j]; - } + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + // rhs[c_id1][i][j] += _weight2 * fctb[j]; + // rhs[c_id2][i][j] += _weight1 * fctb[j]; + _weight2 += _weight2 * fctb[j]; + _weight1 += _weight1 * fctb[j]; + atomicAdd(&rhs[f_id][i][j], _weight2); + atomicAdd(&rhs[c_id2][i][j], _weight2); } } } @@ -571,9 +573,9 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, const cs_real_3_t *coefav, const int inc) { - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= size){ + if(f_id >= size){ return; } cs_lnum_t s_id = b_group_index.x; @@ -581,38 +583,44 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, cs_lnum_t c_id1; cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; - for(cs_lnum_t index = s_id; index < e_id; index++){ - c_id1 = b_face_cells[index]; + c_id1 = b_face_cells[f_id]; - /* Normal is vector 0 if the b_face_normal norm is too small */ - norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] - + b_face_normal[index][1]*b_face_normal[index][1] - + b_face_normal[index][2]*b_face_normal[index][2]); + /* Normal is vector 0 if the b_face_normal norm is too small */ + norm = sqrt(b_face_normal[f_id][0]*b_face_normal[f_id][0] + + b_face_normal[f_id][1]*b_face_normal[f_id][1] + + b_face_normal[f_id][2]*b_face_normal[f_id][2]); - inverse_norm = 1. / norm; + inverse_norm = 1. / norm; - n_d_dist[0] = inverse_norm * b_face_normal[index][0]; - n_d_dist[1] = inverse_norm * b_face_normal[index][1]; - n_d_dist[2] = inverse_norm * b_face_normal[index][2]; + n_d_dist[0] = inverse_norm * b_face_normal[f_id][0]; + n_d_dist[1] = inverse_norm * b_face_normal[f_id][1]; + n_d_dist[2] = inverse_norm * b_face_normal[f_id][2]; - d_b_dist = 1. / b_dist[index]; + d_b_dist = 1. / b_dist[f_id]; - /* Normal divided by b_dist */ - n_d_dist[0] *= d_b_dist; - n_d_dist[1] *= d_b_dist; - n_d_dist[2] *= d_b_dist; + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; - for (cs_lnum_t i = 0; i < 3; i++) { - pfac = coefav[index][i]*inc - + ( coefbv[index][0][i] * pvar[c_id1][0] - + coefbv[index][1][i] * pvar[c_id1][1] - + coefbv[index][2][i] * pvar[c_id1][2] - - pvar[c_id1][i]); - - rhs[c_id1][i][0] += n_d_dist[0] * pfac; - rhs[c_id1][i][1] += n_d_dist[1] * pfac; - rhs[c_id1][i][2] += n_d_dist[2] * pfac; - } + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + // rhs[c_id1][i][0] += n_d_dist[0] * pfac; + // rhs[c_id1][i][1] += n_d_dist[1] * pfac; + // rhs[c_id1][i][2] += n_d_dist[2] * pfac; + + n_d_dist[0] *= pfac; + n_d_dist[1] *= pfac; + n_d_dist[2] *= pfac; + + atomicAdd(&rhs[c_id1][i][0], n_d_dist[0]); + atomicAdd(&rhs[c_id1][i][1], n_d_dist[1]); + atomicAdd(&rhs[c_id1][i][2], n_d_dist[2]); } } @@ -1193,6 +1201,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t n_cells = m->n_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; int device_id; cudaGetDevice(&device_id); @@ -1291,33 +1300,6 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, bool status = false; cs_lnum_t count_nan = 0, count_inf = 0; - // isnan_cuda<<>> - // (n_cells_ext, rhs_d, status); - - // if(status) - // printf("Nan found in rhs after init kernel"); - - // printf("Init execution time %f\n", msecTotal); - // printf("n_group: %d n_thread: %d n_group[0]: %d n_group[1]: %d\n", m->b_face_numbering->n_groups, m->b_face_numbering->n_threads, m->b_face_numbering->group_index[0], m->b_face_numbering->group_index[1]); - - // for(cs_lnum_t id = 0; id < n_cells_ext; id++){ - // for(cs_lnum_t i = 0; i < 3; i++){ - // for(cs_lnum_t j = 0; j < 3; j++){ - // if(isnan(rhs[id][i][j])){ - // status = true; - // count_nan++; - // } - // if(isinf(rhs[id][i][j])){ - // status = true; - // count_inf++; - // } - // } - // } - // } - // if(status){ - // printf("%d Nans found in rhs before interieur face kernel\n", count_nan); - // printf("%d Infs found in rhs before interieur face kernel\n", count_inf); - // } error = cudaEventCreate(&start); error = cudaEventCreate(&stop); @@ -1327,36 +1309,13 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, error = cudaEventSynchronize(start); _compute_rhs_lsq_v_i_face<<>> - (n_cells, i_group_index, i_face_cells, cell_f_cen, rhs_d, pvar_d, weight, c_weight); + (n_i_faces, i_group_index, i_face_cells, cell_f_cen, rhs_d, pvar_d, weight, c_weight); error = cudaEventRecord(stop, NULL); error = cudaEventSynchronize(stop); msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); - // isnan_cuda<<>> - // (n_cells_ext, rhs_d, status); - - // count_nan = 0; count_inf = 0; - // for(cs_lnum_t id = 0; id < n_cells_ext; id++){ - // for(cs_lnum_t i = 0; i < 3; i++){ - // for(cs_lnum_t j = 0; j < 3; j++){ - // if(isnan(rhs[id][i][j])){ - // status = true; - // count_nan++; - // } - // if(isinf(rhs[id][i][j])){ - // status = true; - // count_inf++; - // } - // } - // } - // } - // if(status){ - // printf("%d Nans found in rhs after interieur face kernel\n", count_nan); - // printf("%d Infs found in rhs after interieur face kernel\n", count_inf); - // } - if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ error = cudaEventCreate(&start); error = cudaEventCreate(&stop); @@ -1381,7 +1340,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, error = cudaEventSynchronize(start); _compute_rhs_lsq_v_b_face<<>> - (n_cells, b_group_index, b_face_cells, cell_f_cen, b_face_normal, rhs_d, pvar_d, b_dist, coefb_d, coefa_d, inc); + (m->n_b_faces, b_group_index, b_face_cells, cell_f_cen, b_face_normal, rhs_d, pvar_d, b_dist, coefb_d, coefa_d, inc); error = cudaEventRecord(stop, NULL); error = cudaEventSynchronize(stop); @@ -1403,19 +1362,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *grad_d = NULL; CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); - // if (cs_check_device_ptr(gradv) == CS_ALLOC_HOST) { - // size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; - // CS_CUDA_CHECK(cudaMalloc(&_grad_d, size)); - // grad_d = (cs_real_33_t *)_grad_d; - // } - // else { - // grad_d = (cs_real_33_t *)cs_get_device_ptr((void *)gradv); - // } - error = cudaEventCreate(&start); error = cudaEventCreate(&stop); - // Record the start event + // // Record the start event error = cudaEventRecord(start, NULL); error = cudaEventSynchronize(start); @@ -1430,9 +1380,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream1); cudaStreamDestroy(stream1); - /* Sync to host */ + // /* Sync to host */ if (grad_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; cs_cuda_copy_d2h(gradv, grad_d, size); } else From 86dd6d581f3b351d52c59fae9eb1c2972fa42935 Mon Sep 17 00:00:00 2001 From: Florian Lemaitre Date: Wed, 25 Oct 2023 19:16:52 +0200 Subject: [PATCH 04/70] Fix gradient CUDA --- src/alge/cs_gradient.cxx | 23 +++++++++++++++++------ src/alge/cs_gradient_cuda.cu | 26 +++++++++++--------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 99759ffd13..bd61a27d4e 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6899,13 +6899,15 @@ _lsq_vector_gradient(const cs_mesh_t *m, _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s); - cs_real_33_t *rhs; + cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); + BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); + BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); /* Compute Right-Hand Side */ /*-------------------------*/ -#if defined(HAVE_CUDA) +//#if defined(HAVE_CUDA) cs_lsq_vector_gradient_cuda( m, madj, @@ -6917,9 +6919,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, pvar, c_weight, cocg, - gradv, - rhs); -#else + gradv_cuda, + rhs_cuda); +//#else # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) @@ -6980,7 +6982,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ } /* loop on thread groups */ -#endif +//#endif /* Contribution from extended neighborhood */ if (halo_type == CS_HALO_EXTENDED) { @@ -7066,6 +7068,15 @@ _lsq_vector_gradient(const cs_mesh_t *m, gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; + + for (int j =0; j < 3; ++j) { + auto cpu = gradv[c_id][i][j]; + auto cuda = gradv_cuda[c_id][i][j]; + + if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { + printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + } + } } } // #endif diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index ac1711f018..6b0b0d5ea7 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -511,10 +511,8 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, fctb[j] = dc[j] * pfac; // rhs[c_id1][i][j] += _weight2 * fctb[j]; // rhs[c_id2][i][j] += _weight1 * fctb[j]; - _weight2 += _weight2 * fctb[j]; - _weight1 += _weight1 * fctb[j]; - atomicAdd(&rhs[f_id][i][j], _weight2); - atomicAdd(&rhs[c_id2][i][j], _weight2); + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); } } } @@ -605,7 +603,7 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, for (cs_lnum_t i = 0; i < 3; i++) { pfac = coefav[f_id][i]*inc - + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + coefbv[f_id][1][i] * pvar[c_id1][1] + coefbv[f_id][2][i] * pvar[c_id1][2] - pvar[c_id1][i]); @@ -614,13 +612,9 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, // rhs[c_id1][i][1] += n_d_dist[1] * pfac; // rhs[c_id1][i][2] += n_d_dist[2] * pfac; - n_d_dist[0] *= pfac; - n_d_dist[1] *= pfac; - n_d_dist[2] *= pfac; - - atomicAdd(&rhs[c_id1][i][0], n_d_dist[0]); - atomicAdd(&rhs[c_id1][i][1], n_d_dist[1]); - atomicAdd(&rhs[c_id1][i][2], n_d_dist[2]); + atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); } } @@ -1227,6 +1221,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, unsigned int blocksize = 256; unsigned int gridsize_b = (unsigned int)ceil((double)m->n_b_cells / blocksize); + unsigned int gridsize_if + = (unsigned int)ceil((double)m->n_i_faces / blocksize); unsigned int gridsize_bf = (unsigned int)ceil((double)m->n_b_faces / blocksize); unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); @@ -1308,7 +1304,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, error = cudaEventRecord(start, NULL); error = cudaEventSynchronize(start); - _compute_rhs_lsq_v_i_face<<>> + _compute_rhs_lsq_v_i_face<<>> (n_i_faces, i_group_index, i_face_cells, cell_f_cen, rhs_d, pvar_d, weight, c_weight); error = cudaEventRecord(stop, NULL); @@ -1339,7 +1335,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, error = cudaEventRecord(start, NULL); error = cudaEventSynchronize(start); - _compute_rhs_lsq_v_b_face<<>> + _compute_rhs_lsq_v_b_face<<>> (m->n_b_faces, b_group_index, b_face_cells, cell_f_cen, b_face_normal, rhs_d, pvar_d, b_dist, coefb_d, coefa_d, inc); error = cudaEventRecord(stop, NULL); @@ -1421,4 +1417,4 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } -// cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) \ No newline at end of file +// cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) From 6e3524d383150ed31b144c87dc30e948e54b7880 Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Thu, 26 Oct 2023 14:56:58 +0200 Subject: [PATCH 05/70] Kernel and CPU timing --- src/alge/cs_gradient.cxx | 14 +++ src/alge/cs_gradient_cuda.cu | 211 ++++++++++++++++++++++------------- 2 files changed, 148 insertions(+), 77 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index bd61a27d4e..79009f90aa 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -70,6 +70,7 @@ #include "cs_prototypes.h" #include "cs_timer.h" #include "cs_timer_stats.h" +#include /*---------------------------------------------------------------------------- * Header for the current file @@ -6891,6 +6892,11 @@ _lsq_vector_gradient(const cs_mesh_t *m, cs_cocg_6_t *restrict cocgb_s = NULL; cs_cocg_6_t *restrict cocg = NULL; + /* Timing the computation */ + + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda; + #if defined(HAVE_CUDA) bool accel = (cs_get_device_id() > -1) ? true : false; #else @@ -6908,6 +6914,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute Right-Hand Side */ /*-------------------------*/ //#if defined(HAVE_CUDA) + start = std::chrono::high_resolution_clock::now(); cs_lsq_vector_gradient_cuda( m, madj, @@ -6921,7 +6928,11 @@ _lsq_vector_gradient(const cs_mesh_t *m, cocg, gradv_cuda, rhs_cuda); + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + //#else + start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) @@ -7080,6 +7091,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, } } // #endif +stop = std::chrono::high_resolution_clock::now(); +elapsed = std::chrono::duration_cast(stop - start); +printf("Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); /* Compute gradient on boundary cells */ /*------------------------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 6b0b0d5ea7..b39751c1f3 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -37,6 +37,7 @@ #include #include #include +#include #if defined(HAVE_MPI) #include @@ -94,21 +95,18 @@ * Private function definitions *============================================================================*/ -__global__ void isnan_cuda(cs_lnum_t size, - cs_real_33_t *restrict rhs, - bool status) +__device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], + cs_real_t out[3]) { - cs_lnum_t id = blockIdx.x * blockDim.x + threadIdx.x; + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); - if (id >= size) - return; + cs_real_t inverse_norm = 1. / norm; - for (cs_lnum_t i = 0; i < 3; i++) - for (cs_lnum_t j = 0; j < 3; j++){ - if(isnan(rhs[id][i][j])){ - status = true; - } - } + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; } /*---------------------------------------------------------------------------- @@ -466,7 +464,6 @@ _init_rhs(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_i_face(cs_lnum_t size, - int2 i_group_index, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *cell_f_cen, cs_real_33_t *rhs, @@ -479,8 +476,6 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, if(f_id >= size){ return; } - cs_lnum_t s_id = i_group_index.x; - cs_lnum_t e_id = i_group_index.y; cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; cs_lnum_t c_id1, c_id2; @@ -509,8 +504,54 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; - // rhs[c_id1][i][j] += _weight2 * fctb[j]; - // rhs[c_id2][i][j] += _weight1 * fctb[j]; + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); } @@ -560,7 +601,6 @@ _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_b_face(cs_lnum_t size, - int2 b_group_index, const cs_lnum_t *b_face_cells, const cs_real_3_t *cell_f_cen, const cs_real_3_t *b_face_normal, @@ -576,23 +616,13 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, if(f_id >= size){ return; } - cs_lnum_t s_id = b_group_index.x; - cs_lnum_t e_id = b_group_index.y; + cs_lnum_t c_id1; cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; c_id1 = b_face_cells[f_id]; - /* Normal is vector 0 if the b_face_normal norm is too small */ - norm = sqrt(b_face_normal[f_id][0]*b_face_normal[f_id][0] - + b_face_normal[f_id][1]*b_face_normal[f_id][1] - + b_face_normal[f_id][2]*b_face_normal[f_id][2]); - - inverse_norm = 1. / norm; - - n_d_dist[0] = inverse_norm * b_face_normal[f_id][0]; - n_d_dist[1] = inverse_norm * b_face_normal[f_id][1]; - n_d_dist[2] = inverse_norm * b_face_normal[f_id][2]; + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; @@ -608,10 +638,6 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, + coefbv[f_id][2][i] * pvar[c_id1][2] - pvar[c_id1][i]); - // rhs[c_id1][i][0] += n_d_dist[0] * pfac; - // rhs[c_id1][i][1] += n_d_dist[1] * pfac; - // rhs[c_id1][i][2] += n_d_dist[2] * pfac; - atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); @@ -1197,6 +1223,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_i_faces = m->n_i_faces; + std::chrono::high_resolution_clock::time_point begin, end; + std::chrono::microseconds elapsed; + int device_id; cudaGetDevice(&device_id); @@ -1229,6 +1258,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, unsigned int gridsize_ext = (unsigned int)ceil((double)n_cells_ext / blocksize); + begin = std::chrono::high_resolution_clock::now(); + const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); const cs_lnum_t *restrict b_face_cells @@ -1279,69 +1310,87 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cudaError_t error; cudaEvent_t start,stop; - float msecTotal = 0.0f; + float msec = 0.0f, msecTotal = 0.0f; error = cudaEventCreate(&start); error = cudaEventCreate(&stop); // Record the start event - error = cudaEventRecord(start, NULL); - error = cudaEventSynchronize(start); + error = cudaEventRecord(start, stream1); _init_rhs<<>> - (n_cells_ext, rhs_d); + (n_cells_ext, + rhs_d); - error = cudaEventRecord(stop, NULL); + error = cudaEventRecord(stop, stream1); error = cudaEventSynchronize(stop); - error = cudaEventElapsedTime(&msecTotal, start, stop); + error = cudaEventElapsedTime(&msec, start, stop); + msecTotal += msec; + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); bool status = false; cs_lnum_t count_nan = 0, count_inf = 0; - error = cudaEventCreate(&start); - error = cudaEventCreate(&stop); - // Record the start event - error = cudaEventRecord(start, NULL); - error = cudaEventSynchronize(start); + error = cudaEventRecord(start, stream1); _compute_rhs_lsq_v_i_face<<>> - (n_i_faces, i_group_index, i_face_cells, cell_f_cen, rhs_d, pvar_d, weight, c_weight); - - error = cudaEventRecord(stop, NULL); + (n_i_faces, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); + + error = cudaEventRecord(stop, stream1); error = cudaEventSynchronize(stop); - msecTotal = 0.0f; - error = cudaEventElapsedTime(&msecTotal, start, stop); + msec = 0.0f; + error = cudaEventElapsedTime(&msec, start, stop); + msecTotal += msec; + printf("I_faces = %f\t", msec*1000.f); if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ - error = cudaEventCreate(&start); - error = cudaEventCreate(&stop); // Record the start event - error = cudaEventRecord(start, NULL); - error = cudaEventSynchronize(start); + error = cudaEventRecord(start, stream1); _compute_rhs_lsq_v_b_neighbor<<>> - (n_cells, cell_cells_idx, cell_cells_lst, cell_f_cen, rhs_d, pvar_d); + (n_cells, + cell_cells_idx, + cell_cells_lst, + cell_f_cen, + rhs_d, + pvar_d); - error = cudaEventRecord(stop, NULL); + error = cudaEventRecord(stop, stream1); error = cudaEventSynchronize(stop); - msecTotal = 0.0f; - error = cudaEventElapsedTime(&msecTotal, start, stop); + msec = 0.0f; + error = cudaEventElapsedTime(&msec, start, stop); + msecTotal += msec; + printf("Ex_Neighbor = %f\t", msec*1000.f); } - error = cudaEventCreate(&start); - error = cudaEventCreate(&stop); - // Record the start event - error = cudaEventRecord(start, NULL); - error = cudaEventSynchronize(start); + error = cudaEventRecord(start, stream1); _compute_rhs_lsq_v_b_face<<>> - (m->n_b_faces, b_group_index, b_face_cells, cell_f_cen, b_face_normal, rhs_d, pvar_d, b_dist, coefb_d, coefa_d, inc); - - error = cudaEventRecord(stop, NULL); + (m->n_b_faces, + b_face_cells, + cell_f_cen, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); + + error = cudaEventRecord(stop, stream1); error = cudaEventSynchronize(stop); - msecTotal = 0.0f; - error = cudaEventElapsedTime(&msecTotal, start, stop); + msec = 0.0f; + error = cudaEventElapsedTime(&msec, start, stop); + msecTotal += msec; + printf("B_faces = %f\t", msec*1000.f); if (rhs_d != NULL) { @@ -1358,20 +1407,23 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *grad_d = NULL; CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); - error = cudaEventCreate(&start); - error = cudaEventCreate(&stop); - // // Record the start event - error = cudaEventRecord(start, NULL); - error = cudaEventSynchronize(start); + error = cudaEventRecord(start, stream1); _compute_gradient_lsq_v<<>> - (n_cells, grad_d, rhs_d, cocg_d); + (n_cells, + grad_d, + rhs_d, + cocg_d); - error = cudaEventRecord(stop, NULL); + error = cudaEventRecord(stop, stream1); error = cudaEventSynchronize(stop); - msecTotal = 0.0f; - error = cudaEventElapsedTime(&msecTotal, start, stop); + msec = 0.0f; + error = cudaEventElapsedTime(&msec, start, stop); + msecTotal += msec; + printf("Gradient = %f\t", msec*1000.f); + + printf("Total kernel = %f\t", msecTotal*1000.f); cudaStreamSynchronize(stream1); cudaStreamDestroy(stream1); @@ -1384,6 +1436,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, else cs_sync_d2h(gradv); + end = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(end - begin); + printf("CPU+GPU= %ld\t", elapsed.count()); + // count_nan = 0; count_inf = 0; // for(cs_lnum_t id = 0; id < n_cells; id++){ // for(cs_lnum_t i = 0; i < 3; i++){ @@ -1403,6 +1459,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // printf("%d Nans found in gradv after boundary face kernel\n", count_nan); // printf("%d Infs found in gradv after boundary face kernel\n", count_inf); // } + printf("\n"); if (_pvar_d != NULL) CS_CUDA_CHECK(cudaFree(_pvar_d)); From 5d63419b20adae85f1c83ee104d949009c4e884c Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Fri, 27 Oct 2023 12:31:45 +0200 Subject: [PATCH 06/70] Clean timing --- src/alge/cs_gradient.cxx | 17 ++- src/alge/cs_gradient_cuda.cu | 195 ++++++++++++++--------------------- 2 files changed, 88 insertions(+), 124 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 79009f90aa..e57267ebad 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6913,7 +6913,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute Right-Hand Side */ /*-------------------------*/ -//#if defined(HAVE_CUDA) +#ifdef NDEBUG +#if defined(HAVE_CUDA) +#endif start = std::chrono::high_resolution_clock::now(); cs_lsq_vector_gradient_cuda( m, @@ -6930,8 +6932,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, rhs_cuda); stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); - -//#else +#ifdef NDEBUG +#else +#endif start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { @@ -6993,7 +6996,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ } /* loop on thread groups */ -//#endif + /* Contribution from extended neighborhood */ if (halo_type == CS_HALO_EXTENDED) { @@ -7090,7 +7093,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, } } } -// #endif +#ifdef NDEBUG +#endif +#endif stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); printf("Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); @@ -7163,6 +7168,8 @@ printf("Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_c } BFT_FREE(rhs); + BFT_FREE(rhs_cuda); + BFT_FREE(gradv_cuda); } /*---------------------------------------------------------------------------- diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index b39751c1f3..f14fc14127 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1223,22 +1223,35 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_i_faces = m->n_i_faces; - std::chrono::high_resolution_clock::time_point begin, end; - std::chrono::microseconds elapsed; - int device_id; cudaGetDevice(&device_id); - cudaStream_t stream, stream1; - cudaStreamCreate(&stream1); + cudaStream_t stream; cudaStreamCreate(&stream); + cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, stop; + float msec = 0.0f, msecTotal = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&halo)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces)); + CS_CUDA_CHECK(cudaEventCreate(&gradient)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + cs_real_33_t *rhs_d; CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); cs_cocg_6_t *cocg_d; CS_CUDA_CHECK(cudaMalloc(&cocg_d, n_cells_ext * sizeof(cs_cocg_6_t))); + cs_real_33_t *grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); + cs_cuda_copy_h2d(cocg_d, cocg, n_cells_ext * sizeof(cs_cocg_6_t)); void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, @@ -1258,8 +1271,6 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, unsigned int gridsize_ext = (unsigned int)ceil((double)n_cells_ext / blocksize); - begin = std::chrono::high_resolution_clock::now(); - const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); const cs_lnum_t *restrict b_face_cells @@ -1270,18 +1281,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; - const cs_lnum_t *restrict i_group_index_raw = m->i_face_numbering->group_index; - const cs_lnum_t *restrict b_group_index_raw = m->b_face_numbering->group_index; - // const cs_lnum_t *restrict i_group_index - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(i_group_index_raw); - // const cs_lnum_t *restrict b_group_index - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(b_group_index_raw); - int2 i_group_index, b_group_index; - i_group_index.x = i_group_index_raw[0]; - i_group_index.y = i_group_index_raw[1]; - - b_group_index.x = b_group_index_raw[0]; - b_group_index.y = b_group_index_raw[1]; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; const cs_lnum_t *restrict cell_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); @@ -1297,44 +1298,26 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); - _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, + _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream1, + _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, + _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); - // cudaStreamDestroy(stream1); - cudaStreamSynchronize(0); - - cudaError_t error; - cudaEvent_t start,stop; - float msec = 0.0f, msecTotal = 0.0f; - error = cudaEventCreate(&start); - error = cudaEventCreate(&stop); - - // Record the start event - error = cudaEventRecord(start, stream1); + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - _init_rhs<<>> + _init_rhs<<>> (n_cells_ext, rhs_d); - error = cudaEventRecord(stop, stream1); - error = cudaEventSynchronize(stop); - error = cudaEventElapsedTime(&msec, start, stop); - msecTotal += msec; - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); - + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + bool status = false; cs_lnum_t count_nan = 0, count_inf = 0; - - // Record the start event - error = cudaEventRecord(start, stream1); - _compute_rhs_lsq_v_i_face<<>> + _compute_rhs_lsq_v_i_face<<>> (n_i_faces, i_face_cells, cell_f_cen, @@ -1343,37 +1326,21 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, weight, c_weight); - error = cudaEventRecord(stop, stream1); - error = cudaEventSynchronize(stop); - msec = 0.0f; - error = cudaEventElapsedTime(&msec, start, stop); - msecTotal += msec; - printf("I_faces = %f\t", msec*1000.f); + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ - // Record the start event - error = cudaEventRecord(start, stream1); - _compute_rhs_lsq_v_b_neighbor<<>> + _compute_rhs_lsq_v_b_neighbor<<>> (n_cells, cell_cells_idx, cell_cells_lst, cell_f_cen, rhs_d, pvar_d); - - error = cudaEventRecord(stop, stream1); - error = cudaEventSynchronize(stop); - msec = 0.0f; - error = cudaEventElapsedTime(&msec, start, stop); - msecTotal += msec; - printf("Ex_Neighbor = %f\t", msec*1000.f); } + CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - // Record the start event - error = cudaEventRecord(start, stream1); - - _compute_rhs_lsq_v_b_face<<>> + _compute_rhs_lsq_v_b_face<<>> (m->n_b_faces, b_face_cells, cell_f_cen, @@ -1385,48 +1352,26 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, coefa_d, inc); - error = cudaEventRecord(stop, stream1); - error = cudaEventSynchronize(stop); - msec = 0.0f; - error = cudaEventElapsedTime(&msec, start, stop); - msecTotal += msec; - printf("B_faces = %f\t", msec*1000.f); + CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - if (rhs_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(rhs, rhs_d, size); - } - else - cs_sync_d2h(rhs); + // if (rhs_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(rhs, rhs_d, size); + // } + // else + // cs_sync_d2h(rhs); // /* Compute gradient */ // /*------------------*/ - // void *_grad_d = NULL; - cs_real_33_t *grad_d = NULL; - CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); - - // // Record the start event - error = cudaEventRecord(start, stream1); - - _compute_gradient_lsq_v<<>> + _compute_gradient_lsq_v<<>> (n_cells, grad_d, rhs_d, cocg_d); - error = cudaEventRecord(stop, stream1); - error = cudaEventSynchronize(stop); - msec = 0.0f; - error = cudaEventElapsedTime(&msec, start, stop); - msecTotal += msec; - printf("Gradient = %f\t", msec*1000.f); - - printf("Total kernel = %f\t", msecTotal*1000.f); - - cudaStreamSynchronize(stream1); - cudaStreamDestroy(stream1); + CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); // /* Sync to host */ if (grad_d != NULL) { @@ -1436,29 +1381,41 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, else cs_sync_d2h(gradv); - end = std::chrono::high_resolution_clock::now(); - elapsed = std::chrono::duration_cast(end - begin); - printf("CPU+GPU= %ld\t", elapsed.count()); - - // count_nan = 0; count_inf = 0; - // for(cs_lnum_t id = 0; id < n_cells; id++){ - // for(cs_lnum_t i = 0; i < 3; i++){ - // for(cs_lnum_t j = 0; j < 3; j++){ - // if(isnan(gradv[id][i][j])){ - // status = true; - // count_nan++; - // } - // if(isinf(gradv[id][i][j])){ - // status = true; - // count_inf++; - // } - // } - // } - // } - // if(status){ - // printf("%d Nans found in gradv after boundary face kernel\n", count_nan); - // printf("%d Infs found in gradv after boundary face kernel\n", count_inf); - // } + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + printf("Halo = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + printf("B_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + printf("Gradient = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + printf("Total kernel = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + printf("\n"); if (_pvar_d != NULL) From ba743476cd43ea82ffc35cedcce834368d3b5d26 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Mon, 30 Oct 2023 11:17:44 +0100 Subject: [PATCH 07/70] New kernels versions --- src/alge/cs_gradient_cuda.cu | 418 +++++++++++++++++++++-------------- 1 file changed, 252 insertions(+), 166 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index f14fc14127..f353cc8912 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -462,6 +462,80 @@ _init_rhs(cs_lnum_t size, } } +__global__ static void +_init_rhs_v2(cs_lnum_t size, + cs_real_t *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id < size) + return; + + rhs[c_id] = 0.0; +} + +__global__ static void +_init_rhs_v3(cs_lnum_t size, + double3 *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id < size) + return; + + rhs[c_id] = make_double3(0.0, 0.0, 0.0); +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL){ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], c_weight[c_id2] * _denom * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], c_weight[c_id1] * _denom * fctb[j]); + } + } + } + else{ + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], fctb[j]); + atomicAdd(&rhs[c_id2][i][j], fctb[j]); + } + } + } +} + __global__ static void _compute_rhs_lsq_v_i_face(cs_lnum_t size, const cs_lnum_2_t *i_face_cells, @@ -512,10 +586,10 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, - const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *cell_f_cen, - cs_real_33_t *rhs, - const cs_real_3_t *pvar, + const cs_lnum_t *i_face_cells, + const cs_real_t *cell_f_cen, + cs_real_t *rhs, + const cs_real_t *pvar, const cs_real_t *weight, const cs_real_t *c_weight) { @@ -527,12 +601,12 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; cs_lnum_t c_id1, c_id2; - c_id1 = i_face_cells[f_id][0]; - c_id2 = i_face_cells[f_id][1]; + c_id1 = i_face_cells[f_id]; + c_id2 = i_face_cells[f_id + 1]; - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; + dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; + dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); @@ -549,15 +623,16 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, } for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + pfac = (pvar[c_id2 + i] - pvar[c_id1 + i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; - atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); - atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); } } } + __global__ static void _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, const cs_lnum_t *cell_cells_idx, @@ -640,7 +715,52 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); - atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); + atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); + } +} + +__global__ static void +_compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, + const cs_lnum_t *b_face_cells, + const cs_real_3_t *cell_f_cen, + const cs_real_3_t *b_face_normal, + cs_real_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *b_dist, + const cs_real_33_t *coefbv, + const cs_real_3_t *coefav, + const int inc) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + c_id1 = b_face_cells[f_id]; + + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + atomicAdd(&rhs[c_id1*3*3 + i*3], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 2], n_d_dist[2] * pfac); } } @@ -669,6 +789,31 @@ _compute_gradient_lsq_v(cs_lnum_t size, } } +__global__ static void +_compute_gradient_lsq_v_v2(cs_lnum_t size, + cs_real_t *gradv, + cs_real_t *rhs, + cs_cocg_6_t *cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id + i*3] * cocg[c_id][0] + + rhs[c_id + i*3 + 1] * cocg[c_id][3] + + rhs[c_id + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id + i*3] * cocg[c_id][3] + + rhs[c_id + i*3 + 1] * cocg[c_id][1] + + rhs[c_id + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id + i*3] * cocg[c_id][5] + + rhs[c_id + i*3 + 1] * cocg[c_id][4] + + rhs[c_id + i*3 + 2] * cocg[c_id][2]; + } +} + __global__ static void _compute_gradient_lsq_b_v(cs_lnum_t size, cs_lnum_t n_b_cells, @@ -737,7 +882,7 @@ _compute_gradient_lsq_b_v(cs_lnum_t size, } /*---------------------------------------------------------------------------- - * Synchronize of copy a cs_real_t type array from the host to a device. + * Synchronize of copy a T type array from the host to a device. * * parameters: * val_h <-- pointer to host data @@ -749,150 +894,28 @@ _compute_gradient_lsq_b_v(cs_lnum_t size, * after use if non-NULL) *----------------------------------------------------------------------------*/ +template static void -_sync_or_copy_real_h2d(const cs_real_t *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const cs_real_t **val_d, - void **buf_d) -{ - const cs_real_t *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(cs_real_t); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const cs_real_t *)_buf_d; - } - else { - _val_d = (const cs_real_t *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - - -/*---------------------------------------------------------------------------- - * Synchronize of copy a cs_real_3_t type array from the host to a device. - * - * parameters: - * val_h <-- pointer to host data - * n_vals <-- number of data values - * device_id <-- associated device id - * stream <-- associated stream (for async prefetch only) - * val_d --> matching pointer on device - * buf_d --> matching allocation pointer on device (should be freed - * after use if non-NULL) - *----------------------------------------------------------------------------*/ - -static void -_sync_or_copy_real_3_h2d(const cs_real_3_t *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const cs_real_3_t **val_d, - void **buf_d) -{ - const cs_real_3_t *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(cs_real_3_t); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const cs_real_3_t *)_buf_d; - } - else { - _val_d = (const cs_real_3_t *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - -/*---------------------------------------------------------------------------- - * Synchronize of copy a cs_real_33_t type array from the host to a device. - * - * parameters: - * val_h <-- pointer to host data - * n_vals <-- number of data values - * device_id <-- associated device id - * stream <-- associated stream (for async prefetch only) - * val_d --> matching pointer on device - * buf_d --> matching allocation pointer on device (should be freed - * after use if non-NULL) - *----------------------------------------------------------------------------*/ - -static void -_sync_or_copy_real_33_h2d(const cs_real_33_t *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const cs_real_33_t **val_d, - void **buf_d) -{ - const cs_real_33_t *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(cs_real_33_t); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const cs_real_33_t *)_buf_d; - } - else { - _val_d = (const cs_real_33_t *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - -static void -_sync_or_copy_lnum_h2d(const cs_lnum_t *val_h, +_sync_or_copy_real_h2d(const T *val_h, cs_lnum_t n_vals, int device_id, cudaStream_t stream, - const cs_lnum_t **val_d, + const T **val_d, void **buf_d) { - const cs_lnum_t *_val_d = NULL; + const T *_val_d = NULL; void *_buf_d = NULL; cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(cs_lnum_t); + size_t size = n_vals * sizeof(T); if (alloc_mode == CS_ALLOC_HOST) { CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const cs_lnum_t *)_buf_d; + _val_d = (const T *)_buf_d; } else { - _val_d = (const cs_lnum_t *)cs_get_device_ptr((void *)val_h); + _val_d = (const T *)cs_get_device_ptr((void *)val_h); if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) cudaMemPrefetchAsync(val_h, size, device_id, stream); @@ -1245,6 +1268,13 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *rhs_d; CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); + cs_real_33_t *rhs_d_v0; + CS_CUDA_CHECK(cudaMalloc(&rhs_d_v0, n_cells_ext * sizeof(cs_real_33_t))); + cs_real_t *rhs_test_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_test_d, n_cells_ext * sizeof(cs_real_33_t))); + + cs_real_t *gradv_test_d; + CS_CUDA_CHECK(cudaMalloc(&gradv_test_d, n_cells_ext * sizeof(cs_real_33_t))); cs_cocg_6_t *cocg_d; CS_CUDA_CHECK(cudaMalloc(&cocg_d, n_cells_ext * sizeof(cs_cocg_6_t))); @@ -1260,6 +1290,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *coefb_d = NULL; const cs_lnum_t *cell_cells_idx_d = NULL; + const cs_real_t *pvar_d_1d = NULL; + CS_CUDA_CHECK(cudaMalloc(&pvar_d_1d, n_cells * sizeof(cs_real_33_t))); + unsigned int blocksize = 256; unsigned int gridsize_b = (unsigned int)ceil((double)m->n_b_cells / blocksize); @@ -1270,6 +1303,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); unsigned int gridsize_ext = (unsigned int)ceil((double)n_cells_ext / blocksize); + unsigned int gridsize_ext_1d + = (unsigned int)ceil((double)(n_cells_ext*3*3) / blocksize); const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); @@ -1297,32 +1332,63 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict b_face_normal = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + const cs_real_t *restrict cell_f_cen_1d + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_lnum_t *restrict i_face_cells_1d + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); - _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream, + + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - _init_rhs<<>> - (n_cells_ext, - rhs_d); + // _init_rhs<<>> + // (n_cells_ext, + // rhs_d); + + _init_rhs_v2<<>> + (n_cells_ext*3*3, + rhs_test_d); + + // _init_rhs_v3<<>> + // (n_cells_ext*3, + // rhs_test_d); CS_CUDA_CHECK(cudaEventRecord(init, stream)); bool status = false; cs_lnum_t count_nan = 0, count_inf = 0; - _compute_rhs_lsq_v_i_face<<>> + // _compute_rhs_lsq_v_i_face_v0<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d_v0, + // pvar_d, + // weight, + // c_weight); + + // _compute_rhs_lsq_v_i_face<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + _compute_rhs_lsq_v_i_face_v2<<>> (n_i_faces, - i_face_cells, - cell_f_cen, - rhs_d, - pvar_d, + i_face_cells_1d, + cell_f_cen_1d, + rhs_test_d, + pvar_d_1d, weight, c_weight); @@ -1340,12 +1406,24 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - _compute_rhs_lsq_v_b_face<<>> + // _compute_rhs_lsq_v_b_face<<>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + + _compute_rhs_lsq_v_b_face_v2<<>> (m->n_b_faces, b_face_cells, cell_f_cen, b_face_normal, - rhs_d, + rhs_test_d, pvar_d, b_dist, coefb_d, @@ -1355,20 +1433,26 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - // if (rhs_d != NULL) { - // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - // cs_cuda_copy_d2h(rhs, rhs_d, size); - // } - // else - // cs_sync_d2h(rhs); + if (rhs_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(rhs, rhs_test_d, size); + } + else + cs_sync_d2h(rhs); // /* Compute gradient */ // /*------------------*/ - _compute_gradient_lsq_v<<>> + // _compute_gradient_lsq_v<<>> + // (n_cells, + // grad_d, + // rhs_d, + // cocg_d); + + _compute_gradient_lsq_v_v2<<>> (n_cells, - grad_d, - rhs_d, + gradv_test_d, + rhs_test_d, cocg_d); CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); @@ -1376,7 +1460,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // /* Sync to host */ if (grad_d != NULL) { size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(gradv, grad_d, size); + cs_cuda_copy_d2h(gradv, gradv_test_d, size); } else cs_sync_d2h(gradv); @@ -1426,6 +1510,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaFree(_coefb_d)); CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(rhs_d_v0)); + CS_CUDA_CHECK(cudaFree(rhs_test_d)); CS_CUDA_CHECK(cudaFree(cocg_d)); CS_CUDA_CHECK(cudaFree(grad_d)); From 184b85732a492f846ac2ff5b15da26bd877e9323 Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Mon, 30 Oct 2023 13:32:25 +0100 Subject: [PATCH 08/70] Fix issues --- src/alge/cs_gradient_cuda.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index f353cc8912..0594cff16f 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -467,7 +467,7 @@ _init_rhs_v2(cs_lnum_t size, cs_real_t *restrict rhs) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) + if (c_id >= size) return; rhs[c_id] = 0.0; @@ -478,7 +478,7 @@ _init_rhs_v3(cs_lnum_t size, double3 *restrict rhs) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) + if (c_id >= size) return; rhs[c_id] = make_double3(0.0, 0.0, 0.0); @@ -601,8 +601,8 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; cs_lnum_t c_id1, c_id2; - c_id1 = i_face_cells[f_id]; - c_id2 = i_face_cells[f_id + 1]; + c_id1 = i_face_cells[f_id*2]; + c_id2 = i_face_cells[f_id*2 + 1]; dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; @@ -623,7 +623,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, } for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2 + i] - pvar[c_id1 + i]) * ddc; + pfac = (pvar[c_id2*3 + i] - pvar[c_id1*3 + i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); @@ -1290,8 +1290,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *coefb_d = NULL; const cs_lnum_t *cell_cells_idx_d = NULL; - const cs_real_t *pvar_d_1d = NULL; - CS_CUDA_CHECK(cudaMalloc(&pvar_d_1d, n_cells * sizeof(cs_real_33_t))); + cs_real_t *pvar_d_1d; + CS_CUDA_CHECK(cudaMalloc(&pvar_d_1d, n_cells * sizeof(cs_real_3_t))); + cs_cuda_copy_h2d(pvar_d_1d, pvar, n_cells * sizeof(cs_real_3_t)); unsigned int blocksize = 256; unsigned int gridsize_b @@ -1433,7 +1434,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - if (rhs_d != NULL) { + if (rhs_test_d != NULL) { size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; cs_cuda_copy_d2h(rhs, rhs_test_d, size); } @@ -1458,7 +1459,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); // /* Sync to host */ - if (grad_d != NULL) { + if (gradv_test_d != NULL) { size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; cs_cuda_copy_d2h(gradv, gradv_test_d, size); } From bd6b8054201eb11b87127142510703b68b251382 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 31 Oct 2023 16:15:54 +0100 Subject: [PATCH 09/70] Opimization in progress --- src/alge/cs_gradient_cuda.cu | 352 ++++++++++++++++++++++++++--------- 1 file changed, 260 insertions(+), 92 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 0594cff16f..c949dbcb37 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -462,6 +462,21 @@ _init_rhs(cs_lnum_t size, } } +__global__ static void +_test(cs_lnum_t size, + const cs_real_3_t *restrict pvar, + cs_real_t *restrict pvar_1d) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id < size) { + for (cs_lnum_t i = 0; i < 3; i++){ + if(pvar[c_id][i] != pvar_1d[c_id*3 + i]){ + printf("\tNot equal"); + } + } + } +} + __global__ static void _init_rhs_v2(cs_lnum_t size, cs_real_t *restrict rhs) @@ -538,12 +553,12 @@ _compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_i_face(cs_lnum_t size, - const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *cell_f_cen, - cs_real_33_t *rhs, - const cs_real_3_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight) + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -586,12 +601,12 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, - const cs_lnum_t *i_face_cells, - const cs_real_t *cell_f_cen, - cs_real_t *rhs, - const cs_real_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight) + const cs_lnum_t *restrict i_face_cells, + const cs_real_t *restrict cell_f_cen, + cs_real_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -623,7 +638,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, } for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2*3 + i] - pvar[c_id1*3 + i]) * ddc; + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); @@ -635,11 +650,11 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, - const cs_lnum_t *cell_cells_idx, - const cs_lnum_t *cell_cells_lst, - const cs_real_3_t *cell_f_cen, - cs_real_33_t *rhs, - const cs_real_3_t *pvar) + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells_lst, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar) { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; @@ -676,15 +691,15 @@ _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_b_face(cs_lnum_t size, - const cs_lnum_t *b_face_cells, - const cs_real_3_t *cell_f_cen, - const cs_real_3_t *b_face_normal, - cs_real_33_t *rhs, - const cs_real_3_t *pvar, - const cs_real_t *b_dist, - const cs_real_33_t *coefbv, - const cs_real_3_t *coefav, - const int inc) + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -721,15 +736,15 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, - const cs_lnum_t *b_face_cells, - const cs_real_3_t *cell_f_cen, - const cs_real_3_t *b_face_normal, - cs_real_t *rhs, - const cs_real_3_t *pvar, - const cs_real_t *b_dist, - const cs_real_33_t *coefbv, - const cs_real_3_t *coefav, - const int inc) + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -766,9 +781,9 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, __global__ static void _compute_gradient_lsq_v(cs_lnum_t size, - cs_real_33_t *gradv, - cs_real_33_t *rhs, - cs_cocg_6_t *cocg) + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if (c_id >= size) @@ -791,39 +806,174 @@ _compute_gradient_lsq_v(cs_lnum_t size, __global__ static void _compute_gradient_lsq_v_v2(cs_lnum_t size, - cs_real_t *gradv, - cs_real_t *rhs, - cs_cocg_6_t *cocg) + cs_real_t *restrict gradv, + cs_real_t *restrict rhs, + cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if (c_id >= size) return; for(cs_lnum_t i = 0; i < 3; i++){ - gradv[c_id*3*3 + i*3] = rhs[c_id + i*3] * cocg[c_id][0] - + rhs[c_id + i*3 + 1] * cocg[c_id][3] - + rhs[c_id + i*3 + 2] * cocg[c_id][5]; + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; - gradv[c_id*3*3 + i*3 + 1] = rhs[c_id + i*3] * cocg[c_id][3] - + rhs[c_id + i*3 + 1] * cocg[c_id][1] - + rhs[c_id + i*3 + 2] * cocg[c_id][4]; + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; - gradv[c_id*3*3 + i*3 + 2] = rhs[c_id + i*3] * cocg[c_id][5] - + rhs[c_id + i*3 + 1] * cocg[c_id][4] - + rhs[c_id + i*3 + 2] * cocg[c_id][2]; + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_v_v3(cs_lnum_t size, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + auto& gradc = gradv[c_id]; + auto& rhsc = rhs[c_id]; + auto cocgc = cocg[c_id]; + for(cs_lnum_t i = 0; i < 3; i++){ + auto& gradci = gradc[i]; + auto rhsci = rhsc[i]; + gradci[0] = rhsci[0] * cocgc[0] + + rhsci[1] * cocgc[3] + + rhsci[2] * cocgc[5]; + + gradci[1] = rhsci[0] * cocgc[3] + + rhsci[1] * cocgc[1] + + rhsci[2] * cocgc[4]; + + gradci[2] = rhsci[0] * cocgc[5] + + rhsci[1] * cocgc[4] + + rhsci[2] * cocgc[2]; } } +__global__ static void +_compute_gradient_lsq_v_v4(cs_lnum_t size, + cs_real_33_t *restrict gradv_m, + cs_real_33_t *restrict rhs_m, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + cs_real_t *rhs = (cs_real_t *) rhs_m; + cs_real_t *gradv = (cs_real_t *) gradv_m; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_v_v5(cs_lnum_t size, + cs_real_t *restrict gradv, + cs_real_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id] = rhs[c_id1*3*3 + i*3] * _cocg[0] + + rhs[c_id1*3*3 + i*3 + 1] * _cocg[1] + + rhs[c_id1*3*3 + i*3 + 2] * _cocg[2]; + +} + +__global__ static void +_compute_gradient_lsq_v_v6(cs_lnum_t size, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id1][i][j] = rhs[c_id1][i][0] * _cocg[0] + + rhs[c_id1][i][1] * _cocg[1] + + rhs[c_id1][i][2] * _cocg[2]; + +} + __global__ static void _compute_gradient_lsq_b_v(cs_lnum_t size, cs_lnum_t n_b_cells, - cs_lnum_t *b_cells, - cs_real_33_t *gradv, - cs_real_33_t *rhs, - cs_cocg_6_t *cocg, - cs_real_3_t *b_face_normal, - cs_lnum_t *cell_b_faces, - cs_lnum_t *cell_b_faces_idx) + cs_lnum_t *restrict b_cells, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg, + cs_real_3_t *restrict b_face_normal, + cs_lnum_t *restrict cell_b_faces, + cs_lnum_t *restrict cell_b_faces_idx) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -1246,6 +1396,11 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_i_faces = m->n_i_faces; + cs_real_t *pvar_copy; + pvar_copy = (cs_real_t *) malloc(n_cells * sizeof(cs_real_3_t)); + + memcpy(pvar_copy, pvar, n_cells*sizeof(cs_real_3_t)); + int device_id; cudaGetDevice(&device_id); @@ -1292,7 +1447,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_t *pvar_d_1d; CS_CUDA_CHECK(cudaMalloc(&pvar_d_1d, n_cells * sizeof(cs_real_3_t))); - cs_cuda_copy_h2d(pvar_d_1d, pvar, n_cells * sizeof(cs_real_3_t)); + cs_cuda_copy_h2d(pvar_d_1d, pvar_copy, n_cells * sizeof(cs_real_3_t)); unsigned int blocksize = 256; unsigned int gridsize_b @@ -1349,9 +1504,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - // _init_rhs<<>> - // (n_cells_ext, - // rhs_d); + _init_rhs<<>> + (n_cells_ext, + rhs_d); _init_rhs_v2<<>> (n_cells_ext*3*3, @@ -1375,21 +1530,21 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - // _compute_rhs_lsq_v_i_face<<>> - // (n_i_faces, - // i_face_cells, - // cell_f_cen, - // rhs_d, - // pvar_d, - // weight, - // c_weight); + _compute_rhs_lsq_v_i_face<<>> + (n_i_faces, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); _compute_rhs_lsq_v_i_face_v2<<>> (n_i_faces, i_face_cells_1d, cell_f_cen_1d, rhs_test_d, - pvar_d_1d, + pvar_d, weight, c_weight); @@ -1407,17 +1562,17 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - // _compute_rhs_lsq_v_b_face<<>> - // (m->n_b_faces, - // b_face_cells, - // cell_f_cen, - // b_face_normal, - // rhs_d, - // pvar_d, - // b_dist, - // coefb_d, - // coefa_d, - // inc); + _compute_rhs_lsq_v_b_face<<>> + (m->n_b_faces, + b_face_cells, + cell_f_cen, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); _compute_rhs_lsq_v_b_face_v2<<>> (m->n_b_faces, @@ -1434,12 +1589,12 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - if (rhs_test_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(rhs, rhs_test_d, size); - } - else - cs_sync_d2h(rhs); + // if (rhs_test_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(rhs, rhs_test_d, size); + // } + // else + // cs_sync_d2h(rhs); // /* Compute gradient */ // /*------------------*/ @@ -1450,12 +1605,25 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // rhs_d, // cocg_d); - _compute_gradient_lsq_v_v2<<>> - (n_cells, + // _compute_gradient_lsq_v_v4<<>> + // (n_cells, + // grad_d, + // rhs_d, + // cocg_d); + + + _compute_gradient_lsq_v_v5<<>> + (n_cells*3*3, gradv_test_d, rhs_test_d, cocg_d); + _compute_gradient_lsq_v_v6<<>> + (n_cells*3*3, + grad_d, + rhs_d, + cocg_d); + CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); // /* Sync to host */ @@ -1515,7 +1683,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaFree(rhs_test_d)); CS_CUDA_CHECK(cudaFree(cocg_d)); CS_CUDA_CHECK(cudaFree(grad_d)); + CS_CUDA_CHECK(cudaFree(gradv_test_d)); + CS_CUDA_CHECK(cudaFree(pvar_d_1d)); } - -// cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) From 8c9aab69e15d870750dc914525c47b6b2a10edd9 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 2 Nov 2023 15:36:14 +0100 Subject: [PATCH 10/70] Refactor with multiple files --- src/alge/cs_gradient_cuda.cu | 600 ++----------------------- src/alge/cs_gradient_cuda.cuh | 96 ++++ src/alge/cs_gradient_lsq_vector.cuh | 252 +++++++++++ src/alge/cs_gradient_lsq_vector_v2.cuh | 186 ++++++++ src/alge/cs_gradient_lsq_vector_v3.cuh | 156 +++++++ 5 files changed, 730 insertions(+), 560 deletions(-) create mode 100644 src/alge/cs_gradient_cuda.cuh create mode 100644 src/alge/cs_gradient_lsq_vector.cuh create mode 100644 src/alge/cs_gradient_lsq_vector_v2.cuh create mode 100644 src/alge/cs_gradient_lsq_vector_v3.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index c949dbcb37..00a1bf17f9 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -23,61 +23,12 @@ */ /*----------------------------------------------------------------------------*/ +#include "cs_gradient_cuda.cuh" -#include "cs_defs.h" - -/*---------------------------------------------------------------------------- - * Standard C library headers - *----------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(HAVE_MPI) -#include -#endif - -#include - -/*---------------------------------------------------------------------------- - * Local headers - *----------------------------------------------------------------------------*/ - -#include "bft_error.h" -#include "bft_mem.h" - -#include "cs_base_accel.h" -#include "cs_base_cuda.h" -#include "cs_blas.h" -#include "cs_cell_to_vertex.h" -#include "cs_ext_neighborhood.h" -#include "cs_field.h" -#include "cs_field_pointer.h" -#include "cs_halo.h" -#include "cs_halo_perio.h" -#include "cs_log.h" -#include "cs_math.h" -#include "cs_mesh.h" -#include "cs_mesh_adjacencies.h" -#include "cs_mesh_quantities.h" -#include "cs_parall.h" -#include "cs_porous_model.h" -#include "cs_prototypes.h" -#include "cs_timer.h" -#include "cs_timer_stats.h" - -/*---------------------------------------------------------------------------- - * Header for the current file - *----------------------------------------------------------------------------*/ - -#include "cs_gradient.h" -#include "cs_gradient_priv.h" +#include "cs_gradient_lsq_vector.cuh" +#include "cs_gradient_lsq_vector_v2.cuh" +#include "cs_gradient_lsq_vector_v3.cuh" +#include "cs_gradient_lsq_vector_gather.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -95,20 +46,6 @@ * Private function definitions *============================================================================*/ -__device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], - cs_real_t out[3]) -{ - cs_real_t norm = sqrt(in[0]*in[0] - + in[1]*in[1] - + in[2]*in[2]); - - cs_real_t inverse_norm = 1. / norm; - - out[0] = inverse_norm * in[0]; - out[1] = inverse_norm * in[1]; - out[2] = inverse_norm * in[2]; -} - /*---------------------------------------------------------------------------- * Recompute cocg at boundaries, using saved cocgb *----------------------------------------------------------------------------*/ @@ -445,49 +382,6 @@ _init_rhsv(cs_lnum_t size, } } - -/*---------------------------------------------------------------------------- - * Initialize RHS with null values - *----------------------------------------------------------------------------*/ - -__global__ static void -_init_rhs(cs_lnum_t size, - cs_real_33_t *restrict rhs) -{ - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) { - for (cs_lnum_t i = 0; i < 3; i++) - for (cs_lnum_t j = 0; j < 3; j++) - rhs[c_id][i][j] = 0.0; - } -} - -__global__ static void -_test(cs_lnum_t size, - const cs_real_3_t *restrict pvar, - cs_real_t *restrict pvar_1d) -{ - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) { - for (cs_lnum_t i = 0; i < 3; i++){ - if(pvar[c_id][i] != pvar_1d[c_id*3 + i]){ - printf("\tNot equal"); - } - } - } -} - -__global__ static void -_init_rhs_v2(cs_lnum_t size, - cs_real_t *restrict rhs) -{ - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - rhs[c_id] = 0.0; -} - __global__ static void _init_rhs_v3(cs_lnum_t size, double3 *restrict rhs) @@ -499,336 +393,6 @@ _init_rhs_v3(cs_lnum_t size, rhs[c_id] = make_double3(0.0, 0.0, 0.0); } -__global__ static void -_compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, - const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *cell_f_cen, - cs_real_33_t *rhs, - const cs_real_3_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; - cs_lnum_t c_id1, c_id2; - - c_id1 = i_face_cells[f_id][0]; - c_id2 = i_face_cells[f_id][1]; - - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; - - ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - - if (c_weight != NULL){ - _pond = weight[f_id]; - _denom = 1. / ( _pond *c_weight[c_id1] - + (1. - _pond)*c_weight[c_id2]); - - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - atomicAdd(&rhs[c_id1][i][j], c_weight[c_id2] * _denom * fctb[j]); - atomicAdd(&rhs[c_id2][i][j], c_weight[c_id1] * _denom * fctb[j]); - } - } - } - else{ - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - atomicAdd(&rhs[c_id1][i][j], fctb[j]); - atomicAdd(&rhs[c_id2][i][j], fctb[j]); - } - } - } -} - -__global__ static void -_compute_rhs_lsq_v_i_face(cs_lnum_t size, - const cs_lnum_2_t *restrict i_face_cells, - const cs_real_3_t *restrict cell_f_cen, - cs_real_33_t *restrict rhs, - const cs_real_3_t *restrict pvar, - const cs_real_t *restrict weight, - const cs_real_t *restrict c_weight) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; - cs_lnum_t c_id1, c_id2; - - c_id1 = i_face_cells[f_id][0]; - c_id2 = i_face_cells[f_id][1]; - - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; - - ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - - if (c_weight == NULL){ - _weight1 = 1.; - _weight2 = 1.; - } - else{ - _pond = weight[f_id]; - _denom = 1. / ( _pond *c_weight[c_id1] - + (1. - _pond)*c_weight[c_id2]); - _weight1 = c_weight[c_id1] * _denom; - _weight2 = c_weight[c_id2] * _denom; - } - - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); - atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); - } - } -} - -__global__ static void -_compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, - const cs_lnum_t *restrict i_face_cells, - const cs_real_t *restrict cell_f_cen, - cs_real_t *restrict rhs, - const cs_real_3_t *restrict pvar, - const cs_real_t *restrict weight, - const cs_real_t *restrict c_weight) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; - cs_lnum_t c_id1, c_id2; - - c_id1 = i_face_cells[f_id*2]; - c_id2 = i_face_cells[f_id*2 + 1]; - - dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; - dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; - dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; - - ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - - if (c_weight == NULL){ - _weight1 = 1.; - _weight2 = 1.; - } - else{ - _pond = weight[f_id]; - _denom = 1. / ( _pond *c_weight[c_id1] - + (1. - _pond)*c_weight[c_id2]); - _weight1 = c_weight[c_id1] * _denom; - _weight2 = c_weight[c_id2] * _denom; - } - - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); - atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); - } - } -} - - -__global__ static void -_compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, - const cs_lnum_t *restrict cell_cells_idx, - const cs_lnum_t *restrict cell_cells_lst, - const cs_real_3_t *restrict cell_f_cen, - cs_real_33_t *restrict rhs, - const cs_real_3_t *restrict pvar) -{ - cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; - - if(c_id1 >= size){ - return; - } - - cs_lnum_t s_id = cell_cells_idx[c_id1]; - cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; - - cs_real_t dc[3], ddc, pfac; - - for(cs_lnum_t index = s_id; index < e_id; index++){ - - cs_lnum_t c_id2 = cell_cells_idx[index]; - - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; - - ddc = 1./(dc[0] * dc[0] + dc[1] * dc[1] + dc[2] * dc[2]); - - for (cs_lnum_t i = 0; i < 3; i++) { - - pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - - for (cs_lnum_t j = 0; j < 3; j++) { - rhs[c_id1][i][j] += dc[j] * pfac; - } - } - } - -} - -__global__ static void -_compute_rhs_lsq_v_b_face(cs_lnum_t size, - const cs_lnum_t *restrict b_face_cells, - const cs_real_3_t *restrict cell_f_cen, - const cs_real_3_t *restrict b_face_normal, - cs_real_33_t *restrict rhs, - const cs_real_3_t *restrict pvar, - const cs_real_t *restrict b_dist, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const int inc) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - - cs_lnum_t c_id1; - cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; - - c_id1 = b_face_cells[f_id]; - - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); - - d_b_dist = 1. / b_dist[f_id]; - - /* Normal divided by b_dist */ - n_d_dist[0] *= d_b_dist; - n_d_dist[1] *= d_b_dist; - n_d_dist[2] *= d_b_dist; - - for (cs_lnum_t i = 0; i < 3; i++) { - pfac = coefav[f_id][i]*inc - + ( coefbv[f_id][0][i] * pvar[c_id1][0] - + coefbv[f_id][1][i] * pvar[c_id1][1] - + coefbv[f_id][2][i] * pvar[c_id1][2] - - pvar[c_id1][i]); - - atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); - atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); - atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); - } -} - -__global__ static void -_compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, - const cs_lnum_t *restrict b_face_cells, - const cs_real_3_t *restrict cell_f_cen, - const cs_real_3_t *restrict b_face_normal, - cs_real_t *restrict rhs, - const cs_real_3_t *restrict pvar, - const cs_real_t *restrict b_dist, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const int inc) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - - cs_lnum_t c_id1; - cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; - - c_id1 = b_face_cells[f_id]; - - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); - - d_b_dist = 1. / b_dist[f_id]; - - /* Normal divided by b_dist */ - n_d_dist[0] *= d_b_dist; - n_d_dist[1] *= d_b_dist; - n_d_dist[2] *= d_b_dist; - - for (cs_lnum_t i = 0; i < 3; i++) { - pfac = coefav[f_id][i]*inc - + ( coefbv[f_id][0][i] * pvar[c_id1][0] - + coefbv[f_id][1][i] * pvar[c_id1][1] - + coefbv[f_id][2][i] * pvar[c_id1][2] - - pvar[c_id1][i]); - - atomicAdd(&rhs[c_id1*3*3 + i*3], n_d_dist[0] * pfac); - atomicAdd(&rhs[c_id1*3*3 + i*3 + 1], n_d_dist[1] * pfac); - atomicAdd(&rhs[c_id1*3*3 + i*3 + 2], n_d_dist[2] * pfac); - } -} - -__global__ static void -_compute_gradient_lsq_v(cs_lnum_t size, - cs_real_33_t *restrict gradv, - cs_real_33_t *restrict rhs, - cs_cocg_6_t *restrict cocg) -{ - size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - for(cs_lnum_t i = 0; i < 3; i++){ - gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] - + rhs[c_id][i][1] * cocg[c_id][3] - + rhs[c_id][i][2] * cocg[c_id][5]; - - gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] - + rhs[c_id][i][1] * cocg[c_id][1] - + rhs[c_id][i][2] * cocg[c_id][4]; - - gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] - + rhs[c_id][i][1] * cocg[c_id][4] - + rhs[c_id][i][2] * cocg[c_id][2]; - } -} - -__global__ static void -_compute_gradient_lsq_v_v2(cs_lnum_t size, - cs_real_t *restrict gradv, - cs_real_t *restrict rhs, - cs_cocg_6_t *restrict cocg) -{ - size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - for(cs_lnum_t i = 0; i < 3; i++){ - gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; - - gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; - - gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; - } -} - __global__ static void _compute_gradient_lsq_v_v3(cs_lnum_t size, cs_real_33_t *restrict gradv, @@ -858,112 +422,6 @@ _compute_gradient_lsq_v_v3(cs_lnum_t size, } } -__global__ static void -_compute_gradient_lsq_v_v4(cs_lnum_t size, - cs_real_33_t *restrict gradv_m, - cs_real_33_t *restrict rhs_m, - cs_cocg_6_t *restrict cocg) -{ - size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - cs_real_t *rhs = (cs_real_t *) rhs_m; - cs_real_t *gradv = (cs_real_t *) gradv_m; - - for(cs_lnum_t i = 0; i < 3; i++){ - gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; - - gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; - - gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] - + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] - + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; - } -} - -__global__ static void -_compute_gradient_lsq_v_v5(cs_lnum_t size, - cs_real_t *restrict gradv, - cs_real_t *restrict rhs, - cs_cocg_6_t *restrict cocg) -{ - size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - size_t c_id1 = c_id / (3*3); - size_t i = (c_id / 3) % 3; - size_t j = c_id % 3; - - auto cocg_temp = cocg[c_id1]; - cs_real_t _cocg[3]; - - _cocg[0] = cocg_temp[5]; - _cocg[1] = cocg_temp[4]; - _cocg[2] = cocg_temp[2]; - - if(j == 0){ - _cocg[0] = cocg_temp[0]; - _cocg[1] = cocg_temp[3]; - _cocg[2] = cocg_temp[5]; - } - - if(j == 1){ - _cocg[0] = cocg_temp[3]; - _cocg[1] = cocg_temp[1]; - _cocg[2] = cocg_temp[4]; - } - - gradv[c_id] = rhs[c_id1*3*3 + i*3] * _cocg[0] - + rhs[c_id1*3*3 + i*3 + 1] * _cocg[1] - + rhs[c_id1*3*3 + i*3 + 2] * _cocg[2]; - -} - -__global__ static void -_compute_gradient_lsq_v_v6(cs_lnum_t size, - cs_real_33_t *restrict gradv, - cs_real_33_t *restrict rhs, - cs_cocg_6_t *restrict cocg) -{ - size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) - return; - - size_t c_id1 = c_id / (3*3); - size_t i = (c_id / 3) % 3; - size_t j = c_id % 3; - - auto cocg_temp = cocg[c_id1]; - cs_real_t _cocg[3]; - - _cocg[0] = cocg_temp[5]; - _cocg[1] = cocg_temp[4]; - _cocg[2] = cocg_temp[2]; - - if(j == 0){ - _cocg[0] = cocg_temp[0]; - _cocg[1] = cocg_temp[3]; - _cocg[2] = cocg_temp[5]; - } - - if(j == 1){ - _cocg[0] = cocg_temp[3]; - _cocg[1] = cocg_temp[1]; - _cocg[2] = cocg_temp[4]; - } - - gradv[c_id1][i][j] = rhs[c_id1][i][0] * _cocg[0] - + rhs[c_id1][i][1] * _cocg[1] - + rhs[c_id1][i][2] * _cocg[2]; - -} - __global__ static void _compute_gradient_lsq_b_v(cs_lnum_t size, cs_lnum_t n_b_cells, @@ -1454,6 +912,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (unsigned int)ceil((double)m->n_b_cells / blocksize); unsigned int gridsize_if = (unsigned int)ceil((double)m->n_i_faces / blocksize); + unsigned int gridsize_if_bis + = (unsigned int)ceil((double)(m->n_i_faces*3*3) / blocksize); unsigned int gridsize_bf = (unsigned int)ceil((double)m->n_b_faces / blocksize); unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); @@ -1493,6 +953,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t *restrict i_face_cells_1d = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + // printf("n_i_thread:%d\tn_i_groups:%d\tn_cells%d\n", n_i_threads, n_i_groups, n_cells); _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); @@ -1508,9 +969,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, (n_cells_ext, rhs_d); - _init_rhs_v2<<>> - (n_cells_ext*3*3, - rhs_test_d); + // _init_rhs_v2<<>> + // (n_cells_ext*3*3, + // rhs_test_d); // _init_rhs_v3<<>> // (n_cells_ext*3, @@ -1530,14 +991,14 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face<<>> - (n_i_faces, - i_face_cells, - cell_f_cen, - rhs_d, - pvar_d, - weight, - c_weight); + // _compute_rhs_lsq_v_i_face<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); _compute_rhs_lsq_v_i_face_v2<<>> (n_i_faces, @@ -1548,6 +1009,25 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, weight, c_weight); + // _compute_rhs_lsq_v_i_face_v3<<>> + // (n_i_faces*3*3, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + _compute_rhs_lsq_v_i_face_gather<<>> + (n_cells, + cell_cells_idx, + cell_cells_lst, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ @@ -1627,9 +1107,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); // /* Sync to host */ - if (gradv_test_d != NULL) { + if (grad_d != NULL) { size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(gradv, gradv_test_d, size); + cs_cuda_copy_d2h(gradv, grad_d, size); } else cs_sync_d2h(gradv); diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh new file mode 100644 index 0000000000..15a25ed799 --- /dev/null +++ b/src/alge/cs_gradient_cuda.cuh @@ -0,0 +1,96 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +#include "cs_defs.h" + +/*---------------------------------------------------------------------------- + * Standard C library headers + *----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(HAVE_MPI) +#include +#endif + +#include + +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ + +#include "bft_error.h" +#include "bft_mem.h" + +#include "cs_base_accel.h" +#include "cs_base_cuda.h" +#include "cs_blas.h" +#include "cs_cell_to_vertex.h" +#include "cs_ext_neighborhood.h" +#include "cs_field.h" +#include "cs_field_pointer.h" +#include "cs_halo.h" +#include "cs_halo_perio.h" +#include "cs_log.h" +#include "cs_math.h" +#include "cs_mesh.h" +#include "cs_mesh_adjacencies.h" +#include "cs_mesh_quantities.h" +#include "cs_parall.h" +#include "cs_porous_model.h" +#include "cs_prototypes.h" +#include "cs_timer.h" +#include "cs_timer_stats.h" + +/*---------------------------------------------------------------------------- + * Header for the current file + *----------------------------------------------------------------------------*/ + +#include "cs_gradient.h" +#include "cs_gradient_priv.h" + + + +__device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], + cs_real_t out[3]) +{ + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); + + cs_real_t inverse_norm = 1. / norm; + + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; +} \ No newline at end of file diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh new file mode 100644 index 0000000000..31ee4e95eb --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -0,0 +1,252 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- + * Initialize RHS with null values + *----------------------------------------------------------------------------*/ + +__global__ static void +_init_rhs(cs_lnum_t size, + cs_real_33_t *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id < size) { + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++) + rhs[c_id][i][j] = 0.0; + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *cell_f_cen, + cs_real_33_t *rhs, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL){ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], c_weight[c_id2] * _denom * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], c_weight[c_id1] * _denom * fctb[j]); + } + } + } + else{ + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], fctb[j]); + atomicAdd(&rhs[c_id2][i][j], fctb[j]); + } + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + } + } +} + +__global__ static void +_compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells_lst, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= size){ + return; + } + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + cs_real_t dc[3], ddc, pfac; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + cs_lnum_t c_id2 = cell_cells_idx[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0] * dc[0] + dc[1] * dc[1] + dc[2] * dc[2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + rhs[c_id1][i][j] += dc[j] * pfac; + } + } + } + +} + +__global__ static void +_compute_rhs_lsq_v_b_face(cs_lnum_t size, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + c_id1 = b_face_cells[f_id]; + + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + atomicAdd(&rhs[c_id1][i][0], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1][i][1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1][i][2], n_d_dist[2] * pfac); + } +} + +__global__ static void +_compute_gradient_lsq_v(cs_lnum_t size, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + + rhs[c_id][i][1] * cocg[c_id][3] + + rhs[c_id][i][2] * cocg[c_id][5]; + + gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + + rhs[c_id][i][1] * cocg[c_id][1] + + rhs[c_id][i][2] * cocg[c_id][4]; + + gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + + rhs[c_id][i][1] * cocg[c_id][4] + + rhs[c_id][i][2] * cocg[c_id][2]; + } +} \ No newline at end of file diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh new file mode 100644 index 0000000000..7ca3800542 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -0,0 +1,186 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- + * Initialize RHS with null values + *----------------------------------------------------------------------------*/ + +__global__ static void +_init_rhs_v2(cs_lnum_t size, + cs_real_t *restrict rhs) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + rhs[c_id] = 0.0; +} + +__global__ static void +_compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, + const cs_lnum_t *restrict i_face_cells, + const cs_real_t *restrict cell_f_cen, + cs_real_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id*2]; + c_id2 = i_face_cells[f_id*2 + 1]; + + dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; + dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; + dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); + } + } +} + +__global__ static void +_compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + + cs_lnum_t c_id1; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + c_id1 = b_face_cells[f_id]; + + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + atomicAdd(&rhs[c_id1*3*3 + i*3], n_d_dist[0] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 1], n_d_dist[1] * pfac); + atomicAdd(&rhs[c_id1*3*3 + i*3 + 2], n_d_dist[2] * pfac); + } +} + +__global__ static void +_compute_gradient_lsq_v_v2(cs_lnum_t size, + cs_real_t *restrict gradv, + cs_real_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} + +__global__ static void +_compute_gradient_lsq_v_v4(cs_lnum_t size, + cs_real_33_t *restrict gradv_m, + cs_real_33_t *restrict rhs_m, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + cs_real_t *rhs = (cs_real_t *) rhs_m; + cs_real_t *gradv = (cs_real_t *) gradv_m; + + for(cs_lnum_t i = 0; i < 3; i++){ + gradv[c_id*3*3 + i*3] = rhs[c_id*3*3 + i*3] * cocg[c_id][0] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][5]; + + gradv[c_id*3*3 + i*3 + 1] = rhs[c_id*3*3 + i*3] * cocg[c_id][3] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][1] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][4]; + + gradv[c_id*3*3 + i*3 + 2] = rhs[c_id*3*3 + i*3] * cocg[c_id][5] + + rhs[c_id*3*3 + i*3 + 1] * cocg[c_id][4] + + rhs[c_id*3*3 + i*3 + 2] * cocg[c_id][2]; + } +} diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh new file mode 100644 index 0000000000..85dfe345c9 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_v3.cuh @@ -0,0 +1,156 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +__global__ static void +_compute_rhs_lsq_v_i_face_v3(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + size_t f_id1 = f_id / (3*3); + size_t i = (f_id / 3) % 3; + size_t j = f_id % 3; + + c_id1 = i_face_cells[f_id1][0]; + c_id2 = i_face_cells[f_id1][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id1]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + //for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + //for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + //} + //} +} + +__global__ static void +_compute_gradient_lsq_v_v5(cs_lnum_t size, + cs_real_t *restrict gradv, + cs_real_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id] = rhs[c_id1*3*3 + i*3] * _cocg[0] + + rhs[c_id1*3*3 + i*3 + 1] * _cocg[1] + + rhs[c_id1*3*3 + i*3 + 2] * _cocg[2]; + +} + +__global__ static void +_compute_gradient_lsq_v_v6(cs_lnum_t size, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocg) +{ + size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + if (c_id >= size) + return; + + size_t c_id1 = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + auto cocg_temp = cocg[c_id1]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id1][i][j] = rhs[c_id1][i][0] * _cocg[0] + + rhs[c_id1][i][1] * _cocg[1] + + rhs[c_id1][i][2] * _cocg[2]; + +} From 7494fc25a6d7d1c9cdef511dd4ce815ec68934db Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 2 Nov 2023 15:36:50 +0100 Subject: [PATCH 11/70] Gather version in progress --- src/alge/cs_gradient_lsq_vector_gather.cuh | 78 ++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/alge/cs_gradient_lsq_vector_gather.cuh diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh new file mode 100644 index 0000000000..90caa30d55 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -0,0 +1,78 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells_lst, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + cs_lnum_t s_id = cell_cells_idx[c_id]; + cs_lnum_t e_id = cell_cells_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells_lst[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[index]; + _denom = 1. / ( _pond *c_weight[c_id] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + rhs[c_id][i][j] += _weight2 * fctb[j]; + rhs[c_id2][i][j] += _weight1 * fctb[j]; + } + } + } +} \ No newline at end of file From b2472e1e51129f16b60aae0090c307ab28e33062 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 2 Nov 2023 17:39:36 +0100 Subject: [PATCH 12/70] reconstruct_vector_gradient --- src/alge/cs_gradient.cxx | 235 +++++++++-------- src/alge/cs_gradient_cuda.cu | 492 +++++++++++++++++++++++++++++++---- src/alge/cs_gradient_priv.h | 15 ++ 3 files changed, 587 insertions(+), 155 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e57267ebad..e2988b74ff 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5686,156 +5686,175 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, coupled_faces = (const bool *)cpl->coupled_faces; } - /* Initialize gradient */ - /*---------------------*/ - /* Initialization */ -# pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] = 0.0; + +#if defined(HAVE_CUDA) + cs_reconstruct_vector_gradient_cuda(m, + fvq, + cpl, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + r_grad, + grad, + coupled_faces, + cpl_stride); +#else + /* Initialize gradient */ + /*---------------------*/ + + /* Initialization */ + + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] = 0.0; + } } - } - /* Interior faces contribution */ + /* Interior faces contribution */ - for (int g_id = 0; g_id < n_i_groups; g_id++) { + for (int g_id = 0; g_id < n_i_groups; g_id++) { -# pragma omp parallel for - for (int t_id = 0; t_id < n_i_threads; t_id++) { + # pragma omp parallel for + for (int t_id = 0; t_id < n_i_threads; t_id++) { - for (cs_lnum_t f_id = i_group_index[(t_id*n_i_groups + g_id)*2]; - f_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; - f_id++) { + for (cs_lnum_t f_id = i_group_index[(t_id*n_i_groups + g_id)*2]; + f_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; + f_id++) { - cs_lnum_t c_id1 = i_face_cells[f_id][0]; - cs_lnum_t c_id2 = i_face_cells[f_id][1]; + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; - cs_real_t pond = weight[f_id]; + cs_real_t pond = weight[f_id]; - cs_real_t ktpond = (c_weight == NULL) ? - pond : // no cell weighting - pond * c_weight[c_id1] // cell weighting active - / ( pond * c_weight[c_id1] - + (1.0-pond)* c_weight[c_id2]); + cs_real_t ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); - /* - Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli - + (1-\alpha_\ij) \varia_\cellj\f$ - but for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - and for the cell \f$ \cellj \f$ we remove - \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ - */ + /* + Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli + + (1-\alpha_\ij) \varia_\cellj\f$ + but for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + and for the cell \f$ \cellj \f$ we remove + \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ + */ - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - cs_real_t pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + cs_real_t pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + cs_real_t pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - /* Reconstruction part */ - cs_real_t rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); + /* Reconstruction part */ + cs_real_t rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + } - for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; } - } + } /* End of loop on faces */ - } /* End of loop on faces */ + } /* End of loop on threads */ - } /* End of loop on threads */ + } /* End of loop on thread groups */ - } /* End of loop on thread groups */ + /* Contribution from coupled faces */ + if (cpl != NULL) { + cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); + cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); + } - /* Contribution from coupled faces */ - if (cpl != NULL) { - cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); - cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); - } + /* Boundary face treatment */ - /* Boundary face treatment */ + # pragma omp parallel for + for (int t_id = 0; t_id < n_b_threads; t_id++) { -# pragma omp parallel for - for (int t_id = 0; t_id < n_b_threads; t_id++) { + for (cs_lnum_t f_id = b_group_index[t_id*2]; + f_id < b_group_index[t_id*2 + 1]; + f_id++) { - for (cs_lnum_t f_id = b_group_index[t_id*2]; - f_id < b_group_index[t_id*2 + 1]; - f_id++) { + if (coupled_faces[f_id * cpl_stride]) + continue; - if (coupled_faces[f_id * cpl_stride]) - continue; + cs_lnum_t c_id = b_face_cells[f_id]; - cs_lnum_t c_id = b_face_cells[f_id]; + /* + Remark: for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + */ - /* - Remark: for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - */ + for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = inc*coefav[f_id][i]; - cs_real_t pfac = inc*coefav[f_id][i]; + for (cs_lnum_t k = 0; k < 3; k++) + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; - for (cs_lnum_t k = 0; k < 3; k++) - pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + pfac -= pvar[c_id][i]; - pfac -= pvar[c_id][i]; + /* Reconstruction part */ + cs_real_t rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + cs_real_t vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } - /* Reconstruction part */ - cs_real_t rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { - cs_real_t vecfac = r_grad[c_id][k][0] * diipb[f_id][0] - + r_grad[c_id][k][1] * diipb[f_id][1] - + r_grad[c_id][k][2] * diipb[f_id][2]; - rfac += coefbv[f_id][i][k] * vecfac; - } + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + } - } + } /* loop on faces */ - } /* loop on faces */ + } /* loop on threads */ - } /* loop on threads */ + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; -# pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - cs_real_t dvol; - /* Is the cell disabled (for solid or porous)? Not the case if coupled */ - if (has_dc * c_disable_flag[has_dc * c_id] == 0) - dvol = 1. / cell_f_vol[c_id]; - else - dvol = 0.; + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] *= dvol; + } - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] *= dvol; - } + if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { + cs_real_t gradpa[3]; + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_id][i][j]; + grad[c_id][i][j] = 0.; + } - if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { - cs_real_t gradpa[3]; - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - gradpa[j] = grad[c_id][i][j]; - grad[c_id][i][j] = 0.; + for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t k = 0; k < 3; k++) + grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; } - - for (cs_lnum_t j = 0; j < 3; j++) - for (cs_lnum_t k = 0; k < 3; k++) - grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; } } - } +#endif /* Periodicity and parallelism treatment */ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index f14fc14127..22e882f442 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -72,6 +72,8 @@ #include "cs_timer.h" #include "cs_timer_stats.h" +#include "cs_internal_coupling.h" + /*---------------------------------------------------------------------------- * Header for the current file *----------------------------------------------------------------------------*/ @@ -98,7 +100,7 @@ __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], cs_real_t out[3]) { - cs_real_t norm = sqrt(in[0]*in[0] + cs_real_t norm = sqrt(in[0]*in[0] + in[1]*in[1] + in[2]*in[2]); @@ -446,21 +448,24 @@ _init_rhsv(cs_lnum_t size, } + /*---------------------------------------------------------------------------- - * Initialize RHS with null values + * Initialize a sizex3x3 array with null values *----------------------------------------------------------------------------*/ -__global__ static void -_init_rhs(cs_lnum_t size, - cs_real_33_t *restrict rhs) -{ - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) { - for (cs_lnum_t i = 0; i < 3; i++) - for (cs_lnum_t j = 0; j < 3; j++) - rhs[c_id][i][j] = 0.0; - } -} + __global__ static void + _init_real_33_array(cs_lnum_t size, + cs_real_33_t *restrict array) + { + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_id < size) { + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++) + array[c_id][i][j] = 0.0; + } + } + __global__ static void _compute_rhs_lsq_v_i_face(cs_lnum_t size, @@ -487,7 +492,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - + if (c_weight == NULL){ _weight1 = 1.; _weight2 = 1.; @@ -499,7 +504,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, _weight1 = c_weight[c_id1] * _denom; _weight2 = c_weight[c_id2] * _denom; } - + for(cs_lnum_t i = 0; i < 3; i++){ pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ @@ -535,7 +540,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - + if (c_weight == NULL){ _weight1 = 1.; _weight2 = 1.; @@ -547,7 +552,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, _weight1 = c_weight[c_id1] * _denom; _weight2 = c_weight[c_id2] * _denom; } - + for(cs_lnum_t i = 0; i < 3; i++){ pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ @@ -651,7 +656,7 @@ _compute_gradient_lsq_v(cs_lnum_t size, cs_cocg_6_t *cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= size) return; for(cs_lnum_t i = 0; i < 3; i++){ @@ -717,7 +722,7 @@ _compute_gradient_lsq_b_v(cs_lnum_t size, f_id = cell_b_faces[index]; /* Normal is vector 0 if the b_face_normal norm is too small */ - norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] + norm = sqrt(b_face_normal[index][0]*b_face_normal[index][0] + b_face_normal[index][1]*b_face_normal[index][1] + b_face_normal[index][2]*b_face_normal[index][2]); @@ -1254,9 +1259,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_cuda_copy_h2d(cocg_d, cocg, n_cells_ext * sizeof(cs_cocg_6_t)); - void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, *_cell_cells_idx_d = NULL; - const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; + const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; const cs_real_33_t *coefb_d = NULL; const cs_lnum_t *cell_cells_idx_d = NULL; @@ -1308,22 +1313,22 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - _init_rhs<<>> + _init_real_33_array<<>> (n_cells_ext, rhs_d); CS_CUDA_CHECK(cudaEventRecord(init, stream)); - + bool status = false; cs_lnum_t count_nan = 0, count_inf = 0; - + _compute_rhs_lsq_v_i_face<<>> (n_i_faces, - i_face_cells, - cell_f_cen, - rhs_d, - pvar_d, - weight, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, c_weight); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1331,30 +1336,30 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ _compute_rhs_lsq_v_b_neighbor<<>> - (n_cells, - cell_cells_idx, - cell_cells_lst, - cell_f_cen, - rhs_d, + (n_cells, + cell_cells_idx, + cell_cells_lst, + cell_f_cen, + rhs_d, pvar_d); } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); _compute_rhs_lsq_v_b_face<<>> (m->n_b_faces, - b_face_cells, - cell_f_cen, - b_face_normal, - rhs_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, + b_face_cells, + cell_f_cen, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, inc); CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - + // if (rhs_d != NULL) { // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; // cs_cuda_copy_d2h(rhs, rhs_d, size); @@ -1367,8 +1372,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, _compute_gradient_lsq_v<<>> (n_cells, - grad_d, - rhs_d, + grad_d, + rhs_d, cocg_d); CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); @@ -1411,7 +1416,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); printf("Total kernel = %f\t", msec*1000.f); - + msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); printf("Total = %f\t", msec*1000.f); @@ -1428,7 +1433,400 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaFree(rhs_d)); CS_CUDA_CHECK(cudaFree(cocg_d)); CS_CUDA_CHECK(cudaFree(grad_d)); - + } // cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) + + + + +__global__ static void +_compute_reconstruct_v_i_face(cs_lnum_t size, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + cs_real_3_t *restrict dofij, + cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + } + + } +} + + +__global__ static void +_compute_reconstruct_v_b_face1(cs_lnum_t size, + const cs_lnum_t *restrict b_group_index, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + c_id = b_face_cells[f_id]; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++) + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + + } +} + + + +__global__ static void +_compute_reconstruct_v_b_face2(cs_lnum_t size, + cs_lnum_t has_dc, + const int *restrict c_disable_flag, + const cs_real_t *restrict cell_f_vol, + cs_real_33_t *restrict grad, + const cs_real_33_t *restrict corr_grad_lin + ) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + // if (has_dc * c_disable_flag[has_dc * c_id] == 0) + // dvol = 1. / cell_f_vol[c_id]; + // else + // dvol = 0.; + + + // for (cs_lnum_t i = 0; i < 3; i++) { + // for (cs_lnum_t j = 0; j < 3; j++) + // grad[c_id][i][j] *= dvol; + // } + + + // // if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { + // cs_real_t gradpa[3]; + // for (cs_lnum_t i = 0; i < 3; i++) { + // for (cs_lnum_t j = 0; j < 3; j++) { + // gradpa[j] = grad[c_id][i][j]; + // grad[c_id][i][j] = 0.; + // } + + // for (cs_lnum_t j = 0; j < 3; j++) + // for (cs_lnum_t k = 0; k < 3; k++) + // grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + // } + // } + + grad[c_id][0][0] += 1.; +} + +/*---------------------------------------------------------------------------- + * Reconstruct the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_quantities_t *fvq, + const cs_internal_coupling_t *cpl, + cs_halo_type_t halo_type, + int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const bool *coupled_faces, + cs_lnum_t cpl_stride + ) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; + + int device_id; + cudaGetDevice(&device_id); + + + cudaStream_t stream, stream1; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream); + + cs_real_33_t *grad_d; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); + + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL, *_r_grad_d = NULL; + const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; + const cs_real_33_t *coefb_d = NULL, *r_grad_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + bool *coupled_faces_d; + CS_CUDA_CHECK(cudaMalloc(&coupled_faces_d, sizeof(bool))); + cs_cuda_copy_h2d(coupled_faces_d, coupled_faces, sizeof(bool)); + + + unsigned int blocksize = 256; + unsigned int gridsize_b + = (unsigned int)ceil((double)m->n_b_cells / blocksize); + unsigned int gridsize_if + = (unsigned int)ceil((double)m->n_i_faces / blocksize); + unsigned int gridsize_bf + = (unsigned int)ceil((double)m->n_b_faces / blocksize); + unsigned int gridsize + = (unsigned int)ceil((double)m->n_cells / blocksize); + unsigned int gridsize_ext + = (unsigned int)ceil((double)n_cells_ext / blocksize); + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict cell_cells_idx; + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_idx); + const cs_lnum_t *restrict cell_cells_lst; + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const int n_i_groups + = m->i_face_numbering->n_groups; + const int n_i_threads + = m->i_face_numbering->n_threads; + cs_lnum_t *restrict i_group_index; + // printf("m->i_face_numbering->group_index = ", m->i_face_numbering->group_index); + CS_CUDA_CHECK(cudaMalloc(&i_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); + cs_cuda_copy_h2d(i_group_index, (void *)m->i_face_numbering->group_index, sizeof(int)*n_i_groups * n_i_threads * 2); + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_numbering->group_index); + + const int n_b_groups + = m->b_face_numbering->n_groups; + const int n_b_threads + = m->b_face_numbering->n_threads; + + cs_lnum_t *restrict b_group_index; + CS_CUDA_CHECK(cudaMalloc(&b_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); + cs_cuda_copy_h2d(b_group_index, (void *)m->b_face_numbering->group_index, sizeof(int)*n_b_groups * n_b_threads * 2); + + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_numbering->group_index); + + const cs_real_3_t *restrict cell_cen; + // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + const cs_lnum_t *restrict cell_vol; + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_vol); + cs_real_t *restrict cell_f_vol; + CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells_ext * sizeof(cs_real_t))); + cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells_ext); + // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_vol); + if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2) + cell_f_vol = fvq->cell_vol; + const cs_lnum_3_t *restrict cell_f_cen; + // = (const cs_lnum_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + const cs_real_t *restrict b_dist; + // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); + const cs_real_3_t *restrict b_face_normal; + // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + cs_real_3_t *restrict i_f_face_normal; + // printf("fvq->i_f_face_normal = ", fvq->i_f_face_normal); + CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); + // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->i_f_face_normal); + + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); + cs_real_3_t *restrict dofij; + // printf("fvq->dofij = ", fvq->dofij); + CS_CUDA_CHECK(cudaMalloc(&dofij, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(dofij, (void *)fvq->dofij, sizeof(cs_real_3_t)*n_i_faces); + // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->dofij); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + cs_real_33_t *restrict corr_grad_lin; + CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells_ext * sizeof(cs_real_33_t))); + cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells_ext); + // = (const cs_real_33_t *restrict)cs_get_device_ptr_const_pf(fvq->corr_grad_lin); + const cs_lnum_t has_dc + = fvq->has_disable_flag; + int *restrict c_disable_flag; + CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells_ext * sizeof(int))); + cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells_ext); + // = (const int *restrict)cs_get_device_ptr_const_pf(fvq->c_disable_flag); + + + _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, + &pvar_d, &_pvar_d); + + _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream1, + &r_grad_d, &_r_grad_d); + + _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream1, + &coefa_d, &_coefa_d); + _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, + &coefb_d, &_coefb_d); + + // ----------------------------Begin of Kernels part 1------------------------------------------- + + /* Initialization */ + _init_real_33_array<<>> + (n_cells_ext, grad_d); + + /* Interior faces contribution */ + _compute_reconstruct_v_i_face<<>> + (n_i_faces, + i_group_index, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal); + // cudaDeviceSynchronize(); + // ----------------------------End of Kernels part 1------------------------------------------- + + if (grad_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(grad, grad_d, size); + } + else + cs_sync_d2h(grad); + + /* Contribution from coupled faces */ + if (cpl != NULL) { + cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); + cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); + } + + cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); + + // ----------------------------Begin of Kernels part 2------------------------------------------- + _compute_reconstruct_v_b_face1<<>> + ( n_b_faces, + b_group_index, + coupled_faces, + cpl_stride, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells); + + _compute_reconstruct_v_b_face2<<>> + ( n_cells, + has_dc, + c_disable_flag, + cell_f_vol, + grad_d, + corr_grad_lin + ); + + // ----------------------------End of Kernels part 2------------------------------------------- + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(grad, grad_d, size); + } + else + cs_sync_d2h(grad); + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(grad_d)); +} \ No newline at end of file diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index fe61f924ec..9777d83546 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -123,6 +123,21 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs); +void +cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_quantities_t *fvq, + const cs_internal_coupling_t *cpl, + cs_halo_type_t halo_type, + int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const bool *coupled_faces, + cs_lnum_t cpl_stride + ); #endif /* defined(HAVE_CUDA) */ /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ From bc6b30e142ee4d2d967551ae4d45c7e884492800 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 2 Nov 2023 17:48:06 +0100 Subject: [PATCH 13/70] Interior and boundary faces gather --- src/alge/cs_gradient.cxx | 2 +- src/alge/cs_gradient_cuda.cu | 60 ++++++++++++---------- src/alge/cs_gradient_lsq_vector_gather.cuh | 56 +++++++++++++++++++- 3 files changed, 88 insertions(+), 30 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e57267ebad..0fc560d900 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -7088,7 +7088,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, auto cuda = gradv_cuda[c_id][i][j]; if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + // printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); } } } diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 00a1bf17f9..f5df012773 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -927,7 +927,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t *restrict b_face_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); const cs_lnum_t *restrict cell_cells_idx - = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_idx); + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); const cs_lnum_t *restrict cell_cells_lst = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); const int n_i_groups = m->i_face_numbering->n_groups; @@ -1000,14 +1000,14 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_v2<<>> - (n_i_faces, - i_face_cells_1d, - cell_f_cen_1d, - rhs_test_d, - pvar_d, - weight, - c_weight); + // _compute_rhs_lsq_v_i_face_v2<<>> + // (n_i_faces, + // i_face_cells_1d, + // cell_f_cen_1d, + // rhs_test_d, + // pvar_d, + // weight, + // c_weight); // _compute_rhs_lsq_v_i_face_v3<<>> // (n_i_faces*3*3, @@ -1018,10 +1018,16 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); + assert(cell_cells_idx); + assert(cell_cells); + assert(cell_f_cen); + assert(rhs_d); + assert(pvar_d); + assert(weight); _compute_rhs_lsq_v_i_face_gather<<>> (n_cells, cell_cells_idx, - cell_cells_lst, + cell_cells, cell_f_cen, rhs_d, pvar_d, @@ -1035,7 +1041,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, _compute_rhs_lsq_v_b_neighbor<<>> (n_cells, cell_cells_idx, - cell_cells_lst, + cell_cells, cell_f_cen, rhs_d, pvar_d); @@ -1054,17 +1060,17 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, coefa_d, inc); - _compute_rhs_lsq_v_b_face_v2<<>> - (m->n_b_faces, - b_face_cells, - cell_f_cen, - b_face_normal, - rhs_test_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, - inc); + // _compute_rhs_lsq_v_b_face_v2<<>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_test_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); @@ -1092,11 +1098,11 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // cocg_d); - _compute_gradient_lsq_v_v5<<>> - (n_cells*3*3, - gradv_test_d, - rhs_test_d, - cocg_d); + // _compute_gradient_lsq_v_v5<<>> + // (n_cells*3*3, + // gradv_test_d, + // rhs_test_d, + // cocg_d); _compute_gradient_lsq_v_v6<<>> (n_cells*3*3, diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index 90caa30d55..fd511fdfd4 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -27,7 +27,7 @@ __global__ static void _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, const cs_lnum_t *restrict cell_cells_idx, - const cs_lnum_t *restrict cell_cells_lst, + const cs_lnum_t *restrict cell_cells, const cs_real_3_t *restrict cell_f_cen, cs_real_33_t *restrict rhs, const cs_real_3_t *restrict pvar, @@ -46,7 +46,7 @@ _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, cs_lnum_t e_id = cell_cells_idx[c_id + 1]; for(cs_lnum_t index = s_id; index < e_id; index++){ - c_id2 = cell_cells_lst[index]; + c_id2 = cell_cells[index]; dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id][0]; dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id][1]; @@ -75,4 +75,56 @@ _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, } } } +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_real_3_t *restrict cell_f_cen, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + cs_math_3_normalise_cuda(b_face_normal[index], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id][0] + + coefbv[f_id][1][i] * pvar[c_id][1] + + coefbv[f_id][2][i] * pvar[c_id][2] + - pvar[c_id][i]); + + rhs[c_id][i][0] += n_d_dist[0] * pfac; + rhs[c_id][i][1] += n_d_dist[1] * pfac; + rhs[c_id][i][2] += n_d_dist[2] * pfac; + } + } } \ No newline at end of file From b4924a3a63d45a41bec05bf66627c02dece10200 Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Fri, 3 Nov 2023 10:50:05 +0100 Subject: [PATCH 14/70] Test performance --- src/alge/cs_gradient.cxx | 2 +- src/alge/cs_gradient_cuda.cu | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 0fc560d900..e57267ebad 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -7088,7 +7088,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, auto cuda = gradv_cuda[c_id][i][j]; if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { - // printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); } } } diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index f5df012773..87f43c6a45 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -930,6 +930,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); const cs_lnum_t *restrict cell_cells_lst = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; @@ -979,8 +983,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(init, stream)); - bool status = false; - cs_lnum_t count_nan = 0, count_inf = 0; + // bool status = false; + // cs_lnum_t count_nan = 0, count_inf = 0; // _compute_rhs_lsq_v_i_face_v0<<>> // (n_i_faces, @@ -1048,9 +1052,22 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - _compute_rhs_lsq_v_b_face<<>> - (m->n_b_faces, - b_face_cells, + // _compute_rhs_lsq_v_b_face<<>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + + _compute_rhs_lsq_v_b_face_gather<<>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, cell_f_cen, b_face_normal, rhs_d, From 8e6dee335989a86ab3c74c862d2c49a3b4557ffb Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Fri, 3 Nov 2023 11:45:13 +0100 Subject: [PATCH 15/70] ADD chronometrie and accuracy --- src/alge/cs_gradient.cxx | 48 +++++-- src/alge/cs_gradient_cuda.cu | 248 ++++++++++++++++++++++++----------- src/alge/cs_gradient_priv.h | 3 +- 3 files changed, 213 insertions(+), 86 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e2988b74ff..74c84d1554 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5687,9 +5687,18 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } + /* Timing the computation */ + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda; -#if defined(HAVE_CUDA) + +cs_real_33_t *grad_cuda; + +BFT_MALLOC(grad_cuda, n_cells_ext, cs_real_33_t); + +// #if defined(HAVE_CUDA) + start = std::chrono::high_resolution_clock::now(); cs_reconstruct_vector_gradient_cuda(m, fvq, cpl, @@ -5700,15 +5709,19 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, pvar, c_weight, r_grad, - grad, + grad_cuda, coupled_faces, - cpl_stride); -#else + cpl_stride, + cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION); + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + +// #else /* Initialize gradient */ /*---------------------*/ /* Initialization */ - + start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { @@ -5854,7 +5867,24 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } } } -#endif + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + +// #endif + printf("rec Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j =0; j < 3; ++j) { + auto cpu = grad[c_id][i][j]; + auto cuda = grad_cuda[c_id][i][j]; + + if ((fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6)) > 1e-8) { + printf("rec DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + } + } + } + } /* Periodicity and parallelism treatment */ @@ -5863,6 +5893,8 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, if (cs_glob_mesh->have_rotation_perio) cs_halo_perio_sync_var_tens(m->halo, halo_type, (cs_real_t *)grad); } + + BFT_FREE(grad_cuda); } /*---------------------------------------------------------------------------- @@ -7106,8 +7138,8 @@ _lsq_vector_gradient(const cs_mesh_t *m, auto cpu = gradv[c_id][i][j]; auto cuda = gradv_cuda[c_id][i][j]; - if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-8) { + printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); } } } diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 22e882f442..119a4690cb 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1392,36 +1392,37 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); + // printf("lsq Kernels :"); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + // printf("Kernels execution time in us: \t"); + // printf("Init = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); - printf("I_faces = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + // printf("I_faces = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); - printf("Halo = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + // printf("Halo = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); - printf("B_faces = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + // printf("B_faces = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); - printf("Gradient = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + // printf("Gradient = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); - printf("Total kernel = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + // printf("Total kernel = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); - printf("Total = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + // printf("Total = %f\t", msec*1000.f); - printf("\n"); + // printf("\n"); if (_pvar_d != NULL) CS_CUDA_CHECK(cudaFree(_pvar_d)); @@ -1484,8 +1485,8 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, + r_grad[c_id2][i][2])); for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); } } @@ -1515,8 +1516,8 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, cs_lnum_t c_id; cs_real_t pond, ktpond, pfac, rfac, vecfac; - // if (coupled_faces[f_id * cpl_stride]) - // return; + if (coupled_faces[f_id * cpl_stride]) + return; c_id = b_face_cells[f_id]; @@ -1539,7 +1540,7 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, } for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); } } @@ -1552,7 +1553,8 @@ _compute_reconstruct_v_b_face2(cs_lnum_t size, const int *restrict c_disable_flag, const cs_real_t *restrict cell_f_vol, cs_real_33_t *restrict grad, - const cs_real_33_t *restrict corr_grad_lin + const cs_real_33_t *restrict corr_grad_lin, + bool test_bool ) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -1562,33 +1564,32 @@ _compute_reconstruct_v_b_face2(cs_lnum_t size, } cs_real_t dvol; /* Is the cell disabled (for solid or porous)? Not the case if coupled */ - // if (has_dc * c_disable_flag[has_dc * c_id] == 0) - // dvol = 1. / cell_f_vol[c_id]; - // else - // dvol = 0.; + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; - // for (cs_lnum_t i = 0; i < 3; i++) { - // for (cs_lnum_t j = 0; j < 3; j++) - // grad[c_id][i][j] *= dvol; - // } + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] *= dvol; + } - // // if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { - // cs_real_t gradpa[3]; - // for (cs_lnum_t i = 0; i < 3; i++) { - // for (cs_lnum_t j = 0; j < 3; j++) { - // gradpa[j] = grad[c_id][i][j]; - // grad[c_id][i][j] = 0.; - // } + if (test_bool) { + cs_real_t gradpa[3]; + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_id][i][j]; + grad[c_id][i][j] = 0.; + } + + for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t k = 0; k < 3; k++) + grad[c_id][i][j]+= corr_grad_lin[c_id][j][k] * gradpa[k]; + } + } - // for (cs_lnum_t j = 0; j < 3; j++) - // for (cs_lnum_t k = 0; k < 3; k++) - // grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; - // } - // } - - grad[c_id][0][0] += 1.; } /*---------------------------------------------------------------------------- @@ -1620,7 +1621,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, const bool *coupled_faces, - cs_lnum_t cpl_stride + cs_lnum_t cpl_stride, + bool test_bool ) { const cs_lnum_t n_cells = m->n_cells; @@ -1632,9 +1634,23 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaGetDevice(&device_id); - cudaStream_t stream, stream1; - cudaStreamCreate(&stream1); + cudaStream_t stream; cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, i_faces, b_faces_1, b_faces_2, b_faces_3, stop; + float msec = 0.0f, msec_tot; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_1)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_2)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces_3)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); cs_real_33_t *grad_d; CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); @@ -1736,23 +1752,27 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // = (const int *restrict)cs_get_device_ptr_const_pf(fvq->c_disable_flag); - _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream1, + _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream1, + _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, &r_grad_d, &_r_grad_d); - _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream1, + _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream1, + _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); // ----------------------------Begin of Kernels part 1------------------------------------------- + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + /* Initialization */ _init_real_33_array<<>> (n_cells_ext, grad_d); + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + /* Interior faces contribution */ _compute_reconstruct_v_i_face<<>> (n_i_faces, @@ -1765,7 +1785,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, grad_d, dofij, i_f_face_normal); - // cudaDeviceSynchronize(); + + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); + // ----------------------------End of Kernels part 1------------------------------------------- if (grad_d != NULL) { @@ -1774,6 +1796,10 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, } else cs_sync_d2h(grad); + + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(r_grad, r_grad_d, size); + /* Contribution from coupled faces */ if (cpl != NULL) { @@ -1782,12 +1808,17 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, } cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); + + _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, + &r_grad_d, &_r_grad_d); + + CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- _compute_reconstruct_v_b_face1<<>> ( n_b_faces, b_group_index, - coupled_faces, + coupled_faces_d, cpl_stride, coefb_d, coefa_d, @@ -1799,34 +1830,97 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, b_f_face_normal, b_face_cells); + CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); + _compute_reconstruct_v_b_face2<<>> ( n_cells, has_dc, c_disable_flag, cell_f_vol, grad_d, - corr_grad_lin + corr_grad_lin, + test_bool ); + CS_CUDA_CHECK(cudaEventRecord(b_faces_3, stream)); // ----------------------------End of Kernels part 2------------------------------------------- - cudaStreamSynchronize(stream); - cudaStreamDestroy(stream); + /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(grad, grad_d, size); + } + else + cs_sync_d2h(grad); + - /* Sync to host */ - if (grad_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(grad, grad_d, size); - } - else - cs_sync_d2h(grad); + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + // printf("rec Kernels :"); - if (_pvar_d != NULL) - CS_CUDA_CHECK(cudaFree(_pvar_d)); - if (_coefa_d != NULL) - CS_CUDA_CHECK(cudaFree(_coefa_d)); - if (_coefb_d != NULL) - CS_CUDA_CHECK(cudaFree(_coefb_d)); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + // printf("Kernels execution time in us: \t"); + // printf("Init = %f\t", msec*1000.f); - CS_CUDA_CHECK(cudaFree(grad_d)); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + // printf("I_faces = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); + // printf("CPU part = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); + // printf("B_faces_1 = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); + // printf("B_faces_2 = %f\t", msec*1000.f); + + // printf("\n"); + + // msec_tot = 0.0f; + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); + // printf("Total kernel part 1= %f\t", msec*1000.f); + // msec_tot = msec; + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); + // printf("Total kernel part 2= %f\t", msec*1000.f); + // msec_tot += msec; + + // printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + // printf("Total = %f\t", msec*1000.f); + + // printf("\n"); + + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + if (_r_grad_d != NULL) + CS_CUDA_CHECK(cudaFree(_r_grad_d)); + + CS_CUDA_CHECK(cudaFree(coupled_faces_d)); + CS_CUDA_CHECK(cudaFree(i_group_index)); + CS_CUDA_CHECK(cudaFree(b_group_index)); + CS_CUDA_CHECK(cudaFree(cell_f_vol)); + CS_CUDA_CHECK(cudaFree(i_f_face_normal)); + CS_CUDA_CHECK(cudaFree(dofij)); + CS_CUDA_CHECK(cudaFree(corr_grad_lin)); + CS_CUDA_CHECK(cudaFree(c_disable_flag)); + CS_CUDA_CHECK(cudaFree(grad_d)); } \ No newline at end of file diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 9777d83546..a02635642a 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -136,7 +136,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, const bool *coupled_faces, - cs_lnum_t cpl_stride + cs_lnum_t cpl_stride, + bool test_bool ); #endif /* defined(HAVE_CUDA) */ From f2a542c06d1e51222f0566fb741c66ffced8b0bc Mon Sep 17 00:00:00 2001 From: Daouda DIAKITE Date: Fri, 3 Nov 2023 13:23:13 +0100 Subject: [PATCH 16/70] Gather versions in progress --- src/alge/cs_gradient.cxx | 4 +- src/alge/cs_gradient_cuda.cu | 49 +++++++++++--------- src/alge/cs_gradient_lsq_vector_gather.cuh | 54 ++++++++++++---------- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e57267ebad..145b4574a2 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -7084,8 +7084,8 @@ _lsq_vector_gradient(const cs_mesh_t *m, + rhs[c_id][i][2] * cocg[c_id][2]; for (int j =0; j < 3; ++j) { - auto cpu = gradv[c_id][i][j]; - auto cuda = gradv_cuda[c_id][i][j]; + auto cpu = rhs[c_id][i][j]; + auto cuda = rhs_cuda[c_id][i][j]; if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 87f43c6a45..13e667f58b 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -934,6 +934,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); const cs_lnum_t *restrict cell_b_faces = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; @@ -1032,6 +1036,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, (n_cells, cell_cells_idx, cell_cells, + cell_i_faces, + cell_i_faces_sgn, cell_f_cen, rhs_d, pvar_d, @@ -1052,22 +1058,9 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - // _compute_rhs_lsq_v_b_face<<>> - // (m->n_b_faces, - // b_face_cells, - // cell_f_cen, - // b_face_normal, - // rhs_d, - // pvar_d, - // b_dist, - // coefb_d, - // coefa_d, - // inc); - - _compute_rhs_lsq_v_b_face_gather<<>> - (m->n_b_cells, - cell_b_faces_idx, - cell_b_faces, + _compute_rhs_lsq_v_b_face<<>> + (m->n_b_faces, + b_face_cells, cell_f_cen, b_face_normal, rhs_d, @@ -1077,6 +1070,18 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, coefa_d, inc); + // _compute_rhs_lsq_v_b_face_gather<<>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + // _compute_rhs_lsq_v_b_face_v2<<>> // (m->n_b_faces, // b_face_cells, @@ -1092,12 +1097,12 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - // if (rhs_test_d != NULL) { - // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - // cs_cuda_copy_d2h(rhs, rhs_test_d, size); - // } - // else - // cs_sync_d2h(rhs); + if (rhs_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(rhs, rhs_d, size); + } + else + cs_sync_d2h(rhs); // /* Compute gradient */ // /*------------------*/ diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index fd511fdfd4..80258d4fb8 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -28,60 +28,64 @@ __global__ static void _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, const cs_real_3_t *restrict cell_f_cen, cs_real_33_t *restrict rhs, const cs_real_3_t *restrict pvar, const cs_real_t *restrict weight, const cs_real_t *restrict c_weight) { - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= size){ + if(c_id1 >= size){ return; } - cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; - cs_lnum_t c_id1, c_id2; + cs_real_t dc[3], fctb[3], ddc, _denom, _pond, pfac; + cs_lnum_t c_id2, f_id; - cs_lnum_t s_id = cell_cells_idx[c_id]; - cs_lnum_t e_id = cell_cells_idx[c_id + 1]; + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; for(cs_lnum_t index = s_id; index < e_id; index++){ c_id2 = cell_cells[index]; - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id][2]; + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - if (c_weight == NULL){ - _weight1 = 1.; - _weight2 = 1.; - } - else{ - _pond = weight[index]; - _denom = 1. / ( _pond *c_weight[c_id] - + (1. - _pond)*c_weight[c_id2]); - _weight1 = c_weight[c_id] * _denom; - _weight2 = c_weight[c_id2] * _denom; + if (c_weight != NULL){ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + rhs[c_id1][i][j] += c_weight[c_id2] * _denom * fctb[j]; + } } - + } + else{ for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; - rhs[c_id][i][j] += _weight2 * fctb[j]; - rhs[c_id2][i][j] += _weight1 * fctb[j]; + rhs[c_id1][i][j] += fctb[j]; } } } } +} __global__ static void _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, - const cs_real_3_t *restrict cell_f_cen, const cs_real_3_t *restrict b_face_normal, cs_real_33_t *restrict rhs, const cs_real_3_t *restrict pvar, @@ -106,7 +110,7 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, f_id = cell_b_faces[index]; - cs_math_3_normalise_cuda(b_face_normal[index], n_d_dist); + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; From 843d0231cbded13ae1e889bc2d334d58de275525 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 6 Nov 2023 17:35:02 +0100 Subject: [PATCH 17/70] Work on gestion of cuda compiler --- src/alge/cs_gradient.cxx | 377 +++++++++++++++++++++-------------- src/alge/cs_gradient_cuda.cu | 104 ++++++---- src/alge/cs_gradient_priv.h | 2 +- 3 files changed, 292 insertions(+), 191 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 74c84d1554..458f18bde0 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5692,201 +5692,273 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, std::chrono::high_resolution_clock::time_point start, stop; std::chrono::microseconds elapsed, elapsed_cuda; - -cs_real_33_t *grad_cuda; - -BFT_MALLOC(grad_cuda, n_cells_ext, cs_real_33_t); - -// #if defined(HAVE_CUDA) - start = std::chrono::high_resolution_clock::now(); - cs_reconstruct_vector_gradient_cuda(m, - fvq, - cpl, - halo_type, - inc, - coefav, - coefbv, - pvar, - c_weight, - r_grad, - grad_cuda, - coupled_faces, - cpl_stride, - cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION); - stop = std::chrono::high_resolution_clock::now(); - elapsed_cuda = std::chrono::duration_cast(stop - start); - -// #else - /* Initialize gradient */ - /*---------------------*/ - - /* Initialization */ - start = std::chrono::high_resolution_clock::now(); - # pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] = 0.0; + bool COMPUTE_CPU = true; + bool COMPUTE_CUDA = true; + bool RES_CPU = false; + + cs_real_33_t *grad_cpu; + + if(COMPUTE_CUDA){ + printf("Compute with CUDA\n"); + start = std::chrono::high_resolution_clock::now(); + cs_reconstruct_vector_gradient_cuda(m, + fvq, + cpl, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + r_grad, + grad, + coupled_faces, + cpl_stride, + cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION); + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } + + if(COMPUTE_CPU){ + printf("Compute with CPU\n"); + BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + + /* Initialization */ + start = std::chrono::high_resolution_clock::now(); + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad_cpu[c_id][i][j] = 0.0; + } } - } - /* Interior faces contribution */ + /* Interior faces contribution */ - for (int g_id = 0; g_id < n_i_groups; g_id++) { + for (int g_id = 0; g_id < n_i_groups; g_id++) { - # pragma omp parallel for - for (int t_id = 0; t_id < n_i_threads; t_id++) { + # pragma omp parallel for + for (int t_id = 0; t_id < n_i_threads; t_id++) { - for (cs_lnum_t f_id = i_group_index[(t_id*n_i_groups + g_id)*2]; - f_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; - f_id++) { + for (cs_lnum_t f_id = i_group_index[(t_id*n_i_groups + g_id)*2]; + f_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; + f_id++) { - cs_lnum_t c_id1 = i_face_cells[f_id][0]; - cs_lnum_t c_id2 = i_face_cells[f_id][1]; + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; - cs_real_t pond = weight[f_id]; + cs_real_t pond = weight[f_id]; - cs_real_t ktpond = (c_weight == NULL) ? - pond : // no cell weighting - pond * c_weight[c_id1] // cell weighting active - / ( pond * c_weight[c_id1] - + (1.0-pond)* c_weight[c_id2]); + cs_real_t ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + // if(c_id1 == 604 && c_id2 == 605){ + // printf("CPU :\n"); + // printf("f_id = %d :\n", f_id); + // printf("c_d1 = %d - c_id2 = %d :\n", c_id1, c_id2); + // printf("weight[f_id] = %.17lg:\n", weight[f_id]); + // printf("ktpond = %.17lg:\n", ktpond); + // } - /* - Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli - + (1-\alpha_\ij) \varia_\cellj\f$ - but for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - and for the cell \f$ \cellj \f$ we remove - \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ - */ + /* + Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli + + (1-\alpha_\ij) \varia_\cellj\f$ + but for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + and for the cell \f$ \cellj \f$ we remove + \f$ \varia_\cellj \sum_\face \vect{S}_\face = \vect{0} \f$ + */ - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - cs_real_t pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + cs_real_t pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + cs_real_t pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - /* Reconstruction part */ - cs_real_t rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); + /* Reconstruction part */ + cs_real_t rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); - for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + for (cs_lnum_t j = 0; j < 3; j++) { + grad_cpu[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad_cpu[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + + // if(c_id1 == 604 && c_id2 == 605){ + // printf("Variables loop :\n"); + // printf("pfaci = %.17lg: :\n", pfaci); + // printf("rfac = %.17lg:\n", rfac); + // printf("j = %d - i_f_face_normal[f_id][j] = %.17lg:\n", j, i_f_face_normal[f_id][j]); + // printf("i = %d - j = %d - grad_cpu[c_id1][i][j] = %.17lg:\n",i, j, grad_cpu[c_id1][i][j]); + // if(i == 2 && j == 2){ + // printf("\n"); + // printf("\n"); + // } + // } + } } - } + } /* End of loop on faces */ - } /* End of loop on faces */ + } /* End of loop on threads */ - } /* End of loop on threads */ + } /* End of loop on thread groups */ - } /* End of loop on thread groups */ + /* Test grad */ + // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + // for (cs_lnum_t i = 0; i < 3; i++) { + // for (int j =0; j < 3; ++j) { + // auto cpu = grad_cpu[c_id][i][j]; + // auto cuda = grad[c_id][i][j]; + // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); + // if (err > 1e-12) { + // printf("rec inte DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); + // } + // } + // } + // } + + // stop = std::chrono::high_resolution_clock::now(); + // elapsed = std::chrono::duration_cast(stop - start); + // printf("rec Compute after i_face time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); - /* Contribution from coupled faces */ - if (cpl != NULL) { - cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); - cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); - } + /* Contribution from coupled faces */ + // if (cpl != NULL) { + // cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); + // cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); + // } - /* Boundary face treatment */ + /* Boundary face treatment */ - # pragma omp parallel for - for (int t_id = 0; t_id < n_b_threads; t_id++) { + # pragma omp parallel for + for (int t_id = 0; t_id < n_b_threads; t_id++) { - for (cs_lnum_t f_id = b_group_index[t_id*2]; - f_id < b_group_index[t_id*2 + 1]; - f_id++) { + for (cs_lnum_t f_id = b_group_index[t_id*2]; + f_id < b_group_index[t_id*2 + 1]; + f_id++) { - if (coupled_faces[f_id * cpl_stride]) - continue; + if (coupled_faces[f_id * cpl_stride]) + continue; - cs_lnum_t c_id = b_face_cells[f_id]; + cs_lnum_t c_id = b_face_cells[f_id]; - /* - Remark: for the cell \f$ \celli \f$ we remove - \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ - */ + /* + Remark: for the cell \f$ \celli \f$ we remove + \f$ \varia_\celli \sum_\face \vect{S}_\face = \vect{0} \f$ + */ - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = inc*coefav[f_id][i]; + cs_real_t pfac = inc*coefav[f_id][i]; - for (cs_lnum_t k = 0; k < 3; k++) - pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + for (cs_lnum_t k = 0; k < 3; k++) + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; - pfac -= pvar[c_id][i]; + pfac -= pvar[c_id][i]; - /* Reconstruction part */ - cs_real_t rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { - cs_real_t vecfac = r_grad[c_id][k][0] * diipb[f_id][0] - + r_grad[c_id][k][1] * diipb[f_id][1] - + r_grad[c_id][k][2] * diipb[f_id][2]; - rfac += coefbv[f_id][i][k] * vecfac; - } + /* Reconstruction part */ + cs_real_t rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + cs_real_t vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + for (cs_lnum_t j = 0; j < 3; j++) + grad_cpu[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; - } + } - } /* loop on faces */ + } /* loop on faces */ - } /* loop on threads */ + } /* loop on threads */ - # pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - cs_real_t dvol; - /* Is the cell disabled (for solid or porous)? Not the case if coupled */ - if (has_dc * c_disable_flag[has_dc * c_id] == 0) - dvol = 1. / cell_f_vol[c_id]; - else - dvol = 0.; - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] *= dvol; - } + /* Test grad */ + // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + // for (cs_lnum_t i = 0; i < 3; i++) { + // for (int j =0; j < 3; ++j) { + // auto cpu = grad_cpu[c_id][i][j]; + // auto cuda = grad[c_id][i][j]; + // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); + // if (err> 1e-14) { + // printf("rec bf1 DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); + // } + // } + // } + // } + + # pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; - if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { - cs_real_t gradpa[3]; for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - gradpa[j] = grad[c_id][i][j]; - grad[c_id][i][j] = 0.; - } - for (cs_lnum_t j = 0; j < 3; j++) - for (cs_lnum_t k = 0; k < 3; k++) - grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + grad_cpu[c_id][i][j] *= dvol; } - } - } - stop = std::chrono::high_resolution_clock::now(); - elapsed = std::chrono::duration_cast(stop - start); -// #endif - printf("rec Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); - - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (int j =0; j < 3; ++j) { - auto cpu = grad[c_id][i][j]; - auto cuda = grad_cuda[c_id][i][j]; + if (cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION) { + cs_real_t gradpa[3]; + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad_cpu[c_id][i][j]; + grad_cpu[c_id][i][j] = 0.; + } - if ((fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6)) > 1e-8) { - printf("rec DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t k = 0; k < 3; k++) + grad_cpu[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + } } } - } + + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + + + /* Performances */ + // printf("rec Compute after b_face2 time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + + /* Test grad */ + // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + // for (cs_lnum_t i = 0; i < 3; i++) { + // for (int j =0; j < 3; ++j) { + // auto cpu = grad_cpu[c_id][i][j]; + // auto cuda = grad[c_id][i][j]; + // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); + // if (err> 1e-12) { + // printf("rec bf2 DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); + // } + // } + // } + // } + + } + + if(RES_CPU){ + printf("RESULTS CPU\n"); + grad = grad_cpu; + }else{ + printf("RESULTS CUDA\n"); } - /* Periodicity and parallelism treatment */ + // Free memory + if(COMPUTE_CPU && !RES_CPU){ + BFT_FREE(grad_cpu); + } + + /* Periodicity and parallelism treatment */ if (m->halo != NULL) { cs_halo_sync_var_strided(m->halo, halo_type, (cs_real_t *)grad, 9); @@ -5894,7 +5966,6 @@ BFT_MALLOC(grad_cuda, n_cells_ext, cs_real_33_t); cs_halo_perio_sync_var_tens(m->halo, halo_type, (cs_real_t *)grad); } - BFT_FREE(grad_cuda); } /*---------------------------------------------------------------------------- @@ -7138,8 +7209,8 @@ _lsq_vector_gradient(const cs_mesh_t *m, auto cpu = gradv[c_id][i][j]; auto cuda = gradv_cuda[c_id][i][j]; - if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-8) { - printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { + // printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); } } } @@ -7149,7 +7220,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, #endif stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); -printf("Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); +// printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); /* Compute gradient on boundary cells */ /*------------------------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 119a4690cb..37a6c6f2aa 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1451,11 +1451,15 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, const cs_real_t *c_weight, const cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, - cs_real_3_t *restrict dofij, - cs_real_3_t *restrict i_f_face_normal) + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + // for(int f_id = blockIdx.x * blockDim.x + threadIdx.x; f_id < size; f_id += blockDim.x * gridDim.x){ + + + if(f_id >= size){ return; } @@ -1472,6 +1476,7 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); + for (cs_lnum_t i = 0; i < 3; i++) { pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); @@ -1485,11 +1490,31 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, + r_grad[c_id2][i][2])); for (cs_lnum_t j = 0; j < 3; j++) { + // grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + // grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); - } + // if(c_id1 == 604 && c_id2 == 605){ + // printf("Variables GPU loop :\n"); + // printf("pfaci GPU = %.17lg: :\n", pfaci); + // printf("rfac GPU = %.17lg:\n", rfac); + // printf("j = %d - i_f_face_normal[f_id][j] GPU = %.17lg:\n", j, i_f_face_normal[f_id][j]); + // printf("i = %d - j = %d - grad[c_id1][i][j] GPU = %.17lg:\n",i, j, grad[c_id1][i][j]); + // } + } } + // if(c_id1 == 604 && c_id2 == 605){ + // printf("GPU :\n"); + // printf("f_id GPU = %d :\n", f_id); + // printf("c_d1 GPU = %d - c_id2 GPU = %d :\n", c_id1, c_id2); + // printf("weight[f_id] GPU = %.17lg:\n", weight[f_id]); + // printf("ktpond GPU = %.17lg:\n", ktpond); + // printf("\n"); + // printf("\n"); + + // } } @@ -1516,8 +1541,8 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, cs_lnum_t c_id; cs_real_t pond, ktpond, pfac, rfac, vecfac; - if (coupled_faces[f_id * cpl_stride]) - return; + // if (coupled_faces[f_id * cpl_stride]) + // return; c_id = b_face_cells[f_id]; @@ -1578,15 +1603,18 @@ _compute_reconstruct_v_b_face2(cs_lnum_t size, if (test_bool) { cs_real_t gradpa[3]; + // printf("dvol = %.17lg\n", dvol); for (cs_lnum_t i = 0; i < 3; i++) { for (cs_lnum_t j = 0; j < 3; j++) { gradpa[j] = grad[c_id][i][j]; grad[c_id][i][j] = 0.; } - for (cs_lnum_t j = 0; j < 3; j++) - for (cs_lnum_t k = 0; k < 3; k++) - grad[c_id][i][j]+= corr_grad_lin[c_id][j][k] * gradpa[k]; + for (cs_lnum_t j = 0; j < 3; j++){ + for (cs_lnum_t k = 0; k < 3; k++){ + atomicAdd(&grad[c_id][i][j], corr_grad_lin[c_id][j][k] * gradpa[k]); + } + } } } @@ -1618,7 +1646,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, - cs_real_33_t *restrict r_grad, + const cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, const bool *coupled_faces, cs_lnum_t cpl_stride, @@ -1655,15 +1683,14 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *grad_d; CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); - void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, *_cell_cells_idx_d = NULL, *_r_grad_d = NULL; const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; const cs_real_33_t *coefb_d = NULL, *r_grad_d = NULL; const cs_lnum_t *cell_cells_idx_d = NULL; bool *coupled_faces_d; - CS_CUDA_CHECK(cudaMalloc(&coupled_faces_d, sizeof(bool))); - cs_cuda_copy_h2d(coupled_faces_d, coupled_faces, sizeof(bool)); + CS_CUDA_CHECK(cudaMalloc(&coupled_faces_d, sizeof(bool) * 2)); + cs_cuda_copy_h2d(coupled_faces_d, coupled_faces, sizeof(bool) * 2); unsigned int blocksize = 256; @@ -1712,8 +1739,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t *restrict cell_vol; // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_vol); cs_real_t *restrict cell_f_vol; - CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells_ext * sizeof(cs_real_t))); - cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells_ext); + CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells * sizeof(cs_real_t))); + cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells); // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_vol); if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2) cell_f_vol = fvq->cell_vol; @@ -1741,14 +1768,14 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict diipb = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); cs_real_33_t *restrict corr_grad_lin; - CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells_ext * sizeof(cs_real_33_t))); - cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells_ext); + CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells * sizeof(cs_real_33_t))); + cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells); // = (const cs_real_33_t *restrict)cs_get_device_ptr_const_pf(fvq->corr_grad_lin); const cs_lnum_t has_dc = fvq->has_disable_flag; int *restrict c_disable_flag; - CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells_ext * sizeof(int))); - cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells_ext); + CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells * sizeof(int))); + cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells); // = (const int *restrict)cs_get_device_ptr_const_pf(fvq->c_disable_flag); @@ -1762,17 +1789,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, &coefa_d, &_coefa_d); _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); + // ----------------------------Begin of Kernels part 1------------------------------------------- CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); /* Initialization */ - _init_real_33_array<<>> - (n_cells_ext, grad_d); + _init_real_33_array<<>> + (n_cells, grad_d); CS_CUDA_CHECK(cudaEventRecord(init, stream)); - + + /* Interior faces contribution */ _compute_reconstruct_v_i_face<<>> (n_i_faces, @@ -1785,32 +1814,33 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, grad_d, dofij, i_f_face_normal); + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); // ----------------------------End of Kernels part 1------------------------------------------- - if (grad_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(grad, grad_d, size); - } - else - cs_sync_d2h(grad); + // if (grad_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(grad, grad_d, size); + // } + // else + // cs_sync_d2h(grad); - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(r_grad, r_grad_d, size); + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(r_grad, r_grad_d, size); /* Contribution from coupled faces */ - if (cpl != NULL) { - cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); - cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); - } + // if (cpl != NULL) { + // cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); + // cs_internal_coupling_reconstruct_vector_gradient(cpl, r_grad, grad); + // } - cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); + // cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); - _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, - &r_grad_d, &_r_grad_d); + // _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, + // &r_grad_d, &_r_grad_d); CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); @@ -1860,7 +1890,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - // printf("rec Kernels :"); + // printf("rec Kernels times:\n"); // msec = 0.0f; // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index a02635642a..85107fab11 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -133,7 +133,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, - cs_real_33_t *restrict r_grad, + const cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, const bool *coupled_faces, cs_lnum_t cpl_stride, From 36becbb7724b8bd0083ae9fa487f735ba38b916d Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 7 Nov 2023 14:07:02 +0100 Subject: [PATCH 18/70] Gather kernels valid --- src/alge/cs_gradient.cxx | 33 ++++++------- src/alge/cs_gradient_cuda.cu | 54 ++++++++++++---------- src/alge/cs_gradient_lsq_vector_gather.cuh | 9 ++-- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 145b4574a2..bc2a388eb0 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6905,17 +6905,18 @@ _lsq_vector_gradient(const cs_mesh_t *m, _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s); - cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda; + cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda, *gradv_cpu; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); + BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); /* Compute Right-Hand Side */ /*-------------------------*/ -#ifdef NDEBUG -#if defined(HAVE_CUDA) -#endif +// #ifdef NDEBUG +// #if defined(HAVE_CUDA) +// #endif start = std::chrono::high_resolution_clock::now(); cs_lsq_vector_gradient_cuda( m, @@ -6928,13 +6929,13 @@ _lsq_vector_gradient(const cs_mesh_t *m, pvar, c_weight, cocg, - gradv_cuda, + gradv, rhs_cuda); stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); -#ifdef NDEBUG -#else -#endif +// #ifdef NDEBUG +// #else +// #endif start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { @@ -7071,21 +7072,21 @@ _lsq_vector_gradient(const cs_mesh_t *m, #pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { - gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + gradv_cpu[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + rhs[c_id][i][1] * cocg[c_id][3] + rhs[c_id][i][2] * cocg[c_id][5]; - gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + gradv_cpu[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + rhs[c_id][i][1] * cocg[c_id][1] + rhs[c_id][i][2] * cocg[c_id][4]; - gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + gradv_cpu[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; for (int j =0; j < 3; ++j) { - auto cpu = rhs[c_id][i][j]; - auto cuda = rhs_cuda[c_id][i][j]; + auto cpu = gradv_cpu[c_id][i][j]; + auto cuda = gradv[c_id][i][j]; if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); @@ -7093,9 +7094,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, } } } -#ifdef NDEBUG -#endif -#endif +// #ifdef NDEBUG +// #endif +// #endif stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); printf("Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 13e667f58b..b3bb82a84b 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -926,6 +926,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); const cs_lnum_t *restrict b_face_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); const cs_lnum_t *restrict cell_cells_idx = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); const cs_lnum_t *restrict cell_cells_lst @@ -973,9 +975,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - _init_rhs<<>> - (n_cells_ext, - rhs_d); + // _init_rhs<<>> + // (n_cells_ext, + // rhs_d); + cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); // _init_rhs_v2<<>> // (n_cells_ext*3*3, @@ -1058,22 +1061,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - _compute_rhs_lsq_v_b_face<<>> - (m->n_b_faces, - b_face_cells, - cell_f_cen, - b_face_normal, - rhs_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, - inc); - - // _compute_rhs_lsq_v_b_face_gather<<>> - // (m->n_b_cells, - // cell_b_faces_idx, - // cell_b_faces, + // _compute_rhs_lsq_v_b_face<<>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, // b_face_normal, // rhs_d, // pvar_d, @@ -1082,6 +1073,19 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); + _compute_rhs_lsq_v_b_face_gather<<>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, + b_cells, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); + // _compute_rhs_lsq_v_b_face_v2<<>> // (m->n_b_faces, // b_face_cells, @@ -1097,12 +1101,12 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); - if (rhs_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; - cs_cuda_copy_d2h(rhs, rhs_d, size); - } - else - cs_sync_d2h(rhs); + // if (rhs_d != NULL) { + // size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + // cs_cuda_copy_d2h(rhs, rhs_d, size); + // } + // else + // cs_sync_d2h(rhs); // /* Compute gradient */ // /*------------------*/ diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index 80258d4fb8..2665c14186 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -86,6 +86,7 @@ __global__ static void _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, const cs_real_3_t *restrict b_face_normal, cs_real_33_t *restrict rhs, const cs_real_3_t *restrict pvar, @@ -94,12 +95,14 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, const cs_real_3_t *restrict coefav, const int inc) { - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= size){ + if(c_idx >= size){ return; } + cs_lnum_t c_id = b_cells[c_idx]; + cs_lnum_t f_id; cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; @@ -131,4 +134,4 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, rhs[c_id][i][2] += n_d_dist[2] * pfac; } } -} \ No newline at end of file +} From 9a415267b4157ba086f5fab97f5bd8f099f4eb57 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Tue, 7 Nov 2023 16:33:50 +0100 Subject: [PATCH 19/70] ADD Kernels V2 optim --- src/alge/cs_gradient.cxx | 156 +++++++------ src/alge/cs_gradient_cuda.cu | 208 ++++++++++-------- .../cs_reconstruct_vector_gradient_v2.cuh | 186 ++++++++++++++++ 3 files changed, 376 insertions(+), 174 deletions(-) create mode 100644 src/alge/cs_reconstruct_vector_gradient_v2.cuh diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 458f18bde0..d367a3f453 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5692,15 +5692,47 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, std::chrono::high_resolution_clock::time_point start, stop; std::chrono::microseconds elapsed, elapsed_cuda; - bool COMPUTE_CPU = true; - bool COMPUTE_CUDA = true; - bool RES_CPU = false; cs_real_33_t *grad_cpu; + + + bool COMPUTE_CUDA; + bool COMPUTE_CPU; + bool RES_CPU; + +#if defined(HAVE_CUDA) + COMPUTE_CUDA = (cs_get_device_id() > -1) ? true : false; + RES_CPU = !COMPUTE_CUDA; +#else + COMPUTE_CUDA = false; +#endif + +#if defined(NDEBUG) && !defined(COMPUTE_CUDA) + COMPUTE_CPU = true; + RES_CPU = true; +#elif defined(DEBUG) + COMPUTE_CPU = true; +#else + COMPUTE_CPU = true; +#endif + + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + COMPUTE_CUDA = true; + COMPUTE_CPU = true; + RES_CPU = false; + + // A ne pas garder dans la version finale + bool PERF = true; + bool ACCURACY = true; + if(COMPUTE_CUDA){ printf("Compute with CUDA\n"); - start = std::chrono::high_resolution_clock::now(); + if(PERF){ + start = std::chrono::high_resolution_clock::now(); + } cs_reconstruct_vector_gradient_cuda(m, fvq, cpl, @@ -5715,14 +5747,19 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, coupled_faces, cpl_stride, cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION); - stop = std::chrono::high_resolution_clock::now(); - elapsed_cuda = std::chrono::duration_cast(stop - start); + if(PERF){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } } if(COMPUTE_CPU){ printf("Compute with CPU\n"); BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + if(PERF){ + start = std::chrono::high_resolution_clock::now(); + } /* Initialization */ start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for @@ -5755,14 +5792,6 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); - // if(c_id1 == 604 && c_id2 == 605){ - // printf("CPU :\n"); - // printf("f_id = %d :\n", f_id); - // printf("c_d1 = %d - c_id2 = %d :\n", c_id1, c_id2); - // printf("weight[f_id] = %.17lg:\n", weight[f_id]); - // printf("ktpond = %.17lg:\n", ktpond); - // } - /* Remark: \f$ \varia_\face = \alpha_\ij \varia_\celli + (1-\alpha_\ij) \varia_\cellj\f$ @@ -5788,18 +5817,6 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, for (cs_lnum_t j = 0; j < 3; j++) { grad_cpu[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; grad_cpu[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; - - // if(c_id1 == 604 && c_id2 == 605){ - // printf("Variables loop :\n"); - // printf("pfaci = %.17lg: :\n", pfaci); - // printf("rfac = %.17lg:\n", rfac); - // printf("j = %d - i_f_face_normal[f_id][j] = %.17lg:\n", j, i_f_face_normal[f_id][j]); - // printf("i = %d - j = %d - grad_cpu[c_id1][i][j] = %.17lg:\n",i, j, grad_cpu[c_id1][i][j]); - // if(i == 2 && j == 2){ - // printf("\n"); - // printf("\n"); - // } - // } } } @@ -5809,24 +5826,6 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } /* End of loop on thread groups */ - /* Test grad */ - // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - // for (cs_lnum_t i = 0; i < 3; i++) { - // for (int j =0; j < 3; ++j) { - // auto cpu = grad_cpu[c_id][i][j]; - // auto cuda = grad[c_id][i][j]; - // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); - // if (err > 1e-12) { - // printf("rec inte DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); - // } - // } - // } - // } - - // stop = std::chrono::high_resolution_clock::now(); - // elapsed = std::chrono::duration_cast(stop - start); - // printf("rec Compute after i_face time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); - /* Contribution from coupled faces */ // if (cpl != NULL) { // cs_internal_coupling_initialize_vector_gradient(cpl, c_weight, pvar, grad); @@ -5879,21 +5878,6 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } /* loop on threads */ - - /* Test grad */ - // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - // for (cs_lnum_t i = 0; i < 3; i++) { - // for (int j =0; j < 3; ++j) { - // auto cpu = grad_cpu[c_id][i][j]; - // auto cuda = grad[c_id][i][j]; - // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); - // if (err> 1e-14) { - // printf("rec bf1 DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); - // } - // } - // } - // } - # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_real_t dvol; @@ -5922,28 +5906,36 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } } } - - stop = std::chrono::high_resolution_clock::now(); - elapsed = std::chrono::duration_cast(stop - start); + + if(PERF){ + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + } - /* Performances */ - // printf("rec Compute after b_face2 time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); - - /* Test grad */ - // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - // for (cs_lnum_t i = 0; i < 3; i++) { - // for (int j =0; j < 3; ++j) { - // auto cpu = grad_cpu[c_id][i][j]; - // auto cuda = grad[c_id][i][j]; - // double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-30)); - // if (err> 1e-12) { - // printf("rec bf2 DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); - // } - // } - // } - // } + + } + + /* Performances */ + if(PERF){ + printf("rec Compute after b_face2 time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + } + + /* Test grad */ + if(ACCURACY){ + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j =0; j < 3; ++j) { + auto cpu = grad_cpu[c_id][i][j]; + auto cuda = grad[c_id][i][j]; + double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6)); + if (err> 1e-6) { + printf("rec DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); + } + } + } + } } if(RES_CPU){ @@ -5951,12 +5943,12 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, grad = grad_cpu; }else{ printf("RESULTS CUDA\n"); + // Free memory + if(COMPUTE_CPU){ + BFT_FREE(grad_cpu); + } } - // Free memory - if(COMPUTE_CPU && !RES_CPU){ - BFT_FREE(grad_cpu); - } /* Periodicity and parallelism treatment */ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 37a6c6f2aa..695f21ace8 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -80,6 +80,8 @@ #include "cs_gradient.h" #include "cs_gradient_priv.h" +#include "cs_reconstruct_vector_gradient_v2.cuh" + /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -458,11 +460,12 @@ _init_rhsv(cs_lnum_t size, cs_real_33_t *restrict array) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_real_t *array1d = (cs_real_t *) array; if (c_id < size) { - for (cs_lnum_t i = 0; i < 3; i++) - for (cs_lnum_t j = 0; j < 3; j++) - array[c_id][i][j] = 0.0; + // for (cs_lnum_t i = 0; i < 3; i++) + // for (cs_lnum_t j = 0; j < 3; j++) + array1d[c_id] = 0.0; } } @@ -1273,6 +1276,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, unsigned int gridsize_bf = (unsigned int)ceil((double)m->n_b_faces / blocksize); unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); + unsigned int gridsize_init + = (unsigned int)ceil((double)m->n_cells*3*3 / blocksize); unsigned int gridsize_ext = (unsigned int)ceil((double)n_cells_ext / blocksize); @@ -1313,8 +1318,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - _init_real_33_array<<>> - (n_cells_ext, + _init_real_33_array<<>> + (n_cells_ext*3*3, rhs_d); CS_CUDA_CHECK(cudaEventRecord(init, stream)); @@ -1456,65 +1461,42 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - // for(int f_id = blockIdx.x * blockDim.x + threadIdx.x; f_id < size; f_id += blockDim.x * gridDim.x){ - - - if(f_id >= size){ return; } cs_lnum_t c_id1, c_id2; cs_real_t pond, ktpond, pfaci, pfacj, rfac; - c_id1 = i_face_cells[f_id][0]; - c_id2 = i_face_cells[f_id][1]; + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; - pond = weight[f_id]; - ktpond = (c_weight == NULL) ? - pond : // no cell weighting - pond * c_weight[c_id1] // cell weighting active - / ( pond * c_weight[c_id1] - + (1.0-pond)* c_weight[c_id2]); + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); - for (cs_lnum_t i = 0; i < 3; i++) { - pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - /* Reconstruction part */ - rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); - for (cs_lnum_t j = 0; j < 3; j++) { - // grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - // grad[c_id2][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; - - atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); - atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); - - // if(c_id1 == 604 && c_id2 == 605){ - // printf("Variables GPU loop :\n"); - // printf("pfaci GPU = %.17lg: :\n", pfaci); - // printf("rfac GPU = %.17lg:\n", rfac); - // printf("j = %d - i_f_face_normal[f_id][j] GPU = %.17lg:\n", j, i_f_face_normal[f_id][j]); - // printf("i = %d - j = %d - grad[c_id1][i][j] GPU = %.17lg:\n",i, j, grad[c_id1][i][j]); - // } - } } - // if(c_id1 == 604 && c_id2 == 605){ - // printf("GPU :\n"); - // printf("f_id GPU = %d :\n", f_id); - // printf("c_d1 GPU = %d - c_id2 GPU = %d :\n", c_id1, c_id2); - // printf("weight[f_id] GPU = %.17lg:\n", weight[f_id]); - // printf("ktpond GPU = %.17lg:\n", ktpond); - // printf("\n"); - // printf("\n"); - - // } + } + } @@ -1702,6 +1684,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, = (unsigned int)ceil((double)m->n_b_faces / blocksize); unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); + unsigned int gridsize_init + = (unsigned int)ceil((double)m->n_cells*3*3 / blocksize); unsigned int gridsize_ext = (unsigned int)ceil((double)n_cells_ext / blocksize); @@ -1796,15 +1780,29 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); /* Initialization */ - _init_real_33_array<<>> - (n_cells, grad_d); + _init_real_33_array<<>> + (n_cells*3*3, grad_d); + + // cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_33_t)); CS_CUDA_CHECK(cudaEventRecord(init, stream)); /* Interior faces contribution */ - _compute_reconstruct_v_i_face<<>> - (n_i_faces, + // _compute_reconstruct_v_i_face<<>> + // (n_i_faces, + // i_group_index, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); + + _compute_reconstruct_v_i_face_v2<<>> + (n_i_faces * 3, i_group_index, i_face_cells, pvar_d, @@ -1845,8 +1843,24 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- - _compute_reconstruct_v_b_face1<<>> - ( n_b_faces, + // _compute_reconstruct_v_b_face1<<>> + // ( n_b_faces, + // b_group_index, + // coupled_faces_d, + // cpl_stride, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + + + _compute_reconstruct_v_b_face1_v2<<>> + ( n_b_faces * 3, b_group_index, coupled_faces_d, cpl_stride, @@ -1862,8 +1876,18 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); - _compute_reconstruct_v_b_face2<<>> - ( n_cells, + // _compute_reconstruct_v_b_face2<<>> + // ( n_cells, + // has_dc, + // c_disable_flag, + // cell_f_vol, + // grad_d, + // corr_grad_lin, + // test_bool + // ); + + _compute_reconstruct_v_b_face2_v2<<>> + ( n_cells * 3, has_dc, c_disable_flag, cell_f_vol, @@ -1890,49 +1914,49 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - // printf("rec Kernels times:\n"); + printf("rec Kernels times:\n"); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); - // printf("Kernels execution time in us: \t"); - // printf("Init = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); - // printf("I_faces = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); - // printf("CPU part = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); + printf("CPU part = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); - // printf("B_faces_1 = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); + printf("B_faces_1 = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); - // printf("B_faces_2 = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); + printf("B_faces_2 = %f\t", msec*1000.f); - // printf("\n"); + printf("\n"); - // msec_tot = 0.0f; - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); - // printf("Total kernel part 1= %f\t", msec*1000.f); - // msec_tot = msec; + msec_tot = 0.0f; + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); + printf("Total kernel part 1= %f\t", msec*1000.f); + msec_tot = msec; - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); - // printf("Total kernel part 2= %f\t", msec*1000.f); - // msec_tot += msec; + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); + printf("Total kernel part 2= %f\t", msec*1000.f); + msec_tot += msec; - // printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); + printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); - // printf("Total = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); - // printf("\n"); + printf("\n"); if (_pvar_d != NULL) diff --git a/src/alge/cs_reconstruct_vector_gradient_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_v2.cuh new file mode 100644 index 0000000000..50f30ac034 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_v2.cuh @@ -0,0 +1,186 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + + +__global__ static void +_compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_idt][0]; + c_id2 = i_face_cells[f_idt][1]; + + pond = weight[f_idt]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_idt][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_idt][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_idt][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_idt][j])); + + } + +} + + +__global__ static void +_compute_reconstruct_v_b_face1_v2(cs_lnum_t n_b_faces, + const cs_lnum_t *restrict b_group_index, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + // if (coupled_faces[f_idt * cpl_stride]) + // return; + + c_id = b_face_cells[f_idt]; + + pfac = inc*coefav[f_idt][i]; + + for (cs_lnum_t k = 0; k < 3; k++) + pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + + pfac -= pvar[c_id][i]; + +// /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + + r_grad[c_id][k][1] * diipb[f_idt][1] + + r_grad[c_id][k][2] * diipb[f_idt][2]; + rfac += coefbv[f_idt][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++) + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_idt][j]); + +} + + + +__global__ static void +_compute_reconstruct_v_b_face2_v2( cs_lnum_t n_cells, + cs_lnum_t has_dc, + const int *restrict c_disable_flag, + const cs_real_t *restrict cell_f_vol, + cs_real_33_t *restrict grad, + const cs_real_33_t *restrict corr_grad_lin, + bool test_bool + ) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id >= n_cells){ + return; + } + + size_t c_idt = c_id / 3; + size_t i = c_id % 3; + + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_idt] == 0) + dvol = 1. / cell_f_vol[c_idt]; + else + dvol = 0.; + + for (cs_lnum_t j = 0; j < 3; j++){ + grad[c_idt][i][j] *= dvol; + } + + + if (test_bool) { + cs_real_t gradpa[3]; + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_idt][i][j]; + grad[c_idt][i][j] = 0.; + } + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][0] * gradpa[0]); + atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][1] * gradpa[1]); + atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][2] * gradpa[2]); + } + } + +} \ No newline at end of file From 1938409b904e089e144f588d56a0919201e0cc0b Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 8 Nov 2023 13:22:30 +0100 Subject: [PATCH 20/70] Gather v2 --- src/alge/cs_gradient_cuda.cu | 38 +++- src/alge/cs_gradient_lsq_vector_gather_v2.cuh | 165 ++++++++++++++++++ 2 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 src/alge/cs_gradient_lsq_vector_gather_v2.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index b3bb82a84b..939e703f81 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -29,6 +29,7 @@ #include "cs_gradient_lsq_vector_v2.cuh" #include "cs_gradient_lsq_vector_v3.cuh" #include "cs_gradient_lsq_vector_gather.cuh" +#include "cs_gradient_lsq_vector_gather_v2.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -535,6 +536,14 @@ _sync_or_copy_real_h2d(const T *val_h, *buf_d = _buf_d; } +/* Compute gridsize*/ + +unsigned int get_gridsize(unsigned int size, unsigned int blocksize){ + unsigned int gridsize = (unsigned int)ceil((double)size / blocksize); + + return gridsize; +} + /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ /*============================================================================= @@ -1035,7 +1044,19 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, assert(rhs_d); assert(pvar_d); assert(weight); - _compute_rhs_lsq_v_i_face_gather<<>> + // _compute_rhs_lsq_v_i_face_gather<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + _compute_rhs_lsq_v_i_face_gather_v2<<>> (n_cells, cell_cells_idx, cell_cells, @@ -1073,7 +1094,20 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - _compute_rhs_lsq_v_b_face_gather<<>> + // _compute_rhs_lsq_v_b_face_gather<<>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_cells, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); + + _compute_rhs_lsq_v_b_face_gather_v2<<>> (m->n_b_cells, cell_b_faces_idx, cell_b_faces, diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh new file mode 100644 index 0000000000..32c5b920c5 --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -0,0 +1,165 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[3][3]; + __shared__ cs_real_t _pvar1[3]; + __shared__ cs_real_t _pvar2[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto temp_rhs = rhs[c_id1]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto temp_pvar1 = pvar[c_id1]; + _pvar1[0]= temp_pvar1[0]; _pvar1[1]= temp_pvar1[1]; _pvar1[2]= temp_pvar1[2]; + auto temp_pvar2 = pvar[c_id2]; + _pvar2[0]= temp_pvar2[0]; _pvar2[1]= temp_pvar2[1]; _pvar2[2]= temp_pvar2[2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[i][j] += _weight * fctb[j]; + } + } + + rhs[c_id1][0][0] = _rhs[0][0]; rhs[c_id1][0][1] = _rhs[0][1]; rhs[c_id1][0][2] = _rhs[0][2]; + rhs[c_id1][1][0] = _rhs[1][0]; rhs[c_id1][1][1] = _rhs[1][1]; rhs[c_id1][1][2] = _rhs[1][2]; + rhs[c_id1][2][0] = _rhs[2][0]; rhs[c_id1][2][1] = _rhs[2][1]; rhs[c_id1][2][2] = _rhs[2][2]; +} +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= size){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + __shared__ cs_real_t _rhs[3][3]; + __shared__ cs_real_t _pvar1[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + auto temp_rhs = rhs[c_id]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto temp_pvar = pvar[c_id]; + _pvar1[0]= temp_pvar[0]; _pvar1[1]= temp_pvar[1]; _pvar1[2]= temp_pvar[2]; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * _pvar1[0] + + coefbv[f_id][1][i] * _pvar1[1] + + coefbv[f_id][2][i] * _pvar1[2] + - _pvar1[i]); + + _rhs[i][0] += n_d_dist[0] * pfac; + _rhs[i][1] += n_d_dist[1] * pfac; + _rhs[i][2] += n_d_dist[2] * pfac; + } + + rhs[c_id][0][0] = _rhs[0][0]; rhs[c_id][0][1] = _rhs[0][1]; rhs[c_id][0][2] = _rhs[0][2]; + rhs[c_id][1][0] = _rhs[1][0]; rhs[c_id][1][1] = _rhs[1][1]; rhs[c_id][1][2] = _rhs[1][2]; + rhs[c_id][2][0] = _rhs[2][0]; rhs[c_id][2][1] = _rhs[2][1]; rhs[c_id][2][2] = _rhs[2][2]; + } +} From 2a339f3a9d45af0514a07ecec2a5739e9a857786 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 9 Nov 2023 11:20:07 +0100 Subject: [PATCH 21/70] Gather second version completed --- src/alge/cs_gradient_cuda.cuh | 2 +- src/alge/cs_gradient_lsq_vector.cuh | 2 +- src/alge/cs_gradient_lsq_vector_gather_v2.cuh | 77 ++++++++++--------- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index 15a25ed799..e5bc216202 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -93,4 +93,4 @@ __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], out[0] = inverse_norm * in[0]; out[1] = inverse_norm * in[1]; out[2] = inverse_norm * in[2]; -} \ No newline at end of file +} diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index 31ee4e95eb..e79678b4ee 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -249,4 +249,4 @@ _compute_gradient_lsq_v(cs_lnum_t size, + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; } -} \ No newline at end of file +} diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh index 32c5b920c5..a871ae5b6e 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -48,15 +48,24 @@ _compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; __shared__ cs_real_t _rhs[3][3]; - __shared__ cs_real_t _pvar1[3]; - __shared__ cs_real_t _pvar2[3]; + + auto temp_rhs = rhs[c_id1]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; for(cs_lnum_t index = s_id; index < e_id; index++){ c_id2 = cell_cells[index]; - dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; - dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; - dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); @@ -71,29 +80,22 @@ _compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, _weight = c_weight[c_id2] * _denom; } - auto temp_rhs = rhs[c_id1]; - _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; - _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; - _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + auto _pvar2 = pvar[c_id2]; + // _pvar2[0]= temp_pvar2[0]; _pvar2[1]= temp_pvar2[1]; _pvar2[2]= temp_pvar2[2]; - auto temp_pvar1 = pvar[c_id1]; - _pvar1[0]= temp_pvar1[0]; _pvar1[1]= temp_pvar1[1]; _pvar1[2]= temp_pvar1[2]; - auto temp_pvar2 = pvar[c_id2]; - _pvar2[0]= temp_pvar2[0]; _pvar2[1]= temp_pvar2[1]; _pvar2[2]= temp_pvar2[2]; - - for(cs_lnum_t i = 0; i < 3; i++){ - pfac = (_pvar2[i] - _pvar1[i]) * ddc; - for(cs_lnum_t j = 0; j < 3; j++){ - fctb[j] = dc[j] * pfac; - _rhs[i][j] += _weight * fctb[j]; + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[i][j] += _weight * fctb[j]; + } } + } - rhs[c_id1][0][0] = _rhs[0][0]; rhs[c_id1][0][1] = _rhs[0][1]; rhs[c_id1][0][2] = _rhs[0][2]; rhs[c_id1][1][0] = _rhs[1][0]; rhs[c_id1][1][1] = _rhs[1][1]; rhs[c_id1][1][2] = _rhs[1][2]; rhs[c_id1][2][0] = _rhs[2][0]; rhs[c_id1][2][1] = _rhs[2][1]; rhs[c_id1][2][2] = _rhs[2][2]; } -} __global__ static void _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, @@ -123,12 +125,21 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; __shared__ cs_real_t _rhs[3][3]; - __shared__ cs_real_t _pvar1[3]; + + auto temp_rhs = rhs[c_id]; + _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; + _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; + _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; + + auto _pvar1 = pvar[c_id]; for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; @@ -138,19 +149,11 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, n_d_dist[1] *= d_b_dist; n_d_dist[2] *= d_b_dist; - auto temp_rhs = rhs[c_id]; - _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; - _rhs[1][0]= temp_rhs[1][0]; _rhs[1][1]= temp_rhs[1][1]; _rhs[1][2]= temp_rhs[1][2]; - _rhs[2][0]= temp_rhs[2][0]; _rhs[2][1]= temp_rhs[2][1]; _rhs[2][2]= temp_rhs[2][2]; - - auto temp_pvar = pvar[c_id]; - _pvar1[0]= temp_pvar[0]; _pvar1[1]= temp_pvar[1]; _pvar1[2]= temp_pvar[2]; - for (cs_lnum_t i = 0; i < 3; i++) { - pfac = coefav[f_id][i]*inc - + ( coefbv[f_id][0][i] * _pvar1[0] - + coefbv[f_id][1][i] * _pvar1[1] - + coefbv[f_id][2][i] * _pvar1[2] + pfac = _coefav[i]*inc + + ( _coefbv[0][i] * _pvar1[0] + + _coefbv[1][i] * _pvar1[1] + + _coefbv[2][i] * _pvar1[2] - _pvar1[i]); _rhs[i][0] += n_d_dist[0] * pfac; @@ -158,8 +161,8 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, _rhs[i][2] += n_d_dist[2] * pfac; } - rhs[c_id][0][0] = _rhs[0][0]; rhs[c_id][0][1] = _rhs[0][1]; rhs[c_id][0][2] = _rhs[0][2]; - rhs[c_id][1][0] = _rhs[1][0]; rhs[c_id][1][1] = _rhs[1][1]; rhs[c_id][1][2] = _rhs[1][2]; - rhs[c_id][2][0] = _rhs[2][0]; rhs[c_id][2][1] = _rhs[2][1]; rhs[c_id][2][2] = _rhs[2][2]; } + rhs[c_id][0][0] = _rhs[0][0]; rhs[c_id][0][1] = _rhs[0][1]; rhs[c_id][0][2] = _rhs[0][2]; + rhs[c_id][1][0] = _rhs[1][0]; rhs[c_id][1][1] = _rhs[1][1]; rhs[c_id][1][2] = _rhs[1][2]; + rhs[c_id][2][0] = _rhs[2][0]; rhs[c_id][2][1] = _rhs[2][1]; rhs[c_id][2][2] = _rhs[2][2]; } From c293c4697c34fa8b52700a84e0433f790231b667 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 9 Nov 2023 15:31:19 +0100 Subject: [PATCH 22/70] ULP computation --- src/alge/cs_gradient.cxx | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index bc2a388eb0..9b528c9bf4 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -37,6 +37,13 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #if defined(HAVE_MPI) #include @@ -677,6 +684,31 @@ _sync_scalar_gradient_halo(const cs_mesh_t *m, } } +/* Compute the unit in the last place (ULP) */ +template +typename std::enable_if::is_integer, T>::type +cs_diff_ulp(T x, T y) +{ + // Since `epsilon()` is the gap size (ULP, unit in the last place) + // of floating-point numbers in interval [1, 2), we can scale it to + // the gap size in interval [2^e, 2^{e+1}), where `e` is the exponent + // of `x` and `y`. + + // If `x` and `y` have different gap sizes (which means they have + // different exponents), we take the smaller one. Taking the bigger + // one is also reasonable, I guess. + const T m = std::min(std::fabs(x), std::fabs(y)); + + // Subnormal numbers have fixed exponent, which is `min_exponent - 1`. + const int exp = m < std::numeric_limits::min() + ? std::numeric_limits::min_exponent - 1 + : std::ilogb(m); + + // We divide the absolute difference by the epsilon times the exponent (1 ulp) + return std::fabs(x - y) / std::ldexp(std::numeric_limits::epsilon(), exp); +} + + /*---------------------------------------------------------------------------- * Clip the gradient of a scalar if necessary. This function deals with * the standard or extended neighborhood. @@ -7088,8 +7120,8 @@ _lsq_vector_gradient(const cs_mesh_t *m, auto cpu = gradv_cpu[c_id][i][j]; auto cuda = gradv[c_id][i][j]; - if (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda); + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { + printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); } } } From e6185200b1360a7f023f744470d526f1e45ffbe0 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Sun, 12 Nov 2023 13:34:40 +0100 Subject: [PATCH 23/70] Gather V3 --- src/alge/cs_gradient_cuda.cu | 5 +- src/alge/cs_gradient_lsq_vector.cuh | 4 +- src/alge/cs_gradient_lsq_vector_gather_v3.cuh | 183 ++++++++++++++++++ 3 files changed, 188 insertions(+), 4 deletions(-) create mode 100644 src/alge/cs_gradient_lsq_vector_gather_v3.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 939e703f81..848746f47d 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -30,6 +30,7 @@ #include "cs_gradient_lsq_vector_v3.cuh" #include "cs_gradient_lsq_vector_gather.cuh" #include "cs_gradient_lsq_vector_gather_v2.cuh" +#include "cs_gradient_lsq_vector_gather_v3.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -1056,7 +1057,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_gather_v2<<>> + _compute_rhs_lsq_v_i_face_gather_v3<<>> (n_cells, cell_cells_idx, cell_cells, @@ -1107,7 +1108,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - _compute_rhs_lsq_v_b_face_gather_v2<<>> + _compute_rhs_lsq_v_b_face_gather_v3<<>> (m->n_b_cells, cell_b_faces_idx, cell_b_faces, diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index e79678b4ee..ff0177680f 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -143,7 +143,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, __global__ static void _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, const cs_lnum_t *restrict cell_cells_idx, - const cs_lnum_t *restrict cell_cells_lst, + const cs_lnum_t *restrict cell_cells, const cs_real_3_t *restrict cell_f_cen, cs_real_33_t *restrict rhs, const cs_real_3_t *restrict pvar) @@ -161,7 +161,7 @@ _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, for(cs_lnum_t index = s_id; index < e_id; index++){ - cs_lnum_t c_id2 = cell_cells_idx[index]; + cs_lnum_t c_id2 = cell_cells[index]; dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh new file mode 100644 index 0000000000..e354a9eb3f --- /dev/null +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -0,0 +1,183 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t size, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + // size_t c_id1 = c_id / (3*3); + // size_t i = (c_id / 3) % 3; + // size_t j = c_id % 3; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[256*3*3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex + (i*3+j)*256] = rhs[c_id1][i][j]; + } + } + __syncthreads(); + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto _pvar2 = pvar[c_id2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[lindex + (i*3+j)*256] += _weight * fctb[j]; + } + } + + } + __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id1][i][j] = _rhs[lindex + (i*3+j)*256]; + } + } +} + +__global__ static void +_compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t size, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_normal, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_idx >= size){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t n_d_dist[3], d_b_dist, pfac, norm, inverse_norm; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + __shared__ cs_real_t _rhs[256*3*3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex + (i*3+j)*256] = rhs[c_id][i][j]; + } + } + + __syncthreads(); + + auto _pvar1 = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + + cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + + d_b_dist = 1. / b_dist[f_id]; + + /* Normal divided by b_dist */ + n_d_dist[0] *= d_b_dist; + n_d_dist[1] *= d_b_dist; + n_d_dist[2] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = _coefav[i]*inc + + ( _coefbv[0][i] * _pvar1[0] + + _coefbv[1][i] * _pvar1[1] + + _coefbv[2][i] * _pvar1[2] + - _pvar1[i]); + + _rhs[lindex + (i*3)*256] += n_d_dist[0] * pfac; + _rhs[lindex + (i*3+1)*256] += n_d_dist[1] * pfac; + _rhs[lindex + (i*3+2)*256] += n_d_dist[2] * pfac; + } + + } + __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = _rhs[lindex + (i*3+j)*256]; + } + } +} From 592b7278ec23a0a586ef25eaa596ab02de9969d6 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Sun, 12 Nov 2023 13:46:06 +0100 Subject: [PATCH 24/70] Avoid using generic size in kernels --- src/alge/cs_gradient_lsq_vector.cuh | 24 +++++++++---------- src/alge/cs_gradient_lsq_vector_gather.cuh | 8 +++---- src/alge/cs_gradient_lsq_vector_gather_v2.cuh | 8 +++---- src/alge/cs_gradient_lsq_vector_gather_v3.cuh | 8 +++---- src/alge/cs_gradient_lsq_vector_v2.cuh | 20 ++++++++-------- src/alge/cs_gradient_lsq_vector_v3.cuh | 12 +++++----- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index ff0177680f..15d5644e89 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -29,11 +29,11 @@ *----------------------------------------------------------------------------*/ __global__ static void -_init_rhs(cs_lnum_t size, +_init_rhs(cs_lnum_t n_cells_ext, cs_real_33_t *restrict rhs) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id < size) { + if (c_id < n_cells_ext) { for (cs_lnum_t i = 0; i < 3; i++) for (cs_lnum_t j = 0; j < 3; j++) rhs[c_id][i][j] = 0.0; @@ -41,7 +41,7 @@ _init_rhs(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_v0(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *cell_f_cen, cs_real_33_t *rhs, @@ -51,7 +51,7 @@ _compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; @@ -93,7 +93,7 @@ _compute_rhs_lsq_v_i_face_v0(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_i_face(cs_lnum_t size, +_compute_rhs_lsq_v_i_face(cs_lnum_t n_i_faces, const cs_lnum_2_t *restrict i_face_cells, const cs_real_3_t *restrict cell_f_cen, cs_real_33_t *restrict rhs, @@ -103,7 +103,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; @@ -141,7 +141,7 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, +_compute_rhs_lsq_v_b_neighbor(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_real_3_t *restrict cell_f_cen, @@ -150,7 +150,7 @@ _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id1 >= size){ + if(c_id1 >= n_cells){ return; } @@ -182,7 +182,7 @@ _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_face(cs_lnum_t size, +_compute_rhs_lsq_v_b_face(cs_lnum_t n_b_faces, const cs_lnum_t *restrict b_face_cells, const cs_real_3_t *restrict cell_f_cen, const cs_real_3_t *restrict b_face_normal, @@ -195,7 +195,7 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_b_faces){ return; } @@ -227,13 +227,13 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t size, } __global__ static void -_compute_gradient_lsq_v(cs_lnum_t size, +_compute_gradient_lsq_v(cs_lnum_t n_cells, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs, cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells) return; for(cs_lnum_t i = 0; i < 3; i++){ diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index 2665c14186..b057e6d3a0 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -25,7 +25,7 @@ /*----------------------------------------------------------------------------*/ __global__ static void -_compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_gather(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_lnum_t *restrict cell_i_faces, @@ -38,7 +38,7 @@ _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id1 >= size){ + if(c_id1 >= n_cells){ return; } cs_real_t dc[3], fctb[3], ddc, _denom, _pond, pfac; @@ -83,7 +83,7 @@ _compute_rhs_lsq_v_i_face_gather(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, +_compute_rhs_lsq_v_b_face_gather(cs_lnum_t n_b_cells, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, const cs_lnum_t *restrict b_cells, @@ -97,7 +97,7 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t size, { cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(c_idx >= size){ + if(c_idx >= n_b_cells){ return; } diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh index a871ae5b6e..ef05c66028 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -25,7 +25,7 @@ /*----------------------------------------------------------------------------*/ __global__ static void -_compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_lnum_t *restrict cell_i_faces, @@ -38,7 +38,7 @@ _compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id1 >= size){ + if(c_id1 >= n_cells){ return; } cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; @@ -98,7 +98,7 @@ _compute_rhs_lsq_v_i_face_gather_v2(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, +_compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t n_b_cells, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, const cs_lnum_t *restrict b_cells, @@ -112,7 +112,7 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t size, { cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(c_idx >= size){ + if(c_idx >= n_b_cells){ return; } diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh index e354a9eb3f..db210dc394 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -25,7 +25,7 @@ /*----------------------------------------------------------------------------*/ __global__ static void -_compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_lnum_t *restrict cell_i_faces, @@ -39,7 +39,7 @@ _compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t size, cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; cs_lnum_t lindex = threadIdx.x; - if(c_id1 >= size){ + if(c_id1 >= n_cells){ return; } cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; @@ -106,7 +106,7 @@ _compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t size, +_compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t n_b_cells, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, const cs_lnum_t *restrict b_cells, @@ -121,7 +121,7 @@ _compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t size, cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; cs_lnum_t lindex = threadIdx.x; - if(c_idx >= size){ + if(c_idx >= n_b_cells){ return; } diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh index 7ca3800542..1e83725139 100644 --- a/src/alge/cs_gradient_lsq_vector_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -29,18 +29,18 @@ *----------------------------------------------------------------------------*/ __global__ static void -_init_rhs_v2(cs_lnum_t size, +_init_rhs_v2(cs_lnum_t n_cells_g, cs_real_t *restrict rhs) { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells_g) return; rhs[c_id] = 0.0; } __global__ static void -_compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_v2(cs_lnum_t n_i_faces, const cs_lnum_t *restrict i_face_cells, const cs_real_t *restrict cell_f_cen, cs_real_t *restrict rhs, @@ -50,7 +50,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; @@ -88,7 +88,7 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, } __global__ static void -_compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, +_compute_rhs_lsq_v_b_face_v2(cs_lnum_t n_b_faces, const cs_lnum_t *restrict b_face_cells, const cs_real_3_t *restrict cell_f_cen, const cs_real_3_t *restrict b_face_normal, @@ -101,7 +101,7 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_b_faces){ return; } @@ -133,13 +133,13 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, } __global__ static void -_compute_gradient_lsq_v_v2(cs_lnum_t size, +_compute_gradient_lsq_v_v2(cs_lnum_t n_cells_g, cs_real_t *restrict gradv, cs_real_t *restrict rhs, cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells_g) return; for(cs_lnum_t i = 0; i < 3; i++){ @@ -158,13 +158,13 @@ _compute_gradient_lsq_v_v2(cs_lnum_t size, } __global__ static void -_compute_gradient_lsq_v_v4(cs_lnum_t size, +_compute_gradient_lsq_v_v4(cs_lnum_t n_cells, cs_real_33_t *restrict gradv_m, cs_real_33_t *restrict rhs_m, cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells) return; cs_real_t *rhs = (cs_real_t *) rhs_m; diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh index 85dfe345c9..3251650c26 100644 --- a/src/alge/cs_gradient_lsq_vector_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_v3.cuh @@ -26,7 +26,7 @@ __global__ static void -_compute_rhs_lsq_v_i_face_v3(cs_lnum_t size, +_compute_rhs_lsq_v_i_face_v3(cs_lnum_t n_i_faces, const cs_lnum_2_t *restrict i_face_cells, const cs_real_3_t *restrict cell_f_cen, cs_real_33_t *restrict rhs, @@ -36,7 +36,7 @@ _compute_rhs_lsq_v_i_face_v3(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; @@ -78,13 +78,13 @@ _compute_rhs_lsq_v_i_face_v3(cs_lnum_t size, } __global__ static void -_compute_gradient_lsq_v_v5(cs_lnum_t size, +_compute_gradient_lsq_v_v5(cs_lnum_t n_cells, cs_real_t *restrict gradv, cs_real_t *restrict rhs, cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells) return; size_t c_id1 = c_id / (3*3); @@ -117,13 +117,13 @@ _compute_gradient_lsq_v_v5(cs_lnum_t size, } __global__ static void -_compute_gradient_lsq_v_v6(cs_lnum_t size, +_compute_gradient_lsq_v_v6(cs_lnum_t n_cells, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs, cs_cocg_6_t *restrict cocg) { size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if (c_id >= size) + if (c_id >= n_cells) return; size_t c_id1 = c_id / (3*3); From 12addf2796a036aabfcea53d4d888515780e1bc0 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Tue, 14 Nov 2023 11:36:26 +0100 Subject: [PATCH 25/70] on work --- src/alge/cs_gradient.cxx | 50 +++-- src/alge/cs_gradient_cuda.cu | 175 ++++++++++++------ src/alge/cs_gradient_priv.h | 4 +- .../cs_reconstruct_vector_gradient_gather.cuh | 97 ++++++++++ ...econstruct_vector_gradient_scatter_v2.cuh} | 12 +- src/base/cs_base_cuda.cu | 8 +- src/base/cs_base_cuda.h | 8 +- 7 files changed, 267 insertions(+), 87 deletions(-) create mode 100644 src/alge/cs_reconstruct_vector_gradient_gather.cuh rename src/alge/{cs_reconstruct_vector_gradient_v2.cuh => cs_reconstruct_vector_gradient_scatter_v2.cuh} (96%) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index d367a3f453..633bff7d4c 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5635,6 +5635,7 @@ _initialize_vector_gradient(const cs_mesh_t *m, static void _reconstruct_vector_gradient(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, const cs_internal_coupling_t *cpl, cs_halo_type_t halo_type, @@ -5700,6 +5701,8 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, bool COMPUTE_CUDA; bool COMPUTE_CPU; bool RES_CPU; + bool PERF; + bool ACCURACY; #if defined(HAVE_CUDA) COMPUTE_CUDA = (cs_get_device_id() > -1) ? true : false; @@ -5708,13 +5711,19 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, COMPUTE_CUDA = false; #endif -#if defined(NDEBUG) && !defined(COMPUTE_CUDA) +#if defined(DEBUG) COMPUTE_CPU = true; - RES_CPU = true; -#elif defined(DEBUG) + PERF = true; + ACCURACY = true; +#elif defined(NDEBUG) && !COMPUTE_CUDA COMPUTE_CPU = true; + RES_CPU = true; + PERF = false; + ACCURACY = false; #else - COMPUTE_CPU = true; + COMPUTE_CPU = false; + PERF = false; + ACCURACY = false; #endif @@ -5725,15 +5734,18 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, RES_CPU = false; // A ne pas garder dans la version finale - bool PERF = true; - bool ACCURACY = true; + PERF = true; + ACCURACY = true; + if(COMPUTE_CUDA){ printf("Compute with CUDA\n"); if(PERF){ start = std::chrono::high_resolution_clock::now(); } + cs_reconstruct_vector_gradient_cuda(m, + madj, fvq, cpl, halo_type, @@ -5746,7 +5758,8 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, grad, coupled_faces, cpl_stride, - cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION); + cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION, + PERF); if(PERF){ stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); @@ -5761,7 +5774,6 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, start = std::chrono::high_resolution_clock::now(); } /* Initialization */ - start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { @@ -5912,17 +5924,14 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, elapsed = std::chrono::duration_cast(stop - start); } - - - } /* Performances */ if(PERF){ - printf("rec Compute after b_face2 time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + printf("reconstruct Compute and tranferts time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); } - /* Test grad */ + /* Accuracy grad_cpu and grad_gpu */ if(ACCURACY){ for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { @@ -5938,15 +5947,17 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } } + //Copy grad if(RES_CPU){ printf("RESULTS CPU\n"); - grad = grad_cpu; + memcpy(grad, grad_cpu, sizeof(cs_real_33_t) * n_cells_ext); }else{ - printf("RESULTS CUDA\n"); - // Free memory - if(COMPUTE_CPU){ - BFT_FREE(grad_cpu); - } + printf("RESULTS GPU\n"); + } + + // Free memory + if(COMPUTE_CPU){ + BFT_FREE(grad_cpu); } @@ -8664,6 +8675,7 @@ _gradient_vector(const char *var_name, r_gradv); _reconstruct_vector_gradient(mesh, + cs_glob_mesh_adjacencies, fvq, cpl, halo_type, diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 695f21ace8..97734906dd 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -80,7 +80,8 @@ #include "cs_gradient.h" #include "cs_gradient_priv.h" -#include "cs_reconstruct_vector_gradient_v2.cuh" +#include "cs_reconstruct_vector_gradient_scatter_v2.cuh" +#include "cs_reconstruct_vector_gradient_gather.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -1555,7 +1556,7 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, __global__ static void -_compute_reconstruct_v_b_face2(cs_lnum_t size, +_compute_reconstruct_correction(cs_lnum_t size, cs_lnum_t has_dc, const int *restrict c_disable_flag, const cs_real_t *restrict cell_f_vol, @@ -1620,6 +1621,7 @@ _compute_reconstruct_v_b_face2(cs_lnum_t size, *----------------------------------------------------------------------------*/ extern "C" void cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, const cs_internal_coupling_t *cpl, cs_halo_type_t halo_type, @@ -1628,11 +1630,12 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, - const cs_real_33_t *restrict r_grad, + const cs_real_33_t *restrict r_grad, cs_real_33_t *restrict grad, const bool *coupled_faces, cs_lnum_t cpl_stride, - bool test_bool + bool test_bool, + bool PERF ) { const cs_lnum_t n_cells = m->n_cells; @@ -1693,8 +1696,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); const cs_lnum_t *restrict b_face_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); - const cs_lnum_t *restrict cell_cells_idx; - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_idx); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); const cs_lnum_t *restrict cell_cells_lst; // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); const int n_i_groups @@ -1715,8 +1718,32 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_lnum_t *restrict b_group_index; CS_CUDA_CHECK(cudaMalloc(&b_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); cs_cuda_copy_h2d(b_group_index, (void *)m->b_face_numbering->group_index, sizeof(int)*n_b_groups * n_b_threads * 2); + // printf("Avant allocation\n"); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_numbering->group_index); + + + // if (madj->cell_i_faces == NULL) { + cs_mesh_adjacencies_update_cell_i_faces(); + // } + assert(madj->cell_i_faces); + const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]); + cs_lnum_t *restrict cell_i_faces; + // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); + assert(cell_i_faces); + + + + short int *restrict cell_i_faces_sgn; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face); + // = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); + // printf("Après allocation\n"); const cs_real_3_t *restrict cell_cen; // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); @@ -1728,8 +1755,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_vol); if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2) cell_f_vol = fvq->cell_vol; - const cs_lnum_3_t *restrict cell_f_cen; - // = (const cs_lnum_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); const cs_real_t *restrict weight = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); const cs_real_t *restrict b_dist; @@ -1781,7 +1808,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Initialization */ _init_real_33_array<<>> - (n_cells*3*3, grad_d); + (n_cells_ext*3*3, grad_d); // cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_33_t)); @@ -1801,18 +1828,48 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - _compute_reconstruct_v_i_face_v2<<>> - (n_i_faces * 3, - i_group_index, - i_face_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal); - + // _compute_reconstruct_v_i_face_v2<<>> + // (n_i_faces * 3, + // i_group_index, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); + + // printf("Avant les assert dans gradient_cuda.cu\n"); + assert(cell_cells_idx); + assert(cell_cells); + assert(weight); + // assert(cell_i_faces); + // assert(cell_i_faces_sgn); + // printf("n_i_faces = %d\n", n_i_faces); + // printf("n_cells = %d\n", n_cells); + for(int i = 0; i< n_i_faces; i++){ + // printf("i = %d && weight = %f \n", i, fvq->weight[i]); + // printf("i = %d && c_id2 = %d \n", i, madj->cell_cells[i]); + // printf("i = %d && s_id = %d \n", i, madj->cell_cells_idx[i]); + // printf("i = %d && f_id = %d \n", i, madj->cell_i_faces_sgn[i]); + } + // printf("Après les assert dans gradient_cuda.cu\n"); + _compute_reconstruct_v_i_face_gather<<>> + ( n_cells, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal, + cell_cells_idx, + cell_cells, + cell_i_faces, + cell_i_faces_sgn, + n_i_faces); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1876,7 +1933,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); - // _compute_reconstruct_v_b_face2<<>> + // _compute_reconstruct_correction<<>> // ( n_cells, // has_dc, // c_disable_flag, @@ -1886,7 +1943,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // test_bool // ); - _compute_reconstruct_v_b_face2_v2<<>> + _compute_reconstruct_correction_v2<<>> ( n_cells * 3, has_dc, c_disable_flag, @@ -1914,50 +1971,51 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - printf("rec Kernels times:\n"); + if(PERF){ + printf("rec Kernels times:\n"); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); - printf("I_faces = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); - printf("CPU part = %f\t", msec*1000.f); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, b_faces_1)); + // printf("CPU part = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); - printf("B_faces_1 = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); + printf("B_faces_1 = %f\t", msec*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); - printf("B_faces_2 = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); + printf("Correction = %f\t", msec*1000.f); - printf("\n"); + printf("\n"); - msec_tot = 0.0f; - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); - printf("Total kernel part 1= %f\t", msec*1000.f); - msec_tot = msec; + msec_tot = 0.0f; + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); + printf("Total kernel part 1= %f\t", msec*1000.f); + msec_tot = msec; - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); - printf("Total kernel part 2= %f\t", msec*1000.f); - msec_tot += msec; + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_3)); + printf("Total kernel part 2= %f\t", msec*1000.f); + msec_tot += msec; - printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); + printf("Total kernel 1 and 2= %f\t", msec_tot*1000.f); - msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); - printf("Total = %f\t", msec*1000.f); - - printf("\n"); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + printf("\n"); + } if (_pvar_d != NULL) CS_CUDA_CHECK(cudaFree(_pvar_d)); @@ -1968,6 +2026,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, if (_r_grad_d != NULL) CS_CUDA_CHECK(cudaFree(_r_grad_d)); + CS_CUDA_CHECK(cudaFree(cell_i_faces)); + CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn)); + CS_CUDA_CHECK(cudaFree(coupled_faces_d)); CS_CUDA_CHECK(cudaFree(i_group_index)); CS_CUDA_CHECK(cudaFree(b_group_index)); diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 85107fab11..77ef08399e 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -125,6 +125,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, void cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, const cs_internal_coupling_t *cpl, cs_halo_type_t halo_type, @@ -137,7 +138,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *restrict grad, const bool *coupled_faces, cs_lnum_t cpl_stride, - bool test_bool + bool test_bool, + bool PERF ); #endif /* defined(HAVE_CUDA) */ diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh new file mode 100644 index 0000000000..b8d9f923c8 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -0,0 +1,97 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +__global__ static void +_compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_lnum_t n_i_faces) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + // if(cell_cells_idx) printf("erreur dans le kernel"); + // if(cell_cells) printf("erreur dans le kernel"); + // if(cell_i_faces) printf("erreur dans le kernel"); + // if(cell_i_faces_sgn) printf("erreur dans le kernel"); + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + // printf("s_id = %d\t",s_id); + // printf("e_id = %d\t",e_id); + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + // pond = weight[f_id]; + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + // pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; + // grad[c_id1][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + } + } + // grad[0][0][0] = 1. + f_id; + // grad[c_id2][0][0] = 1. + pond; + } +} diff --git a/src/alge/cs_reconstruct_vector_gradient_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh similarity index 96% rename from src/alge/cs_reconstruct_vector_gradient_v2.cuh rename to src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 50f30ac034..cba97ac660 100644 --- a/src/alge/cs_reconstruct_vector_gradient_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -77,7 +77,7 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_idt][j]); atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_idt][j])); - } + } } @@ -116,8 +116,9 @@ _compute_reconstruct_v_b_face1_v2(cs_lnum_t n_b_faces, pfac = inc*coefav[f_idt][i]; - for (cs_lnum_t k = 0; k < 3; k++) + for (cs_lnum_t k = 0; k < 3; k++){ pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + } pfac -= pvar[c_id][i]; @@ -130,15 +131,16 @@ _compute_reconstruct_v_b_face1_v2(cs_lnum_t n_b_faces, rfac += coefbv[f_idt][i][k] * vecfac; } - for (cs_lnum_t j = 0; j < 3; j++) - atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_idt][j]); + for (cs_lnum_t j = 0; j < 3; j++){ + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_idt][j]); + } } __global__ static void -_compute_reconstruct_v_b_face2_v2( cs_lnum_t n_cells, +_compute_reconstruct_correction_v2( cs_lnum_t n_cells, cs_lnum_t has_dc, const int *restrict c_disable_flag, const cs_real_t *restrict cell_f_vol, diff --git a/src/base/cs_base_cuda.cu b/src/base/cs_base_cuda.cu index 8acf641015..7bcbb7f460 100644 --- a/src/base/cs_base_cuda.cu +++ b/src/base/cs_base_cuda.cu @@ -325,11 +325,13 @@ cs_cuda_copy_h2d_async(void *dst, /*----------------------------------------------------------------------------*/ void -cs_cuda_copy_d2h(void *dst, +_cs_cuda_copy_d2h(void *dst, const void *src, - size_t size) + size_t size, + const char* filename, + long line) { - CS_CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); + CS_CUDA_CHECK_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost), filename, line); } /*----------------------------------------------------------------------------*/ diff --git a/src/base/cs_base_cuda.h b/src/base/cs_base_cuda.h index 7b352cf47a..91dfdcd28c 100644 --- a/src/base/cs_base_cuda.h +++ b/src/base/cs_base_cuda.h @@ -286,9 +286,13 @@ cs_cuda_copy_h2d_async(void *dst, /*----------------------------------------------------------------------------*/ void -cs_cuda_copy_d2h(void *dst, +_cs_cuda_copy_d2h(void *dst, const void *src, - size_t size); + size_t size, + const char* filename, + long line); + +#define cs_cuda_copy_d2h(dst, src, size) _cs_cuda_copy_d2h(dst, src, size, __FILE__, __LINE__) /*----------------------------------------------------------------------------*/ /*! From de93fd17eeb35aa4fad82c66ce7c6789edca64f6 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 14 Nov 2023 12:37:52 +0100 Subject: [PATCH 26/70] Gather i_face v4 --- src/alge/cs_gradient_cuda.cu | 9 +- src/alge/cs_gradient_lsq_vector_gather_v2.cuh | 2 +- src/alge/cs_gradient_lsq_vector_gather_v3.cuh | 93 +++++++++++++++++-- 3 files changed, 93 insertions(+), 11 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 848746f47d..4a9659fe1e 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -988,7 +988,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // _init_rhs<<>> // (n_cells_ext, // rhs_d); - cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); + // cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); // _init_rhs_v2<<>> // (n_cells_ext*3*3, @@ -1038,7 +1038,6 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // pvar_d, // weight, // c_weight); - assert(cell_cells_idx); assert(cell_cells); assert(cell_f_cen); @@ -1057,7 +1056,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_gather_v3<<>> + _compute_rhs_lsq_v_i_face_gather_v4<<>> (n_cells, cell_cells_idx, cell_cells, @@ -1108,7 +1107,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - _compute_rhs_lsq_v_b_face_gather_v3<<>> + _compute_rhs_lsq_v_b_face_gather_v2<<>> (m->n_b_cells, cell_b_faces_idx, cell_b_faces, @@ -1218,6 +1217,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, printf("\n"); + free(pvar_copy); + if (_pvar_d != NULL) CS_CUDA_CHECK(cudaFree(_pvar_d)); if (_coefa_d != NULL) diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh index ef05c66028..0bfeca2461 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -124,7 +124,7 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t n_b_cells, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; - __shared__ cs_real_t _rhs[3][3]; + cs_real_t _rhs[3][3]; auto temp_rhs = rhs[c_id]; _rhs[0][0]= temp_rhs[0][0]; _rhs[0][1]= temp_rhs[0][1]; _rhs[0][2]= temp_rhs[0][2]; diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh index db210dc394..1d96209de8 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -52,14 +52,14 @@ _compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t n_cells, cs_lnum_t s_id = cell_cells_idx[c_id1]; cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; - __shared__ cs_real_t _rhs[256*3*3]; + __shared__ cs_real_t _rhs[256][3][3]; for(cs_lnum_t i = 0; i < 3; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - _rhs[lindex + (i*3+j)*256] = rhs[c_id1][i][j]; + _rhs[lindex][i][j] = rhs[c_id1][i][j]; } } - __syncthreads(); + // __syncthreads(); auto _pvar1 = pvar[c_id1]; auto _cell_f_cen1 = cell_f_cen[c_id1]; @@ -92,15 +92,96 @@ _compute_rhs_lsq_v_i_face_gather_v3(cs_lnum_t n_cells, pfac = (_pvar2[i] - _pvar1[i]) * ddc; for(cs_lnum_t j = 0; j < 3; j++){ fctb[j] = dc[j] * pfac; - _rhs[lindex + (i*3+j)*256] += _weight * fctb[j]; + _rhs[lindex][i][j] += _weight * fctb[j]; } } } - __syncthreads(); + // __syncthreads(); + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id1][i][j] = _rhs[lindex][i][j]; + } + } +} + +__global__ static void +_compute_rhs_lsq_v_i_face_gather_v4(cs_lnum_t n_cells, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _denom, _weight, _pond, pfac; + cs_lnum_t c_id2, f_id; + + // size_t c_id1 = c_id / (3*3); + // size_t i = (c_id / 3) % 3; + // size_t j = c_id % 3; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = 0.0; + } + } + // __syncthreads(); + auto _pvar1 = pvar[c_id1]; + + auto _cell_f_cen1 = cell_f_cen[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + + auto _cell_f_cen2 = cell_f_cen[c_id2]; + + dc[0] = _cell_f_cen2[0] - _cell_f_cen1[0]; + dc[1] = _cell_f_cen2[1] - _cell_f_cen1[1]; + dc[2] = _cell_f_cen2[2] - _cell_f_cen1[2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } + + auto _pvar2 = pvar[c_id2]; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (_pvar2[i] - _pvar1[i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs[lindex][i][j] += _weight * fctb[j]; + } + } + + } + // __syncthreads(); for(cs_lnum_t i = 0; i < 3; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - rhs[c_id1][i][j] = _rhs[lindex + (i*3+j)*256]; + rhs[c_id1][i][j] = _rhs[lindex][i][j]; } } } From 012722a9ad558330501d16df95262f3afe563843 Mon Sep 17 00:00:00 2001 From: Florian Lemaitre Date: Wed, 15 Nov 2023 11:21:40 +0100 Subject: [PATCH 27/70] conflict avoiding add --- src/alge/cs_gradient_cuda.cu | 118 ++++++++-------- src/alge/cs_gradient_cuda.cuh | 185 +++++++++++++++++++++++++ src/alge/cs_gradient_lsq_vector.cuh | 61 ++++++++ src/alge/cs_gradient_lsq_vector_v2.cuh | 55 ++++++++ src/alge/cs_gradient_lsq_vector_v3.cuh | 56 ++++++++ 5 files changed, 416 insertions(+), 59 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 939e703f81..ef43405bf4 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1011,32 +1011,32 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - // _compute_rhs_lsq_v_i_face<<>> - // (n_i_faces, - // i_face_cells, - // cell_f_cen, - // rhs_d, - // pvar_d, - // weight, - // c_weight); - - // _compute_rhs_lsq_v_i_face_v2<<>> - // (n_i_faces, - // i_face_cells_1d, - // cell_f_cen_1d, - // rhs_test_d, - // pvar_d, - // weight, - // c_weight); + _compute_rhs_lsq_v_i_face_cf<<>> + (n_i_faces, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); - // _compute_rhs_lsq_v_i_face_v3<<>> - // (n_i_faces*3*3, - // i_face_cells, - // cell_f_cen, - // rhs_d, - // pvar_d, - // weight, - // c_weight); + //_compute_rhs_lsq_v_i_face_v2<<>> + // (n_i_faces, + // i_face_cells_1d, + // cell_f_cen_1d, + // rhs_test_d, + // pvar_d, + // weight, + // c_weight); + + // _compute_rhs_lsq_v_i_face_v3cf<<>> + // (n_i_faces*3*3, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); assert(cell_cells_idx); assert(cell_cells); @@ -1056,17 +1056,17 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_gather_v2<<>> - (n_cells, - cell_cells_idx, - cell_cells, - cell_i_faces, - cell_i_faces_sgn, - cell_f_cen, - rhs_d, - pvar_d, - weight, - c_weight); + // _compute_rhs_lsq_v_i_face_gather_v2<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1082,17 +1082,17 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - // _compute_rhs_lsq_v_b_face<<>> - // (m->n_b_faces, - // b_face_cells, - // cell_f_cen, - // b_face_normal, - // rhs_d, - // pvar_d, - // b_dist, - // coefb_d, - // coefa_d, - // inc); + _compute_rhs_lsq_v_b_face<<>> + (m->n_b_faces, + b_face_cells, + cell_f_cen, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); // _compute_rhs_lsq_v_b_face_gather<<>> // (m->n_b_cells, @@ -1107,18 +1107,18 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - _compute_rhs_lsq_v_b_face_gather_v2<<>> - (m->n_b_cells, - cell_b_faces_idx, - cell_b_faces, - b_cells, - b_face_normal, - rhs_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, - inc); + //_compute_rhs_lsq_v_b_face_gather_v2<<>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_cells, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); // _compute_rhs_lsq_v_b_face_v2<<>> // (m->n_b_faces, diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index e5bc216202..7066f59291 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -94,3 +94,188 @@ __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], out[1] = inverse_norm * in[1]; out[2] = inverse_norm * in[2]; } + + +template +__device__ uint32_t _conflict_mask(uint32_t mask, V v) noexcept { +#if __CUDA_ARCH__ >= 700 + return __match_any_sync(mask, v); +#else + uint32_t lanemask_eq = 1u << (threadIdx.x % 32); + if (!(mask & lanemask_eq)) + return 0; + uint32_t ref, ballot; + int leader; + goto entry; +loop: + mask &= ~ballot; +entry: + leader = __ffs(mask) - 1; + ref = __shfl_sync(mask, v, leader); + ballot = __ballot_sync(mask, v == ref); + if (!(ballot & lanemask_eq)) + goto loop; + return ballot; +#endif +} + +template +__device__ bool _reduce_add(uint32_t mask, uint32_t peers, T& v) noexcept { + int laneid = threadIdx.x % 32; + uint32_t lanemask_lt = (1u << laneid) - 1; + uint32_t lanemask_gt = -2u << laneid; + int rank = __popc(peers & lanemask_lt); + bool is_leader = rank == 0; + + peers &= lanemask_gt; + while (__any_sync(mask, peers)) { + int next = __ffs(peers); + + auto tmp = v.shuffle(mask, next - 1); + if (next) { + v.add(tmp); + } + + peers &= __ballot_sync(mask, !(rank & 1)); + + rank >>= 1; + } + + return is_leader; +} + + +template +class AtomicCell { + private: + T value = {}; + public: + using inner_type = T; + public: + __device__ AtomicCell() noexcept = default; + __device__ AtomicCell(T value) noexcept : value(value) {} + __device__ void add(const AtomicCell&restrict other) restrict noexcept { + value += other.value; + } + __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept { + atomicAdd(&value, other.value); + } + __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous = *this; + *this = other; + return previous; + } + __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept { + return AtomicCell(atomicExch(&value, other.value)); + } + __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept { + return AtomicCell(__shfl_sync(mask, value, laneid)); + } + __device__ uint32_t conflict_mask(uint32_t mask) const noexcept { + return _conflict_mask(mask, (uintptr_t)this); + } + __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept { + return _reduce_add(mask, peers, *this); + } + __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept { + uint32_t peers = conflict_mask(mask); + if (other.reduce_add(mask, peers)) { + atomic_add(other); + } + } + __device__ inner_type& operator*() noexcept { + return value; + } + __device__ inner_type const& operator*() const noexcept { + return value; + } + __device__ inner_type* operator->() noexcept { + return &value; + } + __device__ inner_type const* operator->() const noexcept { + return &value; + } + __device__ inner_type& get() noexcept { + return value; + } + __device__ inner_type const& get() const noexcept { + return value; + } + static __device__ AtomicCell& ref(inner_type& r) noexcept { + return reinterpret_cast(r); + } + static __device__ AtomicCell const& ref(inner_type const& r) noexcept { + return reinterpret_cast(r); + } +}; + +template +class AtomicCell { + private: + AtomicCell data[Head]; + public: + using inner_type = typename AtomicCell::inner_type[Head]; + public: + __device__ AtomicCell() noexcept = default; + __device__ void add(const AtomicCell&restrict other) restrict noexcept { + for (size_t i = 0; i < Head; ++i) { + data[i].add(other.data[i]); + } + } + __device__ void atomic_add(const AtomicCell&restrict other) restrict noexcept { + for (size_t i = 0; i < Head; ++i) { + data[i].atomic_add(other.data[i]); + } + } + __device__ AtomicCell exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous; + for (size_t i = 0; i < Head; ++i) { + previous.data[i] = data[i].exchange(other.data[i]); + } + return previous; + } + __device__ AtomicCell atomic_exchange(const AtomicCell&restrict other) restrict noexcept { + AtomicCell previous; + for (size_t i = 0; i < Head; ++i) { + previous.data[i] = data[i].atomic_exchange(other.data[i]); + } + return previous; + } + __device__ AtomicCell shuffle(uint32_t mask, unsigned laneid) const noexcept { + AtomicCell shuffled; + for (size_t i = 0; i < Head; ++i) { + shuffled.data[i] = data[i].shuffle(mask, laneid); + } + return shuffled; + } + __device__ uint32_t conflict_mask(uint32_t mask) const noexcept { + return _conflict_mask(mask, (uintptr_t)this); + } + __device__ bool reduce_add(uint32_t mask, uint32_t peers) noexcept { + return _reduce_add(mask, peers, *this); + } + __device__ void conflict_free_add(uint32_t mask, AtomicCell other) noexcept { + uint32_t peers = conflict_mask(mask); + if (other.reduce_add(mask, peers)) { + atomic_add(other); + } + } + __device__ AtomicCell& operator[](size_t i) noexcept { + return data[i]; + } + __device__ AtomicCell const& operator[](size_t i) const noexcept { + return data[i]; + } + __device__ inner_type& get() noexcept { + return reinterpret_cast(*this); + } + __device__ inner_type const& get() const noexcept { + return reinterpret_cast(*this); + } + static __device__ AtomicCell& ref(inner_type& r) noexcept { + return reinterpret_cast(r); + } + static __device__ AtomicCell const& ref(inner_type const& r) noexcept { + return reinterpret_cast(r); + } +}; diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index e79678b4ee..5fc663fa1b 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -140,6 +140,67 @@ _compute_rhs_lsq_v_i_face(cs_lnum_t size, } } +__global__ static void +_compute_rhs_lsq_v_i_face_cf(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + Cell _rhs1, _rhs2; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + _rhs1[i][j].get() = _weight2 * fctb[j]; + _rhs2[i][j].get() = _weight1 * fctb[j]; + //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + } + } + +#if 1 + Cell::ref(rhs[c_id1]).conflict_free_add(-1u, _rhs1); + Cell::ref(rhs[c_id2]).conflict_free_add(-1u, _rhs2); +#else + Cell::ref(rhs[c_id1]).atomic_add(_rhs1); + Cell::ref(rhs[c_id2]).atomic_add(_rhs2); +#endif +} + __global__ static void _compute_rhs_lsq_v_b_neighbor(cs_lnum_t size, const cs_lnum_t *restrict cell_cells_idx, diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh index 7ca3800542..ad634fba90 100644 --- a/src/alge/cs_gradient_lsq_vector_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -87,6 +87,61 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t size, } } +__global__ static void +_compute_rhs_lsq_v_i_face_v2cf(cs_lnum_t size, + const cs_lnum_t *restrict i_face_cells, + const cs_real_t *restrict cell_f_cen, + cs_real_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id*2]; + c_id2 = i_face_cells[f_id*2 + 1]; + + dc[0] = cell_f_cen[c_id2*3] - cell_f_cen[c_id1*3]; + dc[1] = cell_f_cen[c_id2*3 + 1] - cell_f_cen[c_id1*3 + 1]; + dc[2] = cell_f_cen[c_id2*3 + 2] - cell_f_cen[c_id1*3 + 2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + Cell _rhs1, _rhs2; + + for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + //_rhs1[i][j].get() += _weight2 * fctb[j]; + //_rhs2[i][j].get() += _weight1 * fctb[j]; + atomicAdd(&rhs[c_id1*3*3 + i*3 + j], _weight2 * fctb[j]); + atomicAdd(&rhs[c_id2*3*3 + i*3 + j], _weight1 * fctb[j]); + } + } + //reinterpret_cast(&rhs[c_id1*3*3][0][0])->atomic_add(_rhs1); + //reinterpret_cast(&rhs[c_id2*3*3][0][0])->atomic_add(_rhs2); +} + __global__ static void _compute_rhs_lsq_v_b_face_v2(cs_lnum_t size, const cs_lnum_t *restrict b_face_cells, diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh index 85dfe345c9..7906663036 100644 --- a/src/alge/cs_gradient_lsq_vector_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_v3.cuh @@ -77,6 +77,62 @@ _compute_rhs_lsq_v_i_face_v3(cs_lnum_t size, //} } +__global__ static void +_compute_rhs_lsq_v_i_face_v3cf(cs_lnum_t size, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict cell_f_cen, + cs_real_33_t *restrict rhs, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict weight, + const cs_real_t *restrict c_weight) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_real_t dc[3], fctb[3], ddc, _weight1, _weight2, _denom, _pond, pfac; + cs_lnum_t c_id1, c_id2; + + size_t f_id1 = f_id / (3*3); + size_t i = (f_id / 3) % 3; + size_t j = f_id % 3; + + c_id1 = i_face_cells[f_id1][0]; + c_id2 = i_face_cells[f_id1][1]; + + dc[0] = cell_f_cen[c_id2][0] - cell_f_cen[c_id1][0]; + dc[1] = cell_f_cen[c_id2][1] - cell_f_cen[c_id1][1]; + dc[2] = cell_f_cen[c_id2][2] - cell_f_cen[c_id1][2]; + + ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id1]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; + } + + using Cell = AtomicCell; + + //for(cs_lnum_t i = 0; i < 3; i++){ + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + //for(cs_lnum_t j = 0; j < 3; j++){ + fctb[j] = dc[j] * pfac; + Cell::ref(rhs[c_id1][i][j]).conflict_free_add(-1u, Cell::ref(_weight2 * fctb[j])); + Cell::ref(rhs[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(_weight1 * fctb[j])); + //atomicAdd(&rhs[c_id1][i][j], _weight2 * fctb[j]); + //atomicAdd(&rhs[c_id2][i][j], _weight1 * fctb[j]); + //} + //} +} + __global__ static void _compute_gradient_lsq_v_v5(cs_lnum_t size, cs_real_t *restrict gradv, From 49d88967697bcc1916c94751bf11f812f6119094 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Wed, 15 Nov 2023 14:57:09 +0100 Subject: [PATCH 28/70] ready to merge with gather version --- src/alge/cs_gradient_cuda.cu | 108 ++++++++++--- .../cs_reconstruct_vector_gradient_gather.cuh | 104 +++++++++--- ..._reconstruct_vector_gradient_gather_v2.cuh | 152 ++++++++++++++++++ ...reconstruct_vector_gradient_scatter_v2.cuh | 3 +- 4 files changed, 315 insertions(+), 52 deletions(-) create mode 100644 src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 97734906dd..22ae43b477 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -82,6 +82,7 @@ #include "cs_gradient_priv.h" #include "cs_reconstruct_vector_gradient_scatter_v2.cuh" #include "cs_reconstruct_vector_gradient_gather.cuh" +#include "cs_reconstruct_vector_gradient_gather_v2.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -1502,8 +1503,7 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, __global__ static void -_compute_reconstruct_v_b_face1(cs_lnum_t size, - const cs_lnum_t *restrict b_group_index, +_compute_reconstruct_v_b_face(cs_lnum_t size, const bool *coupled_faces, cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, @@ -1522,7 +1522,7 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, return; } cs_lnum_t c_id; - cs_real_t pond, ktpond, pfac, rfac, vecfac; + cs_real_t pfac, rfac, vecfac; // if (coupled_faces[f_id * cpl_stride]) // return; @@ -1533,8 +1533,9 @@ _compute_reconstruct_v_b_face1(cs_lnum_t size, pfac = inc*coefav[f_id][i]; - for (cs_lnum_t k = 0; k < 3; k++) + for (cs_lnum_t k = 0; k < 3; k++){ pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } pfac -= pvar[c_id][i]; @@ -1595,7 +1596,7 @@ _compute_reconstruct_correction(cs_lnum_t size, for (cs_lnum_t j = 0; j < 3; j++){ for (cs_lnum_t k = 0; k < 3; k++){ - atomicAdd(&grad[c_id][i][j], corr_grad_lin[c_id][j][k] * gradpa[k]); + grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; } } } @@ -1737,13 +1738,21 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); assert(cell_i_faces); - - short int *restrict cell_i_faces_sgn; CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face)); cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face); - // = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); - // printf("Après allocation\n"); + + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + + assert(m->b_cells); + assert(madj->cell_b_faces); + assert(madj->cell_b_faces_idx); + assert(b_cells); + assert(cell_b_faces); + assert(cell_b_faces_idx); const cs_real_3_t *restrict cell_cen; // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); @@ -1841,22 +1850,39 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // i_f_face_normal); // printf("Avant les assert dans gradient_cuda.cu\n"); - assert(cell_cells_idx); - assert(cell_cells); - assert(weight); + // assert(cell_cells_idx); + // assert(cell_cells); + // assert(weight); // assert(cell_i_faces); // assert(cell_i_faces_sgn); // printf("n_i_faces = %d\n", n_i_faces); // printf("n_cells = %d\n", n_cells); - for(int i = 0; i< n_i_faces; i++){ + // for(int i = 0; i< n_i_faces; i++){ // printf("i = %d && weight = %f \n", i, fvq->weight[i]); // printf("i = %d && c_id2 = %d \n", i, madj->cell_cells[i]); // printf("i = %d && s_id = %d \n", i, madj->cell_cells_idx[i]); - // printf("i = %d && f_id = %d \n", i, madj->cell_i_faces_sgn[i]); - } + // printf("i = %d && f_id = %d \n", i, madj->cell_i_faces_sgn[i]); + // } // printf("Après les assert dans gradient_cuda.cu\n"); - _compute_reconstruct_v_i_face_gather<<>> - ( n_cells, + // _compute_reconstruct_v_i_face_gather<<>> + // ( n_cells, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // n_i_faces); + + + _compute_reconstruct_v_i_face_gather_v2<<>> + ( n_cells * 3 * 3, i_face_cells, pvar_d, weight, @@ -1900,9 +1926,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- - // _compute_reconstruct_v_b_face1<<>> + // _compute_reconstruct_v_b_face<<>> // ( n_b_faces, - // b_group_index, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1916,9 +1941,23 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); - _compute_reconstruct_v_b_face1_v2<<>> - ( n_b_faces * 3, - b_group_index, + // _compute_reconstruct_v_b_face_v2<<>> + // ( n_b_faces * 3, + // coupled_faces_d, + // cpl_stride, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + + + _compute_reconstruct_v_b_face_gather<<>> + ( m->n_b_cells * 3, coupled_faces_d, cpl_stride, coefb_d, @@ -1929,8 +1968,27 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, r_grad_d, grad_d, b_f_face_normal, - b_face_cells); - + b_cells, + cell_b_faces, + cell_b_faces_idx); + + + // _compute_reconstruct_v_b_face_gather_v2<<>> + // ( m->n_b_cells, + // coupled_faces_d, + // cpl_stride, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); // _compute_reconstruct_correction<<>> @@ -1989,7 +2047,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_1, b_faces_2)); - printf("B_faces_1 = %f\t", msec*1000.f); + printf("B_faces = %f\t", msec*1000.f); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces_2, b_faces_3)); diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh index b8d9f923c8..8ea22126c7 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -50,22 +50,14 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, cs_lnum_t c_id2, f_id; cs_real_t pond, ktpond, pfaci, pfacj, rfac; - // if(cell_cells_idx) printf("erreur dans le kernel"); - // if(cell_cells) printf("erreur dans le kernel"); - // if(cell_i_faces) printf("erreur dans le kernel"); - // if(cell_i_faces_sgn) printf("erreur dans le kernel"); - cs_lnum_t s_id = cell_cells_idx[c_id1]; cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; - // printf("s_id = %d\t",s_id); - // printf("e_id = %d\t",e_id); for(cs_lnum_t index = s_id; index < e_id; index++){ c_id2 = cell_cells[index]; f_id = cell_i_faces[index]; - // pond = weight[f_id]; pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; ktpond = (c_weight == NULL) ? pond : // no cell weighting @@ -73,25 +65,87 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } + } + } +} + + + + +__global__ static void +_compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; for (cs_lnum_t i = 0; i < 3; i++) { - pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - // pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - - /* Reconstruction part */ - rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); - - for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += (pfaci + rfac) * i_f_face_normal[f_id][j]; - // grad[c_id1][i][j] -= (pfacj + rfac) * i_f_face_normal[f_id][j]; + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + } + } - // grad[0][0][0] = 1. + f_id; - // grad[c_id2][0][0] = 1. + pond; } -} +} \ No newline at end of file diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh new file mode 100644 index 0000000000..a8f474c311 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -0,0 +1,152 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +__global__ static void +_compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + const cs_lnum_t n_i_faces) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + size_t c_idx = c_id1 / (3*3); + size_t i = (c_id1 / 3) % 3; + size_t j = c_id1 % 3; + + cs_lnum_t s_id = cell_cells_idx[c_idx]; + cs_lnum_t e_id = cell_cells_idx[c_idx + 1]; + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_idx] // cell weighting active + / ( pond * c_weight[c_idx] + + (1.0-pond)* c_weight[c_id2]); + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_idx][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_idx][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_idx][i][2] + + r_grad[c_id2][i][2])); + + grad[c_idx][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } +} + + + + +__global__ static void +_compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; + } + + } + } +} \ No newline at end of file diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index cba97ac660..67f1717561 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -83,8 +83,7 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, __global__ static void -_compute_reconstruct_v_b_face1_v2(cs_lnum_t n_b_faces, - const cs_lnum_t *restrict b_group_index, +_compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, const bool *coupled_faces, cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, From b099b2bba3912484437a837bb6837c24b928cf4f Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Wed, 15 Nov 2023 15:10:12 +0100 Subject: [PATCH 29/70] fix bug --- src/alge/cs_gradient_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 22ae43b477..db13be63d5 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1956,8 +1956,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); - _compute_reconstruct_v_b_face_gather<<>> - ( m->n_b_cells * 3, + _compute_reconstruct_v_b_face_gather<<>> + ( m->n_b_cells, coupled_faces_d, cpl_stride, coefb_d, From 8b1ceaf9b2e187f37cf7880a02bbb47380793387 Mon Sep 17 00:00:00 2001 From: Florian Lemaitre Date: Wed, 15 Nov 2023 15:48:57 +0100 Subject: [PATCH 30/70] fix merges --- src/alge/cs_gradient.cxx | 26 ++++++++------ src/alge/cs_gradient_cuda.cu | 66 +++++++++++++++++------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 73dd991ff3..b09052a155 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -198,7 +198,7 @@ const cs_e2n_sum_t _e2n_sum_type = CS_E2N_SUM_STORE_THEN_GATHER; /* Strided LSQ gradient variant */ -static int _use_legacy_strided_lsq_gradient = false; +static int _use_legacy_strided_lsq_gradient = true; /*============================================================================ * Private function definitions @@ -6937,7 +6937,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, const cs_real_t *restrict c_weight, cs_real_33_t *restrict gradv) { - const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells = m->n_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; @@ -7155,9 +7155,18 @@ _lsq_vector_gradient(const cs_mesh_t *m, gradv_cpu[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; - - for (int j =0; j < 3; ++j) { - auto cpu = gradv_cpu[c_id][i][j]; + } + } + // #ifdef NDEBUG + // #endif + // #endif + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + #pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; auto cuda = gradv[c_id][i][j]; if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { @@ -7166,12 +7175,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, } } } -// #ifdef NDEBUG -// #endif -// #endif -stop = std::chrono::high_resolution_clock::now(); -elapsed = std::chrono::duration_cast(stop - start); -// printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); /* Compute gradient on boundary cells */ /*------------------------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index ef326c41f5..d02ff326b2 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1205,37 +1205,37 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - // printf("lsq Kernels :"); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); - // printf("Kernels execution time in us: \t"); - // printf("Init = %f\t", msec*1000.f); + printf("lsq Kernels :"); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); - // printf("I_faces = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + printf("I_faces = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); - // printf("Halo = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + printf("Halo = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); - // printf("B_faces = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + printf("B_faces = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); - // printf("Gradient = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + printf("Gradient = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); - // printf("Total kernel = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + printf("Total kernel = %f\t", msec*1000.f); - // msec = 0.0f; - // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); - // printf("Total = %f\t", msec*1000.f); + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); - // printf("\n"); + printf("\n"); free(pvar_copy); @@ -1611,15 +1611,15 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // = (const int *restrict)cs_get_device_ptr_const_pf(fvq->c_disable_flag); - _sync_or_copy_real_3_h2d(pvar, n_cells_ext, device_id, stream, + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, + _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream, &r_grad_d, &_r_grad_d); - _sync_or_copy_real_3_h2d(coefav, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_33_h2d(coefbv, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); @@ -1628,10 +1628,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); /* Initialization */ - _init_real_33_array<<>> - (n_cells_ext*3*3, grad_d); - // cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_33_t)); + cudaMemset(grad_d, 0, n_cells * sizeof(cs_real_33_t)); CS_CUDA_CHECK(cudaEventRecord(init, stream)); @@ -1678,7 +1676,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // printf("Après les assert dans gradient_cuda.cu\n"); // _compute_reconstruct_v_i_face_gather<<>> // ( n_cells, - // i_face_cells, // pvar_d, // weight, // c_weight, @@ -1689,8 +1686,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_cells_idx, // cell_cells, // cell_i_faces, - // cell_i_faces_sgn, - // n_i_faces); + // cell_i_faces_sgn); _compute_reconstruct_v_i_face_gather_v2<<>> @@ -1732,7 +1728,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); - // _sync_or_copy_real_33_h2d(r_grad, n_cells_ext, device_id, stream, + // _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream, // &r_grad_d, &_r_grad_d); CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); From 53832aa965ccec1c441d644778ca9a5d9e26377e Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 21 Nov 2023 16:06:38 +0100 Subject: [PATCH 31/70] Clean code --- src/alge/cs_gradient.cxx | 56 ++++- src/alge/cs_gradient_cuda.cu | 236 ++++++++---------- src/alge/cs_gradient_cuda.cuh | 23 ++ src/alge/cs_gradient_lsq_vector_gather.cuh | 64 +++++ src/alge/cs_gradient_lsq_vector_gather_v3.cuh | 16 +- src/alge/cs_gradient_lsq_vector_v2.cuh | 24 +- src/alge/cs_gradient_lsq_vector_v3.cuh | 6 +- src/alge/cs_gradient_priv.h | 2 +- 8 files changed, 263 insertions(+), 164 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 0a9eef0d70..54739a4f57 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -46,6 +46,7 @@ #include #include #include +#include #if defined(HAVE_MPI) #include @@ -82,7 +83,6 @@ #include "cs_prototypes.h" #include "cs_timer.h" #include "cs_timer_stats.h" -#include /*---------------------------------------------------------------------------- * Header for the current file @@ -6892,9 +6892,28 @@ _lsq_vector_gradient(const cs_mesh_t *m, bool accel = false; #endif - _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb_s); + _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda, *gradv_cpu; + bool COMPUTE_CUDA, COMPUTE_CPU, RES_CPU, PERF, ACCURACY; + + COMPUTE_CUDA = accel; + RES_CPU = !accel; + +#if defined(DEBUG) + COMPUTE_CPU = true; + PERF = true; + ACCURACY = true; +#elif defined(NDEBUG) + COMPUTE_CPU = true; + RES_CPU = true; + PERF = false; + ACCURACY = false; +#else + COMPUTE_CPU = false; + PERF = false; + ACCURACY = false; +#endif BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); @@ -6903,9 +6922,8 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute Right-Hand Side */ /*-------------------------*/ -// #ifdef NDEBUG -// #if defined(HAVE_CUDA) -// #endif +#if defined(HAVE_CUDA) +if(COMPUTE_CUDA){ start = std::chrono::high_resolution_clock::now(); cs_lsq_vector_gradient_cuda( m, @@ -6918,13 +6936,14 @@ _lsq_vector_gradient(const cs_mesh_t *m, pvar, c_weight, cocg, + cocgb_s, gradv, rhs_cuda); stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); -// #ifdef NDEBUG -// #else -// #endif +} // end if COMPUTE_CUDA +#endif +if(COMPUTE_CPU){ start = std::chrono::high_resolution_clock::now(); # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { @@ -7074,9 +7093,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, + rhs[c_id][i][2] * cocg[c_id][2]; } } - // #ifdef NDEBUG - // #endif - // #endif +} // end if COMPUTE_CPU + +if(ACCURACY){ stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); #pragma omp parallel for @@ -7092,8 +7111,14 @@ _lsq_vector_gradient(const cs_mesh_t *m, } } } +} + +if(PERF) printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); +if(RES_CPU){ + memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext); +} /* Compute gradient on boundary cells */ /*------------------------------------*/ @@ -7273,10 +7298,16 @@ _lsq_strided_gradient(const cs_mesh_t *m, BFT_MALLOC(rhs, n_cells_ext, grad_t); cs_array_real_fill_zero(n_cells_ext*stride*3, (cs_real_t *)rhs); +#if defined(HAVE_CUDA) + bool accel = (cs_get_device_id() > -1) ? true : false; +#else + bool accel = false; +#endif + cs_cocg_6_t *restrict cocgb = NULL; cs_cocg_6_t *restrict cocg = NULL; - _get_cell_cocg_lsq(m, halo_type, false, fvq, &cocg, &cocgb); + _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb); /* Contribution from interior faces -------------------------------- */ @@ -7539,7 +7570,6 @@ _lsq_strided_gradient(const cs_mesh_t *m, _math_6_inv_cramer_sym_in_place(cocg[c_id]); } /* loop on boundary cells */ - /* Compute gradient */ /*------------------*/ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index d02ff326b2..8b573de26d 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -544,7 +544,8 @@ _sync_or_copy_real_h2d(const T *val_h, /* Compute gridsize*/ -unsigned int get_gridsize(unsigned int size, unsigned int blocksize){ +unsigned int +get_gridsize(unsigned int size, unsigned int blocksize){ unsigned int gridsize = (unsigned int)ceil((double)size / blocksize); return gridsize; @@ -861,6 +862,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs) { @@ -869,10 +871,6 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_i_faces = m->n_i_faces; - cs_real_t *pvar_copy; - pvar_copy = (cs_real_t *) malloc(n_cells * sizeof(cs_real_3_t)); - - memcpy(pvar_copy, pvar, n_cells*sizeof(cs_real_3_t)); int device_id; cudaGetDevice(&device_id); @@ -896,48 +894,20 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cs_real_33_t *rhs_d; CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_33_t))); - cs_real_33_t *rhs_d_v0; - CS_CUDA_CHECK(cudaMalloc(&rhs_d_v0, n_cells_ext * sizeof(cs_real_33_t))); - cs_real_t *rhs_test_d; - CS_CUDA_CHECK(cudaMalloc(&rhs_test_d, n_cells_ext * sizeof(cs_real_33_t))); - - cs_real_t *gradv_test_d; - CS_CUDA_CHECK(cudaMalloc(&gradv_test_d, n_cells_ext * sizeof(cs_real_33_t))); - cs_cocg_6_t *cocg_d; - CS_CUDA_CHECK(cudaMalloc(&cocg_d, n_cells_ext * sizeof(cs_cocg_6_t))); cs_real_33_t *grad_d = NULL; CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_33_t))); - cs_cuda_copy_h2d(cocg_d, cocg, n_cells_ext * sizeof(cs_cocg_6_t)); - void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, *_cell_cells_idx_d = NULL; const cs_real_3_t *pvar_d = NULL, *coefa_d = NULL; const cs_real_33_t *coefb_d = NULL; const cs_lnum_t *cell_cells_idx_d = NULL; - cs_real_t *pvar_d_1d; - CS_CUDA_CHECK(cudaMalloc(&pvar_d_1d, n_cells * sizeof(cs_real_3_t))); - cs_cuda_copy_h2d(pvar_d_1d, pvar_copy, n_cells * sizeof(cs_real_3_t)); + // cs_cuda_copy_h2d(rhs_d, rhs, n_cells * sizeof(cs_real_33_t)); unsigned int blocksize = 256; - unsigned int gridsize_b - = (unsigned int)ceil((double)m->n_b_cells / blocksize); - unsigned int gridsize_if - = (unsigned int)ceil((double)m->n_i_faces / blocksize); - unsigned int gridsize_if_bis - = (unsigned int)ceil((double)(m->n_i_faces*3*3) / blocksize); - unsigned int gridsize_bf - = (unsigned int)ceil((double)m->n_b_faces / blocksize); - unsigned int gridsize = (unsigned int)ceil((double)m->n_cells / blocksize); - unsigned int gridsize_init - = (unsigned int)ceil((double)m->n_cells*3*3 / blocksize); - unsigned int gridsize_ext - = (unsigned int)ceil((double)n_cells_ext / blocksize); - unsigned int gridsize_ext_1d - = (unsigned int)ceil((double)(n_cells_ext*3*3) / blocksize); const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); @@ -974,12 +944,16 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); const cs_real_3_t *restrict b_face_normal = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + const cs_real_3_t *restrict b_face_cog + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog); const cs_real_t *restrict cell_f_cen_1d = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); const cs_lnum_t *restrict i_face_cells_1d = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + cs_lnum_t stride = 3; + // printf("n_i_thread:%d\tn_i_groups:%d\tn_cells%d\n", n_i_threads, n_i_groups, n_cells); _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, @@ -992,52 +966,58 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - // _init_rhs<<>> + // _init_rhs<<>> // (n_cells_ext, // rhs_d); - // cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); + cudaMemset(rhs_d, 0, n_cells_ext*sizeof(cs_real_33_t)); - // _init_rhs_v2<<>> + // _init_rhs_v2<<>> // (n_cells_ext*3*3, - // rhs_test_d); + // rhs_d); - // _init_rhs_v3<<>> + // _init_rhs_v3<<>> // (n_cells_ext*3, - // rhs_test_d); + // rhs_d); CS_CUDA_CHECK(cudaEventRecord(init, stream)); - // bool status = false; - // cs_lnum_t count_nan = 0, count_inf = 0; - // _compute_rhs_lsq_v_i_face_v0<<>> + // _compute_rhs_lsq_v_i_face_v0<<>> // (n_i_faces, // i_face_cells, // cell_f_cen, - // rhs_d_v0, + // rhs_d, // pvar_d, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_cf<<>> - (n_i_faces, - i_face_cells, - cell_f_cen, - rhs_d, - pvar_d, - weight, - c_weight); - - //_compute_rhs_lsq_v_i_face_v2<<>> - // (n_i_faces, - // i_face_cells_1d, - // cell_f_cen_1d, - // rhs_test_d, - // pvar_d, - // weight, - // c_weight); + // _compute_rhs_lsq_v_i_face_cf<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + // _compute_rhs_lsq_v_i_face<<>> + // (n_i_faces, + // i_face_cells, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); + + _compute_rhs_lsq_v_i_face_v2cf<<>> + (n_i_faces, + i_face_cells, + cell_f_cen, + rhs_d, + pvar_d, + weight, + c_weight); - // _compute_rhs_lsq_v_i_face_v3<<>> + // _compute_rhs_lsq_v_i_face_v3<<>> // (n_i_faces*3*3, // i_face_cells, // cell_f_cen, @@ -1045,13 +1025,13 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // pvar_d, // weight, // c_weight); - assert(cell_cells_idx); - assert(cell_cells); - assert(cell_f_cen); - assert(rhs_d); - assert(pvar_d); - assert(weight); - // _compute_rhs_lsq_v_i_face_gather<<>> + // assert(cell_cells_idx); + // assert(cell_cells); + // assert(cell_f_cen); + // assert(rhs_d); + // assert(pvar_d); + // assert(weight); + // _compute_rhs_lsq_v_i_face_gather<<>> // (n_cells, // cell_cells_idx, // cell_cells, @@ -1063,7 +1043,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - // _compute_rhs_lsq_v_i_face_gather_v2<<>> + // _compute_rhs_lsq_v_i_face_gather_v2<<>> // (n_cells, // cell_cells_idx, // cell_cells, @@ -1075,23 +1055,23 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // weight, // c_weight); - _compute_rhs_lsq_v_i_face_gather_v4<<>> - (n_cells, - cell_cells_idx, - cell_cells, - cell_i_faces, - cell_i_faces_sgn, - cell_f_cen, - rhs_d, - pvar_d, - weight, - c_weight); + // _compute_rhs_lsq_v_i_face_gather_v4<<>> + // (n_cells, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // cell_f_cen, + // rhs_d, + // pvar_d, + // weight, + // c_weight); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); if(halo_type == CS_HALO_EXTENDED && cell_cells_idx != NULL){ - _compute_rhs_lsq_v_b_neighbor<<>> + _compute_rhs_lsq_v_b_neighbor<<>> (n_cells, cell_cells_idx, cell_cells, @@ -1101,50 +1081,52 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - _compute_rhs_lsq_v_b_face<<>> - (m->n_b_faces, - b_face_cells, - cell_f_cen, - b_face_normal, - rhs_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, - inc); + // _compute_rhs_lsq_v_b_face<<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_faces, + // b_face_cells, + // cell_f_cen, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); - // _compute_rhs_lsq_v_b_face_gather<<>> + // _compute_rhs_lsq_v_b_face_gather_stride<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> // (m->n_b_cells, // cell_b_faces_idx, // cell_b_faces, // b_cells, - // b_face_normal, + // b_face_cog, + // cell_cen, // rhs_d, // pvar_d, - // b_dist, // coefb_d, - // coefa_d, + // coefa_d, + // cocg, + // cocgb, // inc); - //_compute_rhs_lsq_v_b_face_gather_v2<<>> - // (m->n_b_cells, - // cell_b_faces_idx, - // cell_b_faces, - // b_cells, - // b_face_normal, - // rhs_d, - // pvar_d, - // b_dist, - // coefb_d, - // coefa_d, - // inc); - - // _compute_rhs_lsq_v_b_face_v2<<>> + _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, + b_cells, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); + + // _compute_rhs_lsq_v_b_face_v2<<n_b_cells, blocksize), blocksize, 0, stream>>> // (m->n_b_faces, // b_face_cells, // cell_f_cen, // b_face_normal, - // rhs_test_d, + // rhs_d, // pvar_d, // b_dist, // coefb_d, @@ -1164,30 +1146,30 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // /* Compute gradient */ // /*------------------*/ - // _compute_gradient_lsq_v<<>> + // _compute_gradient_lsq_v<<>> // (n_cells, // grad_d, // rhs_d, - // cocg_d); + // cocg); - // _compute_gradient_lsq_v_v4<<>> + // _compute_gradient_lsq_v_v4<<>> // (n_cells, // grad_d, // rhs_d, - // cocg_d); + // cocg); - // _compute_gradient_lsq_v_v5<<>> + // _compute_gradient_lsq_v_v5<<>> // (n_cells*3*3, - // gradv_test_d, - // rhs_test_d, - // cocg_d); + // gradv_d, + // rhs_d, + // cocg); - _compute_gradient_lsq_v_v6<<>> + _compute_gradient_lsq_v_v6<<>> (n_cells*3*3, grad_d, rhs_d, - cocg_d); + cocg); CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); @@ -1237,7 +1219,6 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, printf("\n"); - free(pvar_copy); if (_pvar_d != NULL) CS_CUDA_CHECK(cudaFree(_pvar_d)); @@ -1247,19 +1228,10 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaFree(_coefb_d)); CS_CUDA_CHECK(cudaFree(rhs_d)); - CS_CUDA_CHECK(cudaFree(rhs_d_v0)); - CS_CUDA_CHECK(cudaFree(rhs_test_d)); - CS_CUDA_CHECK(cudaFree(cocg_d)); CS_CUDA_CHECK(cudaFree(grad_d)); - CS_CUDA_CHECK(cudaFree(gradv_test_d)); - CS_CUDA_CHECK(cudaFree(pvar_d_1d)); } -// cs_real_t results_precision(cs_real_t *cpu_result, cs_real_t *gpu_result, ) - - - __global__ static void _compute_reconstruct_v_i_face(cs_lnum_t size, diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index 7066f59291..5b1401bfe6 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -95,6 +95,29 @@ __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], out[2] = inverse_norm * in[2]; } +__device__ cs_real_t cs_math_3_square_norm_cuda(const cs_real_t in[3]){ + cs_real_t norm = in[0]*in[0] + in[1]*in[1] + in[2]*in[2]; + return norm; +} + +__device__ void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ + cs_real_t in00 = in[1]*in[2] - in[4]*in[4]; + cs_real_t in01 = in[4]*in[5] - in[3]*in[2]; + cs_real_t in02 = in[3]*in[4] - in[1]*in[5]; + cs_real_t in11 = in[0]*in[2] - in[5]*in[5]; + cs_real_t in12 = in[3]*in[5] - in[0]*in[4]; + cs_real_t in22 = in[0]*in[1] - in[3]*in[3]; + + cs_real_t det_inv = 1. / (in[0]*in00 + in[3]*in01 + in[5]*in02); + + in[0] = in00 * det_inv; + in[1] = in11 * det_inv; + in[2] = in22 * det_inv; + in[3] = in01 * det_inv; + in[4] = in12 * det_inv; + in[5] = in02 * det_inv; +} + template __device__ uint32_t _conflict_mask(uint32_t mask, V v) noexcept { diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index b057e6d3a0..2172a78691 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -135,3 +135,67 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t n_b_cells, } } } + +template +__global__ static void +_compute_rhs_lsq_v_b_face_gather_stride(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + cs_real_33_t *restrict rhs, + const val_t *restrict pvar, + const coefb_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_cocg_6_t *restrict cocg, + const cs_cocg_6_t *restrict cocgb, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t dif[stride], ddif, pfac, norm, var_f[stride]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t ll = 0; ll < 6; ll++) + cocg[c_id][ll] = cocgb[c_idx][ll]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + cocg[c_id][0] += dif[0]*dif[0]*ddif; + cocg[c_id][1] += dif[1]*dif[1]*ddif; + cocg[c_id][2] += dif[2]*dif[2]*ddif; + cocg[c_id][3] += dif[0]*dif[1]*ddif; + cocg[c_id][4] += dif[1]*dif[2]*ddif; + cocg[c_id][5] += dif[0]*dif[2]*ddif; + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + var_f[kk] = coefav[f_id][kk]*inc; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_f[kk] += coefbv[f_id][ll][kk] * pvar[c_id][ll]; + } + + pfac = (var_f[kk] - pvar[c_id][kk]) * ddif; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + rhs[c_id][kk][ll] += dif[ll] * pfac; + } + } + _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]); +} diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh index 1d96209de8..8c6a6efe17 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -214,15 +214,15 @@ _compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t n_b_cells, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; - __shared__ cs_real_t _rhs[256*3*3]; + __shared__ cs_real_t _rhs[256][3][3]; for(cs_lnum_t i = 0; i < 3; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - _rhs[lindex + (i*3+j)*256] = rhs[c_id][i][j]; + _rhs[lindex][i][j] = rhs[c_id][i][j]; } } - __syncthreads(); + // __syncthreads(); auto _pvar1 = pvar[c_id]; @@ -249,16 +249,16 @@ _compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t n_b_cells, + _coefbv[2][i] * _pvar1[2] - _pvar1[i]); - _rhs[lindex + (i*3)*256] += n_d_dist[0] * pfac; - _rhs[lindex + (i*3+1)*256] += n_d_dist[1] * pfac; - _rhs[lindex + (i*3+2)*256] += n_d_dist[2] * pfac; + _rhs[lindex][i][0] += n_d_dist[0] * pfac; + _rhs[lindex][i][1]+= n_d_dist[1] * pfac; + _rhs[lindex][i][2] += n_d_dist[2] * pfac; } } - __syncthreads(); + // __syncthreads(); for(cs_lnum_t i = 0; i < 3; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - rhs[c_id][i][j] = _rhs[lindex + (i*3+j)*256]; + rhs[c_id][i][j] = _rhs[lindex][i][j]; } } } diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh index b129d40315..1c9d222a41 100644 --- a/src/alge/cs_gradient_lsq_vector_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -30,8 +30,9 @@ __global__ static void _init_rhs_v2(cs_lnum_t n_cells_g, - cs_real_t *restrict rhs) + cs_real_33_t *restrict _rhs) { + cs_real_t *rhs = (cs_real_t *) _rhs; cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if (c_id >= n_cells_g) return; @@ -43,11 +44,12 @@ __global__ static void _compute_rhs_lsq_v_i_face_v2(cs_lnum_t n_i_faces, const cs_lnum_t *restrict i_face_cells, const cs_real_t *restrict cell_f_cen, - cs_real_t *restrict rhs, + cs_real_33_t *restrict _rhs, const cs_real_3_t *restrict pvar, const cs_real_t *restrict weight, const cs_real_t *restrict c_weight) { + cs_real_t *rhs = (cs_real_t *) _rhs; cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; if(f_id >= n_i_faces){ @@ -89,13 +91,16 @@ _compute_rhs_lsq_v_i_face_v2(cs_lnum_t n_i_faces, __global__ static void _compute_rhs_lsq_v_i_face_v2cf(cs_lnum_t size, - const cs_lnum_t *restrict i_face_cells, - const cs_real_t *restrict cell_f_cen, - cs_real_t *restrict rhs, + const cs_lnum_2_t *restrict _i_face_cells, + const cs_real_3_t *restrict _cell_f_cen, + cs_real_33_t *restrict _rhs, const cs_real_3_t *restrict pvar, const cs_real_t *restrict weight, const cs_real_t *restrict c_weight) { + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_lnum_t *i_face_cells = (cs_lnum_t *) _i_face_cells; + cs_real_t *cell_f_cen = (cs_real_t *) _cell_f_cen; cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; if(f_id >= size){ @@ -147,13 +152,14 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t n_b_faces, const cs_lnum_t *restrict b_face_cells, const cs_real_3_t *restrict cell_f_cen, const cs_real_3_t *restrict b_face_normal, - cs_real_t *restrict rhs, + cs_real_33_t *restrict _rhs, const cs_real_3_t *restrict pvar, const cs_real_t *restrict b_dist, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const int inc) { + cs_real_t *rhs = (cs_real_t *) _rhs; cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; if(f_id >= n_b_faces){ @@ -189,10 +195,12 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t n_b_faces, __global__ static void _compute_gradient_lsq_v_v2(cs_lnum_t n_cells_g, - cs_real_t *restrict gradv, - cs_real_t *restrict rhs, + cs_real_33_t *restrict _gradv, + cs_real_33_t *restrict _rhs, cs_cocg_6_t *restrict cocg) { + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_real_t *gradv = (cs_real_t *) _gradv; size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if (c_id >= n_cells_g) return; diff --git a/src/alge/cs_gradient_lsq_vector_v3.cuh b/src/alge/cs_gradient_lsq_vector_v3.cuh index 9483f19975..135d0a2520 100644 --- a/src/alge/cs_gradient_lsq_vector_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_v3.cuh @@ -135,10 +135,12 @@ _compute_rhs_lsq_v_i_face_v3cf(cs_lnum_t size, __global__ static void _compute_gradient_lsq_v_v5(cs_lnum_t n_cells, - cs_real_t *restrict gradv, - cs_real_t *restrict rhs, + cs_real_t *restrict _gradv, + cs_real_t *restrict _rhs, cs_cocg_6_t *restrict cocg) { + cs_real_t *rhs = (cs_real_t *) _rhs; + cs_real_t *gradv = (cs_real_t *) _gradv; size_t c_id = blockIdx.x * blockDim.x + threadIdx.x; if (c_id >= n_cells) return; diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 77ef08399e..8ba24bbcff 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -120,6 +120,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict pvar, const cs_real_t *restrict c_weight, cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, cs_real_33_t *restrict gradv, cs_real_33_t *restrict rhs); @@ -148,5 +149,4 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*----------------------------------------------------------------------------*/ END_C_DECLS - #endif /* __CS_GRADIENT_CUDA_H__ */ From 30b68812d3c1aaadc16d9715be3a0f0704f5dd3c Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 16 Nov 2023 09:44:03 +0100 Subject: [PATCH 32/70] ADD gather v2 --- src/alge/cs_gradient.cxx | 12 +++--- src/alge/cs_gradient_cuda.cu | 40 +++++++++--------- ..._reconstruct_vector_gradient_gather_v2.cuh | 42 +++++++++---------- ...reconstruct_vector_gradient_scatter_v2.cuh | 7 ++-- 4 files changed, 50 insertions(+), 51 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 54739a4f57..4a6586be87 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5603,7 +5603,7 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, if(COMPUTE_CUDA){ - printf("Compute with CUDA\n"); + printf("reconstruct Compute with CUDA\n"); if(PERF){ start = std::chrono::high_resolution_clock::now(); } @@ -5631,7 +5631,7 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } if(COMPUTE_CPU){ - printf("Compute with CPU\n"); + printf("reconstruct Compute with CPU\n"); BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); if(PERF){ @@ -5802,9 +5802,9 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, for (int j =0; j < 3; ++j) { auto cpu = grad_cpu[c_id][i][j]; auto cuda = grad[c_id][i][j]; - double err = (fabsl(cpu - cuda) / fmaxl(fabsl(cpu), 1e-6)); + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-6) { - printf("rec DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\n", c_id, i, j, cpu, cuda, cpu - cuda, err); + printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); } } } @@ -5813,10 +5813,10 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, //Copy grad if(RES_CPU){ - printf("RESULTS CPU\n"); + printf("reconstruct RESULTS CPU\n"); memcpy(grad, grad_cpu, sizeof(cs_real_33_t) * n_cells_ext); }else{ - printf("RESULTS GPU\n"); + printf("reconstruct RESULTS GPU\n"); } // Free memory diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 8b573de26d..e0f385825d 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1736,24 +1736,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); - _compute_reconstruct_v_b_face_gather<<>> - ( m->n_b_cells, - coupled_faces_d, - cpl_stride, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_cells, - cell_b_faces, - cell_b_faces_idx); - - - // _compute_reconstruct_v_b_face_gather_v2<<>> + // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coupled_faces_d, // cpl_stride, @@ -1769,6 +1752,23 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces, // cell_b_faces_idx); + + _compute_reconstruct_v_b_face_gather_v2<<>> + ( m->n_b_cells * 3, + coupled_faces_d, + cpl_stride, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_cells, + cell_b_faces, + cell_b_faces_idx); + CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); // _compute_reconstruct_correction<<>> @@ -1810,7 +1810,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamDestroy(stream); if(PERF){ - printf("rec Kernels times:\n"); + printf("recconstruct Kernels times:\t"); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); @@ -1838,7 +1838,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, msec_tot = 0.0f; msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, i_faces)); - printf("Total kernel part 1= %f\t", msec*1000.f); + printf("reconstruct Total kernel part 1= %f\t", msec*1000.f); msec_tot = msec; msec = 0.0f; diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh index a8f474c311..6a2d49457c 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -103,13 +103,16 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, const cs_lnum_t *restrict cell_b_faces, const cs_lnum_t *restrict cell_b_faces_idx) { - cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id1 >= n_b_cells){ + if(c_idx >= n_b_cells){ return; } + size_t c_id1 = c_idx / 3; + size_t i = c_idx % 3; + cs_lnum_t c_id = b_cells[c_id1]; cs_real_t pfac, rfac, vecfac; @@ -124,29 +127,26 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; - for (cs_lnum_t i = 0; i < 3; i++) { - - pfac = inc*coefav[f_id][i]; + pfac = inc*coefav[f_id][i]; - for (cs_lnum_t k = 0; k < 3; k++){ - pfac += coefbv[f_id][i][k] * pvar[c_id][k]; - } + pfac += coefbv[f_id][i][0] * pvar[c_id][0] + + coefbv[f_id][i][1] * pvar[c_id][1] + + coefbv[f_id][i][2] * pvar[c_id][2]; - pfac -= pvar[c_id][i]; + pfac -= pvar[c_id][i]; - // /* Reconstruction part */ - rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { - vecfac = r_grad[c_id][k][0] * diipb[f_id][0] - + r_grad[c_id][k][1] * diipb[f_id][1] - + r_grad[c_id][k][2] * diipb[f_id][2]; - rfac += coefbv[f_id][i][k] * vecfac; - } + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } - for (cs_lnum_t j = 0; j < 3; j++){ - grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; - } + grad[c_id][i][0] += (pfac + rfac) * b_f_face_normal[f_id][0]; + grad[c_id][i][1] += (pfac + rfac) * b_f_face_normal[f_id][1]; + grad[c_id][i][2] += (pfac + rfac) * b_f_face_normal[f_id][2]; - } } } \ No newline at end of file diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 67f1717561..6cd15380a2 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -174,13 +174,12 @@ _compute_reconstruct_correction_v2( cs_lnum_t n_cells, cs_real_t gradpa[3]; for (cs_lnum_t j = 0; j < 3; j++) { gradpa[j] = grad[c_idt][i][j]; - grad[c_idt][i][j] = 0.; } for (cs_lnum_t j = 0; j < 3; j++) { - atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][0] * gradpa[0]); - atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][1] * gradpa[1]); - atomicAdd(&grad[c_idt][i][j], corr_grad_lin[c_idt][j][2] * gradpa[2]); + grad[c_idt][i][j] = corr_grad_lin[c_idt][j][0] * gradpa[0] + + corr_grad_lin[c_idt][j][1] * gradpa[1] + + corr_grad_lin[c_idt][j][2] * gradpa[2]; } } From c05e0735bf9ad94fe1670b4e7e46880a75e95c19 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 16 Nov 2023 10:55:57 +0100 Subject: [PATCH 33/70] ADD scattercf versions --- src/alge/cs_gradient_cuda.cu | 86 ++++++++----- ...reconstruct_vector_gradient_scatter_v2.cuh | 114 ++++++++++++++++++ 2 files changed, 170 insertions(+), 30 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index e0f385825d..383642bf48 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1630,6 +1630,18 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // dofij, // i_f_face_normal); + + _compute_reconstruct_v_i_face_v2cf<<>> + (n_i_faces * 3, + i_group_index, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal); // printf("Avant les assert dans gradient_cuda.cu\n"); // assert(cell_cells_idx); @@ -1661,21 +1673,21 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces_sgn); - _compute_reconstruct_v_i_face_gather_v2<<>> - ( n_cells * 3 * 3, - i_face_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal, - cell_cells_idx, - cell_cells, - cell_i_faces, - cell_i_faces_sgn, - n_i_faces); + // _compute_reconstruct_v_i_face_gather_v2<<>> + // ( n_cells * 3 * 3, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // n_i_faces); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1736,6 +1748,20 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); + _compute_reconstruct_v_b_face_v2cf<<>> + ( n_b_faces * 3, + coupled_faces_d, + cpl_stride, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells); + // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coupled_faces_d, @@ -1753,21 +1779,21 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); - _compute_reconstruct_v_b_face_gather_v2<<>> - ( m->n_b_cells * 3, - coupled_faces_d, - cpl_stride, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_cells, - cell_b_faces, - cell_b_faces_idx); + // _compute_reconstruct_v_b_face_gather_v2<<>> + // ( m->n_b_cells * 3, + // coupled_faces_d, + // cpl_stride, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 6cd15380a2..9aca0b2f3a 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -76,7 +76,64 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, for (cs_lnum_t j = 0; j < 3; j++) { atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_idt][j]); atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_idt][j])); + } + +} + + + + +__global__ static void +_compute_reconstruct_v_i_face_v2cf(cs_lnum_t n_i_faces, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_idt][0]; + c_id2 = i_face_cells[f_idt][1]; + + pond = weight[f_idt]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_idt][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_idt][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + using Cell = AtomicCell; + + for (cs_lnum_t j = 0; j < 3; j++) { + Cell::ref(grad[c_id1][i][j]).conflict_free_add(-1u, Cell::ref((pfaci + rfac) * i_f_face_normal[f_idt][j])); + Cell::ref(grad[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(- ((pfacj + rfac) * i_f_face_normal[f_idt][j]))); } } @@ -138,6 +195,63 @@ _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, +__global__ static void +_compute_reconstruct_v_b_face_v2cf(cs_lnum_t n_b_faces, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + // if (coupled_faces[f_idt * cpl_stride]) + // return; + + c_id = b_face_cells[f_idt]; + + pfac = inc*coefav[f_idt][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + +// /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + + r_grad[c_id][k][1] * diipb[f_idt][1] + + r_grad[c_id][k][2] * diipb[f_idt][2]; + rfac += coefbv[f_idt][i][k] * vecfac; + } + + using Cell = AtomicCell; + + for (cs_lnum_t j = 0; j < 3; j++){ + Cell::ref(grad[c_id][i][j]).conflict_free_add(-1u, Cell::ref((pfac + rfac) * b_f_face_normal[f_idt][j])); + } + +} + + __global__ static void _compute_reconstruct_correction_v2( cs_lnum_t n_cells, cs_lnum_t has_dc, From 4c861fb08e9c51616f8e2d02426e36a00181a879 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Fri, 17 Nov 2023 10:39:11 +0100 Subject: [PATCH 34/70] fix cf version --- ...reconstruct_vector_gradient_scatter_v2.cuh | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 9aca0b2f3a..826cd327ed 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -129,12 +129,17 @@ _compute_reconstruct_v_i_face_v2cf(cs_lnum_t n_i_faces, + dofij[f_idt][2]*( r_grad[c_id1][i][2] + r_grad[c_id2][i][2])); - using Cell = AtomicCell; + using Cell = AtomicCell; + Cell grad_cf1, grad_cf2; for (cs_lnum_t j = 0; j < 3; j++) { - Cell::ref(grad[c_id1][i][j]).conflict_free_add(-1u, Cell::ref((pfaci + rfac) * i_f_face_normal[f_idt][j])); - Cell::ref(grad[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(- ((pfacj + rfac) * i_f_face_normal[f_idt][j]))); + grad_cf1[j].get() = (pfaci + rfac) * i_f_face_normal[f_idt][j]; + grad_cf2[j].get() = - ((pfacj + rfac) * i_f_face_normal[f_idt][j]); + // Cell::ref(grad_cf1[c_id1][i][j]).conflict_free_add(-1u, Cell::ref((pfaci + rfac) * i_f_face_normal[f_idt][j])); + // Cell::ref(grad_cf2[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(- ((pfacj + rfac) * i_f_face_normal[f_idt][j]))); } + Cell::ref(grad[c_id1][i]).conflict_free_add(-1u, grad_cf1); + Cell::ref(grad[c_id2][i]).conflict_free_add(-1u, grad_cf2); } @@ -243,11 +248,14 @@ _compute_reconstruct_v_b_face_v2cf(cs_lnum_t n_b_faces, rfac += coefbv[f_idt][i][k] * vecfac; } - using Cell = AtomicCell; - + using Cell = AtomicCell; + Cell grad_cf; + for (cs_lnum_t j = 0; j < 3; j++){ - Cell::ref(grad[c_id][i][j]).conflict_free_add(-1u, Cell::ref((pfac + rfac) * b_f_face_normal[f_idt][j])); + grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j]; + // grad[c_id][i][j].get() += (pfac + rfac) * b_f_face_normal[f_idt][j]; } + Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); } From 293a087d34b776b18c3393401b26ec2a9790a57a Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Fri, 17 Nov 2023 15:22:52 +0100 Subject: [PATCH 35/70] refacto and add scatter_v1_cf --- src/alge/cs_gradient_cuda.cu | 190 +++--------------- ...cs_reconstruct_vector_gradient_scatter.cuh | 178 ++++++++++++++++ ...reconstruct_vector_gradient_scatter_cf.cuh | 143 +++++++++++++ ...reconstruct_vector_gradient_scatter_v2.cuh | 121 ----------- ...onstruct_vector_gradient_scatter_v2_cf.cuh | 148 ++++++++++++++ 5 files changed, 502 insertions(+), 278 deletions(-) create mode 100644 src/alge/cs_reconstruct_vector_gradient_scatter.cuh create mode 100644 src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh create mode 100644 src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 383642bf48..a2d3a18592 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -35,7 +35,10 @@ #include "cs_gradient_priv.h" #include "cs_reconstruct_vector_gradient_gather.cuh" #include "cs_reconstruct_vector_gradient_gather_v2.cuh" +#include "cs_reconstruct_vector_gradient_scatter.cuh" +#include "cs_reconstruct_vector_gradient_scatter_cf.cuh" #include "cs_reconstruct_vector_gradient_scatter_v2.cuh" +#include "cs_reconstruct_vector_gradient_scatter_v2_cf.cuh" /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ @@ -1233,160 +1236,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } -__global__ static void -_compute_reconstruct_v_i_face(cs_lnum_t size, - const cs_lnum_t *i_group_index, - const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, - const cs_real_3_t *restrict dofij, - const cs_real_3_t *restrict i_f_face_normal) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - cs_lnum_t c_id1, c_id2; - cs_real_t pond, ktpond, pfaci, pfacj, rfac; - - c_id1 = i_face_cells[f_id][0]; - c_id2 = i_face_cells[f_id][1]; - - pond = weight[f_id]; - ktpond = (c_weight == NULL) ? - pond : // no cell weighting - pond * c_weight[c_id1] // cell weighting active - / ( pond * c_weight[c_id1] - + (1.0-pond)* c_weight[c_id2]); - - - for (cs_lnum_t i = 0; i < 3; i++) { - pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - - /* Reconstruction part */ - rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); - - for (cs_lnum_t j = 0; j < 3; j++) { - atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); - atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); - - } - } - -} - - -__global__ static void -_compute_reconstruct_v_b_face(cs_lnum_t size, - const bool *coupled_faces, - cs_lnum_t cpl_stride, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, - int inc, - const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, - const cs_real_3_t *restrict b_f_face_normal, - const cs_lnum_t *restrict b_face_cells) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= size){ - return; - } - cs_lnum_t c_id; - cs_real_t pfac, rfac, vecfac; - - // if (coupled_faces[f_id * cpl_stride]) - // return; - - c_id = b_face_cells[f_id]; - - for (cs_lnum_t i = 0; i < 3; i++) { - - pfac = inc*coefav[f_id][i]; - - for (cs_lnum_t k = 0; k < 3; k++){ - pfac += coefbv[f_id][i][k] * pvar[c_id][k]; - } - - pfac -= pvar[c_id][i]; - - // /* Reconstruction part */ - rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { - vecfac = r_grad[c_id][k][0] * diipb[f_id][0] - + r_grad[c_id][k][1] * diipb[f_id][1] - + r_grad[c_id][k][2] * diipb[f_id][2]; - rfac += coefbv[f_id][i][k] * vecfac; - } - - for (cs_lnum_t j = 0; j < 3; j++) - atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); - - } -} - - - -__global__ static void -_compute_reconstruct_correction(cs_lnum_t size, - cs_lnum_t has_dc, - const int *restrict c_disable_flag, - const cs_real_t *restrict cell_f_vol, - cs_real_33_t *restrict grad, - const cs_real_33_t *restrict corr_grad_lin, - bool test_bool - ) -{ - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(c_id >= size){ - return; - } - cs_real_t dvol; - /* Is the cell disabled (for solid or porous)? Not the case if coupled */ - if (has_dc * c_disable_flag[has_dc * c_id] == 0) - dvol = 1. / cell_f_vol[c_id]; - else - dvol = 0.; - - - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - grad[c_id][i][j] *= dvol; - } - if (test_bool) { - cs_real_t gradpa[3]; - // printf("dvol = %.17lg\n", dvol); - for (cs_lnum_t i = 0; i < 3; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - gradpa[j] = grad[c_id][i][j]; - grad[c_id][i][j] = 0.; - } - - for (cs_lnum_t j = 0; j < 3; j++){ - for (cs_lnum_t k = 0; k < 3; k++){ - grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; - } - } - } - } - -} /*---------------------------------------------------------------------------- * Reconstruct the gradient of a vector using a given gradient of @@ -1631,8 +1482,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - _compute_reconstruct_v_i_face_v2cf<<>> - (n_i_faces * 3, + _compute_reconstruct_v_i_face_cf<<>> + (n_i_faces, i_group_index, i_face_cells, pvar_d, @@ -1642,6 +1493,18 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, grad_d, dofij, i_f_face_normal); + + // _compute_reconstruct_v_i_face_v2_cf<<>> + // (n_i_faces * 3, + // i_group_index, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); // printf("Avant les assert dans gradient_cuda.cu\n"); // assert(cell_cells_idx); @@ -1747,9 +1610,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); - - _compute_reconstruct_v_b_face_v2cf<<>> - ( n_b_faces * 3, + _compute_reconstruct_v_b_face_cf<<>> + ( n_b_faces, coupled_faces_d, cpl_stride, coefb_d, @@ -1762,6 +1624,20 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, b_f_face_normal, b_face_cells); + // _compute_reconstruct_v_b_face_v2_cf<<>> + // ( n_b_faces * 3, + // coupled_faces_d, + // cpl_stride, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coupled_faces_d, diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh new file mode 100644 index 0000000000..e2910de06a --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh @@ -0,0 +1,178 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_reconstruct_v_i_face(cs_lnum_t size, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); + atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); + + } + } + +} + + +__global__ static void +_compute_reconstruct_v_b_face(cs_lnum_t size, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id; + cs_real_t pfac, rfac, vecfac; + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + c_id = b_face_cells[f_id]; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++) + atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); + + } +} + +__global__ static void +_compute_reconstruct_correction(cs_lnum_t size, + cs_lnum_t has_dc, + const int *restrict c_disable_flag, + const cs_real_t *restrict cell_f_vol, + cs_real_33_t *restrict grad, + const cs_real_33_t *restrict corr_grad_lin, + bool test_bool + ) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= size){ + return; + } + cs_real_t dvol; + /* Is the cell disabled (for solid or porous)? Not the case if coupled */ + if (has_dc * c_disable_flag[has_dc * c_id] == 0) + dvol = 1. / cell_f_vol[c_id]; + else + dvol = 0.; + + + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + grad[c_id][i][j] *= dvol; + } + + + if (test_bool) { + cs_real_t gradpa[3]; + // printf("dvol = %.17lg\n", dvol); + for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + gradpa[j] = grad[c_id][i][j]; + grad[c_id][i][j] = 0.; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + for (cs_lnum_t k = 0; k < 3; k++){ + grad[c_id][i][j] += corr_grad_lin[c_id][j][k] * gradpa[k]; + } + } + } + } + +} \ No newline at end of file diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh new file mode 100644 index 0000000000..87b056144b --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh @@ -0,0 +1,143 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +__global__ static void +_compute_reconstruct_v_i_face_cf(cs_lnum_t size, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + pond = weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + using Cell = AtomicCell; + Cell grad_cf1, grad_cf2; + + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad_cf1[i][j].get() = (pfaci + rfac) * i_f_face_normal[f_id][j]; + grad_cf2[i][j].get() = - ((pfacj + rfac) * i_f_face_normal[f_id][j]); + // atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); + // atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); + + } + } + Cell::ref(grad[c_id1]).conflict_free_add(-1u, grad_cf1); + Cell::ref(grad[c_id2]).conflict_free_add(-1u, grad_cf2); + +} + + +__global__ static void +_compute_reconstruct_v_b_face_cf(cs_lnum_t size, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= size){ + return; + } + cs_lnum_t c_id; + cs_real_t pfac, rfac, vecfac; + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + c_id = b_face_cells[f_id]; + + using Cell = AtomicCell; + Cell grad_cf; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + grad_cf[i][j].get() = (pfac + rfac) * b_f_face_normal[f_id][j]; + // atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); + } + } + Cell::ref(grad[c_id]).conflict_free_add(-1u, grad_cf); +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 826cd327ed..70c11a053c 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -82,68 +82,6 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, - -__global__ static void -_compute_reconstruct_v_i_face_v2cf(cs_lnum_t n_i_faces, - const cs_lnum_t *i_group_index, - const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *pvar, - const cs_real_t *weight, - const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, - const cs_real_3_t *restrict dofij, - const cs_real_3_t *restrict i_f_face_normal) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= n_i_faces){ - return; - } - - size_t f_idt = f_id / 3; - size_t i = f_id % 3; - - cs_lnum_t c_id1, c_id2; - cs_real_t pond, ktpond, pfaci, pfacj, rfac; - - c_id1 = i_face_cells[f_idt][0]; - c_id2 = i_face_cells[f_idt][1]; - - pond = weight[f_idt]; - ktpond = (c_weight == NULL) ? - pond : // no cell weighting - pond * c_weight[c_id1] // cell weighting active - / ( pond * c_weight[c_id1] - + (1.0-pond)* c_weight[c_id2]); - - - pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); - - /* Reconstruction part */ - rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_idt][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_idt][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); - - using Cell = AtomicCell; - Cell grad_cf1, grad_cf2; - - for (cs_lnum_t j = 0; j < 3; j++) { - grad_cf1[j].get() = (pfaci + rfac) * i_f_face_normal[f_idt][j]; - grad_cf2[j].get() = - ((pfacj + rfac) * i_f_face_normal[f_idt][j]); - // Cell::ref(grad_cf1[c_id1][i][j]).conflict_free_add(-1u, Cell::ref((pfaci + rfac) * i_f_face_normal[f_idt][j])); - // Cell::ref(grad_cf2[c_id2][i][j]).conflict_free_add(-1u, Cell::ref(- ((pfacj + rfac) * i_f_face_normal[f_idt][j]))); - } - Cell::ref(grad[c_id1][i]).conflict_free_add(-1u, grad_cf1); - Cell::ref(grad[c_id2][i]).conflict_free_add(-1u, grad_cf2); - -} - - __global__ static void _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, const bool *coupled_faces, @@ -200,65 +138,6 @@ _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, -__global__ static void -_compute_reconstruct_v_b_face_v2cf(cs_lnum_t n_b_faces, - const bool *coupled_faces, - cs_lnum_t cpl_stride, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, - int inc, - const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, - const cs_real_3_t *restrict b_f_face_normal, - const cs_lnum_t *restrict b_face_cells) -{ - cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - - if(f_id >= n_b_faces){ - return; - } - - size_t f_idt = f_id / 3; - size_t i = f_id % 3; - - cs_lnum_t c_id; - cs_real_t pond, ktpond, pfac, rfac, vecfac; - - // if (coupled_faces[f_idt * cpl_stride]) - // return; - - c_id = b_face_cells[f_idt]; - - pfac = inc*coefav[f_idt][i]; - - for (cs_lnum_t k = 0; k < 3; k++){ - pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; - } - - pfac -= pvar[c_id][i]; - -// /* Reconstruction part */ - rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { - vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] - + r_grad[c_id][k][1] * diipb[f_idt][1] - + r_grad[c_id][k][2] * diipb[f_idt][2]; - rfac += coefbv[f_idt][i][k] * vecfac; - } - - using Cell = AtomicCell; - Cell grad_cf; - - for (cs_lnum_t j = 0; j < 3; j++){ - grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j]; - // grad[c_id][i][j].get() += (pfac + rfac) * b_f_face_normal[f_idt][j]; - } - Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); - -} - __global__ static void _compute_reconstruct_correction_v2( cs_lnum_t n_cells, diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh new file mode 100644 index 0000000000..474172cbe8 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh @@ -0,0 +1,148 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + +__global__ static void +_compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, + const cs_lnum_t *i_group_index, + const cs_lnum_2_t *i_face_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id1, c_id2; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + c_id1 = i_face_cells[f_idt][0]; + c_id2 = i_face_cells[f_idt][1]; + + pond = weight[f_idt]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_idt][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_idt][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_idt][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + using Cell = AtomicCell; + Cell grad_cf1, grad_cf2; + + for (cs_lnum_t j = 0; j < 3; j++) { + grad_cf1[j].get() = (pfaci + rfac) * i_f_face_normal[f_idt][j]; + grad_cf2[j].get() = - ((pfacj + rfac) * i_f_face_normal[f_idt][j]); + } + Cell::ref(grad[c_id1][i]).conflict_free_add(-1u, grad_cf1); + Cell::ref(grad[c_id2][i]).conflict_free_add(-1u, grad_cf2); + +} + + + + + +__global__ static void +_compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, + const bool *coupled_faces, + cs_lnum_t cpl_stride, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + size_t f_idt = f_id / 3; + size_t i = f_id % 3; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac; + + // if (coupled_faces[f_idt * cpl_stride]) + // return; + + c_id = b_face_cells[f_idt]; + + pfac = inc*coefav[f_idt][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_idt][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + +// /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + + r_grad[c_id][k][1] * diipb[f_idt][1] + + r_grad[c_id][k][2] * diipb[f_idt][2]; + rfac += coefbv[f_idt][i][k] * vecfac; + } + + using Cell = AtomicCell; + Cell grad_cf; + + for (cs_lnum_t j = 0; j < 3; j++){ + grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j]; + // grad[c_id][i][j].get() += (pfac + rfac) * b_f_face_normal[f_idt][j]; + } + Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); + +} \ No newline at end of file From 6e1ba06011d1ac2d0625d6f8f8a25999675e01ab Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 20 Nov 2023 11:31:38 +0100 Subject: [PATCH 36/70] fix --- src/alge/cs_gradient_cuda.cu | 72 +++++++++---------- .../cs_reconstruct_vector_gradient_gather.cuh | 4 +- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index a2d3a18592..e8326b1e8b 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1458,8 +1458,20 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Interior faces contribution */ - // _compute_reconstruct_v_i_face<<>> - // (n_i_faces, + _compute_reconstruct_v_i_face<<>> + (n_i_faces, + i_group_index, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal); + + // _compute_reconstruct_v_i_face_v2<<>> + // (n_i_faces * 3, // i_group_index, // i_face_cells, // pvar_d, @@ -1470,8 +1482,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // _compute_reconstruct_v_i_face_v2<<>> - // (n_i_faces * 3, + // _compute_reconstruct_v_i_face_cf<<>> + // (n_i_faces, // i_group_index, // i_face_cells, // pvar_d, @@ -1482,18 +1494,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - _compute_reconstruct_v_i_face_cf<<>> - (n_i_faces, - i_group_index, - i_face_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal); - // _compute_reconstruct_v_i_face_v2_cf<<>> // (n_i_faces * 3, // i_group_index, @@ -1581,8 +1581,23 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- - // _compute_reconstruct_v_b_face<<>> - // ( n_b_faces, + _compute_reconstruct_v_b_face<<>> + ( n_b_faces, + coupled_faces_d, + cpl_stride, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells); + + + // _compute_reconstruct_v_b_face_v2<<>> + // ( n_b_faces * 3, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1594,10 +1609,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // b_f_face_normal, // b_face_cells); - - - // _compute_reconstruct_v_b_face_v2<<>> - // ( n_b_faces * 3, + + // _compute_reconstruct_v_b_face_cf<<>> + // ( n_b_faces, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1609,20 +1623,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // b_f_face_normal, // b_face_cells); - - _compute_reconstruct_v_b_face_cf<<>> - ( n_b_faces, - coupled_faces_d, - cpl_stride, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_face_cells); // _compute_reconstruct_v_b_face_v2_cf<<>> // ( n_b_faces * 3, diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh index 8ea22126c7..b5a6244e88 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -27,7 +27,6 @@ __global__ static void _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, - const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, const cs_real_t *c_weight, @@ -38,8 +37,7 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_lnum_t *restrict cell_i_faces, - const short int *restrict cell_i_faces_sgn, - const cs_lnum_t n_i_faces) + const short int *restrict cell_i_faces_sgn) { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; From 1e56d6275a1363b904e5322fbfc7c5751d64ecfe Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 20 Nov 2023 11:40:18 +0100 Subject: [PATCH 37/70] cleaning code --- src/alge/cs_gradient_cuda.cu | 51 ++----------------- ...reconstruct_vector_gradient_scatter_cf.cuh | 4 -- ...onstruct_vector_gradient_scatter_v2_cf.cuh | 1 - 3 files changed, 4 insertions(+), 52 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index e8326b1e8b..d3036c278f 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1334,41 +1334,31 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); const cs_lnum_t *restrict cell_b_faces_idx = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); - const cs_lnum_t *restrict cell_cells_lst; - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; cs_lnum_t *restrict i_group_index; - // printf("m->i_face_numbering->group_index = ", m->i_face_numbering->group_index); CS_CUDA_CHECK(cudaMalloc(&i_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); cs_cuda_copy_h2d(i_group_index, (void *)m->i_face_numbering->group_index, sizeof(int)*n_i_groups * n_i_threads * 2); - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_numbering->group_index); const int n_b_groups = m->b_face_numbering->n_groups; const int n_b_threads = m->b_face_numbering->n_threads; - cs_lnum_t *restrict b_group_index; CS_CUDA_CHECK(cudaMalloc(&b_group_index, sizeof(int)*n_i_groups * n_i_threads * 2)); cs_cuda_copy_h2d(b_group_index, (void *)m->b_face_numbering->group_index, sizeof(int)*n_b_groups * n_b_threads * 2); - // printf("Avant allocation\n"); const cs_lnum_t *restrict cell_cells_idx = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); const cs_lnum_t *restrict cell_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); - - - // if (madj->cell_i_faces == NULL) { cs_mesh_adjacencies_update_cell_i_faces(); // } assert(madj->cell_i_faces); const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]); cs_lnum_t *restrict cell_i_faces; - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face)); cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); assert(cell_i_faces); @@ -1382,56 +1372,34 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_lnum_t *restrict cell_b_faces = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); - assert(m->b_cells); - assert(madj->cell_b_faces); - assert(madj->cell_b_faces_idx); - assert(b_cells); - assert(cell_b_faces); - assert(cell_b_faces_idx); - - const cs_real_3_t *restrict cell_cen; - // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); - const cs_lnum_t *restrict cell_vol; - // = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_vol); cs_real_t *restrict cell_f_vol; CS_CUDA_CHECK(cudaMalloc(&cell_f_vol, n_cells * sizeof(cs_real_t))); cs_cuda_copy_h2d(cell_f_vol, (void *)fvq->cell_f_vol, sizeof(cs_real_t)*n_cells); - // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_vol); if (cs_glob_porous_model == 1 || cs_glob_porous_model == 2) cell_f_vol = fvq->cell_vol; const cs_real_3_t *restrict cell_f_cen = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); const cs_real_t *restrict weight = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); - const cs_real_t *restrict b_dist; - // = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); - const cs_real_3_t *restrict b_face_normal; - // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); cs_real_3_t *restrict i_f_face_normal; - // printf("fvq->i_f_face_normal = ", fvq->i_f_face_normal); CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); - // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->i_f_face_normal); const cs_real_3_t *restrict b_f_face_normal = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); cs_real_3_t *restrict dofij; - // printf("fvq->dofij = ", fvq->dofij); CS_CUDA_CHECK(cudaMalloc(&dofij, sizeof(cs_real_3_t)*n_i_faces)); cs_cuda_copy_h2d(dofij, (void *)fvq->dofij, sizeof(cs_real_3_t)*n_i_faces); - // = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->dofij); const cs_real_3_t *restrict diipb = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); cs_real_33_t *restrict corr_grad_lin; CS_CUDA_CHECK(cudaMalloc(&corr_grad_lin, n_cells * sizeof(cs_real_33_t))); cs_cuda_copy_h2d(corr_grad_lin, (void *)fvq->corr_grad_lin, sizeof(cs_real_33_t)*n_cells); - // = (const cs_real_33_t *restrict)cs_get_device_ptr_const_pf(fvq->corr_grad_lin); const cs_lnum_t has_dc = fvq->has_disable_flag; int *restrict c_disable_flag; CS_CUDA_CHECK(cudaMalloc(&c_disable_flag, n_cells * sizeof(int))); cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells); - // = (const int *restrict)cs_get_device_ptr_const_pf(fvq->c_disable_flag); _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, @@ -1458,6 +1426,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Interior faces contribution */ + + //Kernels Scatter _compute_reconstruct_v_i_face<<>> (n_i_faces, i_group_index, @@ -1482,6 +1452,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); + //Kernels Scatter conflict free // _compute_reconstruct_v_i_face_cf<<>> // (n_i_faces, // i_group_index, @@ -1506,21 +1477,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // printf("Avant les assert dans gradient_cuda.cu\n"); - // assert(cell_cells_idx); - // assert(cell_cells); - // assert(weight); - // assert(cell_i_faces); - // assert(cell_i_faces_sgn); - // printf("n_i_faces = %d\n", n_i_faces); - // printf("n_cells = %d\n", n_cells); - // for(int i = 0; i< n_i_faces; i++){ - // printf("i = %d && weight = %f \n", i, fvq->weight[i]); - // printf("i = %d && c_id2 = %d \n", i, madj->cell_cells[i]); - // printf("i = %d && s_id = %d \n", i, madj->cell_cells_idx[i]); - // printf("i = %d && f_id = %d \n", i, madj->cell_i_faces_sgn[i]); - // } - // printf("Après les assert dans gradient_cuda.cu\n"); + //Kernels Gather // _compute_reconstruct_v_i_face_gather<<>> // ( n_cells, // pvar_d, diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh index 87b056144b..db6d3003d2 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh @@ -74,9 +74,6 @@ _compute_reconstruct_v_i_face_cf(cs_lnum_t size, for (cs_lnum_t j = 0; j < 3; j++) { grad_cf1[i][j].get() = (pfaci + rfac) * i_f_face_normal[f_id][j]; grad_cf2[i][j].get() = - ((pfacj + rfac) * i_f_face_normal[f_id][j]); - // atomicAdd(&grad[c_id1][i][j],(pfaci + rfac) * i_f_face_normal[f_id][j]); - // atomicAdd(&grad[c_id2][i][j], - ((pfacj + rfac) * i_f_face_normal[f_id][j])); - } } Cell::ref(grad[c_id1]).conflict_free_add(-1u, grad_cf1); @@ -136,7 +133,6 @@ _compute_reconstruct_v_b_face_cf(cs_lnum_t size, for (cs_lnum_t j = 0; j < 3; j++){ grad_cf[i][j].get() = (pfac + rfac) * b_f_face_normal[f_id][j]; - // atomicAdd(&grad[c_id][i][j], (pfac + rfac) * b_f_face_normal[f_id][j]); } } Cell::ref(grad[c_id]).conflict_free_add(-1u, grad_cf); diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh index 474172cbe8..cbec2ff579 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh @@ -141,7 +141,6 @@ _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, for (cs_lnum_t j = 0; j < 3; j++){ grad_cf[j].get() = (pfac + rfac) * b_f_face_normal[f_idt][j]; - // grad[c_id][i][j].get() += (pfac + rfac) * b_f_face_normal[f_idt][j]; } Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); From f4e312496d39d0ab8753bd7a051c9602c99b8910 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 20 Nov 2023 12:03:48 +0100 Subject: [PATCH 38/70] setup to bests kernels --- src/alge/cs_gradient_cuda.cu | 84 ++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index d3036c278f..0670562606 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1428,20 +1428,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Interior faces contribution */ //Kernels Scatter - _compute_reconstruct_v_i_face<<>> - (n_i_faces, - i_group_index, - i_face_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal); - - // _compute_reconstruct_v_i_face_v2<<>> - // (n_i_faces * 3, + // _compute_reconstruct_v_i_face<<>> + // (n_i_faces, // i_group_index, // i_face_cells, // pvar_d, @@ -1452,9 +1440,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - //Kernels Scatter conflict free - // _compute_reconstruct_v_i_face_cf<<>> - // (n_i_faces, + // _compute_reconstruct_v_i_face_v2<<>> + // (n_i_faces * 3, // i_group_index, // i_face_cells, // pvar_d, @@ -1465,8 +1452,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // _compute_reconstruct_v_i_face_v2_cf<<>> - // (n_i_faces * 3, + //Kernels Scatter conflict free + // _compute_reconstruct_v_i_face_cf<<>> + // (n_i_faces, // i_group_index, // i_face_cells, // pvar_d, @@ -1476,6 +1464,18 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // dofij, // i_f_face_normal); + + _compute_reconstruct_v_i_face_v2_cf<<>> + (n_i_faces * 3, + i_group_index, + i_face_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal); //Kernels Gather // _compute_reconstruct_v_i_face_gather<<>> @@ -1538,23 +1538,8 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- - _compute_reconstruct_v_b_face<<>> - ( n_b_faces, - coupled_faces_d, - cpl_stride, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_face_cells); - - - // _compute_reconstruct_v_b_face_v2<<>> - // ( n_b_faces * 3, + // _compute_reconstruct_v_b_face<<>> + // ( n_b_faces, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1566,9 +1551,10 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // b_f_face_normal, // b_face_cells); - - // _compute_reconstruct_v_b_face_cf<<>> - // ( n_b_faces, + + + // _compute_reconstruct_v_b_face_v2<<>> + // ( n_b_faces * 3, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1580,9 +1566,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // b_f_face_normal, // b_face_cells); - - // _compute_reconstruct_v_b_face_v2_cf<<>> - // ( n_b_faces * 3, + + // _compute_reconstruct_v_b_face_cf<<>> + // ( n_b_faces, // coupled_faces_d, // cpl_stride, // coefb_d, @@ -1595,6 +1581,20 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); + _compute_reconstruct_v_b_face_v2_cf<<>> + ( n_b_faces * 3, + coupled_faces_d, + cpl_stride, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells); + // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coupled_faces_d, From 78067f15a43f75b87b5607b02f634c3915fbc92b Mon Sep 17 00:00:00 2001 From: mohammed derbane <115648603+aneo-mderbane@users.noreply.github.com> Date: Tue, 21 Nov 2023 09:35:21 +0100 Subject: [PATCH 39/70] Apply suggestions from code review Add comment review by Daouda Co-authored-by: ddiakiteaneo <127390724+ddiakiteaneo@users.noreply.github.com> --- src/alge/cs_gradient.cxx | 2 +- ..._reconstruct_vector_gradient_gather_v2.cuh | 2 +- ...cs_reconstruct_vector_gradient_scatter.cuh | 20 +++++++------------ ...reconstruct_vector_gradient_scatter_cf.cuh | 14 ++++--------- ...reconstruct_vector_gradient_scatter_v2.cuh | 2 -- ...onstruct_vector_gradient_scatter_v2_cf.cuh | 8 +------- 6 files changed, 14 insertions(+), 34 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 4a6586be87..7e4081a070 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5804,7 +5804,7 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, auto cuda = grad[c_id][i][j]; double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-6) { - printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %.17lg\tCUDA = %.17lg\tdiff = %.17lg\tdiff relative = %.17lg\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); } } } diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh index 6a2d49457c..16c2b567bd 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -149,4 +149,4 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, grad[c_id][i][2] += (pfac + rfac) * b_f_face_normal[f_id][2]; } -} \ No newline at end of file +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh index e2910de06a..025d370cb5 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh @@ -25,8 +25,7 @@ /*----------------------------------------------------------------------------*/ __global__ static void -_compute_reconstruct_v_i_face(cs_lnum_t size, - const cs_lnum_t *i_group_index, +_compute_reconstruct_v_i_face(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, @@ -38,7 +37,7 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_lnum_t c_id1, c_id2; @@ -78,9 +77,7 @@ _compute_reconstruct_v_i_face(cs_lnum_t size, __global__ static void -_compute_reconstruct_v_b_face(cs_lnum_t size, - const bool *coupled_faces, - cs_lnum_t cpl_stride, +_compute_reconstruct_v_b_face(cs_lnum_t n_b_faces, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, @@ -93,15 +90,12 @@ _compute_reconstruct_v_b_face(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_b_faces){ return; } cs_lnum_t c_id; cs_real_t pfac, rfac, vecfac; - // if (coupled_faces[f_id * cpl_stride]) - // return; - c_id = b_face_cells[f_id]; for (cs_lnum_t i = 0; i < 3; i++) { @@ -130,7 +124,7 @@ _compute_reconstruct_v_b_face(cs_lnum_t size, } __global__ static void -_compute_reconstruct_correction(cs_lnum_t size, +_compute_reconstruct_correction(cs_lnum_t n_cells, cs_lnum_t has_dc, const int *restrict c_disable_flag, const cs_real_t *restrict cell_f_vol, @@ -141,7 +135,7 @@ _compute_reconstruct_correction(cs_lnum_t size, { cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= size){ + if(c_id >= n_cells){ return; } cs_real_t dvol; @@ -175,4 +169,4 @@ _compute_reconstruct_correction(cs_lnum_t size, } } -} \ No newline at end of file +} diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh index db6d3003d2..f684409ffd 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh @@ -25,8 +25,7 @@ /*----------------------------------------------------------------------------*/ __global__ static void -_compute_reconstruct_v_i_face_cf(cs_lnum_t size, - const cs_lnum_t *i_group_index, +_compute_reconstruct_v_i_face_cf(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, @@ -38,7 +37,7 @@ _compute_reconstruct_v_i_face_cf(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_i_faces){ return; } cs_lnum_t c_id1, c_id2; @@ -83,9 +82,7 @@ _compute_reconstruct_v_i_face_cf(cs_lnum_t size, __global__ static void -_compute_reconstruct_v_b_face_cf(cs_lnum_t size, - const bool *coupled_faces, - cs_lnum_t cpl_stride, +_compute_reconstruct_v_b_face_cf(cs_lnum_t n_b_faces, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, @@ -98,15 +95,12 @@ _compute_reconstruct_v_b_face_cf(cs_lnum_t size, { cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; - if(f_id >= size){ + if(f_id >= n_b_faces){ return; } cs_lnum_t c_id; cs_real_t pfac, rfac, vecfac; - // if (coupled_faces[f_id * cpl_stride]) - // return; - c_id = b_face_cells[f_id]; using Cell = AtomicCell; diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 70c11a053c..96ccb8057c 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -81,7 +81,6 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, } - __global__ static void _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, const bool *coupled_faces, @@ -138,7 +137,6 @@ _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, - __global__ static void _compute_reconstruct_correction_v2( cs_lnum_t n_cells, cs_lnum_t has_dc, diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh index cbec2ff579..c437fde30e 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh @@ -28,7 +28,6 @@ __global__ static void _compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, - const cs_lnum_t *i_group_index, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, @@ -90,8 +89,6 @@ _compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, __global__ static void _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, - const bool *coupled_faces, - cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, @@ -114,9 +111,6 @@ _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, cs_lnum_t c_id; cs_real_t pond, ktpond, pfac, rfac, vecfac; - // if (coupled_faces[f_idt * cpl_stride]) - // return; - c_id = b_face_cells[f_idt]; pfac = inc*coefav[f_idt][i]; @@ -144,4 +138,4 @@ _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, } Cell::ref(grad[c_id][i]).conflict_free_add(-1u, grad_cf); -} \ No newline at end of file +} From 00cc38a271736eed092c113e76719c77dd603195 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Tue, 21 Nov 2023 09:54:39 +0100 Subject: [PATCH 40/70] fix bugs after the review code --- src/alge/cs_gradient_cuda.cu | 22 ++----------------- .../cs_reconstruct_vector_gradient_gather.cuh | 6 ----- ..._reconstruct_vector_gradient_gather_v2.cuh | 10 +-------- ...reconstruct_vector_gradient_scatter_v2.cuh | 1 - 4 files changed, 3 insertions(+), 36 deletions(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 0670562606..86a6ddc628 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1430,7 +1430,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, //Kernels Scatter // _compute_reconstruct_v_i_face<<>> // (n_i_faces, - // i_group_index, // i_face_cells, // pvar_d, // weight, @@ -1442,7 +1441,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_i_face_v2<<>> // (n_i_faces * 3, - // i_group_index, // i_face_cells, // pvar_d, // weight, @@ -1455,7 +1453,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, //Kernels Scatter conflict free // _compute_reconstruct_v_i_face_cf<<>> // (n_i_faces, - // i_group_index, // i_face_cells, // pvar_d, // weight, @@ -1467,7 +1464,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, _compute_reconstruct_v_i_face_v2_cf<<>> (n_i_faces * 3, - i_group_index, i_face_cells, pvar_d, weight, @@ -1495,7 +1491,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_i_face_gather_v2<<>> // ( n_cells * 3 * 3, - // i_face_cells, // pvar_d, // weight, // c_weight, @@ -1506,8 +1501,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_cells_idx, // cell_cells, // cell_i_faces, - // cell_i_faces_sgn, - // n_i_faces); + // cell_i_faces_sgn); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1540,8 +1534,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // ----------------------------Begin of Kernels part 2------------------------------------------- // _compute_reconstruct_v_b_face<<>> // ( n_b_faces, - // coupled_faces_d, - // cpl_stride, // coefb_d, // coefa_d, // pvar_d, @@ -1555,8 +1547,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_v2<<>> // ( n_b_faces * 3, - // coupled_faces_d, - // cpl_stride, // coefb_d, // coefa_d, // pvar_d, @@ -1569,8 +1559,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_cf<<>> // ( n_b_faces, - // coupled_faces_d, - // cpl_stride, // coefb_d, // coefa_d, // pvar_d, @@ -1583,8 +1571,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, _compute_reconstruct_v_b_face_v2_cf<<>> ( n_b_faces * 3, - coupled_faces_d, - cpl_stride, coefb_d, coefa_d, pvar_d, @@ -1597,8 +1583,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, - // coupled_faces_d, - // cpl_stride, // coefb_d, // coefa_d, // pvar_d, @@ -1614,8 +1598,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_gather_v2<<>> // ( m->n_b_cells * 3, - // coupled_faces_d, - // cpl_stride, // coefb_d, // coefa_d, // pvar_d, @@ -1669,7 +1651,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamDestroy(stream); if(PERF){ - printf("recconstruct Kernels times:\t"); + printf("reconstruct Kernels times:\t"); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh index b5a6244e88..6432ef3ce9 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -86,8 +86,6 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, __global__ static void _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, - const bool *coupled_faces, - cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, @@ -114,10 +112,6 @@ _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; - - // if (coupled_faces[f_id * cpl_stride]) - // return; - for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh index 16c2b567bd..404421b4c9 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -27,7 +27,6 @@ __global__ static void _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, - const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, const cs_real_t *c_weight, @@ -38,8 +37,7 @@ _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, const cs_lnum_t *restrict cell_cells_idx, const cs_lnum_t *restrict cell_cells, const cs_lnum_t *restrict cell_i_faces, - const short int *restrict cell_i_faces_sgn, - const cs_lnum_t n_i_faces) + const short int *restrict cell_i_faces_sgn) { cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; @@ -89,8 +87,6 @@ _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, __global__ static void _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, - const bool *coupled_faces, - cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, @@ -120,10 +116,6 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; - - // if (coupled_faces[f_id * cpl_stride]) - // return; - for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 96ccb8057c..78013e518e 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -29,7 +29,6 @@ __global__ static void _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, - const cs_lnum_t *i_group_index, const cs_lnum_2_t *i_face_cells, const cs_real_3_t *pvar, const cs_real_t *weight, From 9878b0b4f44f1d4552f6d6191cc450b0c14b4935 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Wed, 22 Nov 2023 11:02:47 +0100 Subject: [PATCH 41/70] ADD gather versions with registers memory and shared memory --- src/alge/cs_gradient_cuda.cu | 154 ++++++++++++--- .../cs_reconstruct_vector_gradient_gather.cuh | 31 ++- ..._reconstruct_vector_gradient_gather_v3.cuh | 168 ++++++++++++++++ ..._reconstruct_vector_gradient_gather_v4.cuh | 150 ++++++++++++++ ..._reconstruct_vector_gradient_gather_v5.cuh | 184 ++++++++++++++++++ 5 files changed, 647 insertions(+), 40 deletions(-) create mode 100644 src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh create mode 100644 src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh create mode 100644 src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 86a6ddc628..d4e3544dea 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -35,6 +35,9 @@ #include "cs_gradient_priv.h" #include "cs_reconstruct_vector_gradient_gather.cuh" #include "cs_reconstruct_vector_gradient_gather_v2.cuh" +#include "cs_reconstruct_vector_gradient_gather_v3.cuh" +#include "cs_reconstruct_vector_gradient_gather_v4.cuh" +#include "cs_reconstruct_vector_gradient_gather_v5.cuh" #include "cs_reconstruct_vector_gradient_scatter.cuh" #include "cs_reconstruct_vector_gradient_scatter_cf.cuh" #include "cs_reconstruct_vector_gradient_scatter_v2.cuh" @@ -1427,7 +1430,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Interior faces contribution */ - //Kernels Scatter + /*************************************Kernels Scatter**************************************************/ // _compute_reconstruct_v_i_face<<>> // (n_i_faces, // i_face_cells, @@ -1450,7 +1453,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - //Kernels Scatter conflict free + /*************************************Kernels Scatter conflict free**************************************/ // _compute_reconstruct_v_i_face_cf<<>> // (n_i_faces, // i_face_cells, @@ -1462,18 +1465,18 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - _compute_reconstruct_v_i_face_v2_cf<<>> - (n_i_faces * 3, - i_face_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal); + // _compute_reconstruct_v_i_face_v2_cf<<>> + // (n_i_faces * 3, + // i_face_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal); - //Kernels Gather + /*************************************Kernels Gather**************************************************/ // _compute_reconstruct_v_i_face_gather<<>> // ( n_cells, // pvar_d, @@ -1503,6 +1506,56 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces, // cell_i_faces_sgn); + + + /*************************************Kernels Gather registers memory************************************/ + // _compute_reconstruct_v_i_face_gather_v3<<>> + // ( n_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + // _compute_reconstruct_v_i_face_gather_v4<<>> + // ( n_cells * 3 * 3, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); + + + + + /*************************************Kernels Gather shared memory***************************************/ + _compute_reconstruct_v_i_face_gather_v5<<>> + ( n_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal, + cell_cells_idx, + cell_cells, + cell_i_faces, + cell_i_faces_sgn); + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); // ----------------------------End of Kernels part 1------------------------------------------- @@ -1532,6 +1585,9 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- + + + /*************************************Kernels Scatter**************************************************/ // _compute_reconstruct_v_b_face<<>> // ( n_b_faces, // coefb_d, @@ -1557,6 +1613,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); + /*************************************Kernels Scatter conflict free************************************/ // _compute_reconstruct_v_b_face_cf<<>> // ( n_b_faces, // coefb_d, @@ -1569,18 +1626,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); - _compute_reconstruct_v_b_face_v2_cf<<>> - ( n_b_faces * 3, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_face_cells); + // _compute_reconstruct_v_b_face_v2_cf<<>> + // ( n_b_faces * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells); + /*************************************Kernels Gather**************************************************/ // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coefb_d, @@ -1610,6 +1668,54 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces, // cell_b_faces_idx); + /*************************************Kernels Gather registers memory***************************************/ + // _compute_reconstruct_v_b_face_gather_v3<<>> + // ( m->n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + // _compute_reconstruct_v_b_face_gather_v4<<>> + // ( m->n_b_cells * 3, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); + + + + /*************************************Kernels Gather shared memory***************************************/ + _compute_reconstruct_v_b_face_gather_v5<<>> + ( m->n_b_cells, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_cells, + cell_b_faces, + cell_b_faces_idx); + + CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); // _compute_reconstruct_correction<<>> diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh index 6432ef3ce9..b7c375e9b9 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -63,22 +63,22 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); - for (cs_lnum_t i = 0; i < 3; i++) { - pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); - - /* Reconstruction part */ - rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] - + r_grad[c_id2][i][0]) - + dofij[f_id][1]*( r_grad[c_id1][i][1] - + r_grad[c_id2][i][1]) - + dofij[f_id][2]*( r_grad[c_id1][i][2] - + r_grad[c_id2][i][2])); - - for (cs_lnum_t j = 0; j < 3; j++) { - grad[c_id1][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; - } + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_id1][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_id1][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_id1][i][2] + + r_grad[c_id2][i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + grad[c_id1][i][j] += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; } } + } } @@ -137,7 +137,6 @@ _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, for (cs_lnum_t j = 0; j < 3; j++){ grad[c_id][i][j] += (pfac + rfac) * b_f_face_normal[f_id][j]; } - } } -} \ No newline at end of file +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh new file mode 100644 index 0000000000..ab443b6f51 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh @@ -0,0 +1,168 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + +__global__ static void +_compute_reconstruct_v_i_face_gather_v3(cs_lnum_t n_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + auto _grad = grad[c_id1]; + auto _pvar1 = pvar[c_id1]; + auto _r_grad1 = r_grad[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + auto _pvar2 = pvar[c_id2]; + auto _r_grad2 = r_grad[c_id2]; + auto _dofij = dofij[f_id]; + auto _i_f_face_normal = i_f_face_normal[f_id]; + auto _cell_i_faces_sgn = cell_i_faces_sgn[index]; + + pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]); + + /* Reconstruction part */ + rfac = 0.5 * ( _dofij[0]*( _r_grad1[i][0] + + _r_grad2[i][0]) + + _dofij[1]*( _r_grad1[i][1] + + _r_grad2[i][1]) + + _dofij[2]*( _r_grad1[i][2] + + _r_grad2[i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + _grad[i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j]; + } + } + } + grad[c_id1][0][0] = _grad[0][0]; grad[c_id1][0][1] = _grad[0][1]; grad[c_id1][0][2] = _grad[0][2]; + grad[c_id1][1][0] = _grad[1][0]; grad[c_id1][1][1] = _grad[1][1]; grad[c_id1][1][2] = _grad[1][2]; + grad[c_id1][2][0] = _grad[2][0]; grad[c_id1][2][1] = _grad[2][1]; grad[c_id1][2][2] = _grad[2][2]; +} + + + + +__global__ static void +_compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + auto _grad = grad[c_id]; + auto _r_grad = r_grad[c_id]; + auto _pvar = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + auto _diipb = diipb[f_id]; + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + auto _b_f_face_normal = b_f_face_normal[f_id]; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*_coefav[i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += _coefbv[i][k] * _pvar[k]; + } + + pfac -= _pvar[i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = _r_grad[k][0] * _diipb[0] + + _r_grad[k][1] * _diipb[1] + + _r_grad[k][2] * _diipb[2]; + rfac += _coefbv[i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + _grad[i][j] += (pfac + rfac) * _b_f_face_normal[j]; + } + + } + } + grad[c_id][0][0] = _grad[0][0]; grad[c_id][0][1] = _grad[0][1]; grad[c_id][0][2] = _grad[0][2]; + grad[c_id][1][0] = _grad[1][0]; grad[c_id][1][1] = _grad[1][1]; grad[c_id][1][2] = _grad[1][2]; + grad[c_id][2][0] = _grad[2][0]; grad[c_id][2][1] = _grad[2][1]; grad[c_id][2][2] = _grad[2][2]; +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh new file mode 100644 index 0000000000..906374456d --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh @@ -0,0 +1,150 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + +__global__ static void +_compute_reconstruct_v_i_face_gather_v4(cs_lnum_t n_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + size_t c_idx = c_id1 / (3*3); + size_t i = (c_id1 / 3) % 3; + size_t j = c_id1 % 3; + + cs_lnum_t s_id = cell_cells_idx[c_idx]; + cs_lnum_t e_id = cell_cells_idx[c_idx + 1]; + + auto _grad = grad[c_idx][i][j]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_idx] // cell weighting active + / ( pond * c_weight[c_idx] + + (1.0-pond)* c_weight[c_id2]); + + pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_idx][i]); + + /* Reconstruction part */ + rfac = 0.5 * ( dofij[f_id][0]*( r_grad[c_idx][i][0] + + r_grad[c_id2][i][0]) + + dofij[f_id][1]*( r_grad[c_idx][i][1] + + r_grad[c_id2][i][1]) + + dofij[f_id][2]*( r_grad[c_idx][i][2] + + r_grad[c_id2][i][2])); + + _grad += cell_i_faces_sgn[index] * (pfaci + rfac) * i_f_face_normal[f_id][j]; + } + grad[c_idx][i][j] = _grad; +} + + + + +__global__ static void +_compute_reconstruct_v_b_face_gather_v4(cs_lnum_t n_b_cells, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_idx >= n_b_cells){ + return; + } + + size_t c_id1 = c_idx / 3; + size_t i = c_idx % 3; + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + auto _grad = grad[c_id][i]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + pfac = inc*coefav[f_id][i]; + + pfac += coefbv[f_id][i][0] * pvar[c_id][0] + + coefbv[f_id][i][1] * pvar[c_id][1] + + coefbv[f_id][i][2] * pvar[c_id][2]; + + pfac -= pvar[c_id][i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + _grad[0] += (pfac + rfac) * b_f_face_normal[f_id][0]; + _grad[1] += (pfac + rfac) * b_f_face_normal[f_id][1]; + _grad[2] += (pfac + rfac) * b_f_face_normal[f_id][2]; + } + grad[c_id][i][0] = _grad[0]; + grad[c_id][i][1] = _grad[1]; + grad[c_id][i][2] = _grad[2]; +} diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh new file mode 100644 index 0000000000..dcb4957899 --- /dev/null +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh @@ -0,0 +1,184 @@ +/*============================================================================ + * Gradient reconstruction, CUDA implementations. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + + + + +__global__ static void +_compute_reconstruct_v_i_face_gather_v5(cs_lnum_t n_cells, + const cs_real_3_t *pvar, + const cs_real_t *weight, + const cs_real_t *c_weight, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict dofij, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_lnum_t c_id2, f_id; + cs_real_t pond, ktpond, pfaci, pfacj, rfac; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + __shared__ cs_real_t _grad[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] = grad[c_id1][i][j]; + } + } + + + auto _pvar1 = pvar[c_id1]; + auto _r_grad1 = r_grad[c_id1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + auto _pvar2 = pvar[c_id2]; + auto _r_grad2 = r_grad[c_id2]; + auto _dofij = dofij[f_id]; + auto _i_f_face_normal = i_f_face_normal[f_id]; + auto _cell_i_faces_sgn = cell_i_faces_sgn[index]; + + pond = (_cell_i_faces_sgn > 0) ? weight[f_id] : 1. - weight[f_id]; + ktpond = (c_weight == NULL) ? + pond : // no cell weighting + pond * c_weight[c_id1] // cell weighting active + / ( pond * c_weight[c_id1] + + (1.0-pond)* c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]); + + /* Reconstruction part */ + rfac = 0.5 * ( _dofij[0]*( _r_grad1[i][0] + + _r_grad2[i][0]) + + _dofij[1]*( _r_grad1[i][1] + + _r_grad2[i][1]) + + _dofij[2]*( _r_grad1[i][2] + + _r_grad2[i][2])); + + for (cs_lnum_t j = 0; j < 3; j++) { + _grad[lindex][i][j] += _cell_i_faces_sgn * (pfaci + rfac) * _i_f_face_normal[j]; + } + } + } + grad[c_id1][0][0] = _grad[lindex][0][0]; grad[c_id1][0][1] = _grad[lindex][0][1]; grad[c_id1][0][2] = _grad[lindex][0][2]; + grad[c_id1][1][0] = _grad[lindex][1][0]; grad[c_id1][1][1] = _grad[lindex][1][1]; grad[c_id1][1][2] = _grad[lindex][1][2]; + grad[c_id1][2][0] = _grad[lindex][2][0]; grad[c_id1][2][1] = _grad[lindex][2][1]; grad[c_id1][2][2] = _grad[lindex][2][2]; +} + + + + +__global__ static void +_compute_reconstruct_v_b_face_gather_v5(cs_lnum_t n_b_cells, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + __shared__ cs_real_t _grad[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] = grad[c_id][i][j]; + } + } + + auto _r_grad = r_grad[c_id]; + auto _pvar = pvar[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + auto _diipb = diipb[f_id]; + auto _coefav = coefav[f_id]; + auto _coefbv = coefbv[f_id]; + auto _b_f_face_normal = b_f_face_normal[f_id]; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*_coefav[i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += _coefbv[i][k] * _pvar[k]; + } + + pfac -= _pvar[i]; + + // /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = _r_grad[k][0] * _diipb[0] + + _r_grad[k][1] * _diipb[1] + + _r_grad[k][2] * _diipb[2]; + rfac += _coefbv[i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + _grad[lindex][i][j] += (pfac + rfac) * _b_f_face_normal[j]; + } + } + } + grad[c_id][0][0] = _grad[lindex][0][0]; grad[c_id][0][1] = _grad[lindex][0][1]; grad[c_id][0][2] = _grad[lindex][0][2]; + grad[c_id][1][0] = _grad[lindex][1][0]; grad[c_id][1][1] = _grad[lindex][1][1]; grad[c_id][1][2] = _grad[lindex][1][2]; + grad[c_id][2][0] = _grad[lindex][2][0]; grad[c_id][2][1] = _grad[lindex][2][1]; grad[c_id][2][2] = _grad[lindex][2][2]; +} From 0a95d5a16385728f4618fb3943e267ca6b99ba97 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 27 Nov 2023 17:25:28 +0100 Subject: [PATCH 42/70] ADD _gradient_vector kernel and refacto --- src/alge/cs_gradient.cxx | 425 +++++++++++++----- src/alge/cs_gradient_cuda.cu | 203 +++++++-- src/alge/cs_gradient_priv.h | 13 +- ...reconstruct_vector_gradient_scatter_v2.cuh | 2 - 4 files changed, 486 insertions(+), 157 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 7e4081a070..347751257c 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5557,54 +5557,56 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, std::chrono::high_resolution_clock::time_point start, stop; std::chrono::microseconds elapsed, elapsed_cuda; - - cs_real_33_t *grad_cpu; - + cs_real_33_t *grad_cpu, *grad_gpu; - - bool COMPUTE_CUDA; - bool COMPUTE_CPU; - bool RES_CPU; - bool PERF; - bool ACCURACY; + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; #if defined(HAVE_CUDA) - COMPUTE_CUDA = (cs_get_device_id() > -1) ? true : false; - RES_CPU = !COMPUTE_CUDA; + compute_cuda = (cs_get_device_id() > -1) ? true : false; #else - COMPUTE_CUDA = false; + compute_cuda = false; #endif +res_cpu = !compute_cuda; + #if defined(DEBUG) - COMPUTE_CPU = true; - PERF = true; - ACCURACY = true; -#elif defined(NDEBUG) && !COMPUTE_CUDA - COMPUTE_CPU = true; - RES_CPU = true; - PERF = false; - ACCURACY = false; + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; #else - COMPUTE_CPU = false; - PERF = false; - ACCURACY = false; + compute_cpu = false; + perf = false; + accuracy = false; #endif // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - COMPUTE_CUDA = true; - COMPUTE_CPU = true; - RES_CPU = false; + compute_cuda = true; + compute_cpu = true; + res_cpu = false; // A ne pas garder dans la version finale - PERF = true; - ACCURACY = true; + perf = false; + accuracy = false; - if(COMPUTE_CUDA){ - printf("reconstruct Compute with CUDA\n"); - if(PERF){ +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(!res_cpu){ + grad_gpu = grad; + } else { + BFT_MALLOC(grad_gpu, n_cells_ext, cs_real_33_t); + } + if(perf){ start = std::chrono::high_resolution_clock::now(); } @@ -5619,22 +5621,26 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, pvar, c_weight, r_grad, - grad, + grad_gpu, coupled_faces, cpl_stride, cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION, - PERF); - if(PERF){ + perf); + if(perf){ stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); } } +#endif - if(COMPUTE_CPU){ - printf("reconstruct Compute with CPU\n"); - BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + if(compute_cpu){ + if(res_cpu){ + grad_cpu = grad; + } else { + BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + } - if(PERF){ + if(perf){ start = std::chrono::high_resolution_clock::now(); } /* Initialization */ @@ -5783,47 +5789,62 @@ _reconstruct_vector_gradient(const cs_mesh_t *m, } } - if(PERF){ + if(perf){ stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); } - } /* Performances */ - if(PERF){ - printf("reconstruct Compute and tranferts time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("reconstruct Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count()); + } + #endif + + if(compute_cpu){ + printf("reconstruct Compute and tranferts time in us: CPU = %ld\n", elapsed.count()); + } } /* Accuracy grad_cpu and grad_gpu */ - if(ACCURACY){ - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (int j =0; j < 3; ++j) { - auto cpu = grad_cpu[c_id][i][j]; - auto cuda = grad[c_id][i][j]; - double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); - if (err> 1e-6) { - printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j =0; j < 3; ++j) { + auto cpu = grad_cpu[c_id][i][j]; + auto cuda = grad_gpu[c_id][i][j]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-6) { + printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + } + } } } } - } - } - - //Copy grad - if(RES_CPU){ - printf("reconstruct RESULTS CPU\n"); - memcpy(grad, grad_cpu, sizeof(cs_real_33_t) * n_cells_ext); - }else{ - printf("reconstruct RESULTS GPU\n"); + #endif } - // Free memory - if(COMPUTE_CPU){ - BFT_FREE(grad_cpu); +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(grad_gpu); + } } +#endif +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(grad_cpu); + } + } /* Periodicity and parallelism treatment */ @@ -6895,26 +6916,37 @@ _lsq_vector_gradient(const cs_mesh_t *m, _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda, *gradv_cpu; - bool COMPUTE_CUDA, COMPUTE_CPU, RES_CPU, PERF, ACCURACY; + bool compute_cuda, compute_cpu, res_cpu, perf, accuracy; - COMPUTE_CUDA = accel; - RES_CPU = !accel; + compute_cuda = accel; + res_cpu = !accel; #if defined(DEBUG) - COMPUTE_CPU = true; - PERF = true; - ACCURACY = true; + compute_cpu = true; + perf = true; + accuracy = true; #elif defined(NDEBUG) - COMPUTE_CPU = true; - RES_CPU = true; - PERF = false; - ACCURACY = false; + compute_cpu = true; + res_cpu = true; + perf = false; + accuracy = false; #else - COMPUTE_CPU = false; - PERF = false; - ACCURACY = false; + compute_cpu = false; + perf = false; + accuracy = false; #endif + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + compute_cuda = false; + compute_cpu = true; + res_cpu = true; + + // A ne pas garder dans la version finale + perf = true; + accuracy = false; + BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); @@ -6923,28 +6955,36 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Compute Right-Hand Side */ /*-------------------------*/ #if defined(HAVE_CUDA) -if(COMPUTE_CUDA){ - start = std::chrono::high_resolution_clock::now(); - cs_lsq_vector_gradient_cuda( - m, - madj, - fvq, - halo_type, - inc, - coefav, - coefbv, - pvar, - c_weight, - cocg, - cocgb_s, - gradv, - rhs_cuda); - stop = std::chrono::high_resolution_clock::now(); - elapsed_cuda = std::chrono::duration_cast(stop - start); -} // end if COMPUTE_CUDA + if(compute_cuda){ + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + cs_lsq_vector_gradient_cuda( + m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + cocg, + cocgb_s, + gradv, + rhs_cuda); + + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } + } // end if compute_cuda #endif -if(COMPUTE_CPU){ - start = std::chrono::high_resolution_clock::now(); + +if(compute_cpu){ + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } # pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) @@ -7093,11 +7133,14 @@ if(COMPUTE_CPU){ + rhs[c_id][i][2] * cocg[c_id][2]; } } -} // end if COMPUTE_CPU -if(ACCURACY){ - stop = std::chrono::high_resolution_clock::now(); - elapsed = std::chrono::duration_cast(stop - start); + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + } +} // end if compute_cpu + +if(accuracy){ #pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { @@ -7113,10 +7156,10 @@ if(ACCURACY){ } } -if(PERF) +if(perf) printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); -if(RES_CPU){ +if(res_cpu){ memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext); } /* Compute gradient on boundary cells */ @@ -9026,27 +9069,179 @@ _gradient_vector(const char *var_name, /* Use Neumann BC's as default if not provided */ + cs_real_3_t *_bc_coeff_a = NULL; cs_real_33_t *_bc_coeff_b = NULL; + /* Timing the computation */ + + std::chrono::high_resolution_clock::time_point start, stop; + std::chrono::microseconds elapsed, elapsed_cuda; + + cs_real_3_t *_bc_coeff_a_gpu = NULL; + cs_real_3_t *_bc_coeff_a_cpu = NULL; + cs_real_33_t *_bc_coeff_b_gpu = NULL; + cs_real_33_t *_bc_coeff_b_cpu = NULL; + + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; + +#if defined(HAVE_CUDA) + compute_cuda = (cs_get_device_id() > -1) ? true : false; +#else + compute_cuda = false; +#endif + + +res_cpu = !compute_cuda; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + compute_cuda = true; + compute_cpu = true; + res_cpu = false; + + // A ne pas garder dans la version finale + perf = false; + accuracy = false; + +// Compute on GPU +#if defined(HAVE_CUDA) + if(compute_cuda){ + BFT_MALLOC(_bc_coeff_a_gpu, n_b_faces, cs_real_3_t); + BFT_MALLOC(_bc_coeff_b_gpu, n_b_faces, cs_real_33_t); + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + _gradient_vector_cuda(mesh, _bc_coeff_a_gpu, _bc_coeff_b_gpu, perf); + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_cuda = std::chrono::duration_cast(stop - start); + } + } +#endif + +// Compute on CPU + if(compute_cpu){ + BFT_MALLOC(_bc_coeff_a_cpu, n_b_faces, cs_real_3_t); + BFT_MALLOC(_bc_coeff_b_cpu, n_b_faces, cs_real_33_t); + + if(perf){ + start = std::chrono::high_resolution_clock::now(); + } + + if (bc_coeff_a == NULL) { + for (cs_lnum_t i = 0; i < n_b_faces; i++) { + for (cs_lnum_t j = 0; j < 3; j++) + _bc_coeff_a_cpu[i][j] = 0; + } + } + if (bc_coeff_b == NULL) { + for (cs_lnum_t i = 0; i < n_b_faces; i++) { + for (cs_lnum_t j = 0; j < 3; j++) { + for (cs_lnum_t k = 0; k < 3; k++) + _bc_coeff_b_cpu[i][j][k] = 0; + _bc_coeff_b_cpu[i][j][j] = 1; + } + } + } + + if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); + } + } + +// selected the result of the computation on CPU or GPU if (bc_coeff_a == NULL) { - BFT_MALLOC(_bc_coeff_a, n_b_faces, cs_real_3_t); - for (cs_lnum_t i = 0; i < n_b_faces; i++) { - for (cs_lnum_t j = 0; j < 3; j++) - _bc_coeff_a[i][j] = 0; + if(res_cpu){ + bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_cpu; + } else { + bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a_gpu; } - bc_coeff_a = (const cs_real_3_t *)_bc_coeff_a; } if (bc_coeff_b == NULL) { - BFT_MALLOC(_bc_coeff_b, n_b_faces, cs_real_33_t); - for (cs_lnum_t i = 0; i < n_b_faces; i++) { - for (cs_lnum_t j = 0; j < 3; j++) { - for (cs_lnum_t k = 0; k < 3; k++) - _bc_coeff_b[i][j][k] = 0; - _bc_coeff_b[i][j][j] = 1; + if(res_cpu){ + bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_cpu; + } else { + bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b_gpu; + } + } + + /* Performances */ + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("_gradient_vector Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda.count()); } + #endif + + if(compute_cpu){ + printf("_gradient_vector Compute and tranferts time in us: CPU = %ld\n", elapsed.count()); + } + } + + /* Accuracy grad_cpu and grad_gpu */ + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + auto cpu = _bc_coeff_a_cpu[f_id][i]; + auto cuda = _bc_coeff_a_gpu[f_id][i]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("_gradient_vector_a DIFFERENCE @%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", f_id, i, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + for (int j =0; j < 3; ++j) { + auto cpu = _bc_coeff_b_cpu[f_id][i][j]; + auto cuda = _bc_coeff_b_gpu[f_id][i][j]; + double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("_gradient_vector_b DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", f_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + } + } + } + } + } + } + #endif + } + +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(_bc_coeff_a_gpu); + BFT_FREE(_bc_coeff_b_gpu); + } + } +#endif + +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(_bc_coeff_a_cpu); + BFT_FREE(_bc_coeff_b_cpu); } - bc_coeff_b = (const cs_real_33_t *)_bc_coeff_b; } /* Update of local BC. coefficients for internal coupling */ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index d4e3544dea..2409c29eaa 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1274,7 +1274,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const bool *coupled_faces, cs_lnum_t cpl_stride, bool test_bool, - bool PERF + bool perf ) { const cs_lnum_t n_cells = m->n_cells; @@ -1542,19 +1542,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - _compute_reconstruct_v_i_face_gather_v5<<>> - ( n_cells, - pvar_d, - weight, - c_weight, - r_grad_d, - grad_d, - dofij, - i_f_face_normal, - cell_cells_idx, - cell_cells, - cell_i_faces, - cell_i_faces_sgn); + // _compute_reconstruct_v_i_face_gather_v5<<>> + // ( n_cells, + // pvar_d, + // weight, + // c_weight, + // r_grad_d, + // grad_d, + // dofij, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1701,19 +1701,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - _compute_reconstruct_v_b_face_gather_v5<<>> - ( m->n_b_cells, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_cells, - cell_b_faces, - cell_b_faces_idx); + // _compute_reconstruct_v_b_face_gather_v5<<>> + // ( m->n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); @@ -1728,15 +1728,15 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // test_bool // ); - _compute_reconstruct_correction_v2<<>> - ( n_cells * 3, - has_dc, - c_disable_flag, - cell_f_vol, - grad_d, - corr_grad_lin, - test_bool - ); + // _compute_reconstruct_correction_v2<<>> + // ( n_cells * 3, + // has_dc, + // c_disable_flag, + // cell_f_vol, + // grad_d, + // corr_grad_lin, + // test_bool + // ); CS_CUDA_CHECK(cudaEventRecord(b_faces_3, stream)); // ----------------------------End of Kernels part 2------------------------------------------- @@ -1756,7 +1756,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cudaStreamSynchronize(stream); cudaStreamDestroy(stream); - if(PERF){ + if(perf){ printf("reconstruct Kernels times:\t"); msec = 0.0f; @@ -1824,3 +1824,132 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaFree(c_disable_flag)); CS_CUDA_CHECK(cudaFree(grad_d)); } + + + + + +__global__ static void +_set_one_to_coeff_b(const cs_lnum_t n_b_faces, + cs_real_33_t *_bc_coeff_b) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_faces){ + return; + } + + cs_lnum_t f_id = c_idx / 3; + size_t i = c_idx % 3; + + _bc_coeff_b[f_id][i][i] = 1; +} + +/*---------------------------------------------------------------------------- + * _gradient_vector the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +_gradient_vector_cuda(const cs_mesh_t *mesh, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool perf) +{ + const cs_lnum_t n_b_faces = mesh->n_b_faces; + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init1, init2, stop; + float msec = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init1)); + CS_CUDA_CHECK(cudaEventCreate(&init2)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + unsigned int blocksize = 256; + unsigned int gridsize_f + = (unsigned int)ceil((double)n_b_faces / blocksize); + + + cs_real_3_t *_bc_coeff_a_d; + CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_a_d, n_b_faces * sizeof(cs_real_3_t))); + cs_real_33_t *_bc_coeff_b_d; + CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_b_d, n_b_faces * sizeof(cs_real_33_t))); + + + /* Initialization */ + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t)); + CS_CUDA_CHECK(cudaEventRecord(init1, stream)); + cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); + + _set_one_to_coeff_b<<>> + (n_b_faces, _bc_coeff_b_d); + CS_CUDA_CHECK(cudaEventRecord(init2, stream)); + + + /* Sync to host */ + if (_bc_coeff_a_d != NULL) { + size_t size = n_b_faces * sizeof(cs_real_t) * 3; + cs_cuda_copy_d2h(_bc_coeff_a, _bc_coeff_a_d, size); + } + else + cs_sync_d2h(_bc_coeff_a); + /* Sync to host */ + if (_bc_coeff_b_d != NULL) { + size_t size = n_b_faces * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(_bc_coeff_b, _bc_coeff_b_d, size); + } + else + cs_sync_d2h(_bc_coeff_b); + + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + if(perf){ + printf("reconstruct Kernels times:\t"); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init1)); + printf("Kernels execution time in us: \t"); + printf("Init1 = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init1, init2)); + printf("Init2 = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + } + CS_CUDA_CHECK(cudaFree(_bc_coeff_a_d)); + CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d)); +} \ No newline at end of file diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 8ba24bbcff..516c3e6080 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -140,9 +140,16 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const bool *coupled_faces, cs_lnum_t cpl_stride, bool test_bool, - bool PERF - ); -#endif /* defined(HAVE_CUDA) */ + bool perf); +#endif + +void +_gradient_vector_cuda(const cs_mesh_t *mesh, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool perf); + +/* defined(HAVE_CUDA) */ /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index 78013e518e..27f2878a3e 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -82,8 +82,6 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, __global__ static void _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, - const bool *coupled_faces, - cs_lnum_t cpl_stride, const cs_real_33_t *restrict coefbv, const cs_real_3_t *restrict coefav, const cs_real_3_t *restrict pvar, From e2af7c874ba54664b922967213081cdee9c01277 Mon Sep 17 00:00:00 2001 From: mohammed derbane <115648603+aneo-mderbane@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:45:03 +0100 Subject: [PATCH 43/70] Update src/alge/cs_gradient_cuda.cu Co-authored-by: ddiakiteaneo <127390724+ddiakiteaneo@users.noreply.github.com> --- src/alge/cs_gradient_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 2409c29eaa..1ab017e24f 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1952,4 +1952,4 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, } CS_CUDA_CHECK(cudaFree(_bc_coeff_a_d)); CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d)); -} \ No newline at end of file +} From 55de6003550181b23e1d2be68432b5d5a7b4d240 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 23 Nov 2023 17:38:21 +0100 Subject: [PATCH 44/70] Gradient on boundary cells --- src/alge/cs_gradient.cxx | 68 ++++- src/alge/cs_gradient_cuda.cu | 296 +++++++++++++++++++-- src/alge/cs_gradient_cuda.cuh | 16 ++ src/alge/cs_gradient_lsq_vector.cuh | 122 +++++++++ src/alge/cs_gradient_lsq_vector_gather.cuh | 93 +++++++ src/alge/cs_gradient_priv.h | 22 ++ 6 files changed, 583 insertions(+), 34 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 347751257c..8084d0e803 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -198,7 +198,7 @@ const cs_e2n_sum_t _e2n_sum_type = CS_E2N_SUM_SCATTER; /* Strided LSQ gradient variant */ -static int _use_legacy_strided_lsq_gradient = true; +static int _use_legacy_strided_lsq_gradient = false; /*============================================================================ * Private function definitions @@ -6993,7 +6993,17 @@ if(compute_cpu){ } /* Contribution from interior faces */ - + // int num_device = omp_get_num_devices(); + // printf("OMP supported devices %d\n", num_device); + // #pragma omp target + // { + // #pragma omp teams distribute parallel for + // for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + // for (cs_lnum_t i = 0; i < 3; i++) + // for (cs_lnum_t j = 0; j < 3; j++) + // rhs[c_id][i][j] = 0.0; + // } + // } for (int g_id = 0; g_id < n_i_groups; g_id++) { # pragma omp parallel for @@ -7341,6 +7351,10 @@ _lsq_strided_gradient(const cs_mesh_t *m, BFT_MALLOC(rhs, n_cells_ext, grad_t); cs_array_real_fill_zero(n_cells_ext*stride*3, (cs_real_t *)rhs); + grad_t *gradv_cpu; + BFT_MALLOC(gradv_cpu, n_cells_ext*stride*3, grad_t); + + #if defined(HAVE_CUDA) bool accel = (cs_get_device_id() > -1) ? true : false; #else @@ -7613,30 +7627,52 @@ _lsq_strided_gradient(const cs_mesh_t *m, _math_6_inv_cramer_sym_in_place(cocg[c_id]); } /* loop on boundary cells */ + /* Compute gradient */ /*------------------*/ #pragma omp parallel for if(n_cells >= CS_THR_MIN) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < stride; i++) { - gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + gradv_cpu[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + rhs[c_id][i][1] * cocg[c_id][3] + rhs[c_id][i][2] * cocg[c_id][5]; - gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + gradv_cpu[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + rhs[c_id][i][1] * cocg[c_id][1] + rhs[c_id][i][2] * cocg[c_id][4]; - gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + gradv_cpu[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + rhs[c_id][i][1] * cocg[c_id][4] + rhs[c_id][i][2] * cocg[c_id][2]; } } + memcpy(gradv, gradv_cpu, sizeof(cs_real_t) * n_cells_ext * stride * 3); /* Correct gradient on boundary cells */ /*------------------------------------*/ +cs_real_t c_norm, ref_norm; +// #if defined(HAVE_CUDA) + cs_lsq_vector_gradient_strided_cuda + ( + m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + cocg, + cocgb, + gradv, + rhs, + n_c_iter_max, + c_eps); +// #else #pragma omp parallel for schedule(dynamic, CS_THR_MIN) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { @@ -7645,7 +7681,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; - cs_real_3_t *c_grad = gradv[c_id]; + cs_real_3_t *c_grad = gradv_cpu[c_id]; cs_real_t grad_0[stride][3], grad_i[stride][3]; @@ -7654,7 +7690,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, /* Compute norm for convergence testing. */ - cs_real_t ref_norm = 0; + ref_norm = 0; for (cs_lnum_t kk = 0; kk < stride; kk++) { for (cs_lnum_t ll = 0; ll < 3; ll++) ref_norm += cs_math_fabs(c_grad[kk][ll]); @@ -7662,7 +7698,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, /* Iterate over boundary condition contributions. */ - cs_real_t c_norm = 0; + c_norm = 0; int n_c_it; for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) { @@ -7777,6 +7813,21 @@ _lsq_strided_gradient(const cs_mesh_t *m, #endif n_c_it *= -1; } +// #endif + +#pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; + auto cuda = gradv[c_id][i][j]; + + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { + printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + } + } + } + } /* Optional postprocessing */ @@ -7804,6 +7855,7 @@ _lsq_strided_gradient(const cs_mesh_t *m, } BFT_FREE(rhs); + BFT_FREE(gradv_cpu); } diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 1ab017e24f..3e71d3c317 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -63,6 +63,22 @@ * Recompute cocg at boundaries, using saved cocgb *----------------------------------------------------------------------------*/ +#define INSTANTIATE(name, stride) template void name (const cs_mesh_t *m,\ + const cs_mesh_adjacencies_t *madj,\ + const cs_mesh_quantities_t *fvq,\ + const cs_halo_type_t halo_type,\ + const int inc,\ + const cs_real_t (*restrict coefav)[stride],\ + const cs_real_t (*restrict coefbv)[stride][stride],\ + const cs_real_t (*restrict pvar)[stride],\ + const cs_real_t *restrict c_weight,\ + cs_cocg_6_t *restrict cocg,\ + cs_cocg_6_t *restrict cocgb,\ + cs_real_t (*restrict gradv)[stride][3],\ + cs_real_t (*restrict rhs)[stride][3],\ + cs_lnum_t n_c_iter_max,\ + cs_real_t c_eps) + template __global__ static void _compute_cocg_from_cocgb(cs_lnum_t n_b_cells, @@ -1099,33 +1115,33 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - // _compute_rhs_lsq_v_b_face_gather_stride<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> - // (m->n_b_cells, - // cell_b_faces_idx, - // cell_b_faces, - // b_cells, - // b_face_cog, - // cell_cen, - // rhs_d, - // pvar_d, - // coefb_d, - // coefa_d, - // cocg, - // cocgb, - // inc); + _compute_rhs_lsq_v_b_face_gather_stride_v2<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, + b_cells, + b_face_cog, + cell_cen, + rhs_d, + pvar_d, + coefb_d, + coefa_d, + cocg, + cocgb, + inc); - _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> - (m->n_b_cells, - cell_b_faces_idx, - cell_b_faces, - b_cells, - b_face_normal, - rhs_d, - pvar_d, - b_dist, - coefb_d, - coefa_d, - inc); + // _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_cells, + // b_face_normal, + // rhs_d, + // pvar_d, + // b_dist, + // coefb_d, + // coefa_d, + // inc); // _compute_rhs_lsq_v_b_face_v2<<n_b_cells, blocksize), blocksize, 0, stream>>> // (m->n_b_faces, @@ -1238,6 +1254,234 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, } +/*----------------------------------------------------------------------------*/ +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*============================================================================= + * Semi-private function definitions + *============================================================================*/ + +/*---------------------------------------------------------------------------- + * Compute cell gradient using least-squares reconstruction for non-orthogonal + * meshes (nswrgp > 1). + * + * Optionally, a volume force generating a hydrostatic pressure component + * may be accounted for. + * + * cocg is computed to account for variable B.C.'s (flux). + * + * parameters: + * m <-- pointer to associated mesh structure + * madj <-- pointer to mesh adjacencies structure + * fvq <-- pointer to associated finite volume quantities + * halo_type <-- halo type (extended or not) + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) + *----------------------------------------------------------------------------*/ +template +void +cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_t (*restrict gradv)[stride][3], + cs_real_t (*restrict rhs)[stride][3], + cs_lnum_t n_c_iter_max, + cs_real_t c_eps) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_i_faces = m->n_i_faces; + + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, stop; + float msec = 0.0f, msecTotal = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&i_faces)); + CS_CUDA_CHECK(cudaEventCreate(&halo)); + CS_CUDA_CHECK(cudaEventCreate(&b_faces)); + CS_CUDA_CHECK(cudaEventCreate(&gradient)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + decltype(rhs) rhs_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_t)*stride*3)); + + + decltype(gradv) grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells * sizeof(cs_real_t)*stride*3)); + + void *_pvar_d = NULL, *_coefa_d = NULL, *_coefb_d = NULL, + *_cell_cells_idx_d = NULL; + decltype(pvar) pvar_d = NULL, coefa_d = NULL; + decltype(coefbv) coefb_d = NULL; + const cs_lnum_t *cell_cells_idx_d = NULL; + + unsigned int blocksize = 256; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_face_cells); + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->b_cells); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->cell_cells_lst); + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces); + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)cs_get_device_ptr_const_pf(madj->cell_i_faces_sgn); + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); + const cs_real_t *restrict weight + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->weight); + const cs_real_t *restrict b_dist + = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->b_dist); + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_face_normal); + const cs_real_3_t *restrict b_face_cog + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_cog); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + + const cs_lnum_t *restrict i_face_cells_1d + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + + _sync_or_copy_real_h2d(pvar, n_cells_ext*stride, device_id, stream, + &pvar_d, &_pvar_d); + + _sync_or_copy_real_h2d(coefav, n_b_faces*stride, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d(coefbv, n_b_faces*stride*stride, device_id, stream, + &coefb_d, &_coefb_d); + + cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * stride * 3); + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); + + CS_CUDA_CHECK(cudaEventRecord(halo, stream)); + + _compute_gradient_b_face_lsq_v<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + b_cells, + cell_b_faces_idx, + cell_b_faces, + b_face_cog, + cell_cen, + diipb, + grad_d, + coefb_d, + cocg, + n_c_iter_max, + c_eps); + + CS_CUDA_CHECK(cudaEventRecord(b_faces, stream)); + + CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); + + // /* Sync to host */ + if (grad_d != NULL) { + size_t size = n_cells * sizeof(cs_real_t) * stride * 3; + cs_cuda_copy_d2h(gradv, grad_d, size); + } + else + cs_sync_d2h(gradv); + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + printf("lsq Kernels :"); + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + // printf("Kernels execution time in us: \t"); + // printf("Init = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, i_faces)); + // printf("I_faces = %f\t", msec*1000.f); + + // msec = 0.0f; + // CS_CUDA_CHECK(cudaEventElapsedTime(&msec, i_faces, halo)); + // printf("Halo = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, halo, b_faces)); + printf("B_faces = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, b_faces, gradient)); + printf("Gradient = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + printf("Total kernel = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + + + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + + CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(grad_d)); + +} + +INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 1); +INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 3); +INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 6); +INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 9); diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index 5b1401bfe6..ee19988414 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -79,6 +79,22 @@ #include "cs_gradient.h" #include "cs_gradient_priv.h" +__device__ cs_real_t +cs_math_fabs_cuda(cs_real_t x) +{ + cs_real_t ret = (x < 0) ? -x : x; + + return ret; +} + +__device__ cs_real_t +cs_math_3_dot_product_cuda(const cs_real_t u[3], + const cs_real_t v[3]) +{ + cs_real_t prod = u[0]*v[0] + u[1]*v[1] + u[2]*v[2]; + + return prod; +} __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index 34d9d69cb1..61e8051517 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -311,3 +311,125 @@ _compute_gradient_lsq_v(cs_lnum_t n_cells, + rhs[c_id][i][2] * cocg[c_id][2]; } } + +template +__global__ static void +_compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *restrict diipb, + cs_real_t (*restrict gradv)[stride][3], + const cs_real_t (*restrict coefbv)[stride][stride], + cs_cocg_6_t *restrict cocg, + cs_lnum_t n_c_iter_max, + cs_real_t c_eps) +{ + size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (c_idx >= n_b_cells) + return; + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + auto c_grad = gradv[c_id]; + cs_real_t grad_0[3][3], grad_i[3][3], rhs_c[3][3], dif[3], grad_c[3][3], + var_ip_f[3]; + + cs_real_t ref_norm = 0.0, ddif, c_norm = 0; + cs_lnum_t n_c_it, f_id; + cs_real_t eps_dvg = 1e-2; + cs_real_t cs_math_epzero = 1e-12; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad_0[i][j] = c_grad[i][j]; + grad_i[i][j] = c_grad[i][j]; + } + } + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + for (cs_lnum_t ll = 0; ll < 3; ll++) + ref_norm += cs_math_fabs_cuda(c_grad[kk][ll]); + } + + for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) { + for (cs_lnum_t ll = 0; ll < stride; ll++) { + rhs_c[ll][0] = 0; + rhs_c[ll][1] = 0; + rhs_c[ll][2] = 0; + } + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + for (cs_lnum_t ii = 0; ii < 3; ii++) + dif[ii] = b_face_cog[f_id][ii] - cell_cen[c_id][ii]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_ip_f[ll] = cs_math_3_dot_product_cuda(c_grad[ll], diipb[f_id]); + } + + const cs_real_t *b = ((const cs_real_t *)coefbv) + + (f_id*stride*stride); + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + cs_real_t pfac = 0; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + pfac += b[kk*3 + ll] * var_ip_f[ll] * ddif; + } + + for (cs_lnum_t ll = 0; ll < 3; ll++) + rhs_c[kk][ll] += dif[ll] * pfac; + } + + } + + for(cs_lnum_t i = 0; i < stride; i++){ + grad_c[i][0] = rhs_c[i][0] * cocg[c_id][0] + + rhs_c[i][1] * cocg[c_id][3] + + rhs_c[i][2] * cocg[c_id][5]; + + grad_c[i][1] = rhs_c[i][0] * cocg[c_id][3] + + rhs_c[i][1] * cocg[c_id][1] + + rhs_c[i][2] * cocg[c_id][4]; + + grad_c[i][2] = rhs_c[i][0] * cocg[c_id][5] + + rhs_c[i][1] * cocg[c_id][4] + + rhs_c[i][2] * cocg[c_id][2]; + } + + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + c_grad[ii][jj] = grad_0[ii][jj] + grad_c[ii][jj]; + c_norm += cs_math_fabs_cuda(c_grad[ii][jj] - grad_i[ii][jj]); + grad_i[ii][jj] = c_grad[ii][jj]; + } + } + + if (c_norm < ref_norm * c_eps || c_norm < cs_math_epzero) + break; + } + + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + gradv[c_id][ii][jj] = c_grad[ii][jj]; + } + } + + if (c_norm > eps_dvg * ref_norm) { + for (cs_lnum_t ii = 0; ii < stride; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) { + c_grad[ii][jj] = grad_0[ii][jj]; + } + } + + n_c_it *= -1; + } +} \ No newline at end of file diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index 2172a78691..a42b66dbe4 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -199,3 +199,96 @@ _compute_rhs_lsq_v_b_face_gather_stride(cs_lnum_t n_b_cells, } _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]); } + + +template +__global__ static void +_compute_rhs_lsq_v_b_face_gather_stride_v2(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict b_cells, + const cs_real_3_t *restrict b_face_cog, + const cs_real_3_t *restrict cell_cen, + cs_real_33_t *restrict rhs, + const val_t *restrict pvar, + const coefb_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_cocg_6_t *restrict cocg, + const cs_cocg_6_t *restrict cocgb, + const int inc) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t lindex = threadIdx.x; + + if(c_idx >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t f_id; + cs_real_t dif[stride], ddif, pfac, norm, var_f[stride]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t ll = 0; ll < 6; ll++) + cocg[c_id][ll] = cocgb[c_idx][ll]; + + __shared__ cs_real_t _rhs[256][3][3]; + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[lindex][i][j] = rhs[c_id][i][j]; + } + } + + auto _pvar = pvar[c_id]; + auto _cocg = cocg[c_id]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + auto _coefbv = coefbv[f_id]; + auto _coefav = coefav[f_id]; + + + for (cs_lnum_t ll = 0; ll < 3; ll++) + dif[ll] = b_face_cog[f_id][ll] - cell_cen[c_id][ll]; + + ddif = 1. / cs_math_3_square_norm_cuda(dif); + + _cocg[0] += dif[0]*dif[0]*ddif; + _cocg[1] += dif[1]*dif[1]*ddif; + _cocg[2] += dif[2]*dif[2]*ddif; + _cocg[3] += dif[0]*dif[1]*ddif; + _cocg[4] += dif[1]*dif[2]*ddif; + _cocg[5] += dif[0]*dif[2]*ddif; + + for (cs_lnum_t kk = 0; kk < stride; kk++) { + var_f[kk] = _coefav[kk]*inc; + for (cs_lnum_t ll = 0; ll < stride; ll++) { + var_f[kk] += _coefbv[ll][kk] * _pvar[ll]; + } + + pfac = (var_f[kk] - _pvar[kk]) * ddif; + + for (cs_lnum_t ll = 0; ll < 3; ll++) + _rhs[lindex][kk][ll] += dif[ll] * pfac; + } + } + + cocg[c_id][0] += _cocg[0]; + cocg[c_id][1] += _cocg[1]; + cocg[c_id][2] += _cocg[2]; + cocg[c_id][3] += _cocg[3]; + cocg[c_id][4] += _cocg[4]; + cocg[c_id][5] += _cocg[5]; + + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = _rhs[lindex][i][j]; + } + } + // _math_6_inv_cramer_sym_in_place_cuda(cocg[c_id]); +} diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 516c3e6080..17dc7c091f 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -156,4 +156,26 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, /*----------------------------------------------------------------------------*/ END_C_DECLS +#ifdef __cplusplus +/** + * This template will be instantited with stride = 1, 3, 6, 9 +*/ +template +void +cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict pvar)[stride], + const cs_real_t *restrict c_weight, + cs_cocg_6_t *restrict cocg, + cs_cocg_6_t *restrict cocgb, + cs_real_t (*restrict gradv)[stride][3], + cs_real_t (*restrict rhs)[stride][3], + cs_lnum_t n_c_iter_max, + cs_real_t c_eps); +#endif #endif /* __CS_GRADIENT_CUDA_H__ */ From e1f0a8559217586f92b0df716a9909466cb61041 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 29 Nov 2023 13:51:40 +0100 Subject: [PATCH 45/70] Gradient on boundary cells comparison --- src/alge/cs_gradient.cxx | 108 +++++----- src/alge/cs_gradient_cuda.cu | 109 ++++++---- src/alge/cs_gradient_cuda.cuh | 50 ++++- src/alge/cs_gradient_lsq_vector.cuh | 195 ++++++++++++++++-- src/alge/cs_gradient_lsq_vector_gather.cuh | 2 +- src/alge/cs_gradient_lsq_vector_gather_v2.cuh | 2 +- src/alge/cs_gradient_lsq_vector_gather_v3.cuh | 2 +- src/alge/cs_gradient_lsq_vector_v2.cuh | 2 +- 8 files changed, 354 insertions(+), 116 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 8084d0e803..f47259c911 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6936,21 +6936,10 @@ _lsq_vector_gradient(const cs_mesh_t *m, accuracy = false; #endif - - // Pour l'instant ces lignes sont pour moi - // Elles seront à enlever - compute_cuda = false; - compute_cpu = true; - res_cpu = true; - - // A ne pas garder dans la version finale - perf = true; - accuracy = false; - - BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); - BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); - BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); - BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); /* Compute Right-Hand Side */ /*-------------------------*/ @@ -7143,35 +7132,6 @@ if(compute_cpu){ + rhs[c_id][i][2] * cocg[c_id][2]; } } - - if(perf){ - stop = std::chrono::high_resolution_clock::now(); - elapsed = std::chrono::duration_cast(stop - start); - } -} // end if compute_cpu - -if(accuracy){ - #pragma omp parallel for - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (int j = 0; j < 3; ++j) { - auto cpu = gradv_cpu[c_id][i][j]; - auto cuda = gradv[c_id][i][j]; - - if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); - } - } - } - } -} - -if(perf) - printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); - -if(res_cpu){ - memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext); -} /* Compute gradient on boundary cells */ /*------------------------------------*/ @@ -7224,12 +7184,38 @@ if(res_cpu){ for (int kk = 0; kk < 9; kk++) { int ii = _33_9_idx[kk][0]; int jj = _33_9_idx[kk][1]; - gradv[c_id][ii][jj] = x[kk]; + gradv_cpu[c_id][ii][jj] = x[kk]; } } } + stop = std::chrono::high_resolution_clock::now(); + elapsed = std::chrono::duration_cast(stop - start); +} // end if COMPUTE_CPU + +if(accuracy){ + #pragma omp parallel for + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; + auto cuda = gradv[c_id][i][j]; + + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { + printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + } + } + } + } +} + +if(perf) + printf("lsq Compute time in us: CPU = %ld\tCUDA = %ld\n", elapsed.count(), elapsed_cuda.count()); + +if(res_cpu){ + memcpy(gradv, gradv_cpu, sizeof(cs_real_33_t) * n_cells_ext); +} /* Periodicity and parallelism treatment */ @@ -7649,6 +7635,14 @@ _lsq_strided_gradient(const cs_mesh_t *m, } } memcpy(gradv, gradv_cpu, sizeof(cs_real_t) * n_cells_ext * stride * 3); + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < stride; i++) { + for (int j = 0; j < 3; ++j) { + if(fabs(gradv[c_id][i][j]-gradv_cpu[c_id][i][j]) != 0.0) + printf("grad = %f\t", gradv[c_id][i][j]); + } + } + } /* Correct gradient on boundary cells */ /*------------------------------------*/ @@ -7815,7 +7809,17 @@ cs_real_t c_norm, ref_norm; } // #endif -#pragma omp parallel for + /* Optional postprocessing */ + + if (b_iter_count != NULL) { + for (cs_lnum_t i = s_id; i < e_id; i++) { + cs_lnum_t f_id = cell_b_faces[i]; + b_iter_count[f_id] = n_c_it; + } + } + + } /* End of correction for BC coeffs */ + #pragma omp parallel for for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { for (int j = 0; j < 3; ++j) { @@ -7823,22 +7827,12 @@ cs_real_t c_norm, ref_norm; auto cuda = gradv[c_id][i][j]; if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + // printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); } } } } - /* Optional postprocessing */ - - if (b_iter_count != NULL) { - for (cs_lnum_t i = s_id; i < e_id; i++) { - cs_lnum_t f_id = cell_b_faces[i]; - b_iter_count[f_id] = n_c_it; - } - } - - } /* End of correction for BC coeffs */ /* Periodicity and parallelism treatment */ diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 3e71d3c317..67af5cf6b0 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -900,7 +900,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, cudaStream_t stream; cudaStreamCreate(&stream); - cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, stop; + cudaEvent_t start, mem_h2d, init, i_faces, halo, b_faces, gradient, gradient_b, stop; float msec = 0.0f, msecTotal = 0.0f; CS_CUDA_CHECK(cudaEventCreate(&start)); CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); @@ -909,6 +909,7 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventCreate(&halo)); CS_CUDA_CHECK(cudaEventCreate(&b_faces)); CS_CUDA_CHECK(cudaEventCreate(&gradient)); + CS_CUDA_CHECK(cudaEventCreate(&gradient_b)); CS_CUDA_CHECK(cudaEventCreate(&stop)); // Record the start event @@ -971,8 +972,8 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_t *restrict cell_f_cen_1d = (const cs_real_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_f_cen); - const cs_lnum_t *restrict i_face_cells_1d - = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); cs_lnum_t stride = 3; @@ -1115,33 +1116,33 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, // coefa_d, // inc); - _compute_rhs_lsq_v_b_face_gather_stride_v2<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> - (m->n_b_cells, - cell_b_faces_idx, - cell_b_faces, - b_cells, - b_face_cog, - cell_cen, - rhs_d, - pvar_d, - coefb_d, - coefa_d, - cocg, - cocgb, - inc); + // _compute_rhs_lsq_v_b_face_gather_stride_v2<3, cs_real_3_t, cs_real_33_t><<n_b_cells, blocksize), blocksize, 0, stream>>> + // (m->n_b_cells, + // cell_b_faces_idx, + // cell_b_faces, + // b_cells, + // b_face_cog, + // cell_cen, + // rhs_d, + // pvar_d, + // coefb_d, + // coefa_d, + // cocg, + // cocgb, + // inc); - // _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> - // (m->n_b_cells, - // cell_b_faces_idx, - // cell_b_faces, - // b_cells, - // b_face_normal, - // rhs_d, - // pvar_d, - // b_dist, - // coefb_d, - // coefa_d, - // inc); + _compute_rhs_lsq_v_b_face_gather_v3<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + cell_b_faces_idx, + cell_b_faces, + b_cells, + b_face_normal, + rhs_d, + pvar_d, + b_dist, + coefb_d, + coefa_d, + inc); // _compute_rhs_lsq_v_b_face_v2<<n_b_cells, blocksize), blocksize, 0, stream>>> // (m->n_b_faces, @@ -1195,6 +1196,24 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(gradient, stream)); + _compute_gradient_lsq_b_v<<n_b_cells, blocksize), blocksize, 0, stream>>> + (m->n_b_cells, + b_cells, + cell_b_faces_idx, + cell_b_faces, + b_face_normal, + diipb, + pvar_d, + b_dist, + coefb_d, + coefa_d, + grad_d, + rhs_d, + cocgb, + inc); + + CS_CUDA_CHECK(cudaEventRecord(gradient_b, stream)); + // /* Sync to host */ if (grad_d != NULL) { size_t size = n_cells * sizeof(cs_real_t) * 3 * 3; @@ -1232,7 +1251,11 @@ cs_lsq_vector_gradient_cuda(const cs_mesh_t *m, printf("Gradient = %f\t", msec*1000.f); msec = 0.0f; - CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient)); + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, gradient, gradient_b)); + printf("Gradient_b = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, gradient_b)); printf("Total kernel = %f\t", msec*1000.f); msec = 0.0f; @@ -1380,9 +1403,6 @@ cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict diipb = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); - const cs_lnum_t *restrict i_face_cells_1d - = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); - _sync_or_copy_real_h2d(pvar, n_cells_ext*stride, device_id, stream, &pvar_d, &_pvar_d); @@ -1391,7 +1411,7 @@ cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, _sync_or_copy_real_h2d(coefbv, n_b_faces*stride*stride, device_id, stream, &coefb_d, &_coefb_d); - cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * stride * 3); + cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * n_cells * stride * 3); CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); @@ -1401,7 +1421,26 @@ cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(halo, stream)); - _compute_gradient_b_face_lsq_v<<n_b_cells, blocksize), blocksize, 0, stream>>> + // assert(b_cells); + // assert(cell_b_faces_idx); + // assert(cell_b_faces); + // assert(b_face_cog); + // assert(cell_cen); + // assert(diipb); + // assert(grad_d); + // assert(coefb_d); + // assert(cocg); + + // for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + // for (cs_lnum_t i = 0; i < stride; i++) { + // for (int j = 0; j < 3; ++j) { + // // if(fabs(gradv[c_id][i][j]) != 0.0) + // // printf("grad = %f\t", gradv[c_id][i][j]); + // } + // } + // } + + _compute_gradient_lsq_b_strided_v<<n_b_cells, blocksize), blocksize, 0, stream>>> (m->n_b_cells, b_cells, cell_b_faces_idx, diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index ee19988414..0ecc6f9bd8 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -97,7 +97,7 @@ cs_math_3_dot_product_cuda(const cs_real_t u[3], } -__device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], +__device__ void cs_math_3_normalize_cuda(const cs_real_t in[3], cs_real_t out[3]) { cs_real_t norm = sqrt(in[0]*in[0] @@ -134,6 +134,54 @@ __device__ void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ in[5] = in02 * det_inv; } +template +__device__ void +_fact_crout_pp_cuda(cs_real_t *ad) +{ + cs_real_t aux[d_size]; + for (int kk = 0; kk < d_size - 1; kk++) { + int kk_d_size = kk*(kk + 1)/2; + for (int ii = kk + 1; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] = ad[ii_d_size + kk]; + ad[ii_d_size + kk] = ad[ii_d_size + kk] + / ad[kk_d_size + kk]; + for (int jj = kk + 1; jj < ii + 1; jj++) { + ad[ii_d_size + jj] = ad[ii_d_size + jj] - ad[ii_d_size + kk]*aux[jj]; + } + } + } +} + +template +__device__ void +_fw_and_bw_ldtl_pp_cuda(const cs_real_t mat[], + cs_real_t x[], + const cs_real_t b[]) +{ + cs_real_t aux[d_size]; + + for (int ii = 0; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] = b[ii]; + for (int jj = 0; jj < ii; jj++) { + aux[ii] -= aux[jj]*mat[ii_d_size + jj]; + } + } + + for (int ii = 0; ii < d_size; ii++) { + int ii_d_size = ii*(ii + 1)/2; + aux[ii] /= mat[ii_d_size + ii]; + } + + for (int ii = d_size - 1; ii >= 0; ii--) { + x[ii] = aux[ii]; + for (int jj = d_size - 1; jj > ii; jj--) { + int jj_d_size = jj*(jj + 1)/2; + x[ii] -= x[jj]*mat[jj_d_size + ii]; + } + } +} template __device__ uint32_t _conflict_mask(uint32_t mask, V v) noexcept { diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index 61e8051517..633e14e31a 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -265,7 +265,7 @@ _compute_rhs_lsq_v_b_face(cs_lnum_t n_b_faces, c_id1 = b_face_cells[f_id]; - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; @@ -312,9 +312,162 @@ _compute_gradient_lsq_v(cs_lnum_t n_cells, } } +__global__ static void +_compute_gradient_lsq_b_v(cs_lnum_t n_b_cells, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_lnum_t *restrict cell_b_faces, + const cs_real_3_t *restrict b_face_normal, + const cs_real_3_t *restrict diipb, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict b_dist, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + cs_real_33_t *restrict gradv, + cs_real_33_t *restrict rhs, + cs_cocg_6_t *restrict cocgb_s, + const int inc) +{ + size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (c_idx >= n_b_cells) + return; + + cs_lnum_t c_id = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + cs_lnum_t f_id; + cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + cs_real_3_t normal; + + cs_lnum_t _33_9_idx[9][2]; + int nn = 0; + for (int ll = 0; ll < 3; ll++) { + for (int mm = 0; mm < 3; mm++) { + _33_9_idx[nn][0] = ll; + _33_9_idx[nn][1] = mm; + nn++; + } + } + + auto _cocg = cocgb_s[c_idx]; + auto _rhs = rhs[c_id]; + + cocgb[0][0] = _cocg[0]; + cocgb[0][1] = _cocg[3]; + cocgb[0][2] = _cocg[5]; + cocgb[1][0] = _cocg[3]; + cocgb[1][1] = _cocg[1]; + cocgb[1][2] = _cocg[4]; + cocgb[2][0] = _cocg[5]; + cocgb[2][1] = _cocg[4]; + cocgb[2][2] = _cocg[2]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], normal); + for (cs_lnum_t ii = 0; ii < 3; ii++) { + for (cs_lnum_t jj = 0; jj < 3; jj++) + cocgb[ii][jj] += normal[ii] * normal[jj]; + } + } + + for (int ll = 0; ll < 9; ll++) { + + int ll_9 = ll*(ll+1)/2; + + for (int mm = 0; mm <= ll; mm++) { + cocgb_v[ll_9+mm] = 0.; + + int pp = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + int rr = _33_9_idx[mm][0]; + int ss = _33_9_idx[mm][1]; + + if (pp == rr) + cocgb_v[ll_9+mm] = cocgb[qq][ss]; + + rhsb_v[ll] = _rhs[pp][qq]; + } + } + + cs_real_3_t nb; + cs_real_t a[3], bt[3][3], db, db2; + for (cs_lnum_t i = s_id; i < e_id; i++) { + + f_id = cell_b_faces[i]; + + auto iipbf = diipb[f_id]; + + cs_math_3_normalize_cuda(b_face_normal[f_id], nb); + + db = 1./b_dist[f_id]; + db2 = db*db; + + for (int ll = 0; ll < 3; ll++) { + for (int pp = 0; pp < 3; pp++) + bt[ll][pp] = coefbv[f_id][ll][pp]; + } + for (int ll = 0; ll < 3; ll++) { + a[ll] = inc*coefav[f_id][ll]; + bt[ll][ll] -= 1; + } + + for (int ll = 0; ll < 9; ll++) { + + int kk = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + int ll_9 = ll*(ll+1)/2; + for (int pp = 0; pp <= ll; pp++) { + + int rr = _33_9_idx[pp][0]; + int ss = _33_9_idx[pp][1]; + + cs_real_t cocgv = 0.; + for (int mm = 0; mm < 3; mm++) + cocgv += bt[mm][kk]*bt[mm][rr]; + cocgb_v[ll_9+pp] += cocgv*(iipbf[qq]*iipbf[ss])*db2; + + cocgb_v[ll_9+pp] -= ( nb[ss]*bt[rr][kk]*iipbf[qq] + + nb[qq]*bt[kk][rr]*iipbf[ss]) + *db; + } + } + + for (int ll = 0; ll < 9; ll++) { + int pp = _33_9_idx[ll][0]; + int qq = _33_9_idx[ll][1]; + + cs_real_t rhsv = 0.; + for (int rr = 0; rr < 3; rr++) { + rhsv += bt[rr][pp]*diipb[f_id][qq] + *(a[rr]+ bt[rr][0]*pvar[c_id][0] + + bt[rr][1]*pvar[c_id][1] + + bt[rr][2]*pvar[c_id][2]); + } + + rhsb_v[ll] -= rhsv*db2; + } + + } + _fact_crout_pp_cuda<9>(cocgb_v); + + _fw_and_bw_ldtl_pp_cuda<9>(cocgb_v, x, rhsb_v); + + for (int kk = 0; kk < 9; kk++) { + int ii = _33_9_idx[kk][0]; + int jj = _33_9_idx[kk][1]; + gradv[c_id][ii][jj] = x[kk]; + } +} + template __global__ static void -_compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, +_compute_gradient_lsq_b_strided_v(const cs_lnum_t n_b_cells, const cs_lnum_t *restrict b_cells, const cs_lnum_t *restrict cell_b_faces_idx, const cs_lnum_t *restrict cell_b_faces, @@ -330,15 +483,15 @@ _compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, size_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; if (c_idx >= n_b_cells) return; - + cs_lnum_t c_id = b_cells[c_idx]; cs_lnum_t s_id = cell_b_faces_idx[c_id]; cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; auto c_grad = gradv[c_id]; - cs_real_t grad_0[3][3], grad_i[3][3], rhs_c[3][3], dif[3], grad_c[3][3], - var_ip_f[3]; + cs_real_t grad_0[stride][3], grad_i[stride][3], rhs_c[stride][3], dif[3], grad_c[stride][3], + var_ip_f[stride]; cs_real_t ref_norm = 0.0, ddif, c_norm = 0; cs_lnum_t n_c_it, f_id; @@ -352,12 +505,16 @@ _compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, } } + ref_norm = 0; for (cs_lnum_t kk = 0; kk < stride; kk++) { for (cs_lnum_t ll = 0; ll < 3; ll++) ref_norm += cs_math_fabs_cuda(c_grad[kk][ll]); } + c_norm = 0; + for (n_c_it = 0; n_c_it < n_c_iter_max; n_c_it++) { + for (cs_lnum_t ll = 0; ll < stride; ll++) { rhs_c[ll][0] = 0; rhs_c[ll][1] = 0; @@ -376,13 +533,12 @@ _compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, var_ip_f[ll] = cs_math_3_dot_product_cuda(c_grad[ll], diipb[f_id]); } - const cs_real_t *b = ((const cs_real_t *)coefbv) - + (f_id*stride*stride); + auto b = coefbv[f_id]; for (cs_lnum_t kk = 0; kk < stride; kk++) { cs_real_t pfac = 0; for (cs_lnum_t ll = 0; ll < stride; ll++) { - pfac += b[kk*3 + ll] * var_ip_f[ll] * ddif; + pfac += b[kk][ll] * var_ip_f[ll] * ddif; } for (cs_lnum_t ll = 0; ll < 3; ll++) @@ -392,19 +548,20 @@ _compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, } for(cs_lnum_t i = 0; i < stride; i++){ - grad_c[i][0] = rhs_c[i][0] * cocg[c_id][0] - + rhs_c[i][1] * cocg[c_id][3] - + rhs_c[i][2] * cocg[c_id][5]; + grad_c[i][0] = rhs_c[i][0] * cocg[c_id][0] + + rhs_c[i][1] * cocg[c_id][3] + + rhs_c[i][2] * cocg[c_id][5]; - grad_c[i][1] = rhs_c[i][0] * cocg[c_id][3] - + rhs_c[i][1] * cocg[c_id][1] - + rhs_c[i][2] * cocg[c_id][4]; + grad_c[i][1] = rhs_c[i][0] * cocg[c_id][3] + + rhs_c[i][1] * cocg[c_id][1] + + rhs_c[i][2] * cocg[c_id][4]; - grad_c[i][2] = rhs_c[i][0] * cocg[c_id][5] - + rhs_c[i][1] * cocg[c_id][4] - + rhs_c[i][2] * cocg[c_id][2]; + grad_c[i][2] = rhs_c[i][0] * cocg[c_id][5] + + rhs_c[i][1] * cocg[c_id][4] + + rhs_c[i][2] * cocg[c_id][2]; } + c_norm = 0.0; for (cs_lnum_t ii = 0; ii < stride; ii++) { for (cs_lnum_t jj = 0; jj < 3; jj++) { c_grad[ii][jj] = grad_0[ii][jj] + grad_c[ii][jj]; @@ -426,10 +583,10 @@ _compute_gradient_b_face_lsq_v(const cs_lnum_t n_b_cells, if (c_norm > eps_dvg * ref_norm) { for (cs_lnum_t ii = 0; ii < stride; ii++) { for (cs_lnum_t jj = 0; jj < 3; jj++) { - c_grad[ii][jj] = grad_0[ii][jj]; + gradv[c_id][ii][jj] = grad_0[ii][jj]; } } n_c_it *= -1; } -} \ No newline at end of file +} diff --git a/src/alge/cs_gradient_lsq_vector_gather.cuh b/src/alge/cs_gradient_lsq_vector_gather.cuh index a42b66dbe4..586764e259 100644 --- a/src/alge/cs_gradient_lsq_vector_gather.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather.cuh @@ -113,7 +113,7 @@ _compute_rhs_lsq_v_b_face_gather(cs_lnum_t n_b_cells, f_id = cell_b_faces[index]; - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; diff --git a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh index 0bfeca2461..cdb831140c 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v2.cuh @@ -140,7 +140,7 @@ _compute_rhs_lsq_v_b_face_gather_v2(cs_lnum_t n_b_cells, auto _coefav = coefav[f_id]; auto _coefbv = coefbv[f_id]; - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; diff --git a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh index 8c6a6efe17..37ce174c8d 100644 --- a/src/alge/cs_gradient_lsq_vector_gather_v3.cuh +++ b/src/alge/cs_gradient_lsq_vector_gather_v3.cuh @@ -233,7 +233,7 @@ _compute_rhs_lsq_v_b_face_gather_v3(cs_lnum_t n_b_cells, auto _coefav = coefav[f_id]; auto _coefbv = coefbv[f_id]; - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; diff --git a/src/alge/cs_gradient_lsq_vector_v2.cuh b/src/alge/cs_gradient_lsq_vector_v2.cuh index 1c9d222a41..a342fd67f7 100644 --- a/src/alge/cs_gradient_lsq_vector_v2.cuh +++ b/src/alge/cs_gradient_lsq_vector_v2.cuh @@ -171,7 +171,7 @@ _compute_rhs_lsq_v_b_face_v2(cs_lnum_t n_b_faces, c_id1 = b_face_cells[f_id]; - cs_math_3_normalise_cuda(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize_cuda(b_face_normal[f_id], n_d_dist); d_b_dist = 1. / b_dist[f_id]; From b1ce00d97a0f2648016d26e8f7381ea3f4da839c Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 30 Nov 2023 15:37:41 +0100 Subject: [PATCH 46/70] Small change in kernel --- src/alge/cs_gradient.cxx | 12 ++---------- src/alge/cs_gradient_lsq_vector.cuh | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index f47259c911..5385685d94 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -7635,14 +7635,6 @@ _lsq_strided_gradient(const cs_mesh_t *m, } } memcpy(gradv, gradv_cpu, sizeof(cs_real_t) * n_cells_ext * stride * 3); - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < stride; i++) { - for (int j = 0; j < 3; ++j) { - if(fabs(gradv[c_id][i][j]-gradv_cpu[c_id][i][j]) != 0.0) - printf("grad = %f\t", gradv[c_id][i][j]); - } - } - } /* Correct gradient on boundary cells */ /*------------------------------------*/ @@ -7826,8 +7818,8 @@ cs_real_t c_norm, ref_norm; auto cpu = gradv_cpu[c_id][i][j]; auto cuda = gradv[c_id][i][j]; - if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { - // printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-6) { + printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); } } } diff --git a/src/alge/cs_gradient_lsq_vector.cuh b/src/alge/cs_gradient_lsq_vector.cuh index 633e14e31a..0ecacd1d3d 100644 --- a/src/alge/cs_gradient_lsq_vector.cuh +++ b/src/alge/cs_gradient_lsq_vector.cuh @@ -490,6 +490,9 @@ _compute_gradient_lsq_b_strided_v(const cs_lnum_t n_b_cells, cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; auto c_grad = gradv[c_id]; + auto _cocg = cocg[c_id]; + auto _cell_cen = cell_cen[c_id]; + cs_real_t grad_0[stride][3], grad_i[stride][3], rhs_c[stride][3], dif[3], grad_c[stride][3], var_ip_f[stride]; @@ -525,7 +528,7 @@ _compute_gradient_lsq_b_strided_v(const cs_lnum_t n_b_cells, f_id = cell_b_faces[index]; for (cs_lnum_t ii = 0; ii < 3; ii++) - dif[ii] = b_face_cog[f_id][ii] - cell_cen[c_id][ii]; + dif[ii] = b_face_cog[f_id][ii] - _cell_cen[ii]; ddif = 1. / cs_math_3_square_norm_cuda(dif); @@ -548,17 +551,17 @@ _compute_gradient_lsq_b_strided_v(const cs_lnum_t n_b_cells, } for(cs_lnum_t i = 0; i < stride; i++){ - grad_c[i][0] = rhs_c[i][0] * cocg[c_id][0] - + rhs_c[i][1] * cocg[c_id][3] - + rhs_c[i][2] * cocg[c_id][5]; + grad_c[i][0] = rhs_c[i][0] * _cocg[0] + + rhs_c[i][1] * _cocg[3] + + rhs_c[i][2] * _cocg[5]; - grad_c[i][1] = rhs_c[i][0] * cocg[c_id][3] - + rhs_c[i][1] * cocg[c_id][1] - + rhs_c[i][2] * cocg[c_id][4]; + grad_c[i][1] = rhs_c[i][0] * _cocg[3] + + rhs_c[i][1] * _cocg[1] + + rhs_c[i][2] * _cocg[4]; - grad_c[i][2] = rhs_c[i][0] * cocg[c_id][5] - + rhs_c[i][1] * cocg[c_id][4] - + rhs_c[i][2] * cocg[c_id][2]; + grad_c[i][2] = rhs_c[i][0] * _cocg[5] + + rhs_c[i][1] * _cocg[4] + + rhs_c[i][2] * _cocg[2]; } c_norm = 0.0; From 9429240461d4db3da24e594c2dfdd8be4cad8df2 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 7 Dec 2023 15:49:06 +0100 Subject: [PATCH 47/70] ADD Kernels convection_diffusion_vector AND Refacto _gradient_vector --- src/alge/Makefile.am | 1 + src/alge/cs_convection_diffusion.c | 321 ++++++++++++++++++---- src/alge/cs_convection_diffusion_cuda.cu | 289 +++++++++++++++++++ src/alge/cs_convection_diffusion_cuda.cuh | 113 ++++++++ src/alge/cs_convection_diffusion_priv.h | 93 +++++++ src/alge/cs_gradient.cxx | 6 +- src/alge/cs_gradient_cuda.cu | 78 ++---- src/alge/cs_gradient_cuda.cuh | 15 + src/alge/cs_gradient_priv.h | 3 +- 9 files changed, 798 insertions(+), 121 deletions(-) create mode 100644 src/alge/cs_convection_diffusion_cuda.cu create mode 100644 src/alge/cs_convection_diffusion_cuda.cuh create mode 100644 src/alge/cs_convection_diffusion_priv.h diff --git a/src/alge/Makefile.am b/src/alge/Makefile.am index f31278797e..25526ee402 100644 --- a/src/alge/Makefile.am +++ b/src/alge/Makefile.am @@ -193,6 +193,7 @@ libcsalge_a_SOURCES += \ cs_benchmark_cuda.cu \ cs_blas_cuda.cu \ cs_gradient_cuda.cu \ +cs_convection_diffusion_cuda.cu \ cs_matrix_spmv_cuda.cu \ cs_sles_it_cuda.cu \ cs_sles_pc_cuda.cu diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 39cb4b56ff..6a3534396e 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -74,8 +74,9 @@ /*---------------------------------------------------------------------------- * Header for the current file *----------------------------------------------------------------------------*/ - +#include "time.h" #include "cs_convection_diffusion.h" +#include "cs_convection_diffusion_priv.h" /*----------------------------------------------------------------------------*/ @@ -4369,88 +4370,286 @@ cs_convection_diffusion_vector(int idtvar, - when we have convection, we are not in pure upwind and we have not shunted the slope test. */ - if ( (idiffp != 0 && ircflp == 1) || ivisep == 1 - || ( iconvp != 0 && iupwin == 0 - && (ischcp == 0 || ircflp == 1 || isstpp == 0))) { - if (f_id != -1) { - /* Get the calculation option from the field */ - if (f->type & CS_FIELD_VARIABLE && var_cal_opt.iwgrec == 1) { - if (var_cal_opt.idiff > 0) { - int key_id = cs_field_key_id("gradient_weighting_id"); - int diff_id = cs_field_get_key_int(f, key_id); - if (diff_id > -1) { - cs_field_t *weight_f = cs_field_by_id(diff_id); - gweight = weight_f->val; - cs_field_synchronize(weight_f, halo_type); + + /* Timing the computation */ + + clock_t start, stop; + double elapsed, elapsed_cuda; + + cs_real_3_t *rhs_cpu, *rhs_gpu; + cs_real_33_t *grad_cpu, *grad_gpu; + cs_real_33_t *grdpa_cpu, *grdpa_gpu; + + bool compute_cuda; + bool compute_cpu; + bool res_cpu; + bool perf; + bool accuracy; + +#if defined(HAVE_CUDA) + compute_cuda = (cs_get_device_id() > -1) ? true : false; +#else + compute_cuda = false; +#endif + +res_cpu = !compute_cuda; + +#if defined(DEBUG) + compute_cpu = true; + perf = true; + accuracy = true; +#elif defined(NDEBUG) + compute_cpu = true; + perf = false; + accuracy = false; +#else + compute_cpu = false; + perf = false; + accuracy = false; +#endif + + + // Pour l'instant ces lignes sont pour moi + // Elles seront à enlever + compute_cuda = false; + compute_cpu = true; + res_cpu = true; + + // A ne pas garder dans la version finale + perf = true; + accuracy = true; + +printf("je passe dans convection_diffusion\n"); +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(!res_cpu){ + grad_gpu = grad; + grdpa_gpu = grdpa; + rhs_gpu = rhs; + } else { + BFT_MALLOC(grad_gpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grdpa_gpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(rhs_gpu, n_cells_ext, cs_real_3_t); + } + if(perf){ + start = clock(); + } + + bool flag1 = ( (idiffp != 0 && ircflp == 1) || ivisep == 1 + || ( iconvp != 0 && iupwin == 0 + && (ischcp == 0 || ircflp == 1 || isstpp == 0))); + + if (flag1) { + + if (f_id != -1) { + /* Get the calculation option from the field */ + if (f->type & CS_FIELD_VARIABLE && var_cal_opt.iwgrec == 1) { + if (var_cal_opt.idiff > 0) { + int key_id = cs_field_key_id("gradient_weighting_id"); + int diff_id = cs_field_get_key_int(f, key_id); + if (diff_id > -1) { + cs_field_t *weight_f = cs_field_by_id(diff_id); + gweight = weight_f->val; + cs_field_synchronize(weight_f, halo_type); + } } } } - } - cs_gradient_vector_synced_input(var_name, - gradient_type, - halo_type, - inc, - nswrgp, - iwarnp, - imligp, - epsrgp, - climgp, - coefav, - coefbv, - _pvar, - gweight, /* weighted gradient */ - cpl, - grad); + cs_gradient_vector_synced_input(var_name, + gradient_type, + halo_type, + inc, + nswrgp, + iwarnp, + imligp, + epsrgp, + climgp, + coefav, + coefbv, + _pvar, + gweight, /* weighted gradient */ + cpl, + grad_cpu); + } + + bool flag2 = (iconvp > 0 && iupwin == 0 && isstpp == 0); + cs_convection_diffusion_vector_cuda(m, + fvq, + _pvar, + i_massflux, + grad_gpu, + grdpa_gpu, + rhs_gpu, + coefav, + coefbv, + inc, + flag1, + flag2, + perf); + + if(perf){ + stop = clock(); + elapsed_cuda = (double) (stop - start); + } } - else { -# pragma omp parallel for - for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { - for (int isou = 0; isou < 3; isou++) { - for (int jsou = 0; jsou < 3; jsou++) - grad[cell_id][isou][jsou] = 0.; +#endif + + + if(compute_cpu){ + if(res_cpu){ + grad_cpu = grad; + grdpa_cpu = grdpa; + rhs_cpu = rhs; + } else { + BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grdpa_cpu, n_cells_ext, cs_real_33_t); + BFT_MALLOC(rhs_cpu, n_cells_ext, cs_real_3_t); + } + + if(perf){ + start = clock(); + } + + if ( (idiffp != 0 && ircflp == 1) || ivisep == 1 + || ( iconvp != 0 && iupwin == 0 + && (ischcp == 0 || ircflp == 1 || isstpp == 0))) { + + if (f_id != -1) { + /* Get the calculation option from the field */ + if (f->type & CS_FIELD_VARIABLE && var_cal_opt.iwgrec == 1) { + if (var_cal_opt.idiff > 0) { + int key_id = cs_field_key_id("gradient_weighting_id"); + int diff_id = cs_field_get_key_int(f, key_id); + if (diff_id > -1) { + cs_field_t *weight_f = cs_field_by_id(diff_id); + gweight = weight_f->val; + cs_field_synchronize(weight_f, halo_type); + } + } + } + } + + cs_gradient_vector_synced_input(var_name, + gradient_type, + halo_type, + inc, + nswrgp, + iwarnp, + imligp, + epsrgp, + climgp, + coefav, + coefbv, + _pvar, + gweight, /* weighted gradient */ + cpl, + grad_cpu); + } + else { + # pragma omp parallel for + for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { + for (int isou = 0; isou < 3; isou++) { + for (int jsou = 0; jsou < 3; jsou++) + grad_cpu[cell_id][isou][jsou] = 0.; + } } } - } /* ====================================================================== ---> Compute uncentered gradient grdpa for the slope test ======================================================================*/ -# pragma omp parallel for - for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { - for (int jsou = 0; jsou < 3; jsou++) { - for (int isou = 0; isou < 3; isou++) - grdpa[cell_id][isou][jsou] = 0.; + # pragma omp parallel for + for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { + for (int jsou = 0; jsou < 3; jsou++) { + for (int isou = 0; isou < 3; isou++) + grdpa_cpu[cell_id][isou][jsou] = 0.; + } } - } - if (iconvp > 0 && iupwin == 0 && isstpp == 0) { + if (iconvp > 0 && iupwin == 0 && isstpp == 0) { - cs_slope_test_gradient_vector(inc, - halo_type, - (const cs_real_33_t *)grad, - grdpa, - _pvar, - coefav, - coefbv, - i_massflux); + cs_slope_test_gradient_vector(inc, + halo_type, + (const cs_real_33_t *)grad_cpu, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + + } + + /* ====================================================================== + ---> Contribution from interior faces + ======================================================================*/ + + n_upwind = 0; + + if (n_cells_ext > n_cells) { + # pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) + for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { + for (int isou = 0; isou < 3; isou++) + rhs_cpu[cell_id][isou] = 0.; + } + } + if(perf){ + stop = clock(); + elapsed = (double) (stop - start); + } } - /* ====================================================================== - ---> Contribution from interior faces - ======================================================================*/ + /* Accuracy grad_cpu and grad_gpu */ + if(accuracy){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + if(compute_cpu){ + cs_real_t cpu, cuda; + double err; + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + cpu = rhs_cpu[c_id][i]; + cuda = rhs_gpu[c_id][i]; + err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("convection_diffusion_a DIFFERENCE @%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\n", c_id, i, cpu, cuda, fabs(cpu - cuda), err); + } + for (int j =0; j < 3; ++j) { + cpu = grdpa_cpu[c_id][i][j]; + cuda = grdpa_gpu[c_id][i][j]; + err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); + if (err> 1e-12) { + printf("convection_diffusion_b DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); + } + } + } + } + } + } + #endif + } - n_upwind = 0; +// Free memory +#if defined(HAVE_CUDA) + if(compute_cuda){ + if(res_cpu){ + BFT_FREE(grad_gpu); + BFT_FREE(grdpa_gpu); + BFT_FREE(rhs_gpu); + } + } +#endif - if (n_cells_ext > n_cells) { -# pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) - for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { - for (int isou = 0; isou < 3; isou++) - rhs[cell_id][isou] = 0.; +// Free memory + if(compute_cpu){ + if(!res_cpu){ + BFT_FREE(grad_cpu); + BFT_FREE(grdpa_cpu); + BFT_FREE(rhs_cpu); } } diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu new file mode 100644 index 0000000000..4e14de751d --- /dev/null +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -0,0 +1,289 @@ +#include "cs_defs.h" + +/*---------------------------------------------------------------------------- + * Standard C library headers + *----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(HAVE_MPI) +#include +#endif + +#include + +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ + +#include "bft_error.h" +#include "bft_mem.h" + +#include "cs_base_accel.h" +#include "cs_base_cuda.h" +#include "cs_blas.h" +#include "cs_cell_to_vertex.h" +#include "cs_ext_neighborhood.h" +#include "cs_field.h" +#include "cs_field_pointer.h" +#include "cs_halo.h" +#include "cs_halo_perio.h" +#include "cs_log.h" +#include "cs_math.h" +#include "cs_mesh.h" +#include "cs_mesh_adjacencies.h" +#include "cs_mesh_quantities.h" +#include "cs_parall.h" +#include "cs_porous_model.h" +#include "cs_prototypes.h" +#include "cs_timer.h" +#include "cs_timer_stats.h" + +#include "cs_convection_diffusion.h" +#include "cs_convection_diffusion_priv.h" + +#include "cs_convection_diffusion_cuda.cuh" + + +/*---------------------------------------------------------------------------- + * _gradient_vector the gradient of a vector using a given gradient of + * this vector (typically lsq). + * + * parameters: + * m <-- pointer to associated mesh structure + * fvq <-- pointer to associated finite volume quantities + * cpl <-- structure associated with internal coupling, or NULL + * inc <-- if 0, solve on increment; 1 otherwise + * coefav <-- B.C. coefficients for boundary face normals + * coefbv <-- B.C. coefficients for boundary face normals + * pvar <-- variable + * c_weight <-- weighted gradient coefficient variable + * r_grad --> gradient used for reconstruction + * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) + *----------------------------------------------------------------------------*/ +extern "C" void +cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_quantities_t *fvq, + const cs_real_3_t *restrict pvar, + const cs_real_t i_massflux[], + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + cs_real_3_t *restrict rhs, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const int inc, + const bool flag1, + const bool flag2, + const bool perf) +{ + const cs_lnum_t n_cells = mesh->n_cells; + const cs_lnum_t n_cells_ext = mesh->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = mesh->n_i_faces; + const cs_lnum_t n_b_faces = mesh->n_b_faces; + + int device_id; + cudaGetDevice(&device_id); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t start, mem_h2d, init, f_i, f_b, f_f, stop; + float msec = 0.0f; + CS_CUDA_CHECK(cudaEventCreate(&start)); + CS_CUDA_CHECK(cudaEventCreate(&mem_h2d)); + CS_CUDA_CHECK(cudaEventCreate(&init)); + CS_CUDA_CHECK(cudaEventCreate(&f_i)); + CS_CUDA_CHECK(cudaEventCreate(&f_b)); + CS_CUDA_CHECK(cudaEventCreate(&f_f)); + CS_CUDA_CHECK(cudaEventCreate(&stop)); + + + // Record the start event + CS_CUDA_CHECK(cudaEventRecord(start, stream)); + + unsigned int blocksize = 256; + + cs_real_33_t *grad_d = NULL; + + cs_real_33_t *grdpa_d; + CS_CUDA_CHECK(cudaMalloc(&grdpa_d, n_cells_ext * sizeof(cs_real_33_t))); + + cs_real_3_t *rhs_d; + CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_3_t))); + cs_cuda_copy_h2d(rhs_d, rhs, sizeof(cs_real_3_t)*n_cells_ext); + + cs_gnum_t n_upwind; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(mesh->i_face_cells); + + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_face_cells); + + const cs_real_3_t *restrict i_face_cog + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->i_face_cog); + + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); + + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); + + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); + + cs_real_3_t *restrict i_f_face_normal; + CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); + + cs_real_3_t *restrict coefa_d; + CS_CUDA_CHECK(cudaMalloc(&coefa_d, sizeof(cs_real_3_t)*n_b_faces)); + cs_cuda_copy_h2d(coefa_d, (void *)coefav, sizeof(cs_real_3_t)*n_b_faces); + + cs_real_33_t *restrict coefb_d; + CS_CUDA_CHECK(cudaMalloc(&coefb_d, sizeof(cs_real_33_t)*n_b_faces)); + cs_cuda_copy_h2d(coefb_d, (void *)coefbv, sizeof(cs_real_33_t)*n_b_faces); + + cs_real_t *restrict cell_vol; + CS_CUDA_CHECK(cudaMalloc(&cell_vol, n_cells * sizeof(cs_real_t))); + cs_cuda_copy_h2d(cell_vol, (void *)fvq->cell_vol, sizeof(cs_real_t)*n_cells); + + cs_real_3_t *restrict pvar_d; + CS_CUDA_CHECK(cudaMalloc(&pvar_d, sizeof(cs_real_3_t)*n_cells)); + cs_cuda_copy_h2d(pvar_d, (void *)pvar, sizeof(cs_real_3_t)*n_cells); + + /* Initialization */ + + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + + if(flag1){ + cs_cuda_copy_h2d(grad_d, grad, sizeof(cs_real_33_t)*n_cells_ext); + }else{ + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); + cudaMemset(grad_d, 0, n_cells_ext * sizeof(cs_real_33_t)); + } + + cudaMemset(grdpa_d, 0, n_cells_ext * sizeof(cs_real_33_t)); + + CS_CUDA_CHECK(cudaEventRecord(init, stream)); + + if (flag2) { + printf(" On passe dans cs_slope_test"); + cs_slope_test_gradient_vector_cuda_i<<>> + (n_i_faces, + i_face_cells, + i_face_cog, + cell_cen, + pvar_d, + i_massflux, + i_f_face_normal, + grad_d, + grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_i, stream)); + + cs_slope_test_gradient_vector_cuda_b<<>> + (n_b_faces, + pvar_d, + b_face_cells, + diipb, + inc, + coefa_d, + coefb_d, + b_f_face_normal, + grad_d, + grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_b, stream)); + + // cs_slope_test_gradient_vector_cuda_f<<<(n_cells / blocksize) * 3 * 3, blocksize, 0, stream>>> + // (n_cells, + // cell_vol, + // grdpa_d); + + CS_CUDA_CHECK(cudaEventRecord(f_f, stream)); + + } + + n_upwind = 0; + + if(n_cells_ext > n_cells){ + cudaMemset(rhs_d[n_cells], 0, (n_cells_ext-n_cells) * sizeof(cs_real_3_t)); + } + + + /* Sync to host */ + if (grdpa_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + cs_cuda_copy_d2h(grdpa, grdpa_d, size); + } + else + cs_sync_d2h(grdpa_d); + + + /* Sync to host */ + if (rhs_d != NULL) { + size_t size = n_cells_ext * sizeof(cs_real_t) * 3; + cs_cuda_copy_d2h(rhs, rhs_d, size); + } + else + cs_sync_d2h(rhs_d); + + CS_CUDA_CHECK(cudaEventRecord(stop, stream)); + CS_CUDA_CHECK(cudaEventSynchronize(stop)); + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + + if(perf){ + printf("convection_diffusion Kernels times:\t"); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + if (flag2) { + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, f_i)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_i, f_b)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_b, f_f)); + printf("Kernels execution time in us: \t"); + printf("Init = %f\t", msec*1000.f); + + } + + msec = 0.0f; + CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); + printf("Total = %f\t", msec*1000.f); + + printf("\n"); + } + + if (!flag1){ + CS_CUDA_CHECK(cudaFree(grad_d)); + } + + CS_CUDA_CHECK(cudaFree(grdpa_d)); + CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(i_f_face_normal)); + CS_CUDA_CHECK(cudaFree(coefa_d)); + CS_CUDA_CHECK(cudaFree(coefb_d)); + CS_CUDA_CHECK(cudaFree(cell_vol)); + CS_CUDA_CHECK(cudaFree(pvar_d)); +} diff --git a/src/alge/cs_convection_diffusion_cuda.cuh b/src/alge/cs_convection_diffusion_cuda.cuh new file mode 100644 index 0000000000..07141849ad --- /dev/null +++ b/src/alge/cs_convection_diffusion_cuda.cuh @@ -0,0 +1,113 @@ +__global__ static void +cs_slope_test_gradient_vector_cuda_i( const cs_lnum_t n_i_faces, + const cs_lnum_2_t *restrict i_face_cells, + const cs_real_3_t *restrict i_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *pvar, + const cs_real_t *restrict i_massflux, + const cs_real_3_t *restrict i_f_face_normal, + cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_i_faces){ + return; + } + cs_real_t difv[3], djfv[3], vfac[3]; + cs_real_t pif, pjf, pfac; + cs_lnum_t c_id1, c_id2; + + c_id1 = i_face_cells[f_id][0]; + c_id2 = i_face_cells[f_id][1]; + + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; + djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + pif = pvar[c_id1][isou]; + pjf = pvar[c_id2][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[c_id1][isou][jsou]*difv[jsou]; + pjf = pjf + grad[c_id2][isou][jsou]*djfv[jsou]; + } + + pfac = pjf; + if (i_massflux[f_id] > 0.) pfac = pif; + + /* U gradient */ + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; + atomicAdd(&grdpa[c_id1][isou][jsou], vfac[jsou]); + atomicAdd(&grdpa[c_id2][isou][jsou],- vfac[jsou]); + } + } +} + + +__global__ static void +cs_slope_test_gradient_vector_cuda_b(const cs_lnum_t n_b_faces, + const cs_real_3_t *pvar, + const cs_lnum_t *restrict b_face_cells, + const cs_real_3_t *restrict diipb, + const int inc, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_3_t *restrict b_f_face_normal, + const cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + + cs_real_t diipbv[3]; + cs_lnum_t ii = b_face_cells[f_id]; + + for (int jsou = 0; jsou < 3; jsou++) + diipbv[jsou] = diipb[f_id][jsou]; + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pfac = inc*coefa[f_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[f_id][jsou][isou]*( pvar[ii][jsou] + + grad[ii][jsou][0]*diipbv[0] + + grad[ii][jsou][1]*diipbv[1] + + grad[ii][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++) + atomicAdd(&grdpa[ii][isou][jsou], pfac*b_f_face_normal[f_id][jsou]); + } + +} + + + +__global__ static void +cs_slope_test_gradient_vector_cuda_f(const cs_lnum_t n_cells, + cs_real_t *cell_vol, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id >= n_cells){ + return; + } + size_t c_idx = c_id / (3*3); + size_t i = (c_id / 3) % 3; + size_t j = c_id % 3; + + cs_real_t unsvol = 1./cell_vol[c_idx]; + grdpa[c_idx][i][j] *= unsvol; +} \ No newline at end of file diff --git a/src/alge/cs_convection_diffusion_priv.h b/src/alge/cs_convection_diffusion_priv.h new file mode 100644 index 0000000000..24b5ff9b25 --- /dev/null +++ b/src/alge/cs_convection_diffusion_priv.h @@ -0,0 +1,93 @@ +#ifndef __CS_CONVECTION_DIFFUSION_CUDA_H__ +#define __CS_CONVECTION_DIFFUSION_CUDA_H__ + +/*============================================================================ + * Private functions for gradient reconstruction. + *============================================================================*/ + +/* + This file is part of code_saturne, a general-purpose CFD tool. + + Copyright (C) 1998-2023 EDF S.A. + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ + +#include "cs_base.h" +#include "cs_base_accel.h" +#include "cs_halo.h" +#include "cs_internal_coupling.h" +#include "cs_mesh.h" +#include "cs_mesh_quantities.h" + +/*----------------------------------------------------------------------------*/ + +BEGIN_C_DECLS + +/*! \cond DOXYGEN_SHOULD_SKIP_THIS */ + +/*============================================================================ + * Macro definitions + *============================================================================*/ + +/*============================================================================= + * Local type definitions + *============================================================================*/ + +/* Type for symmetric least-squares covariance matrices + as they are adimensional, single-precision should be usable here */ + + +/*============================================================================ + * Global variables + *============================================================================*/ + +/*============================================================================= + * Semi-private function prototypes + *============================================================================*/ + +#if defined(HAVE_CUDA) + +void +cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_quantities_t *fvq, + const cs_real_3_t *restrict pvar, + const cs_real_t i_massflux[], + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + cs_real_3_t *restrict rhs, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const int inc, + const bool flag1, + const bool flag2, + const bool perf); + +#endif + +/* defined(HAVE_CUDA) */ + +/*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ + +/*----------------------------------------------------------------------------*/ + +END_C_DECLS +#endif /* __CS_GRADIENT_CUDA_H__ */ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 347751257c..6963c87493 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5590,9 +5590,9 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = true; + compute_cuda = false; compute_cpu = true; - res_cpu = false; + res_cpu = true; // A ne pas garder dans la version finale perf = false; @@ -6944,7 +6944,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, res_cpu = true; // A ne pas garder dans la version finale - perf = true; + perf = false; accuracy = false; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 2409c29eaa..cb6d14837b 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1318,18 +1318,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, unsigned int blocksize = 256; - unsigned int gridsize_b - = (unsigned int)ceil((double)m->n_b_cells / blocksize); - unsigned int gridsize_if - = (unsigned int)ceil((double)m->n_i_faces / blocksize); - unsigned int gridsize_bf - = (unsigned int)ceil((double)m->n_b_faces / blocksize); - unsigned int gridsize - = (unsigned int)ceil((double)m->n_cells / blocksize); - unsigned int gridsize_init - = (unsigned int)ceil((double)m->n_cells*3*3 / blocksize); - unsigned int gridsize_ext - = (unsigned int)ceil((double)n_cells_ext / blocksize); const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(m->i_face_cells); @@ -1431,7 +1419,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Interior faces contribution */ /*************************************Kernels Scatter**************************************************/ - // _compute_reconstruct_v_i_face<<>> + // _compute_reconstruct_v_i_face<<>> // (n_i_faces, // i_face_cells, // pvar_d, @@ -1442,7 +1430,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // _compute_reconstruct_v_i_face_v2<<>> + // _compute_reconstruct_v_i_face_v2<<>> // (n_i_faces * 3, // i_face_cells, // pvar_d, @@ -1454,7 +1442,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // i_f_face_normal); /*************************************Kernels Scatter conflict free**************************************/ - // _compute_reconstruct_v_i_face_cf<<>> + // _compute_reconstruct_v_i_face_cf<<>> // (n_i_faces, // i_face_cells, // pvar_d, @@ -1465,7 +1453,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // _compute_reconstruct_v_i_face_v2_cf<<>> + // _compute_reconstruct_v_i_face_v2_cf<<>> // (n_i_faces * 3, // i_face_cells, // pvar_d, @@ -1477,7 +1465,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // i_f_face_normal); /*************************************Kernels Gather**************************************************/ - // _compute_reconstruct_v_i_face_gather<<>> + // _compute_reconstruct_v_i_face_gather<<>> // ( n_cells, // pvar_d, // weight, @@ -1492,7 +1480,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces_sgn); - // _compute_reconstruct_v_i_face_gather_v2<<>> + // _compute_reconstruct_v_i_face_gather_v2<<>> // ( n_cells * 3 * 3, // pvar_d, // weight, @@ -1509,7 +1497,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather registers memory************************************/ - // _compute_reconstruct_v_i_face_gather_v3<<>> + // _compute_reconstruct_v_i_face_gather_v3<<>> // ( n_cells, // pvar_d, // weight, @@ -1524,7 +1512,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces_sgn); - // _compute_reconstruct_v_i_face_gather_v4<<>> + // _compute_reconstruct_v_i_face_gather_v4<<>> // ( n_cells * 3 * 3, // pvar_d, // weight, @@ -1542,7 +1530,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - // _compute_reconstruct_v_i_face_gather_v5<<>> + // _compute_reconstruct_v_i_face_gather_v5<<>> // ( n_cells, // pvar_d, // weight, @@ -1588,7 +1576,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Scatter**************************************************/ - // _compute_reconstruct_v_b_face<<>> + // _compute_reconstruct_v_b_face<<>> // ( n_b_faces, // coefb_d, // coefa_d, @@ -1601,7 +1589,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); - // _compute_reconstruct_v_b_face_v2<<>> + // _compute_reconstruct_v_b_face_v2<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1614,7 +1602,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); /*************************************Kernels Scatter conflict free************************************/ - // _compute_reconstruct_v_b_face_cf<<>> + // _compute_reconstruct_v_b_face_cf<<>> // ( n_b_faces, // coefb_d, // coefa_d, @@ -1626,7 +1614,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); - // _compute_reconstruct_v_b_face_v2_cf<<>> + // _compute_reconstruct_v_b_face_v2_cf<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1639,7 +1627,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); /*************************************Kernels Gather**************************************************/ - // _compute_reconstruct_v_b_face_gather<<>> + // _compute_reconstruct_v_b_face_gather<<>> // ( m->n_b_cells, // coefb_d, // coefa_d, @@ -1654,7 +1642,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); - // _compute_reconstruct_v_b_face_gather_v2<<>> + // _compute_reconstruct_v_b_face_gather_v2<<>> // ( m->n_b_cells * 3, // coefb_d, // coefa_d, @@ -1669,7 +1657,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); /*************************************Kernels Gather registers memory***************************************/ - // _compute_reconstruct_v_b_face_gather_v3<<>> + // _compute_reconstruct_v_b_face_gather_v3<<>> // ( m->n_b_cells, // coefb_d, // coefa_d, @@ -1684,7 +1672,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); - // _compute_reconstruct_v_b_face_gather_v4<<>> + // _compute_reconstruct_v_b_face_gather_v4<<>> // ( m->n_b_cells * 3, // coefb_d, // coefa_d, @@ -1701,7 +1689,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - // _compute_reconstruct_v_b_face_gather_v5<<>> + // _compute_reconstruct_v_b_face_gather_v5<<>> // ( m->n_b_cells, // coefb_d, // coefa_d, @@ -1718,7 +1706,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); - // _compute_reconstruct_correction<<>> + // _compute_reconstruct_correction<<>> // ( n_cells, // has_dc, // c_disable_flag, @@ -1728,7 +1716,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // test_bool // ); - // _compute_reconstruct_correction_v2<<>> + // _compute_reconstruct_correction_v2<<>> // ( n_cells * 3, // has_dc, // c_disable_flag, @@ -1826,25 +1814,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, } - - - -__global__ static void -_set_one_to_coeff_b(const cs_lnum_t n_b_faces, - cs_real_33_t *_bc_coeff_b) -{ - cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - - if(c_idx >= n_b_faces){ - return; - } - - cs_lnum_t f_id = c_idx / 3; - size_t i = c_idx % 3; - - _bc_coeff_b[f_id][i][i] = 1; -} - /*---------------------------------------------------------------------------- * _gradient_vector the gradient of a vector using a given gradient of * this vector (typically lsq). @@ -1888,9 +1857,6 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(start, stream)); unsigned int blocksize = 256; - unsigned int gridsize_f - = (unsigned int)ceil((double)n_b_faces / blocksize); - cs_real_3_t *_bc_coeff_a_d; CS_CUDA_CHECK(cudaMalloc(&_bc_coeff_a_d, n_b_faces * sizeof(cs_real_3_t))); @@ -1905,7 +1871,7 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(init1, stream)); cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); - _set_one_to_coeff_b<<>> + _set_one_to_coeff_b<<< n_b_faces/blocksize * 3, blocksize, 0, stream>>> (n_b_faces, _bc_coeff_b_d); CS_CUDA_CHECK(cudaEventRecord(init2, stream)); @@ -1952,4 +1918,4 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, } CS_CUDA_CHECK(cudaFree(_bc_coeff_a_d)); CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d)); -} \ No newline at end of file +} diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_gradient_cuda.cuh index 5b1401bfe6..2a41e4a26d 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_gradient_cuda.cuh @@ -80,6 +80,21 @@ #include "cs_gradient_priv.h" +__global__ static void +_set_one_to_coeff_b(const cs_lnum_t n_b_faces, + cs_real_33_t *_bc_coeff_b) +{ + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_idx >= n_b_faces){ + return; + } + + cs_lnum_t f_id = c_idx / 3; + size_t i = c_idx % 3; + + _bc_coeff_b[f_id][i][i] = 1; +} __device__ void cs_math_3_normalise_cuda(const cs_real_t in[3], cs_real_t out[3]) diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 516c3e6080..0e760f6f22 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -141,13 +141,14 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_lnum_t cpl_stride, bool test_bool, bool perf); -#endif void _gradient_vector_cuda(const cs_mesh_t *mesh, cs_real_3_t *_bc_coeff_a, cs_real_33_t *_bc_coeff_b, bool perf); + +#endif /* defined(HAVE_CUDA) */ From 03a34068c08866e1def91c5a65555d6ae45f3061 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 14 Dec 2023 17:13:22 +0100 Subject: [PATCH 48/70] ADD slope_test scatter and gather but gather doesn't work --- src/alge/cs_convection_diffusion.c | 87 ++++--- src/alge/cs_convection_diffusion_cuda.cu | 243 ++++++++++++------ .../cs_convection_diffusion_cuda_gather.cuh | 113 ++++++++ ... cs_convection_diffusion_cuda_scatter.cuh} | 32 +-- src/alge/cs_convection_diffusion_priv.h | 2 +- src/alge/cs_gradient.cxx | 4 +- src/alge/cs_gradient_cuda.cu | 79 +++--- 7 files changed, 381 insertions(+), 179 deletions(-) create mode 100644 src/alge/cs_convection_diffusion_cuda_gather.cuh rename src/alge/{cs_convection_diffusion_cuda.cuh => cs_convection_diffusion_cuda_scatter.cuh} (81%) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 6a3534396e..5c32fd8532 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4375,9 +4375,8 @@ cs_convection_diffusion_vector(int idtvar, /* Timing the computation */ clock_t start, stop; - double elapsed, elapsed_cuda; + unsigned long elapsed, elapsed_cuda; - cs_real_3_t *rhs_cpu, *rhs_gpu; cs_real_33_t *grad_cpu, *grad_gpu; cs_real_33_t *grdpa_cpu, *grdpa_gpu; @@ -4412,25 +4411,22 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = false; + compute_cuda = true; compute_cpu = true; - res_cpu = true; + res_cpu = false; // A ne pas garder dans la version finale perf = true; accuracy = true; -printf("je passe dans convection_diffusion\n"); #if defined(HAVE_CUDA) if(compute_cuda){ if(!res_cpu){ grad_gpu = grad; grdpa_gpu = grdpa; - rhs_gpu = rhs; } else { BFT_MALLOC(grad_gpu, n_cells_ext, cs_real_33_t); BFT_MALLOC(grdpa_gpu, n_cells_ext, cs_real_33_t); - BFT_MALLOC(rhs_gpu, n_cells_ext, cs_real_3_t); } if(perf){ start = clock(); @@ -4471,17 +4467,18 @@ printf("je passe dans convection_diffusion\n"); _pvar, gweight, /* weighted gradient */ cpl, - grad_cpu); + grad_gpu); } bool flag2 = (iconvp > 0 && iupwin == 0 && isstpp == 0); + cs_convection_diffusion_vector_cuda(m, + cs_glob_mesh_adjacencies, fvq, _pvar, i_massflux, grad_gpu, grdpa_gpu, - rhs_gpu, coefav, coefbv, inc, @@ -4489,24 +4486,29 @@ printf("je passe dans convection_diffusion\n"); flag2, perf); + /* Handle parallelism and periodicity */ + if (flag2){ + if (halo != NULL) { + cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa_gpu, 9); + if (m->n_init_perio > 0) + cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa_gpu); + } + } if(perf){ stop = clock(); - elapsed_cuda = (double) (stop - start); + elapsed_cuda = (stop - start) * 1e6 / CLOCKS_PER_SEC; } } #endif - if(compute_cpu){ if(res_cpu){ grad_cpu = grad; grdpa_cpu = grdpa; - rhs_cpu = rhs; } else { BFT_MALLOC(grad_cpu, n_cells_ext, cs_real_33_t); BFT_MALLOC(grdpa_cpu, n_cells_ext, cs_real_33_t); - BFT_MALLOC(rhs_cpu, n_cells_ext, cs_real_3_t); } if(perf){ @@ -4558,9 +4560,9 @@ printf("je passe dans convection_diffusion\n"); } } - /* ====================================================================== - ---> Compute uncentered gradient grdpa for the slope test - ======================================================================*/ +/* ====================================================================== + ---> Compute uncentered gradient grdpa for the slope test + ======================================================================*/ # pragma omp parallel for for (cs_lnum_t cell_id = 0; cell_id < n_cells_ext; cell_id++) { @@ -4583,23 +4585,22 @@ printf("je passe dans convection_diffusion\n"); } - /* ====================================================================== - ---> Contribution from interior faces - ======================================================================*/ - - n_upwind = 0; + if(perf){ + stop = clock(); + elapsed = (stop - start) * 1e6 / CLOCKS_PER_SEC; + } + } - if (n_cells_ext > n_cells) { - # pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) - for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { - for (int isou = 0; isou < 3; isou++) - rhs_cpu[cell_id][isou] = 0.; + /* Performances */ + if(perf){ + #if defined(HAVE_CUDA) + if(compute_cuda){ + printf("convection Compute and tranferts time in us: CUDA = %ld\n", elapsed_cuda); } - } + #endif - if(perf){ - stop = clock(); - elapsed = (double) (stop - start); + if(compute_cpu){ + printf("convection Compute and tranferts time in us: CPU = %ld\n", elapsed); } } @@ -4612,18 +4613,12 @@ printf("je passe dans convection_diffusion\n"); double err; for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { - cpu = rhs_cpu[c_id][i]; - cuda = rhs_gpu[c_id][i]; - err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); - if (err> 1e-12) { - printf("convection_diffusion_a DIFFERENCE @%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\n", c_id, i, cpu, cuda, fabs(cpu - cuda), err); - } for (int j =0; j < 3; ++j) { cpu = grdpa_cpu[c_id][i][j]; cuda = grdpa_gpu[c_id][i][j]; err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); - if (err> 1e-12) { - printf("convection_diffusion_b DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); + if (err> 1e-6) { + printf("convection_diffusion_b DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); } } } @@ -4639,7 +4634,6 @@ printf("je passe dans convection_diffusion\n"); if(res_cpu){ BFT_FREE(grad_gpu); BFT_FREE(grdpa_gpu); - BFT_FREE(rhs_gpu); } } #endif @@ -4649,10 +4643,23 @@ printf("je passe dans convection_diffusion\n"); if(!res_cpu){ BFT_FREE(grad_cpu); BFT_FREE(grdpa_cpu); - BFT_FREE(rhs_cpu); } } + + /* ====================================================================== + ---> Contribution from interior faces + ======================================================================*/ + + n_upwind = 0; + + if (n_cells_ext > n_cells) { +# pragma omp parallel for if(n_cells_ext -n_cells > CS_THR_MIN) + for (cs_lnum_t cell_id = n_cells; cell_id < n_cells_ext; cell_id++) { + for (int isou = 0; isou < 3; isou++) + rhs[cell_id][isou] = 0.; + } + } /* --> Pure upwind flux =====================*/ diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu index 4e14de751d..39a1bbf1ee 100644 --- a/src/alge/cs_convection_diffusion_cuda.cu +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -49,9 +49,44 @@ #include "cs_convection_diffusion.h" #include "cs_convection_diffusion_priv.h" -#include "cs_convection_diffusion_cuda.cuh" +#include "cs_convection_diffusion_cuda_scatter.cuh" +#include "cs_convection_diffusion_cuda_gather.cuh" + +template +static void +_sync_or_copy_real_h2d_c(const T *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const T **val_d, + void **buf_d) +{ + const T *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(T); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const T *)_buf_d; + } + else { + _val_d = (const T *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + /*---------------------------------------------------------------------------- * _gradient_vector the gradient of a vector using a given gradient of * this vector (typically lsq). @@ -70,12 +105,12 @@ *----------------------------------------------------------------------------*/ extern "C" void cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, const cs_real_3_t *restrict pvar, const cs_real_t i_massflux[], const cs_real_33_t *grad, cs_real_33_t *grdpa, - cs_real_3_t *restrict rhs, const cs_real_3_t *restrict coefav, const cs_real_33_t *restrict coefbv, const int inc, @@ -84,6 +119,7 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, const bool perf) { const cs_lnum_t n_cells = mesh->n_cells; + const cs_lnum_t n_b_cells = mesh->n_b_cells; const cs_lnum_t n_cells_ext = mesh->n_cells_with_ghosts; const cs_lnum_t n_i_faces = mesh->n_i_faces; const cs_lnum_t n_b_faces = mesh->n_b_faces; @@ -111,25 +147,18 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, unsigned int blocksize = 256; cs_real_33_t *grad_d = NULL; + CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); cs_real_33_t *grdpa_d; CS_CUDA_CHECK(cudaMalloc(&grdpa_d, n_cells_ext * sizeof(cs_real_33_t))); - cs_real_3_t *rhs_d; - CS_CUDA_CHECK(cudaMalloc(&rhs_d, n_cells_ext * sizeof(cs_real_3_t))); - cs_cuda_copy_h2d(rhs_d, rhs, sizeof(cs_real_3_t)*n_cells_ext); - cs_gnum_t n_upwind; - const cs_lnum_2_t *restrict i_face_cells = (const cs_lnum_2_t *restrict)cs_get_device_ptr_const_pf(mesh->i_face_cells); const cs_lnum_t *restrict b_face_cells = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_face_cells); - const cs_real_3_t *restrict i_face_cog - = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->i_face_cog); - const cs_real_3_t *restrict cell_cen = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->cell_cen); @@ -139,85 +168,145 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, const cs_real_3_t *restrict b_f_face_normal = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->b_f_face_normal); + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells_idx); + + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_cells); + + const cs_lnum_t *restrict b_cells + = (cs_lnum_t *restrict)cs_get_device_ptr_const_pf(mesh->b_cells); + + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces); + + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)cs_get_device_ptr_const_pf(madj->cell_b_faces_idx); + + cs_real_t *restrict i_massflux_d; + CS_CUDA_CHECK(cudaMalloc(&i_massflux_d, sizeof(cs_real_t)*n_i_faces)); + cs_cuda_copy_h2d(i_massflux_d, (void *)i_massflux, sizeof(cs_real_t)*n_i_faces); + + cs_real_3_t *restrict i_face_cog; + CS_CUDA_CHECK(cudaMalloc(&i_face_cog, sizeof(cs_real_3_t)*n_i_faces)); + cs_cuda_copy_h2d(i_face_cog, (void *)fvq->i_face_cog, sizeof(cs_real_3_t)*n_i_faces); + cs_real_3_t *restrict i_f_face_normal; CS_CUDA_CHECK(cudaMalloc(&i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces)); cs_cuda_copy_h2d(i_f_face_normal, (void *)fvq->i_f_face_normal, sizeof(cs_real_3_t)*n_i_faces); - cs_real_3_t *restrict coefa_d; - CS_CUDA_CHECK(cudaMalloc(&coefa_d, sizeof(cs_real_3_t)*n_b_faces)); - cs_cuda_copy_h2d(coefa_d, (void *)coefav, sizeof(cs_real_3_t)*n_b_faces); - - cs_real_33_t *restrict coefb_d; - CS_CUDA_CHECK(cudaMalloc(&coefb_d, sizeof(cs_real_33_t)*n_b_faces)); - cs_cuda_copy_h2d(coefb_d, (void *)coefbv, sizeof(cs_real_33_t)*n_b_faces); - cs_real_t *restrict cell_vol; - CS_CUDA_CHECK(cudaMalloc(&cell_vol, n_cells * sizeof(cs_real_t))); + CS_CUDA_CHECK(cudaMalloc(&cell_vol, sizeof(cs_real_t)*n_cells)); cs_cuda_copy_h2d(cell_vol, (void *)fvq->cell_vol, sizeof(cs_real_t)*n_cells); - cs_real_3_t *restrict pvar_d; - CS_CUDA_CHECK(cudaMalloc(&pvar_d, sizeof(cs_real_3_t)*n_cells)); - cs_cuda_copy_h2d(pvar_d, (void *)pvar, sizeof(cs_real_3_t)*n_cells); + cs_mesh_adjacencies_update_cell_i_faces(); + const cs_lnum_t n_cells_i_face = (madj->cell_cells_idx[n_cells]); + + cs_lnum_t *restrict cell_i_faces; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces, madj->cell_i_faces, sizeof(cs_lnum_t)*n_cells_i_face); + + short int *restrict cell_i_faces_sgn; + CS_CUDA_CHECK(cudaMalloc(&cell_i_faces_sgn, sizeof(short int)*n_cells_i_face)); + cs_cuda_copy_h2d(cell_i_faces_sgn, madj->cell_i_faces_sgn, sizeof(short int)*n_cells_i_face); + + + void *_coefb_d, *_coefa_d, *_pvar_d; + + const cs_real_3_t * coefa_d = NULL; + const cs_real_3_t * pvar_d = NULL; + const cs_real_33_t * coefb_d = NULL; /* Initialization */ - CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + _sync_or_copy_real_h2d_c(pvar, n_cells_ext, device_id, stream, + &pvar_d, &_pvar_d); + _sync_or_copy_real_h2d_c(coefav, n_b_faces, device_id, stream, + &coefa_d, &_coefa_d); + _sync_or_copy_real_h2d_c(coefbv, n_b_faces, device_id, stream, + &coefb_d, &_coefb_d); if(flag1){ cs_cuda_copy_h2d(grad_d, grad, sizeof(cs_real_33_t)*n_cells_ext); - }else{ - CS_CUDA_CHECK(cudaMalloc(&grad_d, n_cells_ext * sizeof(cs_real_33_t))); + } + else{ cudaMemset(grad_d, 0, n_cells_ext * sizeof(cs_real_33_t)); } + CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); + cudaMemset(grdpa_d, 0, n_cells_ext * sizeof(cs_real_33_t)); CS_CUDA_CHECK(cudaEventRecord(init, stream)); if (flag2) { - printf(" On passe dans cs_slope_test"); - cs_slope_test_gradient_vector_cuda_i<<>> - (n_i_faces, - i_face_cells, - i_face_cog, - cell_cen, - pvar_d, - i_massflux, - i_f_face_normal, - grad_d, - grdpa_d); + // cs_slope_test_gradient_vector_cuda_i<<<(unsigned int)ceil((double)n_i_faces / blocksize), blocksize, 0, stream>>> + // (n_i_faces, + // i_face_cells, + // i_face_cog, + // cell_cen, + // pvar_d, + // i_massflux_d, + // i_f_face_normal, + // grad_d, + // grdpa_d); + + + cs_slope_test_gradient_vector_cuda_i_gather<<<(unsigned int)ceil((double)n_cells / blocksize), blocksize, 0, stream>>> + (n_cells, + i_face_cog, + cell_cen, + pvar_d, + i_massflux_d, + i_f_face_normal, + cell_cells_idx, + cell_cells, + cell_i_faces, + cell_i_faces_sgn, + grad_d, + grdpa_d); CS_CUDA_CHECK(cudaEventRecord(f_i, stream)); - cs_slope_test_gradient_vector_cuda_b<<>> - (n_b_faces, - pvar_d, - b_face_cells, - diipb, - inc, - coefa_d, - coefb_d, - b_f_face_normal, - grad_d, - grdpa_d); + // cs_slope_test_gradient_vector_cuda_b<<<(unsigned int)ceil((double)n_b_faces / blocksize), blocksize, 0, stream>>> + // (n_b_faces, + // pvar_d, + // b_face_cells, + // diipb, + // inc, + // coefa_d, + // coefb_d, + // b_f_face_normal, + // grad_d, + // grdpa_d); + + + cs_slope_test_gradient_vector_cuda_b_gather<<<(unsigned int)ceil((double)n_b_cells / blocksize), blocksize, 0, stream>>> + (n_b_cells, + pvar_d, + diipb, + inc, + coefa_d, + coefb_d, + b_f_face_normal, + b_cells, + cell_b_faces, + cell_b_faces_idx, + grad_d, + grdpa_d); CS_CUDA_CHECK(cudaEventRecord(f_b, stream)); - // cs_slope_test_gradient_vector_cuda_f<<<(n_cells / blocksize) * 3 * 3, blocksize, 0, stream>>> - // (n_cells, - // cell_vol, - // grdpa_d); + cs_slope_test_gradient_vector_cuda_f<<<(unsigned int)ceil((double)n_cells * 3 * 3 / blocksize), blocksize, 0, stream>>> + (n_cells * 3 * 3, + cell_vol, + grdpa_d); CS_CUDA_CHECK(cudaEventRecord(f_f, stream)); } n_upwind = 0; - - if(n_cells_ext > n_cells){ - cudaMemset(rhs_d[n_cells], 0, (n_cells_ext-n_cells) * sizeof(cs_real_3_t)); - } - /* Sync to host */ if (grdpa_d != NULL) { @@ -225,16 +314,7 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, cs_cuda_copy_d2h(grdpa, grdpa_d, size); } else - cs_sync_d2h(grdpa_d); - - - /* Sync to host */ - if (rhs_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3; - cs_cuda_copy_d2h(rhs, rhs_d, size); - } - else - cs_sync_d2h(rhs_d); + cs_sync_d2h(grdpa); CS_CUDA_CHECK(cudaEventRecord(stop, stream)); CS_CUDA_CHECK(cudaEventSynchronize(stop)); @@ -243,47 +323,48 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, cudaStreamDestroy(stream); if(perf){ - printf("convection_diffusion Kernels times:\t"); + printf("convection_diffusion Kernels:\n"); + printf("Execution time in us: \t"); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, mem_h2d, init)); - printf("Kernels execution time in us: \t"); printf("Init = %f\t", msec*1000.f); if (flag2) { msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, init, f_i)); - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); + printf("f_i = %f\t", msec*1000.f); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_i, f_b)); - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); + printf("f_b = %f\t", msec*1000.f); msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, f_b, f_f)); - printf("Kernels execution time in us: \t"); - printf("Init = %f\t", msec*1000.f); + printf("f_f = %f\t", msec*1000.f); } msec = 0.0f; CS_CUDA_CHECK(cudaEventElapsedTime(&msec, start, stop)); - printf("Total = %f\t", msec*1000.f); - - printf("\n"); + printf("Total = %f\n", msec*1000.f); } if (!flag1){ CS_CUDA_CHECK(cudaFree(grad_d)); } + if (_pvar_d != NULL) + CS_CUDA_CHECK(cudaFree(_pvar_d)); + if (_coefa_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefa_d)); + if (_coefb_d != NULL) + CS_CUDA_CHECK(cudaFree(_coefb_d)); + CS_CUDA_CHECK(cudaFree(grdpa_d)); - CS_CUDA_CHECK(cudaFree(rhs_d)); + CS_CUDA_CHECK(cudaFree(i_massflux_d)); CS_CUDA_CHECK(cudaFree(i_f_face_normal)); - CS_CUDA_CHECK(cudaFree(coefa_d)); - CS_CUDA_CHECK(cudaFree(coefb_d)); CS_CUDA_CHECK(cudaFree(cell_vol)); - CS_CUDA_CHECK(cudaFree(pvar_d)); + CS_CUDA_CHECK(cudaFree(cell_i_faces)); + CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn)); } diff --git a/src/alge/cs_convection_diffusion_cuda_gather.cuh b/src/alge/cs_convection_diffusion_cuda_gather.cuh new file mode 100644 index 0000000000..8b13f1dcf0 --- /dev/null +++ b/src/alge/cs_convection_diffusion_cuda_gather.cuh @@ -0,0 +1,113 @@ +__global__ static void +cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, + const cs_real_3_t *restrict i_face_cog, + const cs_real_3_t *restrict cell_cen, + const cs_real_3_t *pvar, + const cs_real_t *restrict i_massflux, + const cs_real_3_t *restrict i_f_face_normal, + const cs_lnum_t *restrict cell_cells_idx, + const cs_lnum_t *restrict cell_cells, + const cs_lnum_t *restrict cell_i_faces, + const short int *restrict cell_i_faces_sgn, + cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + if(c_id1 >= n_cells){ + return; + } + + cs_real_t difv[3], djfv[3], vfac[3]; + cs_real_t pif, pjf, pfac; + cs_lnum_t c_id2, f_id; + + cs_lnum_t s_id = cell_cells_idx[c_id1]; + cs_lnum_t e_id = cell_cells_idx[c_id1 + 1]; + + + for(cs_lnum_t index = s_id; index < e_id; index++){ + c_id2 = cell_cells[index]; + f_id = cell_i_faces[index]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; + djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + pif = pvar[c_id1][isou]; + pjf = pvar[c_id2][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[c_id1][isou][jsou]*difv[jsou]; + pjf = pjf + grad[c_id2][isou][jsou]*djfv[jsou]; + } + + pfac = pjf; + if (i_massflux[f_id] > 0.) + pfac = pif; + + /* U gradient */ + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; + grdpa[c_id1][isou][jsou] += cell_i_faces_sgn[index] * vfac[jsou]; + } + } + } +} + + +__global__ static void +cs_slope_test_gradient_vector_cuda_b_gather(const cs_lnum_t n_b_cells, + const cs_real_3_t *pvar, + const cs_real_3_t *restrict diipb, + const int inc, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_cells, + const cs_lnum_t *restrict cell_b_faces, + const cs_lnum_t *restrict cell_b_faces_idx, + const cs_real_33_t *grad, + cs_real_33_t *grdpa) +{ + cs_lnum_t c_id1 = blockIdx.x * blockDim.x + threadIdx.x; + + + if(c_id1 >= n_b_cells){ + return; + } + + cs_lnum_t c_id = b_cells[c_id1]; + + cs_real_t pfac, rfac, vecfac; + cs_real_t diipbv[3]; + cs_lnum_t f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id + 1]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + f_id = cell_b_faces[index]; + + /* x-y-z components, p = u, v, w */ + + for (int jsou = 0; jsou < 3; jsou++) + diipbv[jsou] = diipb[f_id][jsou]; + + for (int isou = 0; isou < 3; isou++) { + pfac = inc*coefa[f_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[f_id][jsou][isou]*( pvar[c_id][jsou] + + grad[c_id][jsou][0]*diipbv[0] + + grad[c_id][jsou][1]*diipbv[1] + + grad[c_id][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++) + grdpa[c_id][isou][jsou] += pfac*b_f_face_normal[f_id][jsou]; + } + } +} diff --git a/src/alge/cs_convection_diffusion_cuda.cuh b/src/alge/cs_convection_diffusion_cuda_scatter.cuh similarity index 81% rename from src/alge/cs_convection_diffusion_cuda.cuh rename to src/alge/cs_convection_diffusion_cuda_scatter.cuh index 07141849ad..80daba1938 100644 --- a/src/alge/cs_convection_diffusion_cuda.cuh +++ b/src/alge/cs_convection_diffusion_cuda_scatter.cuh @@ -21,7 +21,6 @@ cs_slope_test_gradient_vector_cuda_i( const cs_lnum_t n_i_faces, c_id1 = i_face_cells[f_id][0]; c_id2 = i_face_cells[f_id][1]; - for (int jsou = 0; jsou < 3; jsou++) { difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; @@ -70,7 +69,8 @@ cs_slope_test_gradient_vector_cuda_b(const cs_lnum_t n_b_faces, } cs_real_t diipbv[3]; - cs_lnum_t ii = b_face_cells[f_id]; + cs_lnum_t c_id1 = b_face_cells[f_id]; + cs_real_t pfac; for (int jsou = 0; jsou < 3; jsou++) diipbv[jsou] = diipb[f_id][jsou]; @@ -78,16 +78,16 @@ cs_slope_test_gradient_vector_cuda_b(const cs_lnum_t n_b_faces, /* x-y-z components, p = u, v, w */ for (int isou = 0; isou < 3; isou++) { - cs_real_t pfac = inc*coefa[f_id][isou]; + pfac = inc*coefa[f_id][isou]; /*coefu is a matrix */ for (int jsou = 0; jsou < 3; jsou++) - pfac += coefb[f_id][jsou][isou]*( pvar[ii][jsou] - + grad[ii][jsou][0]*diipbv[0] - + grad[ii][jsou][1]*diipbv[1] - + grad[ii][jsou][2]*diipbv[2]); + pfac += coefb[f_id][jsou][isou]*( pvar[c_id1][jsou] + + grad[c_id1][jsou][0]*diipbv[0] + + grad[c_id1][jsou][1]*diipbv[1] + + grad[c_id1][jsou][2]*diipbv[2]); for (int jsou = 0; jsou < 3; jsou++) - atomicAdd(&grdpa[ii][isou][jsou], pfac*b_f_face_normal[f_id][jsou]); + atomicAdd(&grdpa[c_id1][isou][jsou], pfac*b_f_face_normal[f_id][jsou]); } } @@ -99,15 +99,15 @@ cs_slope_test_gradient_vector_cuda_f(const cs_lnum_t n_cells, cs_real_t *cell_vol, cs_real_33_t *grdpa) { - cs_lnum_t c_id = blockIdx.x * blockDim.x + threadIdx.x; + cs_lnum_t c_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(c_id >= n_cells){ + if(c_idx >= n_cells){ return; } - size_t c_idx = c_id / (3*3); - size_t i = (c_id / 3) % 3; - size_t j = c_id % 3; + size_t c_id = c_idx / (3*3); + size_t i = (c_idx / 3) % 3; + size_t j = c_idx % 3; - cs_real_t unsvol = 1./cell_vol[c_idx]; - grdpa[c_idx][i][j] *= unsvol; -} \ No newline at end of file + cs_real_t unsvol = 1./cell_vol[c_id]; + grdpa[c_id][i][j] *= unsvol; +} diff --git a/src/alge/cs_convection_diffusion_priv.h b/src/alge/cs_convection_diffusion_priv.h index 24b5ff9b25..0ac68df0c9 100644 --- a/src/alge/cs_convection_diffusion_priv.h +++ b/src/alge/cs_convection_diffusion_priv.h @@ -68,12 +68,12 @@ BEGIN_C_DECLS void cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, + const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, const cs_real_3_t *restrict pvar, const cs_real_t i_massflux[], const cs_real_33_t *grad, cs_real_33_t *grdpa, - cs_real_3_t *restrict rhs, const cs_real_3_t *restrict coefav, const cs_real_33_t *restrict coefbv, const int inc, diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 6963c87493..2536624296 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5590,9 +5590,9 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = false; + compute_cuda = true; compute_cpu = true; - res_cpu = true; + res_cpu = false; // A ne pas garder dans la version finale perf = false; diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index cb6d14837b..c651ef5b1e 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1278,6 +1278,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, ) { const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_i_faces = m->n_i_faces; @@ -1530,19 +1531,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - // _compute_reconstruct_v_i_face_gather_v5<<>> - // ( n_cells, - // pvar_d, - // weight, - // c_weight, - // r_grad_d, - // grad_d, - // dofij, - // i_f_face_normal, - // cell_cells_idx, - // cell_cells, - // cell_i_faces, - // cell_i_faces_sgn); + _compute_reconstruct_v_i_face_gather_v5<<>> + ( n_cells, + pvar_d, + weight, + c_weight, + r_grad_d, + grad_d, + dofij, + i_f_face_normal, + cell_cells_idx, + cell_cells, + cell_i_faces, + cell_i_faces_sgn); CS_CUDA_CHECK(cudaEventRecord(i_faces, stream)); @@ -1628,7 +1629,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather**************************************************/ // _compute_reconstruct_v_b_face_gather<<>> - // ( m->n_b_cells, + // ( n_b_cells, // coefb_d, // coefa_d, // pvar_d, @@ -1643,7 +1644,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_gather_v2<<>> - // ( m->n_b_cells * 3, + // ( n_b_cells * 3, // coefb_d, // coefa_d, // pvar_d, @@ -1658,7 +1659,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather registers memory***************************************/ // _compute_reconstruct_v_b_face_gather_v3<<>> - // ( m->n_b_cells, + // ( n_b_cells, // coefb_d, // coefa_d, // pvar_d, @@ -1673,7 +1674,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // _compute_reconstruct_v_b_face_gather_v4<<>> - // ( m->n_b_cells * 3, + // ( n_b_cells * 3, // coefb_d, // coefa_d, // pvar_d, @@ -1689,19 +1690,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - // _compute_reconstruct_v_b_face_gather_v5<<>> - // ( m->n_b_cells, - // coefb_d, - // coefa_d, - // pvar_d, - // inc, - // diipb, - // r_grad_d, - // grad_d, - // b_f_face_normal, - // b_cells, - // cell_b_faces, - // cell_b_faces_idx); + _compute_reconstruct_v_b_face_gather_v5<<>> + ( n_b_cells, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_cells, + cell_b_faces, + cell_b_faces_idx); CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); @@ -1716,15 +1717,15 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // test_bool // ); - // _compute_reconstruct_correction_v2<<>> - // ( n_cells * 3, - // has_dc, - // c_disable_flag, - // cell_f_vol, - // grad_d, - // corr_grad_lin, - // test_bool - // ); + _compute_reconstruct_correction_v2<<>> + ( n_cells * 3, + has_dc, + c_disable_flag, + cell_f_vol, + grad_d, + corr_grad_lin, + test_bool + ); CS_CUDA_CHECK(cudaEventRecord(b_faces_3, stream)); // ----------------------------End of Kernels part 2------------------------------------------- From face75420292d1c64c169775bbeeecb721732605 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Mon, 18 Dec 2023 16:43:49 +0100 Subject: [PATCH 49/70] Cleaning version of slope_test. Gather version doesn't work --- src/alge/cs_convection_diffusion.c | 4 +- src/alge/cs_convection_diffusion_cuda.cu | 102 +++++++++--------- ...lope_test_gradient_vector_cuda_gather.cuh} | 12 ++- ...ope_test_gradient_vector_cuda_scatter.cuh} | 0 4 files changed, 60 insertions(+), 58 deletions(-) rename src/alge/{cs_convection_diffusion_cuda_gather.cuh => cs_slope_test_gradient_vector_cuda_gather.cuh} (92%) rename src/alge/{cs_convection_diffusion_cuda_scatter.cuh => cs_slope_test_gradient_vector_cuda_scatter.cuh} (100%) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 5c32fd8532..fe4d3ad82d 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4416,8 +4416,8 @@ res_cpu = !compute_cuda; res_cpu = false; // A ne pas garder dans la version finale - perf = true; - accuracy = true; + perf = false; + accuracy = false; #if defined(HAVE_CUDA) if(compute_cuda){ diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu index 39a1bbf1ee..287a6af130 100644 --- a/src/alge/cs_convection_diffusion_cuda.cu +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -49,8 +49,8 @@ #include "cs_convection_diffusion.h" #include "cs_convection_diffusion_priv.h" -#include "cs_convection_diffusion_cuda_scatter.cuh" -#include "cs_convection_diffusion_cuda_gather.cuh" +#include "cs_slope_test_gradient_vector_cuda_scatter.cuh" +#include "cs_slope_test_gradient_vector_cuda_gather.cuh" @@ -240,60 +240,60 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(init, stream)); if (flag2) { - // cs_slope_test_gradient_vector_cuda_i<<<(unsigned int)ceil((double)n_i_faces / blocksize), blocksize, 0, stream>>> - // (n_i_faces, - // i_face_cells, - // i_face_cog, - // cell_cen, - // pvar_d, - // i_massflux_d, - // i_f_face_normal, - // grad_d, - // grdpa_d); + cs_slope_test_gradient_vector_cuda_i<<<(unsigned int)ceil((double)n_i_faces / blocksize), blocksize, 0, stream>>> + (n_i_faces, + i_face_cells, + i_face_cog, + cell_cen, + pvar_d, + i_massflux_d, + i_f_face_normal, + grad_d, + grdpa_d); - cs_slope_test_gradient_vector_cuda_i_gather<<<(unsigned int)ceil((double)n_cells / blocksize), blocksize, 0, stream>>> - (n_cells, - i_face_cog, - cell_cen, - pvar_d, - i_massflux_d, - i_f_face_normal, - cell_cells_idx, - cell_cells, - cell_i_faces, - cell_i_faces_sgn, - grad_d, - grdpa_d); + // cs_slope_test_gradient_vector_cuda_i_gather<<<(unsigned int)ceil((double)n_cells / blocksize), blocksize, 0, stream>>> + // (n_cells, + // i_face_cog, + // cell_cen, + // pvar_d, + // i_massflux_d, + // i_f_face_normal, + // cell_cells_idx, + // cell_cells, + // cell_i_faces, + // cell_i_faces_sgn, + // grad_d, + // grdpa_d); CS_CUDA_CHECK(cudaEventRecord(f_i, stream)); - // cs_slope_test_gradient_vector_cuda_b<<<(unsigned int)ceil((double)n_b_faces / blocksize), blocksize, 0, stream>>> - // (n_b_faces, - // pvar_d, - // b_face_cells, - // diipb, - // inc, - // coefa_d, - // coefb_d, - // b_f_face_normal, - // grad_d, - // grdpa_d); - - - cs_slope_test_gradient_vector_cuda_b_gather<<<(unsigned int)ceil((double)n_b_cells / blocksize), blocksize, 0, stream>>> - (n_b_cells, - pvar_d, - diipb, - inc, - coefa_d, - coefb_d, - b_f_face_normal, - b_cells, - cell_b_faces, - cell_b_faces_idx, - grad_d, - grdpa_d); + cs_slope_test_gradient_vector_cuda_b<<<(unsigned int)ceil((double)n_b_faces / blocksize), blocksize, 0, stream>>> + (n_b_faces, + pvar_d, + b_face_cells, + diipb, + inc, + coefa_d, + coefb_d, + b_f_face_normal, + grad_d, + grdpa_d); + + + // cs_slope_test_gradient_vector_cuda_b_gather<<<(unsigned int)ceil((double)n_b_cells / blocksize), blocksize, 0, stream>>> + // (n_b_cells, + // pvar_d, + // diipb, + // inc, + // coefa_d, + // coefb_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx, + // grad_d, + // grdpa_d); CS_CUDA_CHECK(cudaEventRecord(f_b, stream)); diff --git a/src/alge/cs_convection_diffusion_cuda_gather.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh similarity index 92% rename from src/alge/cs_convection_diffusion_cuda_gather.cuh rename to src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh index 8b13f1dcf0..30a832082b 100644 --- a/src/alge/cs_convection_diffusion_cuda_gather.cuh +++ b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh @@ -19,7 +19,7 @@ cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, } cs_real_t difv[3], djfv[3], vfac[3]; - cs_real_t pif, pjf, pfac; + cs_real_t pif, pjf, pfac, face_sgn; cs_lnum_t c_id2, f_id; cs_lnum_t s_id = cell_cells_idx[c_id1]; @@ -29,6 +29,7 @@ cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, for(cs_lnum_t index = s_id; index < e_id; index++){ c_id2 = cell_cells[index]; f_id = cell_i_faces[index]; + face_sgn = cell_i_faces_sgn[index]; for (int jsou = 0; jsou < 3; jsou++) { difv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id1][jsou]; @@ -41,19 +42,20 @@ cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, pif = pvar[c_id1][isou]; pjf = pvar[c_id2][isou]; for (int jsou = 0; jsou < 3; jsou++) { - pif = pif + grad[c_id1][isou][jsou]*difv[jsou]; - pjf = pjf + grad[c_id2][isou][jsou]*djfv[jsou]; + pif = pif + grad[c_id1][isou][jsou] * difv[jsou]; + pjf = pjf + grad[c_id2][isou][jsou] * djfv[jsou]; } pfac = pjf; - if (i_massflux[f_id] > 0.) + if (i_massflux[f_id] * face_sgn > 0.) pfac = pif; + pfac *= face_sgn; /* U gradient */ for (int jsou = 0; jsou < 3; jsou++) { vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; - grdpa[c_id1][isou][jsou] += cell_i_faces_sgn[index] * vfac[jsou]; + grdpa[c_id1][isou][jsou] += vfac[jsou]; } } } diff --git a/src/alge/cs_convection_diffusion_cuda_scatter.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh similarity index 100% rename from src/alge/cs_convection_diffusion_cuda_scatter.cuh rename to src/alge/cs_slope_test_gradient_vector_cuda_scatter.cuh From a7011e9651ab99e79f16b97e31d255fd5abbbeb4 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 19 Dec 2023 12:12:37 +0100 Subject: [PATCH 50/70] Avoid code duplication between gradient and convection --- ...{cs_gradient_cuda.cuh => cs_alge_cuda.cuh} | 69 +++++++++++--- src/alge/cs_convection_diffusion_cuda.cu | 95 ++----------------- src/alge/cs_gradient_cuda.cu | 51 ++-------- 3 files changed, 72 insertions(+), 143 deletions(-) rename src/alge/{cs_gradient_cuda.cuh => cs_alge_cuda.cuh} (87%) diff --git a/src/alge/cs_gradient_cuda.cuh b/src/alge/cs_alge_cuda.cuh similarity index 87% rename from src/alge/cs_gradient_cuda.cuh rename to src/alge/cs_alge_cuda.cuh index 8db8366e54..11c5123553 100644 --- a/src/alge/cs_gradient_cuda.cuh +++ b/src/alge/cs_alge_cuda.cuh @@ -23,6 +23,7 @@ */ /*----------------------------------------------------------------------------*/ +#pragma once #include "cs_defs.h" @@ -72,14 +73,58 @@ #include "cs_timer.h" #include "cs_timer_stats.h" -/*---------------------------------------------------------------------------- - * Header for the current file - *----------------------------------------------------------------------------*/ +BEGIN_C_DECLS + + typedef cs_real_t cs_cocg_t; + typedef cs_real_t cs_cocg_6_t[6]; + typedef cs_real_t cs_cocg_33_t[3][3]; + +END_C_DECLS + +template +static void +_sync_or_copy_real_h2d(const T *val_h, + cs_lnum_t n_vals, + int device_id, + cudaStream_t stream, + const T **val_d, + void **buf_d) +{ + const T *_val_d = NULL; + void *_buf_d = NULL; + + cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); + size_t size = n_vals * sizeof(T); + + if (alloc_mode == CS_ALLOC_HOST) { + CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); + cs_cuda_copy_h2d(_buf_d, val_h, size); + _val_d = (const T *)_buf_d; + } + else { + _val_d = (const T *)cs_get_device_ptr((void *)val_h); + + if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) + cudaMemPrefetchAsync(val_h, size, device_id, stream); + else + cs_sync_h2d(val_h); + } + + *val_d = _val_d; + *buf_d = _buf_d; +} + +/* Compute gridsize*/ + +static unsigned int +get_gridsize(unsigned int size, unsigned int blocksize){ + unsigned int gridsize = (unsigned int)ceil((double)size / blocksize); + + return gridsize; +} -#include "cs_gradient.h" -#include "cs_gradient_priv.h" -__device__ cs_real_t +__device__ static cs_real_t cs_math_fabs_cuda(cs_real_t x) { cs_real_t ret = (x < 0) ? -x : x; @@ -87,7 +132,7 @@ cs_math_fabs_cuda(cs_real_t x) return ret; } -__device__ cs_real_t +__device__ static cs_real_t cs_math_3_dot_product_cuda(const cs_real_t u[3], const cs_real_t v[3]) { @@ -112,7 +157,7 @@ _set_one_to_coeff_b(const cs_lnum_t n_b_faces, _bc_coeff_b[f_id][i][i] = 1; } -__device__ void cs_math_3_normalize_cuda(const cs_real_t in[3], +__device__ static void cs_math_3_normalize_cuda(const cs_real_t in[3], cs_real_t out[3]) { cs_real_t norm = sqrt(in[0]*in[0] @@ -126,12 +171,12 @@ __device__ void cs_math_3_normalize_cuda(const cs_real_t in[3], out[2] = inverse_norm * in[2]; } -__device__ cs_real_t cs_math_3_square_norm_cuda(const cs_real_t in[3]){ +__device__ static cs_real_t cs_math_3_square_norm_cuda(const cs_real_t in[3]){ cs_real_t norm = in[0]*in[0] + in[1]*in[1] + in[2]*in[2]; return norm; } -__device__ void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ +__device__ static void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ cs_real_t in00 = in[1]*in[2] - in[4]*in[4]; cs_real_t in01 = in[4]*in[5] - in[3]*in[2]; cs_real_t in02 = in[3]*in[4] - in[1]*in[5]; @@ -150,7 +195,7 @@ __device__ void _math_6_inv_cramer_sym_in_place_cuda(cs_cocg_t in[6]){ } template -__device__ void +__device__ static void _fact_crout_pp_cuda(cs_real_t *ad) { cs_real_t aux[d_size]; @@ -169,7 +214,7 @@ _fact_crout_pp_cuda(cs_real_t *ad) } template -__device__ void +__device__ static void _fw_and_bw_ldtl_pp_cuda(const cs_real_t mat[], cs_real_t x[], const cs_real_t b[]) diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu index 287a6af130..ff26fe7294 100644 --- a/src/alge/cs_convection_diffusion_cuda.cu +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -1,50 +1,4 @@ -#include "cs_defs.h" - -/*---------------------------------------------------------------------------- - * Standard C library headers - *----------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(HAVE_MPI) -#include -#endif - -#include - -/*---------------------------------------------------------------------------- - * Local headers - *----------------------------------------------------------------------------*/ - -#include "bft_error.h" -#include "bft_mem.h" - -#include "cs_base_accel.h" -#include "cs_base_cuda.h" -#include "cs_blas.h" -#include "cs_cell_to_vertex.h" -#include "cs_ext_neighborhood.h" -#include "cs_field.h" -#include "cs_field_pointer.h" -#include "cs_halo.h" -#include "cs_halo_perio.h" -#include "cs_log.h" -#include "cs_math.h" -#include "cs_mesh.h" -#include "cs_mesh_adjacencies.h" -#include "cs_mesh_quantities.h" -#include "cs_parall.h" -#include "cs_porous_model.h" -#include "cs_prototypes.h" -#include "cs_timer.h" -#include "cs_timer_stats.h" +#include "cs_alge_cuda.cuh" #include "cs_convection_diffusion.h" #include "cs_convection_diffusion_priv.h" @@ -52,41 +6,6 @@ #include "cs_slope_test_gradient_vector_cuda_scatter.cuh" #include "cs_slope_test_gradient_vector_cuda_gather.cuh" - - -template -static void -_sync_or_copy_real_h2d_c(const T *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const T **val_d, - void **buf_d) -{ - const T *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(T); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const T *)_buf_d; - } - else { - _val_d = (const T *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - /*---------------------------------------------------------------------------- * _gradient_vector the gradient of a vector using a given gradient of * this vector (typically lsq). @@ -219,11 +138,11 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, /* Initialization */ - _sync_or_copy_real_h2d_c(pvar, n_cells_ext, device_id, stream, + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_h2d_c(coefav, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_h2d_c(coefbv, n_b_faces, device_id, stream, + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); if(flag1){ @@ -268,7 +187,7 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(f_i, stream)); - cs_slope_test_gradient_vector_cuda_b<<<(unsigned int)ceil((double)n_b_faces / blocksize), blocksize, 0, stream>>> + cs_slope_test_gradient_vector_cuda_b<<>> (n_b_faces, pvar_d, b_face_cells, @@ -281,7 +200,7 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, grdpa_d); - // cs_slope_test_gradient_vector_cuda_b_gather<<<(unsigned int)ceil((double)n_b_cells / blocksize), blocksize, 0, stream>>> + // cs_slope_test_gradient_vector_cuda_b_gather<<>> // (n_b_cells, // pvar_d, // diipb, @@ -297,7 +216,7 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(f_b, stream)); - cs_slope_test_gradient_vector_cuda_f<<<(unsigned int)ceil((double)n_cells * 3 * 3 / blocksize), blocksize, 0, stream>>> + cs_slope_test_gradient_vector_cuda_f<<>> (n_cells * 3 * 3, cell_vol, grdpa_d); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index d0ca70d3d0..011a6ed4bf 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -23,7 +23,7 @@ */ /*----------------------------------------------------------------------------*/ -#include "cs_gradient_cuda.cuh" +#include "cs_alge_cuda.cuh" #include "cs_gradient.h" #include "cs_gradient_lsq_vector.cuh" @@ -43,6 +43,13 @@ #include "cs_reconstruct_vector_gradient_scatter_v2.cuh" #include "cs_reconstruct_vector_gradient_scatter_v2_cf.cuh" +/*---------------------------------------------------------------------------- + * Header for the current file + *----------------------------------------------------------------------------*/ + +#include "cs_gradient.h" +#include "cs_gradient_priv.h" + /*! \cond DOXYGEN_SHOULD_SKIP_THIS */ /*----------------------------------------------------------------------------*/ @@ -531,48 +538,6 @@ _compute_gradient_lsq_b_v(cs_lnum_t size, * after use if non-NULL) *----------------------------------------------------------------------------*/ -template -static void -_sync_or_copy_real_h2d(const T *val_h, - cs_lnum_t n_vals, - int device_id, - cudaStream_t stream, - const T **val_d, - void **buf_d) -{ - const T *_val_d = NULL; - void *_buf_d = NULL; - - cs_alloc_mode_t alloc_mode = cs_check_device_ptr(val_h); - size_t size = n_vals * sizeof(T); - - if (alloc_mode == CS_ALLOC_HOST) { - CS_CUDA_CHECK(cudaMalloc(&_buf_d, size)); - cs_cuda_copy_h2d(_buf_d, val_h, size); - _val_d = (const T *)_buf_d; - } - else { - _val_d = (const T *)cs_get_device_ptr((void *)val_h); - - if (alloc_mode == CS_ALLOC_HOST_DEVICE_SHARED) - cudaMemPrefetchAsync(val_h, size, device_id, stream); - else - cs_sync_h2d(val_h); - } - - *val_d = _val_d; - *buf_d = _buf_d; -} - -/* Compute gridsize*/ - -unsigned int -get_gridsize(unsigned int size, unsigned int blocksize){ - unsigned int gridsize = (unsigned int)ceil((double)size / blocksize); - - return gridsize; -} - /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */ /*============================================================================= From cf267146262ccb6fbe8629988010eac77dc532a4 Mon Sep 17 00:00:00 2001 From: mohammed derbane <115648603+aneo-mderbane@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:04:58 +0100 Subject: [PATCH 51/70] Apply suggestions from code review Co-authored-by: ddiakiteaneo <127390724+ddiakiteaneo@users.noreply.github.com> --- src/alge/cs_convection_diffusion.c | 2 +- src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index fe4d3ad82d..5ac6e85ffe 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4600,7 +4600,7 @@ res_cpu = !compute_cuda; #endif if(compute_cpu){ - printf("convection Compute and tranferts time in us: CPU = %ld\n", elapsed); + printf("convection compute time in us: CPU = %ld\n", elapsed); } } diff --git a/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh index 30a832082b..868a179d12 100644 --- a/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh +++ b/src/alge/cs_slope_test_gradient_vector_cuda_gather.cuh @@ -36,8 +36,6 @@ cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, djfv[jsou] = i_face_cog[f_id][jsou] - cell_cen[c_id2][jsou]; } - /* x-y-z component, p = u, v, w */ - for (int isou = 0; isou < 3; isou++) { pif = pvar[c_id1][isou]; pjf = pvar[c_id2][isou]; @@ -51,8 +49,6 @@ cs_slope_test_gradient_vector_cuda_i_gather( const cs_lnum_t n_cells, pfac = pif; pfac *= face_sgn; - /* U gradient */ - for (int jsou = 0; jsou < 3; jsou++) { vfac[jsou] = pfac*i_f_face_normal[f_id][jsou]; grdpa[c_id1][isou][jsou] += vfac[jsou]; @@ -94,8 +90,6 @@ cs_slope_test_gradient_vector_cuda_b_gather(const cs_lnum_t n_b_cells, for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; - /* x-y-z components, p = u, v, w */ - for (int jsou = 0; jsou < 3; jsou++) diipbv[jsou] = diipb[f_id][jsou]; From 4d4dc1d3f7293947daec44eebbd241e01e17bde9 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Wed, 20 Dec 2023 10:10:03 +0100 Subject: [PATCH 52/70] review of Daouda --- src/alge/cs_convection_diffusion.c | 10 +++++----- src/alge/cs_gradient.cxx | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 5ac6e85ffe..bc0c152939 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4411,13 +4411,13 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = true; - compute_cpu = true; - res_cpu = false; + // compute_cuda = true; + // compute_cpu = true; + // res_cpu = false; // A ne pas garder dans la version finale - perf = false; - accuracy = false; + // perf = false; + // accuracy = false; #if defined(HAVE_CUDA) if(compute_cuda){ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 57b1516a38..e37456bce8 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5590,13 +5590,13 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = true; - compute_cpu = true; - res_cpu = false; + // compute_cuda = true; + // compute_cpu = true; + // res_cpu = false; // A ne pas garder dans la version finale - perf = false; - accuracy = false; + // perf = false; + // accuracy = false; #if defined(HAVE_CUDA) @@ -6938,13 +6938,13 @@ _lsq_vector_gradient(const cs_mesh_t *m, // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = false; - compute_cpu = true; - res_cpu = true; + // compute_cuda = true; + // compute_cpu = true; + // res_cpu = false; // A ne pas garder dans la version finale - perf = false; - accuracy = false; + // perf = false; + // accuracy = false; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); @@ -9161,13 +9161,13 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - compute_cuda = true; - compute_cpu = true; - res_cpu = false; + // compute_cuda = true; + // compute_cpu = true; + // res_cpu = false; // A ne pas garder dans la version finale - perf = false; - accuracy = false; + // perf = false; + // accuracy = false; // Compute on GPU #if defined(HAVE_CUDA) From 83c7e8e2cfd3617389f57743dd2d9cda7921f23b Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 8 Dec 2023 09:11:27 +0100 Subject: [PATCH 53/70] OpenMP LSQ vector --- src/alge/cs_gradient.cxx | 477 +++++++++++++++++++++++++++++++++------ src/user/cs_user_mesh.c | 45 +++- 2 files changed, 446 insertions(+), 76 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e37456bce8..f114b87d2e 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -198,7 +198,7 @@ const cs_e2n_sum_t _e2n_sum_type = CS_E2N_SUM_SCATTER; /* Strided LSQ gradient variant */ -static int _use_legacy_strided_lsq_gradient = false; +static int _use_legacy_strided_lsq_gradient = true; /*============================================================================ * Private function definitions @@ -6863,6 +6863,333 @@ _find_bc_coeffs(const char *var_name, * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) *----------------------------------------------------------------------------*/ +void cs_math_3_normalize_target(const cs_real_t in[3], + cs_real_t out[3]) +{ + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); + + cs_real_t inverse_norm = 1. / norm; + + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; +} + +static void +_lsq_vector_gradient_target(const cs_mesh_t *m, + const cs_mesh_adjacencies_t *madj, + const cs_mesh_quantities_t *fvq, + const cs_halo_type_t halo_type, + const int inc, + const cs_real_3_t *restrict coefav, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict pvar, + const cs_real_t *restrict c_weight, + cs_real_33_t *restrict gradv, + cs_cocg_6_t *restrict cocg, + cs_real_33_t *restrict rhs) +{ + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const int n_b_threads = m->b_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)m->i_face_cells; + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)m->b_face_cells; + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)m->cell_cells_idx; + const cs_lnum_t *restrict cell_cells_lst + = (const cs_lnum_t *restrict)m->cell_cells_lst; + + const cs_real_3_t *restrict cell_f_cen + = (const cs_real_3_t *restrict)fvq->cell_f_cen; + const cs_real_t *restrict weight = fvq->weight; + const cs_real_t *restrict b_dist = fvq->b_dist; + const cs_real_3_t *restrict b_face_normal + = (const cs_real_3_t *restrict)fvq->b_face_normal; + + // cs_real_33_t *rhs; + // BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); + + /* Timing the computation */ + +#if defined(HAVE_CUDA) + bool accel = (cs_get_device_id() > -1) ? true : false; +#else + bool accel = false; +#endif + + // _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); + + double t_kernel = 0.0; + double t_begin, t_end; + + /* Contribution from interior faces */ + int num_device = omp_get_num_devices(); + printf("OMP supported devices %d\n", num_device); + t_begin = omp_get_wtime(); +#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(from: gradv[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], b_face_normal[0:n_b_faces], \ + coefav[0:n_b_faces], coefbv[0:n_b_faces], \ + cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ + cell_cells_idx[0:n_cells_ext], \ + cell_cells_lst[0:n_cells_ext], \ + cocg[0:n_cells_ext]) +{ + #pragma omp target teams distribute parallel for collapse(3) map(tofrom: rhs[0:n_cells_ext]) + for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t j = 0; j < 3; j++) + rhs[c_id][i][j] = 0.0; + } + + // for (int g_id = 0; g_id < n_i_groups; g_id++) { + + // for (int t_id = 0; t_id < n_i_threads; t_id++) { + #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext]) + for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { + + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; + + cs_real_t dc[3], fctb[3]; + + for (cs_lnum_t i = 0; i < 3; i++) + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL) { + cs_real_t pond = weight[f_id]; + cs_real_t denom = 1. / ( pond *c_weight[c_id1] + + (1. - pond)*c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; + } + } + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += fctb[j]; + } + } + } + + } /* loop on faces */ + + // } /* loop on threads */ + + // } /* loop on thread groups */ + + /* Contribution from extended neighborhood */ + + if (halo_type == CS_HALO_EXTENDED) { + + #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ + map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ + cell_cells_idx[0:n_cells_ext], \ + cell_cells_lst[0:n_cells_ext]) + for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) { + for (cs_lnum_t cidx = cell_cells_idx[c_id1]; + cidx < cell_cells_idx[c_id1+1]; + cidx++) { + + cs_lnum_t c_id2 = cell_cells_lst[cidx]; + + cs_real_t dc[3]; + + for (cs_lnum_t i = 0; i < 3; i++) + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + rhs[c_id1][i][j] += dc[j] * pfac; + } + } + } + } + + } /* End for extended neighborhood */ + + /* Contribution from boundary faces */ + +// for (int t_id = 0; t_id < n_b_threads; t_id++) { + + #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_normal[0:n_b_faces], \ + coefav[0:n_b_faces], coefbv[0:n_b_faces], \ + pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + + cs_lnum_t c_id1 = b_face_cells[f_id]; + + cs_real_t n_d_dist[3]; + // /* Normal is vector 0 if the b_face_normal norm is too small */ + cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); + + cs_real_t d_b_dist = 1. / b_dist[f_id]; + + // /* Normal divided by b_dist */ + for (cs_lnum_t i = 0; i < 3; i++) + n_d_dist[i] *= d_b_dist; + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); + + for (cs_lnum_t j = 0; j < 3; j++) + #pragma omp atomic + rhs[c_id1][i][j] += n_d_dist[j] * pfac; + } + + } /* loop on faces */ +// +// } /* loop on threads */ +// +// /* Compute gradient */ +// /*------------------*/ + + #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ + map(from: gradv[0:n_cells_ext]) \ + map(to: pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] + + rhs[c_id][i][1] * cocg[c_id][3] + + rhs[c_id][i][2] * cocg[c_id][5]; + + gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] + + rhs[c_id][i][1] * cocg[c_id][1] + + rhs[c_id][i][2] * cocg[c_id][4]; + + gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] + + rhs[c_id][i][1] * cocg[c_id][4] + + rhs[c_id][i][2] * cocg[c_id][2]; + } + } + +} // end omp target + +t_end = omp_get_wtime(); + +t_kernel = t_end - t_begin; +printf("Time of kernel: %lf\n", t_kernel); + +// printf("Check grad target"); +// for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { +// for (cs_lnum_t i = 0; i < 3; i++) { +// for (int j = 0; j < 3; ++j) { +// if (gradv[c_id][i][j] != 0.) { +// printf("DIFFERENCE @%d-%d-%d: %f\n", c_id, i, j, gradv[c_id][i][j]); +// } +// } +// } +// } + + /* Compute gradient on boundary cells */ + /*------------------------------------*/ + +// cs_lnum_t t_s_id, t_e_id; +// cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); + +// /* Build indices bijection between [1-9] and [1-3]*[1-3] */ + +// cs_lnum_t _33_9_idx[9][2]; +// int nn = 0; +// for (int ll = 0; ll < 3; ll++) { +// for (int mm = 0; mm < 3; mm++) { +// _33_9_idx[nn][0] = ll; +// _33_9_idx[nn][1] = mm; +// nn++; +// } +// } + +// /* Loop on boundary cells */ + +// for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { + +// cs_lnum_t c_id = m->b_cells[b_c_id]; + +// cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + +// _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); + +// _compute_cocgb_rhsb_lsq_v +// (c_id, +// inc, +// madj, +// fvq, +// _33_9_idx, +// (const cs_real_3_t *)pvar, +// (const cs_real_3_t *)coefav, +// (const cs_real_33_t *)coefbv, +// (const cs_real_3_t *)cocgb, +// (const cs_real_3_t *)rhs[c_id], +// cocgb_v, +// rhsb_v); + +// _fw_and_bw_ldtl_pp(cocgb_v, +// 9, +// x, +// rhsb_v); + +// for (int kk = 0; kk < 9; kk++) { +// int ii = _33_9_idx[kk][0]; +// int jj = _33_9_idx[kk][1]; +// gradv[c_id][ii][jj] = x[kk]; +// } + +// } + + +// /* Periodicity and parallelism treatment */ + +// if (m->halo != NULL) { +// cs_halo_sync_var_strided(m->halo, halo_type, (cs_real_t *)gradv, 9); +// if (cs_glob_mesh->have_rotation_perio) +// cs_halo_perio_sync_var_tens(m->halo, halo_type, (cs_real_t *)gradv); +// } + + BFT_FREE(rhs); +} + static void _lsq_vector_gradient(const cs_mesh_t *m, const cs_mesh_adjacencies_t *madj, @@ -6905,7 +7232,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, /* Timing the computation */ std::chrono::high_resolution_clock::time_point start, stop; - std::chrono::microseconds elapsed, elapsed_cuda; + std::chrono::microseconds elapsed, elapsed_cuda, elapsed_target; #if defined(HAVE_CUDA) bool accel = (cs_get_device_id() > -1) ? true : false; @@ -6915,7 +7242,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); - cs_real_33_t *rhs, *rhs_cuda, *gradv_cuda, *gradv_cpu; + cs_real_33_t *rhs, *rhs_cuda, *rhs_target, *gradv_cuda, *gradv_cpu, *gradv_target; bool compute_cuda, compute_cpu, res_cpu, perf, accuracy; compute_cuda = accel; @@ -6935,21 +7262,23 @@ _lsq_vector_gradient(const cs_mesh_t *m, perf = false; accuracy = false; #endif +accuracy = false; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever // compute_cuda = true; // compute_cpu = true; // res_cpu = false; - - // A ne pas garder dans la version finale // perf = false; // accuracy = false; + +BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(rhs_target, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); +BFT_MALLOC(gradv_target, n_cells_ext, cs_real_33_t); - BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); - BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); - BFT_MALLOC(gradv_cuda, n_cells_ext, cs_real_33_t); - BFT_MALLOC(gradv_cpu, n_cells_ext, cs_real_33_t); /* Compute Right-Hand Side */ /*-------------------------*/ #if defined(HAVE_CUDA) @@ -6979,6 +7308,23 @@ _lsq_vector_gradient(const cs_mesh_t *m, } // end if compute_cuda #endif +start = std::chrono::high_resolution_clock::now(); +_lsq_vector_gradient_target(m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + gradv_target, + cocg, + rhs_target); +stop = std::chrono::high_resolution_clock::now(); +elapsed_target = std::chrono::duration_cast(stop - start); +printf("OMP target lsq %ld\n", elapsed_target.count()); + if(compute_cpu){ if(perf){ start = std::chrono::high_resolution_clock::now(); @@ -6991,17 +7337,6 @@ if(compute_cpu){ } /* Contribution from interior faces */ - // int num_device = omp_get_num_devices(); - // printf("OMP supported devices %d\n", num_device); - // #pragma omp target - // { - // #pragma omp teams distribute parallel for - // for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { - // for (cs_lnum_t i = 0; i < 3; i++) - // for (cs_lnum_t j = 0; j < 3; j++) - // rhs[c_id][i][j] = 0.0; - // } - // } for (int g_id = 0; g_id < n_i_groups; g_id++) { # pragma omp parallel for @@ -7141,64 +7476,78 @@ if(compute_cpu){ + rhs[c_id][i][2] * cocg[c_id][2]; } } - /* Compute gradient on boundary cells */ - /*------------------------------------*/ - - #pragma omp parallel - { - cs_lnum_t t_s_id, t_e_id; - cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); - /* Build indices bijection between [1-9] and [1-3]*[1-3] */ +for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { + for (int j = 0; j < 3; ++j) { + auto cpu = gradv_cpu[c_id][i][j]; + auto omp = gradv_target[c_id][i][j]; - cs_lnum_t _33_9_idx[9][2]; - int nn = 0; - for (int ll = 0; ll < 3; ll++) { - for (int mm = 0; mm < 3; mm++) { - _33_9_idx[nn][0] = ll; - _33_9_idx[nn][1] = mm; - nn++; + if (fabs(cpu - omp) / fmax(fabs(cpu), 1e-6) > 1e-12) { + printf("DIFFERENCE @%d-%d-%d: CPU = %a\tOMP = %a\n|CPU - OMP| = %a\t|CPU - OMP|ulp = %a\n", c_id, i, j, cpu, omp, fabs(cpu - omp), cs_diff_ulp(cpu, omp)); } } + } +} - /* Loop on boundary cells */ + /* Compute gradient on boundary cells */ + /*------------------------------------*/ - for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { + // #pragma omp parallel + // { + // cs_lnum_t t_s_id, t_e_id; + // cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); + + // /* Build indices bijection between [1-9] and [1-3]*[1-3] */ + + // cs_lnum_t _33_9_idx[9][2]; + // int nn = 0; + // for (int ll = 0; ll < 3; ll++) { + // for (int mm = 0; mm < 3; mm++) { + // _33_9_idx[nn][0] = ll; + // _33_9_idx[nn][1] = mm; + // nn++; + // } + // } - cs_lnum_t c_id = m->b_cells[b_c_id]; + // /* Loop on boundary cells */ - cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + // for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { - _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); + // cs_lnum_t c_id = m->b_cells[b_c_id]; - _compute_cocgb_rhsb_lsq_v - (c_id, - inc, - madj, - fvq, - _33_9_idx, - (const cs_real_3_t *)pvar, - (const cs_real_3_t *)coefav, - (const cs_real_33_t *)coefbv, - (const cs_real_3_t *)cocgb, - (const cs_real_3_t *)rhs[c_id], - cocgb_v, - rhsb_v); + // cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; - _fw_and_bw_ldtl_pp(cocgb_v, - 9, - x, - rhsb_v); + // _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); - for (int kk = 0; kk < 9; kk++) { - int ii = _33_9_idx[kk][0]; - int jj = _33_9_idx[kk][1]; - gradv_cpu[c_id][ii][jj] = x[kk]; - } + // _compute_cocgb_rhsb_lsq_v + // (c_id, + // inc, + // madj, + // fvq, + // _33_9_idx, + // (const cs_real_3_t *)pvar, + // (const cs_real_3_t *)coefav, + // (const cs_real_33_t *)coefbv, + // (const cs_real_3_t *)cocgb, + // (const cs_real_3_t *)rhs[c_id], + // cocgb_v, + // rhsb_v); - } + // _fw_and_bw_ldtl_pp(cocgb_v, + // 9, + // x, + // rhsb_v); - } + // for (int kk = 0; kk < 9; kk++) { + // int ii = _33_9_idx[kk][0]; + // int jj = _33_9_idx[kk][1]; + // gradv_cpu[c_id][ii][jj] = x[kk]; + // } + + // } + + // } stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); } // end if COMPUTE_CPU diff --git a/src/user/cs_user_mesh.c b/src/user/cs_user_mesh.c index a0795a0e51..1208efc081 100644 --- a/src/user/cs_user_mesh.c +++ b/src/user/cs_user_mesh.c @@ -57,15 +57,7 @@ BEGIN_C_DECLS -/*----------------------------------------------------------------------------*/ -/*! - * \file cs_user_mesh.c - * - * \brief Definition and modification of the calculation mesh. - * - * See \ref cs_user_mesh for examples. - */ -/*----------------------------------------------------------------------------*/ +static int _n_tubes = 1; /*============================================================================ * User function definitions @@ -88,6 +80,7 @@ BEGIN_C_DECLS void cs_user_mesh_restart_mode(void) { + const char *path = "mesh_input.csm"; } @@ -110,11 +103,24 @@ cs_user_mesh_input(void) */ /*----------------------------------------------------------------------------*/ -#pragma weak cs_user_join + void cs_user_join(void) { + int join_num; + + /* Add a joining operation */ + /* ----------------------- */ + + int verbosity = 1; /* per-task dump if > 1, debug level if >= 3 */ + int visualization = 1; /* debug level if >= 3 */ + float fraction = 0.10, plane = 25.; + join_num = cs_join_add("join", + fraction, + plane, + verbosity, + visualization); } /*----------------------------------------------------------------------------*/ @@ -123,11 +129,26 @@ cs_user_join(void) */ /*----------------------------------------------------------------------------*/ -#pragma weak cs_user_periodicity + void cs_user_periodicity(void) { - + int join_num; + + int verbosity = 1; /* per-task dump if > 1, debug level if >= 3 */ + int visualization = 1; /* debug level if >= 3 */ + float fraction = 0.10, plane = 25.; + + const double translation[3] = {0.0, + 0.0450*_n_tubes, + 0.0}; /* Translation vector */ + + join_num = cs_join_perio_add_translation("south or north", + fraction, + plane, + verbosity, + visualization, + translation); } /*----------------------------------------------------------------------------*/ From 020ddd3a293e7f3d5bcf8f1e2b77f584b0060aa2 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 8 Dec 2023 10:44:24 +0100 Subject: [PATCH 54/70] Clean up code --- src/alge/cs_gradient.cxx | 221 +++++++++++++-------------------------- 1 file changed, 70 insertions(+), 151 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index f114b87d2e..199cc7fb78 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6946,71 +6946,68 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_cells_lst[0:n_cells_ext], \ cocg[0:n_cells_ext]) { - #pragma omp target teams distribute parallel for collapse(3) map(tofrom: rhs[0:n_cells_ext]) + #pragma omp target teams distribute parallel for collapse(3) \ + map(tofrom: rhs[0:n_cells_ext]) for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) - for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t i = 0; i < 3; i++){ + for (cs_lnum_t j = 0; j < 3; j++){ rhs[c_id][i][j] = 0.0; + } + } } - // for (int g_id = 0; g_id < n_i_groups; g_id++) { + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + cell_f_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext]) + for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { - // for (int t_id = 0; t_id < n_i_threads; t_id++) { - #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ - map(to: i_face_cells[0:n_i_faces], \ - cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext]) - for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; - cs_lnum_t c_id1 = i_face_cells[f_id][0]; - cs_lnum_t c_id2 = i_face_cells[f_id][1]; + cs_real_t dc[3], fctb[3]; - cs_real_t dc[3], fctb[3]; + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + } - for (cs_lnum_t i = 0; i < 3; i++) - dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - - if (c_weight != NULL) { - cs_real_t pond = weight[f_id]; - cs_real_t denom = 1. / ( pond *c_weight[c_id1] - + (1. - pond)*c_weight[c_id2]); - - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; - } - } - } - else { - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += fctb[j]; - } - } - } + if (c_weight != NULL) { + cs_real_t pond = weight[f_id]; + cs_real_t denom = 1. / ( pond *c_weight[c_id1] + + (1. - pond)*c_weight[c_id2]); - } /* loop on faces */ + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - // } /* loop on threads */ + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; + } + } + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - // } /* loop on thread groups */ + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += fctb[j]; + } + } + } - /* Contribution from extended neighborhood */ + } - if (halo_type == CS_HALO_EXTENDED) { + if (halo_type == CS_HALO_EXTENDED) { #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ @@ -7025,8 +7022,9 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_real_t dc[3]; - for (cs_lnum_t i = 0; i < 3; i++) - dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + } cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); @@ -7041,17 +7039,15 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } } - } /* End for extended neighborhood */ - - /* Contribution from boundary faces */ + } -// for (int t_id = 0; t_id < n_b_threads; t_id++) { - - #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ - map(to: b_face_normal[0:n_b_faces], \ - coefav[0:n_b_faces], coefbv[0:n_b_faces], \ - pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_normal[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { cs_lnum_t c_id1 = b_face_cells[f_id]; @@ -7063,8 +7059,9 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_real_t d_b_dist = 1. / b_dist[f_id]; // /* Normal divided by b_dist */ - for (cs_lnum_t i = 0; i < 3; i++) + for (cs_lnum_t i = 0; i < 3; i++){ n_d_dist[i] *= d_b_dist; + } for (cs_lnum_t i = 0; i < 3; i++) { cs_real_t pfac = coefav[f_id][i]*inc @@ -7073,22 +7070,19 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, + coefbv[f_id][2][i] * pvar[c_id1][2] - pvar[c_id1][i]); - for (cs_lnum_t j = 0; j < 3; j++) + for (cs_lnum_t j = 0; j < 3; j++){ #pragma omp atomic rhs[c_id1][i][j] += n_d_dist[j] * pfac; + } } - } /* loop on faces */ -// -// } /* loop on threads */ -// -// /* Compute gradient */ -// /*------------------*/ + } - #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ - map(from: gradv[0:n_cells_ext]) \ - map(to: pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(from: gradv[0:n_cells_ext]) \ + map(to: pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { for (cs_lnum_t i = 0; i < 3; i++) { gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] @@ -7105,88 +7099,13 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } } -} // end omp target +} // end omp data t_end = omp_get_wtime(); t_kernel = t_end - t_begin; printf("Time of kernel: %lf\n", t_kernel); -// printf("Check grad target"); -// for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { -// for (cs_lnum_t i = 0; i < 3; i++) { -// for (int j = 0; j < 3; ++j) { -// if (gradv[c_id][i][j] != 0.) { -// printf("DIFFERENCE @%d-%d-%d: %f\n", c_id, i, j, gradv[c_id][i][j]); -// } -// } -// } -// } - - /* Compute gradient on boundary cells */ - /*------------------------------------*/ - -// cs_lnum_t t_s_id, t_e_id; -// cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); - -// /* Build indices bijection between [1-9] and [1-3]*[1-3] */ - -// cs_lnum_t _33_9_idx[9][2]; -// int nn = 0; -// for (int ll = 0; ll < 3; ll++) { -// for (int mm = 0; mm < 3; mm++) { -// _33_9_idx[nn][0] = ll; -// _33_9_idx[nn][1] = mm; -// nn++; -// } -// } - -// /* Loop on boundary cells */ - -// for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { - -// cs_lnum_t c_id = m->b_cells[b_c_id]; - -// cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; - -// _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); - -// _compute_cocgb_rhsb_lsq_v -// (c_id, -// inc, -// madj, -// fvq, -// _33_9_idx, -// (const cs_real_3_t *)pvar, -// (const cs_real_3_t *)coefav, -// (const cs_real_33_t *)coefbv, -// (const cs_real_3_t *)cocgb, -// (const cs_real_3_t *)rhs[c_id], -// cocgb_v, -// rhsb_v); - -// _fw_and_bw_ldtl_pp(cocgb_v, -// 9, -// x, -// rhsb_v); - -// for (int kk = 0; kk < 9; kk++) { -// int ii = _33_9_idx[kk][0]; -// int jj = _33_9_idx[kk][1]; -// gradv[c_id][ii][jj] = x[kk]; -// } - -// } - - -// /* Periodicity and parallelism treatment */ - -// if (m->halo != NULL) { -// cs_halo_sync_var_strided(m->halo, halo_type, (cs_real_t *)gradv, 9); -// if (cs_glob_mesh->have_rotation_perio) -// cs_halo_perio_sync_var_tens(m->halo, halo_type, (cs_real_t *)gradv); -// } - BFT_FREE(rhs); } From 021505ecc188d4d3503f1d2e30411b51bed1891d Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 8 Dec 2023 14:50:10 +0100 Subject: [PATCH 55/70] Interior and boundary faces contributions gather omp --- src/alge/cs_gradient.cxx | 250 ++++++++++++++++++++++++++++----------- 1 file changed, 184 insertions(+), 66 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 199cc7fb78..e294b90ff6 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6892,6 +6892,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_real_33_t *restrict rhs) { const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; const cs_lnum_t n_i_faces = m->n_i_faces; const cs_lnum_t n_b_faces = m->n_b_faces; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; @@ -6910,6 +6911,15 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_lnum_t *restrict cell_cells_lst = (const cs_lnum_t *restrict)m->cell_cells_lst; + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)madj->cell_cells; + const cs_lnum_t *restrict cell_i_faces_sgn + = (const cs_lnum_t *restrict)madj->cell_i_faces_sgn; + const short int *restrict cell_i_faces + = (const short int *restrict)madj->cell_i_faces; + const short int *restrict cell_b_faces + = (const short int *restrict)madj->cell_b_faces; + const cs_real_3_t *restrict cell_f_cen = (const cs_real_3_t *restrict)fvq->cell_f_cen; const cs_real_t *restrict weight = fvq->weight; @@ -6933,6 +6943,8 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, double t_kernel = 0.0; double t_begin, t_end; + bool scatter = true; + /* Contribution from interior faces */ int num_device = omp_get_num_devices(); printf("OMP supported devices %d\n", num_device); @@ -6955,62 +6967,119 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } } } + if(scatter){ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + cell_f_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext]) + for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { - #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: i_face_cells[0:n_i_faces], \ - cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) - for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { + cs_lnum_t c_id1 = i_face_cells[f_id][0]; + cs_lnum_t c_id2 = i_face_cells[f_id][1]; - cs_lnum_t c_id1 = i_face_cells[f_id][0]; - cs_lnum_t c_id2 = i_face_cells[f_id][1]; + cs_real_t dc[3], fctb[3]; - cs_real_t dc[3], fctb[3]; + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; + } - for (cs_lnum_t i = 0; i < 3; i++){ - dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; - } + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + if (c_weight != NULL) { + cs_real_t pond = weight[f_id]; + cs_real_t denom = 1. / ( pond *c_weight[c_id1] + + (1. - pond)*c_weight[c_id2]); - if (c_weight != NULL) { - cs_real_t pond = weight[f_id]; - cs_real_t denom = 1. / ( pond *c_weight[c_id1] - + (1. - pond)*c_weight[c_id2]); + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; + } + } + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += fctb[j]; + } } } + } - else { - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + } + else{ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + cell_cells_idx[0:n_cells_ext], \ + cell_cells[0:n_cells_ext], \ + cell_f_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext]) + for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += fctb[j]; + cs_lnum_t s_id = cell_cells_idx[c_id]; + cs_lnum_t e_id = cell_cells_idx[c_id+1]; + + cs_lnum_t c_id2, f_id; + + cs_real_t dc[3], fctb[3]; + for(cs_lnum_t index = s_id; index < e_id; index++){ + + c_id2 = cell_cells[index]; + + for (cs_lnum_t i = 0; i < 3; i++){ + dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id][i]; + } + + cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); + + if (c_weight != NULL) { + f_id = cell_i_faces[index]; + cs_real_t pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + cs_real_t denom = 1. / ( pond *c_weight[c_id] + + (1. - pond)*c_weight[c_id2]); + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + rhs[c_id][i][j] += c_weight[c_id2] * denom * fctb[j]; + } + } + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + rhs[c_id][i][j] += fctb[j]; + } + } } } - } - } + } + } if (halo_type == CS_HALO_EXTENDED) { - #pragma omp target teams distribute parallel for map(tofrom: rhs[0:n_cells_ext]) \ - map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ cell_cells_idx[0:n_cells_ext], \ cell_cells_lst[0:n_cells_ext]) for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) { @@ -7041,42 +7110,91 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } - #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: b_face_normal[0:n_b_faces], \ - coefav[0:n_b_faces], \ - coefbv[0:n_b_faces], \ - pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) - for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + if(scatter){ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_normal[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { - cs_lnum_t c_id1 = b_face_cells[f_id]; + cs_lnum_t c_id1 = b_face_cells[f_id]; - cs_real_t n_d_dist[3]; - // /* Normal is vector 0 if the b_face_normal norm is too small */ - cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); + cs_real_t n_d_dist[3]; + // /* Normal is vector 0 if the b_face_normal norm is too small */ + cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); - cs_real_t d_b_dist = 1. / b_dist[f_id]; + cs_real_t d_b_dist = 1. / b_dist[f_id]; - // /* Normal divided by b_dist */ - for (cs_lnum_t i = 0; i < 3; i++){ - n_d_dist[i] *= d_b_dist; - } + // /* Normal divided by b_dist */ + for (cs_lnum_t i = 0; i < 3; i++){ + n_d_dist[i] *= d_b_dist; + } - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = coefav[f_id][i]*inc - + ( coefbv[f_id][0][i] * pvar[c_id1][0] - + coefbv[f_id][1][i] * pvar[c_id1][1] - + coefbv[f_id][2][i] * pvar[c_id1][2] - - pvar[c_id1][i]); + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id1][0] + + coefbv[f_id][1][i] * pvar[c_id1][1] + + coefbv[f_id][2][i] * pvar[c_id1][2] + - pvar[c_id1][i]); - for (cs_lnum_t j = 0; j < 3; j++){ - #pragma omp atomic - rhs[c_id1][i][j] += n_d_dist[j] * pfac; + for (cs_lnum_t j = 0; j < 3; j++){ + #pragma omp atomic + rhs[c_id1][i][j] += n_d_dist[j] * pfac; + } } - } - } + } + } + else{ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_normal[0:n_b_faces], \ + cell_b_faces[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + pvar[0:n_cells_ext],\ + cocg[0:n_cells_ext]) + for (cs_lnum_t c_id = 0; c_id < n_b_cells; c_id++) { + + cs_lnum_t s_id = cell_cells_idx[c_id]; + cs_lnum_t e_id = cell_cells_idx[c_id+1]; + + cs_lnum_t c_id2, f_id; + + cs_real_t n_d_dist[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + f_id = cell_b_faces[index]; + + cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); + + cs_real_t d_b_dist = 1. / b_dist[f_id]; + + // /* Normal divided by b_dist */ + for (cs_lnum_t i = 0; i < 3; i++){ + n_d_dist[i] *= d_b_dist; + } + + for (cs_lnum_t i = 0; i < 3; i++) { + cs_real_t pfac = coefav[f_id][i]*inc + + ( coefbv[f_id][0][i] * pvar[c_id][0] + + coefbv[f_id][1][i] * pvar[c_id][1] + + coefbv[f_id][2][i] * pvar[c_id][2] + - pvar[c_id][i]); + + for (cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] += n_d_dist[j] * pfac; + } + } + } + + } + } + #pragma omp target teams distribute parallel for \ map(tofrom: rhs[0:n_cells_ext]) \ From a9541ea83d51a352e80df81b13c77b8e3e20d591 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 13 Dec 2023 15:39:13 +0100 Subject: [PATCH 56/70] Fix gather kernels --- src/alge/cs_gradient.cxx | 140 ++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 60 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index e294b90ff6..9a0666368a 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6877,6 +6877,13 @@ void cs_math_3_normalize_target(const cs_real_t in[3], out[2] = inverse_norm * in[2]; } +unsigned int +num_block(unsigned int size, unsigned int num_threads){ + unsigned int num = (unsigned int)ceil((double)size / num_threads); + + return num; +} + static void _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_mesh_adjacencies_t *madj, @@ -6907,18 +6914,22 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_lnum_t *restrict b_face_cells = (const cs_lnum_t *restrict)m->b_face_cells; const cs_lnum_t *restrict cell_cells_idx - = (const cs_lnum_t *restrict)m->cell_cells_idx; + = (const cs_lnum_t *restrict)madj->cell_cells_idx; + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)madj->cell_b_faces_idx; const cs_lnum_t *restrict cell_cells_lst = (const cs_lnum_t *restrict)m->cell_cells_lst; + const cs_lnum_t *restrict b_cells + = (const cs_lnum_t *restrict)m->b_cells; const cs_lnum_t *restrict cell_cells = (const cs_lnum_t *restrict)madj->cell_cells; - const cs_lnum_t *restrict cell_i_faces_sgn - = (const cs_lnum_t *restrict)madj->cell_i_faces_sgn; - const short int *restrict cell_i_faces - = (const short int *restrict)madj->cell_i_faces; - const short int *restrict cell_b_faces - = (const short int *restrict)madj->cell_b_faces; + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)madj->cell_i_faces_sgn; + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)madj->cell_i_faces; + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)madj->cell_b_faces; const cs_real_3_t *restrict cell_f_cen = (const cs_real_3_t *restrict)fvq->cell_f_cen; @@ -6943,7 +6954,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, double t_kernel = 0.0; double t_begin, t_end; - bool scatter = true; + bool scatter = false; /* Contribution from interior faces */ int num_device = omp_get_num_devices(); @@ -6956,6 +6967,8 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ cell_cells_idx[0:n_cells_ext], \ cell_cells_lst[0:n_cells_ext], \ + cell_b_faces_idx[0:n_cells+1], \ + b_cells[0:n_b_cells], \ cocg[0:n_cells_ext]) { #pragma omp target teams distribute parallel for collapse(3) \ @@ -6978,7 +6991,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_lnum_t c_id1 = i_face_cells[f_id][0]; cs_lnum_t c_id2 = i_face_cells[f_id][1]; - cs_real_t dc[3], fctb[3]; + cs_real_t dc[3], fctb[3],_weight1, _weight2, _denom, _pond, pfac; for (cs_lnum_t i = 0; i < 3; i++){ dc[i] = cell_f_cen[c_id2][i] - cell_f_cen[c_id1][i]; @@ -6986,34 +6999,27 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - if (c_weight != NULL) { - cs_real_t pond = weight[f_id]; - cs_real_t denom = 1. / ( pond *c_weight[c_id1] - + (1. - pond)*c_weight[c_id2]); - - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += c_weight[c_id2] * denom * fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += c_weight[c_id1] * denom * fctb[j]; - } - } + if (c_weight == NULL){ + _weight1 = 1.; + _weight2 = 1.; + } + else{ + _pond = weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id1] + + (1. - _pond)*c_weight[c_id2]); + _weight1 = c_weight[c_id1] * _denom; + _weight2 = c_weight[c_id2] * _denom; } - else { - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - #pragma omp atomic - rhs[c_id1][i][j] += fctb[j]; - #pragma omp atomic - rhs[c_id2][i][j] += fctb[j]; - } + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = (pvar[c_id2][i] - pvar[c_id1][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + #pragma omp atomic + rhs[c_id1][i][j] += _weight2 * fctb[j]; + #pragma omp atomic + rhs[c_id2][i][j] += _weight1 * fctb[j]; } } @@ -7026,7 +7032,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_cells_idx[0:n_cells_ext], \ cell_cells[0:n_cells_ext], \ cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) + pvar[0:n_cells_ext]) num_teams(num_block(n_cells, 256)) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_lnum_t s_id = cell_cells_idx[c_id]; @@ -7034,7 +7040,16 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_lnum_t c_id2, f_id; - cs_real_t dc[3], fctb[3]; + cs_real_t _rhs[256][3][3]; + cs_lnum_t tid = omp_get_thread_num(); + + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + _rhs[tid][i][j] = 0.0; + } + } + + cs_real_t dc[3], fctb[3], _weight, _denom, _pond, pfac; for(cs_lnum_t index = s_id; index < e_id; index++){ c_id2 = cell_cells[index]; @@ -7045,30 +7060,31 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_real_t ddc = 1./(dc[0]*dc[0] + dc[1]*dc[1] + dc[2]*dc[2]); - if (c_weight != NULL) { - f_id = cell_i_faces[index]; - cs_real_t pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; - cs_real_t denom = 1. / ( pond *c_weight[c_id] - + (1. - pond)*c_weight[c_id2]); + if (c_weight == NULL){ + _weight = 1.; + } + else{ + f_id = cell_i_faces[index]; + _pond = (cell_i_faces_sgn[index] > 0) ? weight[f_id] : 1. - weight[f_id]; + _denom = 1. / ( _pond *c_weight[c_id] + + (1. - _pond)*c_weight[c_id2]); + _weight = c_weight[c_id2] * _denom; + } - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - rhs[c_id][i][j] += c_weight[c_id2] * denom * fctb[j]; - } + for (cs_lnum_t i = 0; i < 3; i++) { + pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + + for (cs_lnum_t j = 0; j < 3; j++) { + fctb[j] = dc[j] * pfac; + _rhs[tid][i][j] += _weight * fctb[j]; } } - else { - for (cs_lnum_t i = 0; i < 3; i++) { - cs_real_t pfac = (pvar[c_id2][i] - pvar[c_id][i]) * ddc; + } - for (cs_lnum_t j = 0; j < 3; j++) { - fctb[j] = dc[j] * pfac; - rhs[c_id][i][j] += fctb[j]; - } - } + for(cs_lnum_t i = 0; i < 3; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + rhs[c_id][i][j] = _rhs[tid][i][j]; } } @@ -7155,14 +7171,18 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_b_faces[0:n_b_faces], \ coefav[0:n_b_faces], \ coefbv[0:n_b_faces], \ + b_cells[0:n_cells], \ + cell_b_faces_idx[0:n_cells+1], \ pvar[0:n_cells_ext],\ cocg[0:n_cells_ext]) - for (cs_lnum_t c_id = 0; c_id < n_b_cells; c_id++) { + for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { - cs_lnum_t s_id = cell_cells_idx[c_id]; - cs_lnum_t e_id = cell_cells_idx[c_id+1]; + cs_lnum_t c_id = b_cells[c_idx]; - cs_lnum_t c_id2, f_id; + cs_lnum_t s_id = cell_b_faces_idx[c_id]; + cs_lnum_t e_id = cell_b_faces_idx[c_id+1]; + + cs_lnum_t f_id; cs_real_t n_d_dist[3]; From 6928be24b9434b851274d3bc03f6c8379942748c Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 15 Dec 2023 09:40:31 +0100 Subject: [PATCH 57/70] OMP convection diffussion scatter and gather --- src/alge/cs_convection_diffusion.c | 316 ++++++++++++++++++++++++++++- src/alge/cs_gradient.cxx | 76 ++++--- 2 files changed, 354 insertions(+), 38 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index bc0c152939..b8872c3cb1 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -1288,6 +1288,293 @@ cs_slope_test_gradient_vector(const int inc, } } +void +cs_slope_test_gradient_vector_target(const int inc, + const cs_halo_type_t halo_type, + const cs_real_33_t *grad, + cs_real_33_t *grdpa, + const cs_real_3_t *pvar, + const cs_real_3_t *coefa, + const cs_real_33_t *coefb, + const cs_real_t *i_massflux) +{ + const cs_mesh_t *m = cs_glob_mesh; + const cs_mesh_adjacencies_t *madj = cs_glob_mesh_adjacencies; + const cs_halo_t *halo = m->halo; + cs_mesh_quantities_t *fvq = cs_glob_mesh_quantities; + + const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; + const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; + + const cs_lnum_2_t *restrict i_face_cells + = (const cs_lnum_2_t *restrict)m->i_face_cells; + const cs_lnum_t *restrict b_face_cells + = (const cs_lnum_t *restrict)m->b_face_cells; + const cs_lnum_t *restrict b_cells + = (const cs_lnum_t *restrict)m->b_cells; + const cs_real_t *restrict cell_vol = fvq->cell_vol; + const cs_real_3_t *restrict cell_cen + = (const cs_real_3_t *restrict)fvq->cell_cen; + const cs_real_3_t *restrict i_f_face_normal + = (const cs_real_3_t *restrict)fvq->i_f_face_normal; + const cs_real_3_t *restrict b_f_face_normal + = (const cs_real_3_t *restrict)fvq->b_f_face_normal; + const cs_real_3_t *restrict i_face_cog + = (const cs_real_3_t *restrict)fvq->i_face_cog; + const cs_real_3_t *restrict diipb + = (const cs_real_3_t *restrict)fvq->diipb; + const cs_lnum_t *restrict cell_cells_idx + = (const cs_lnum_t *restrict)madj->cell_cells_idx; + const cs_lnum_t *restrict cell_b_faces_idx + = (const cs_lnum_t *restrict)madj->cell_b_faces_idx; + const cs_lnum_t *restrict cell_cells + = (const cs_lnum_t *restrict)madj->cell_cells; + const short int *restrict cell_i_faces_sgn + = (const short int *restrict)madj->cell_i_faces_sgn; + const cs_lnum_t *restrict cell_i_faces + = (const cs_lnum_t *restrict)madj->cell_i_faces; + const cs_lnum_t *restrict cell_b_faces + = (const cs_lnum_t *restrict)madj->cell_b_faces; + + const int n_i_groups = m->i_face_numbering->n_groups; + const int n_i_threads = m->i_face_numbering->n_threads; + const int n_b_threads = m->b_face_numbering->n_threads; + const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; + const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; + + bool scatter = true; + +#pragma omp target data map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: grad[0:n_cells_ext], \ + i_face_cog[0:n_i_faces], \ + cell_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext], \ + i_massflux[0:n_i_faces], \ + i_f_face_normal[0:n_i_faces], \ + b_face_cells[0:n_b_faces], \ + coefb[0:n_b_faces], \ + coefa[0:n_b_faces], \ + cell_cells_idx[0:n_cells_ext], \ + cell_cells[0:n_cells_ext], \ + b_cells[0:n_cells], \ + cell_b_faces_idx[0:n_cells+1], \ + cell_vol[0:n_cells_ext], \ + i_face_cells[0:n_i_faces]) +{ + if(scatter){ + #pragma omp target teams distribute parallel for \ + map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: grad[0:n_cells_ext], \ + i_face_cog[0:n_i_faces], \ + cell_cen[0:n_cells_ext], \ + pvar[0:n_cells_ext], \ + i_massflux[0:n_i_faces], \ + i_f_face_normal[0:n_i_faces], \ + i_face_cells[0:n_i_faces]) + for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++){ + + cs_real_t difv[3], djfv[3]; + + cs_lnum_t ii = i_face_cells[face_id][0]; + cs_lnum_t jj = i_face_cells[face_id][1]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou]; + djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pif = pvar[ii][isou]; + cs_real_t pjf = pvar[jj][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[ii][isou][jsou]*difv[jsou]; + pjf = pjf + grad[jj][isou][jsou]*djfv[jsou]; + } + + cs_real_t pfac = pjf; + if (i_massflux[face_id] > 0.) pfac = pif; + + /* U gradient */ + + cs_real_t vfac[3]; + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[face_id][jsou]; + #pragma omp atomic + grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou]; + #pragma omp atomic + grdpa[jj][isou][jsou] = grdpa[jj][isou][jsou] - vfac[jsou]; + } + } + + } + + #pragma omp target teams distribute parallel for \ + map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: b_face_cells[0:n_b_faces], \ + coefb[0:n_b_faces], \ + coefa[0:n_b_faces], \ + grad[0:n_cells_ext]) \ + if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { + + cs_real_t diipbv[3]; + cs_lnum_t ii = b_face_cells[face_id]; + + for (int jsou = 0; jsou < 3; jsou++){ + diipbv[jsou] = diipb[face_id][jsou]; + } + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pfac = inc*coefa[face_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[face_id][jsou][isou]*( pvar[ii][jsou] + + grad[ii][jsou][0]*diipbv[0] + + grad[ii][jsou][1]*diipbv[1] + + grad[ii][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++){ + #pragma omp atomic + grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou]; + } + } + + } + + } + else{ + #pragma omp target teams distribute parallel for \ + map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: grad[0:n_cells_ext], \ + i_face_cog[0:n_i_faces], \ + cell_cen[0:n_cells_ext], \ + cell_cells_idx[0:n_cells_ext], \ + cell_cells[0:n_cells_ext], \ + pvar[0:n_cells_ext], \ + i_massflux[0:n_i_faces], \ + i_f_face_normal[0:n_i_faces]) + for (cs_lnum_t ii = 0; ii < n_cells; ii++){ + + cs_lnum_t s_id = cell_cells_idx[ii]; + cs_lnum_t e_id = cell_cells_idx[ii+1]; + + cs_real_t difv[3], djfv[3]; + + cs_lnum_t jj, face_id; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + jj = cell_cells[index]; + face_id = cell_i_faces[index]; + + for (int jsou = 0; jsou < 3; jsou++) { + difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou]; + djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou]; + } + + /* x-y-z component, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pif = pvar[ii][isou]; + cs_real_t pjf = pvar[jj][isou]; + for (int jsou = 0; jsou < 3; jsou++) { + pif = pif + grad[ii][isou][jsou]*difv[jsou]; + pjf = pjf + grad[jj][isou][jsou]*djfv[jsou]; + } + + cs_real_t pfac = pjf; + if (i_massflux[face_id] > 0.) pfac = pif; + + /* U gradient */ + + cs_real_t vfac[3]; + + for (int jsou = 0; jsou < 3; jsou++) { + vfac[jsou] = pfac*i_f_face_normal[face_id][jsou]; + grdpa[ii][isou][jsou] = grdpa[ii][isou][jsou] + vfac[jsou]; + } + } + } + + } + + #pragma omp target teams distribute parallel for \ + map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: b_face_cells[0:n_b_faces], \ + coefb[0:n_b_faces], \ + coefa[0:n_b_faces], \ + b_cells[0:n_cells], \ + cell_b_faces_idx[0:n_cells+1], \ + grad[0:n_cells_ext]) \ + if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { + + cs_lnum_t ii = b_cells[c_idx]; + + cs_lnum_t s_id = cell_b_faces_idx[ii]; + cs_lnum_t e_id = cell_b_faces_idx[ii+1]; + + cs_lnum_t face_id; + + cs_real_t diipbv[3]; + + for(cs_lnum_t index = s_id; index < e_id; index++){ + + face_id = cell_b_faces[index]; + + for (int jsou = 0; jsou < 3; jsou++){ + diipbv[jsou] = diipb[face_id][jsou]; + } + + /* x-y-z components, p = u, v, w */ + + for (int isou = 0; isou < 3; isou++) { + cs_real_t pfac = inc*coefa[face_id][isou]; + /*coefu is a matrix */ + for (int jsou = 0; jsou < 3; jsou++) + pfac += coefb[face_id][jsou][isou]*( pvar[ii][jsou] + + grad[ii][jsou][0]*diipbv[0] + + grad[ii][jsou][1]*diipbv[1] + + grad[ii][jsou][2]*diipbv[2]); + + for (int jsou = 0; jsou < 3; jsou++){ + grdpa[ii][isou][jsou] += pfac*b_f_face_normal[face_id][jsou]; + } + } + } + + } + } + + #pragma omp target teams distribute parallel for \ + map(tofrom: grdpa[0:n_cells_ext]) \ + map(to: cell_vol[0:n_cells_ext]) + for (cs_lnum_t cell_id = 0; cell_id < n_cells; cell_id++) { + cs_real_t unsvol = 1./cell_vol[cell_id]; + for (int isou = 0; isou < 3; isou++) { + for (int jsou = 0; jsou < 3; jsou++){ + grdpa[cell_id][isou][jsou] = grdpa[cell_id][isou][jsou]*unsvol; + } + } + } +} + /* Handle parallelism and periodicity */ + + if (halo != NULL) { + cs_halo_sync_var_strided(halo, halo_type, (cs_real_t *)grdpa, 9); + if (m->n_init_perio > 0) + cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa); + } +} + /*----------------------------------------------------------------------------*/ /*! * \brief Compute the upwind gradient used in the slope tests. @@ -4276,6 +4563,7 @@ cs_convection_diffusion_vector(int idtvar, int iupwin = 0; cs_real_33_t *grad, *grdpa; + cs_real_33_t *grad_target, *grdpa_target; cs_real_t *bndcel; const cs_real_3_t *coface = NULL; @@ -4302,6 +4590,8 @@ cs_convection_diffusion_vector(int idtvar, BFT_MALLOC(grad, n_cells_ext, cs_real_33_t); BFT_MALLOC(grdpa, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grad_target, n_cells_ext, cs_real_33_t); + BFT_MALLOC(grdpa_target, n_cells_ext, cs_real_33_t); /* Choose gradient type */ @@ -4574,14 +4864,24 @@ res_cpu = !compute_cuda; if (iconvp > 0 && iupwin == 0 && isstpp == 0) { - cs_slope_test_gradient_vector(inc, - halo_type, - (const cs_real_33_t *)grad_cpu, - grdpa_cpu, - _pvar, - coefav, - coefbv, - i_massflux); + // cs_slope_test_gradient_vector(inc, + // halo_type, + // (const cs_real_33_t *)grad_cpu, + // grdpa_cpu, + // _pvar, + // coefav, + // coefbv, + // i_massflux); + + + cs_slope_test_gradient_vector_target(inc, + halo_type, + (const cs_real_33_t *)grad, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); } diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 9a0666368a..5170c426af 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6884,7 +6884,8 @@ num_block(unsigned int size, unsigned int num_threads){ return num; } -static void +BEGIN_C_DECLS +void _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_mesh_adjacencies_t *madj, const cs_mesh_quantities_t *fvq, @@ -6971,7 +6972,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, b_cells[0:n_b_cells], \ cocg[0:n_cells_ext]) { - #pragma omp target teams distribute parallel for collapse(3) \ + #pragma omp target teams distribute parallel for \ map(tofrom: rhs[0:n_cells_ext]) for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++){ @@ -7032,7 +7033,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_cells_idx[0:n_cells_ext], \ cell_cells[0:n_cells_ext], \ cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) num_teams(num_block(n_cells, 256)) + pvar[0:n_cells_ext]) //num_teams(num_block(n_cells, 256)) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_lnum_t s_id = cell_cells_idx[c_id]; @@ -7040,14 +7041,14 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_lnum_t c_id2, f_id; - cs_real_t _rhs[256][3][3]; - cs_lnum_t tid = omp_get_thread_num(); + // cs_real_t _rhs[256][3][3]; + // cs_lnum_t tid = omp_get_thread_num(); - for(cs_lnum_t i = 0; i < 3; i++){ - for(cs_lnum_t j = 0; j < 3; j++){ - _rhs[tid][i][j] = 0.0; - } - } + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // _rhs[tid][i][j] = 0.0; + // } + // } cs_real_t dc[3], fctb[3], _weight, _denom, _pond, pfac; for(cs_lnum_t index = s_id; index < e_id; index++){ @@ -7077,16 +7078,16 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, for (cs_lnum_t j = 0; j < 3; j++) { fctb[j] = dc[j] * pfac; - _rhs[tid][i][j] += _weight * fctb[j]; + rhs[c_id][i][j] += _weight * fctb[j]; } } } - for(cs_lnum_t i = 0; i < 3; i++){ - for(cs_lnum_t j = 0; j < 3; j++){ - rhs[c_id][i][j] = _rhs[tid][i][j]; - } - } + // for(cs_lnum_t i = 0; i < 3; i++){ + // for(cs_lnum_t j = 0; j < 3; j++){ + // rhs[c_id][i][j] = _rhs[tid][i][j]; + // } + // } } } @@ -7221,20 +7222,34 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, map(from: gradv[0:n_cells_ext]) \ map(to: pvar[0:n_cells_ext],\ cocg[0:n_cells_ext]) - for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - gradv[c_id][i][0] = rhs[c_id][i][0] * cocg[c_id][0] - + rhs[c_id][i][1] * cocg[c_id][3] - + rhs[c_id][i][2] * cocg[c_id][5]; - - gradv[c_id][i][1] = rhs[c_id][i][0] * cocg[c_id][3] - + rhs[c_id][i][1] * cocg[c_id][1] - + rhs[c_id][i][2] * cocg[c_id][4]; - - gradv[c_id][i][2] = rhs[c_id][i][0] * cocg[c_id][5] - + rhs[c_id][i][1] * cocg[c_id][4] - + rhs[c_id][i][2] * cocg[c_id][2]; - } + for (cs_lnum_t c_idx = 0; c_idx < n_cells*3*3; c_idx++) { + + size_t c_id = c_idx / (3*3); + size_t i = (c_idx / 3) % 3; + size_t j = c_idx % 3; + + auto cocg_temp = cocg[c_id]; + cs_real_t _cocg[3]; + + _cocg[0] = cocg_temp[5]; + _cocg[1] = cocg_temp[4]; + _cocg[2] = cocg_temp[2]; + + if(j == 0){ + _cocg[0] = cocg_temp[0]; + _cocg[1] = cocg_temp[3]; + _cocg[2] = cocg_temp[5]; + } + + if(j == 1){ + _cocg[0] = cocg_temp[3]; + _cocg[1] = cocg_temp[1]; + _cocg[2] = cocg_temp[4]; + } + + gradv[c_id][i][j] = rhs[c_id][i][0] * _cocg[0] + + rhs[c_id][i][1] * _cocg[1] + + rhs[c_id][i][2] * _cocg[2]; } } // end omp data @@ -7246,6 +7261,7 @@ printf("Time of kernel: %lf\n", t_kernel); BFT_FREE(rhs); } +END_C_DECLS static void _lsq_vector_gradient(const cs_mesh_t *m, From e3aa0d18d2872cb7f78938e909800d76ebcdbc0c Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 19 Dec 2023 09:50:49 +0100 Subject: [PATCH 58/70] Various small changes --- src/alge/cs_convection_diffusion.c | 23 +++++++++++++---------- src/alge/cs_gradient.cxx | 18 +++++++++--------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index b8872c3cb1..0fe8faa76d 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -1345,11 +1345,13 @@ cs_slope_test_gradient_vector_target(const int inc, const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; - bool scatter = true; + bool scatter = false; #pragma omp target data map(tofrom: grdpa[0:n_cells_ext]) \ map(to: grad[0:n_cells_ext], \ i_face_cog[0:n_i_faces], \ + cell_i_faces_sgn[0:n_i_faces], \ + cell_i_faces[0:n_i_faces], \ cell_cen[0:n_cells_ext], \ pvar[0:n_cells_ext], \ i_massflux[0:n_i_faces], \ @@ -1373,7 +1375,7 @@ cs_slope_test_gradient_vector_target(const int inc, pvar[0:n_cells_ext], \ i_massflux[0:n_i_faces], \ i_f_face_normal[0:n_i_faces], \ - i_face_cells[0:n_i_faces]) + i_face_cells[0:n_i_faces]) schedule(static,1) for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++){ cs_real_t difv[3], djfv[3]; @@ -1419,7 +1421,7 @@ cs_slope_test_gradient_vector_target(const int inc, map(to: b_face_cells[0:n_b_faces], \ coefb[0:n_b_faces], \ coefa[0:n_b_faces], \ - grad[0:n_cells_ext]) \ + grad[0:n_cells_ext]) schedule(static,1) \ if(m->n_b_faces > CS_THR_MIN) for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { @@ -1455,12 +1457,14 @@ cs_slope_test_gradient_vector_target(const int inc, map(tofrom: grdpa[0:n_cells_ext]) \ map(to: grad[0:n_cells_ext], \ i_face_cog[0:n_i_faces], \ + cell_i_faces_sgn[0:n_i_faces], \ + cell_i_faces[0:n_i_faces], \ cell_cen[0:n_cells_ext], \ cell_cells_idx[0:n_cells_ext], \ cell_cells[0:n_cells_ext], \ pvar[0:n_cells_ext], \ i_massflux[0:n_i_faces], \ - i_f_face_normal[0:n_i_faces]) + i_f_face_normal[0:n_i_faces]) schedule(static,1) for (cs_lnum_t ii = 0; ii < n_cells; ii++){ cs_lnum_t s_id = cell_cells_idx[ii]; @@ -1468,20 +1472,19 @@ cs_slope_test_gradient_vector_target(const int inc, cs_real_t difv[3], djfv[3]; - cs_lnum_t jj, face_id; + cs_lnum_t jj, face_id, face_sgn; for(cs_lnum_t index = s_id; index < e_id; index++){ jj = cell_cells[index]; face_id = cell_i_faces[index]; + face_sgn = cell_i_faces_sgn[index]; for (int jsou = 0; jsou < 3; jsou++) { difv[jsou] = i_face_cog[face_id][jsou] - cell_cen[ii][jsou]; djfv[jsou] = i_face_cog[face_id][jsou] - cell_cen[jj][jsou]; } - /* x-y-z component, p = u, v, w */ - for (int isou = 0; isou < 3; isou++) { cs_real_t pif = pvar[ii][isou]; cs_real_t pjf = pvar[jj][isou]; @@ -1491,9 +1494,9 @@ cs_slope_test_gradient_vector_target(const int inc, } cs_real_t pfac = pjf; - if (i_massflux[face_id] > 0.) pfac = pif; + if (i_massflux[face_id]*face_sgn > 0.) pfac = pif; - /* U gradient */ + pfac *= face_sgn; cs_real_t vfac[3]; @@ -1513,7 +1516,7 @@ cs_slope_test_gradient_vector_target(const int inc, coefa[0:n_b_faces], \ b_cells[0:n_cells], \ cell_b_faces_idx[0:n_cells+1], \ - grad[0:n_cells_ext]) \ + grad[0:n_cells_ext]) schedule(static,1)\ if(m->n_b_faces > CS_THR_MIN) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 5170c426af..263fef603b 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6955,7 +6955,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, double t_kernel = 0.0; double t_begin, t_end; - bool scatter = false; + bool scatter = true; /* Contribution from interior faces */ int num_device = omp_get_num_devices(); @@ -6973,7 +6973,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cocg[0:n_cells_ext]) { #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) + map(tofrom: rhs[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++){ for (cs_lnum_t j = 0; j < 3; j++){ @@ -6986,7 +6986,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, map(tofrom: rhs[0:n_cells_ext]) \ map(to: i_face_cells[0:n_i_faces], \ cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) + pvar[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { cs_lnum_t c_id1 = i_face_cells[f_id][0]; @@ -7033,7 +7033,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_cells_idx[0:n_cells_ext], \ cell_cells[0:n_cells_ext], \ cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) //num_teams(num_block(n_cells, 256)) + pvar[0:n_cells_ext]) schedule(static,1) //num_teams(num_block(n_cells, 64)) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_lnum_t s_id = cell_cells_idx[c_id]; @@ -7041,7 +7041,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cs_lnum_t c_id2, f_id; - // cs_real_t _rhs[256][3][3]; + // cs_real_t _rhs[64][3][3]; // cs_lnum_t tid = omp_get_thread_num(); // for(cs_lnum_t i = 0; i < 3; i++){ @@ -7098,7 +7098,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, map(tofrom: rhs[0:n_cells_ext]) \ map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ cell_cells_idx[0:n_cells_ext], \ - cell_cells_lst[0:n_cells_ext]) + cell_cells_lst[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) { for (cs_lnum_t cidx = cell_cells_idx[c_id1]; cidx < cell_cells_idx[c_id1+1]; @@ -7134,7 +7134,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, coefav[0:n_b_faces], \ coefbv[0:n_b_faces], \ pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) + cocg[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { cs_lnum_t c_id1 = b_face_cells[f_id]; @@ -7175,7 +7175,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, b_cells[0:n_cells], \ cell_b_faces_idx[0:n_cells+1], \ pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) + cocg[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { cs_lnum_t c_id = b_cells[c_idx]; @@ -7221,7 +7221,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, map(tofrom: rhs[0:n_cells_ext]) \ map(from: gradv[0:n_cells_ext]) \ map(to: pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) + cocg[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_cells*3*3; c_idx++) { size_t c_id = c_idx / (3*3); From c66128a4207d81bc13a35ba9a5dc5ec05545336a Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 20 Dec 2023 17:56:08 +0100 Subject: [PATCH 59/70] Chore with OMP --- src/alge/cs_convection_diffusion.c | 27 +++++++++---------- src/alge/cs_gradient.cxx | 42 ++++++++++++------------------ 2 files changed, 29 insertions(+), 40 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 0fe8faa76d..3a3e2726ee 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -1288,6 +1288,7 @@ cs_slope_test_gradient_vector(const int inc, } } +#if defined(HAVE_OPENMP_TARGET) void cs_slope_test_gradient_vector_target(const int inc, const cs_halo_type_t halo_type, @@ -1345,7 +1346,7 @@ cs_slope_test_gradient_vector_target(const int inc, const cs_lnum_t *restrict i_group_index = m->i_face_numbering->group_index; const cs_lnum_t *restrict b_group_index = m->b_face_numbering->group_index; - bool scatter = false; + bool scatter = true; #pragma omp target data map(tofrom: grdpa[0:n_cells_ext]) \ map(to: grad[0:n_cells_ext], \ @@ -1577,6 +1578,7 @@ cs_slope_test_gradient_vector_target(const int inc, cs_halo_perio_sync_var_sym_tens(halo, halo_type, (cs_real_t *)grdpa); } } +#endif /*----------------------------------------------------------------------------*/ /*! @@ -4566,7 +4568,6 @@ cs_convection_diffusion_vector(int idtvar, int iupwin = 0; cs_real_33_t *grad, *grdpa; - cs_real_33_t *grad_target, *grdpa_target; cs_real_t *bndcel; const cs_real_3_t *coface = NULL; @@ -4593,8 +4594,6 @@ cs_convection_diffusion_vector(int idtvar, BFT_MALLOC(grad, n_cells_ext, cs_real_33_t); BFT_MALLOC(grdpa, n_cells_ext, cs_real_33_t); - BFT_MALLOC(grad_target, n_cells_ext, cs_real_33_t); - BFT_MALLOC(grdpa_target, n_cells_ext, cs_real_33_t); /* Choose gradient type */ @@ -4876,16 +4875,16 @@ res_cpu = !compute_cuda; // coefbv, // i_massflux); - - cs_slope_test_gradient_vector_target(inc, - halo_type, - (const cs_real_33_t *)grad, - grdpa_cpu, - _pvar, - coefav, - coefbv, - i_massflux); - + #if defined(HAVE_OPENMP_TARGET) + cs_slope_test_gradient_vector_target(inc, + halo_type, + (const cs_real_33_t *)grad, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + #endif } if(perf){ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 263fef603b..ff6a94002b 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6877,14 +6877,9 @@ void cs_math_3_normalize_target(const cs_real_t in[3], out[2] = inverse_norm * in[2]; } -unsigned int -num_block(unsigned int size, unsigned int num_threads){ - unsigned int num = (unsigned int)ceil((double)size / num_threads); - - return num; -} - BEGIN_C_DECLS +#if defined(HAVE_OPENMP_TARGET) + void _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_mesh_adjacencies_t *madj, @@ -6964,11 +6959,12 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ map(from: gradv[0:n_cells_ext]) \ map(to: i_face_cells[0:n_i_faces], b_face_normal[0:n_b_faces], \ - coefav[0:n_b_faces], coefbv[0:n_b_faces], \ + coefav[0:n_b_faces], coefbv[0:n_b_faces], b_dist[0:n_b_faces],\ cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ cell_cells_idx[0:n_cells_ext], \ cell_cells_lst[0:n_cells_ext], \ cell_b_faces_idx[0:n_cells+1], \ + b_face_cells[0:n_b_faces], \ b_cells[0:n_b_cells], \ cocg[0:n_cells_ext]) { @@ -7033,7 +7029,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cell_cells_idx[0:n_cells_ext], \ cell_cells[0:n_cells_ext], \ cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) schedule(static,1) //num_teams(num_block(n_cells, 64)) + pvar[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_lnum_t s_id = cell_cells_idx[c_id]; @@ -7133,6 +7129,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, map(to: b_face_normal[0:n_b_faces], \ coefav[0:n_b_faces], \ coefbv[0:n_b_faces], \ + b_face_cells[0:n_b_faces], \ pvar[0:n_cells_ext],\ cocg[0:n_cells_ext]) schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { @@ -7261,6 +7258,8 @@ printf("Time of kernel: %lf\n", t_kernel); BFT_FREE(rhs); } + +#endif END_C_DECLS static void @@ -7335,7 +7334,6 @@ _lsq_vector_gradient(const cs_mesh_t *m, perf = false; accuracy = false; #endif -accuracy = false; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever @@ -7382,6 +7380,7 @@ BFT_MALLOC(gradv_target, n_cells_ext, cs_real_33_t); #endif start = std::chrono::high_resolution_clock::now(); +#if defined(HAVE_OPENMP_TARGET) _lsq_vector_gradient_target(m, madj, fvq, @@ -7394,6 +7393,7 @@ _lsq_vector_gradient_target(m, gradv_target, cocg, rhs_target); +#endif stop = std::chrono::high_resolution_clock::now(); elapsed_target = std::chrono::duration_cast(stop - start); printf("OMP target lsq %ld\n", elapsed_target.count()); @@ -7550,19 +7550,6 @@ if(compute_cpu){ } } -for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { - for (int j = 0; j < 3; ++j) { - auto cpu = gradv_cpu[c_id][i][j]; - auto omp = gradv_target[c_id][i][j]; - - if (fabs(cpu - omp) / fmax(fabs(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %a\tOMP = %a\n|CPU - OMP| = %a\t|CPU - OMP|ulp = %a\n", c_id, i, j, cpu, omp, fabs(cpu - omp), cs_diff_ulp(cpu, omp)); - } - } - } -} - /* Compute gradient on boundary cells */ /*------------------------------------*/ @@ -7658,7 +7645,10 @@ if(res_cpu){ BFT_FREE(rhs); BFT_FREE(rhs_cuda); + BFT_FREE(rhs_target); BFT_FREE(gradv_cuda); + BFT_FREE(gradv_cpu); + BFT_FREE(gradv_target); } /*----------------------------------------------------------------------------*/ @@ -9583,9 +9573,9 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever - // compute_cuda = true; - // compute_cpu = true; - // res_cpu = false; + compute_cuda = false; + compute_cpu = true; + res_cpu = true; // A ne pas garder dans la version finale // perf = false; From d068786103f9dbcc5288dd12767b67dd4bf1b2a3 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Thu, 21 Dec 2023 11:15:37 +0100 Subject: [PATCH 60/70] Remove dgx workaround for mesh --- src/user/cs_user_mesh.c | 45 +++++++++++------------------------------ 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/src/user/cs_user_mesh.c b/src/user/cs_user_mesh.c index 1208efc081..a0795a0e51 100644 --- a/src/user/cs_user_mesh.c +++ b/src/user/cs_user_mesh.c @@ -57,7 +57,15 @@ BEGIN_C_DECLS -static int _n_tubes = 1; +/*----------------------------------------------------------------------------*/ +/*! + * \file cs_user_mesh.c + * + * \brief Definition and modification of the calculation mesh. + * + * See \ref cs_user_mesh for examples. + */ +/*----------------------------------------------------------------------------*/ /*============================================================================ * User function definitions @@ -80,7 +88,6 @@ static int _n_tubes = 1; void cs_user_mesh_restart_mode(void) { - const char *path = "mesh_input.csm"; } @@ -103,24 +110,11 @@ cs_user_mesh_input(void) */ /*----------------------------------------------------------------------------*/ - +#pragma weak cs_user_join void cs_user_join(void) { - int join_num; - - /* Add a joining operation */ - /* ----------------------- */ - - int verbosity = 1; /* per-task dump if > 1, debug level if >= 3 */ - int visualization = 1; /* debug level if >= 3 */ - float fraction = 0.10, plane = 25.; - join_num = cs_join_add("join", - fraction, - plane, - verbosity, - visualization); } /*----------------------------------------------------------------------------*/ @@ -129,26 +123,11 @@ cs_user_join(void) */ /*----------------------------------------------------------------------------*/ - +#pragma weak cs_user_periodicity void cs_user_periodicity(void) { - int join_num; - - int verbosity = 1; /* per-task dump if > 1, debug level if >= 3 */ - int visualization = 1; /* debug level if >= 3 */ - float fraction = 0.10, plane = 25.; - - const double translation[3] = {0.0, - 0.0450*_n_tubes, - 0.0}; /* Translation vector */ - - join_num = cs_join_perio_add_translation("south or north", - fraction, - plane, - verbosity, - visualization, - translation); + } /*----------------------------------------------------------------------------*/ From ba534a7b0c06030e6450da3a6aea69fa0d0e6517 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Sat, 23 Dec 2023 00:10:04 +0100 Subject: [PATCH 61/70] Fix memory issue --- src/alge/cs_gradient.cxx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index ff6a94002b..08b36815aa 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6934,19 +6934,8 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, const cs_real_3_t *restrict b_face_normal = (const cs_real_3_t *restrict)fvq->b_face_normal; - // cs_real_33_t *rhs; - // BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); - /* Timing the computation */ -#if defined(HAVE_CUDA) - bool accel = (cs_get_device_id() > -1) ? true : false; -#else - bool accel = false; -#endif - - // _get_cell_cocg_lsq(m, halo_type, accel, fvq, &cocg, &cocgb_s); - double t_kernel = 0.0; double t_begin, t_end; @@ -7256,7 +7245,6 @@ t_end = omp_get_wtime(); t_kernel = t_end - t_begin; printf("Time of kernel: %lf\n", t_kernel); - BFT_FREE(rhs); } #endif From 41b261d771f99d2bb1352177bbc66ac121a1700d Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Wed, 27 Dec 2023 09:30:30 +0100 Subject: [PATCH 62/70] correction call of kernel set_one_to_coeff_b and fix display accuracy --- src/alge/cs_convection_diffusion.c | 40 +++++++++++++++--------------- src/alge/cs_gradient.cxx | 17 ++++++++----- src/alge/cs_gradient_cuda.cu | 25 +++++++++++++------ src/alge/cs_gradient_priv.h | 10 +++++--- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 3a3e2726ee..d0325e2fa8 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4866,25 +4866,25 @@ res_cpu = !compute_cuda; if (iconvp > 0 && iupwin == 0 && isstpp == 0) { - // cs_slope_test_gradient_vector(inc, - // halo_type, - // (const cs_real_33_t *)grad_cpu, - // grdpa_cpu, - // _pvar, - // coefav, - // coefbv, - // i_massflux); - - #if defined(HAVE_OPENMP_TARGET) - cs_slope_test_gradient_vector_target(inc, - halo_type, - (const cs_real_33_t *)grad, - grdpa_cpu, - _pvar, - coefav, - coefbv, - i_massflux); - #endif + cs_slope_test_gradient_vector(inc, + halo_type, + (const cs_real_33_t *)grad_cpu, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + + // #if defined(HAVE_OPENMP_TARGET) + // cs_slope_test_gradient_vector_target(inc, + // halo_type, + // (const cs_real_33_t *)grad_cpu, + // grdpa_cpu, + // _pvar, + // coefav, + // coefbv, + // i_massflux); + // #endif } if(perf){ @@ -4920,7 +4920,7 @@ res_cpu = !compute_cuda; cuda = grdpa_gpu[c_id][i][j]; err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-6) { - printf("convection_diffusion_b DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); + printf("slop_test DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err); } } } diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 08b36815aa..c609c5f6b3 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5820,7 +5820,7 @@ res_cpu = !compute_cuda; auto cuda = grad_gpu[c_id][i][j]; double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-6) { - printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + printf("reconstruct DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); } } } @@ -7609,7 +7609,7 @@ if(accuracy){ auto cuda = gradv[c_id][i][j]; if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-12) { - printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + printf("lsq DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\t|CPU - CUDA| = %.17f\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); } } } @@ -8228,7 +8228,7 @@ cs_real_t c_norm, ref_norm; auto cuda = gradv[c_id][i][j]; if (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) > 1e-6) { - printf("DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\n|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); + printf("lsq_strided DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\t|CPU - CUDA| = %a\t|CPU - CUDA|ulp = %a\n", c_id, i, j, cpu, cuda, fabs(cpu - cuda), cs_diff_ulp(cpu, cuda)); } } } @@ -9577,7 +9577,7 @@ res_cpu = !compute_cuda; if(perf){ start = std::chrono::high_resolution_clock::now(); } - _gradient_vector_cuda(mesh, _bc_coeff_a_gpu, _bc_coeff_b_gpu, perf); + _gradient_vector_cuda(mesh, bc_coeff_a, bc_coeff_b, _bc_coeff_a_gpu, _bc_coeff_b_gpu, perf); if(perf){ stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); @@ -9656,14 +9656,19 @@ res_cpu = !compute_cuda; auto cuda = _bc_coeff_a_gpu[f_id][i]; double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-12) { - printf("_gradient_vector_a DIFFERENCE @%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", f_id, i, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + printf("_gradient_vector_a DIFFERENCE @%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); } + } + } + + for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { + for (cs_lnum_t i = 0; i < 3; i++) { for (int j =0; j < 3; ++j) { auto cpu = _bc_coeff_b_cpu[f_id][i][j]; auto cuda = _bc_coeff_b_gpu[f_id][i][j]; double err = (fabs(cpu - cuda) / fmax(fabs(cpu), 1e-6) ); if (err> 1e-12) { - printf("_gradient_vector_b DIFFERENCE @%d-%d-%d: CPU = %a\tCUDA = %a\tdiff = %a\tdiff relative = %a\tulp = %a\n", f_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); + printf("_gradient_vector_b DIFFERENCE @%d-%d-%d: CPU = %.17f\tCUDA = %.17f\tdiff = %.17f\tdiff relative = %.17f\tulp = %a\n", f_id, i, j, cpu, cuda, fabs(cpu - cuda), err, cs_diff_ulp(cpu, cuda)); } } } diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 011a6ed4bf..88675ce9e5 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -2080,10 +2080,12 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, * grad --> gradient of pvar (du_i/dx_j : grad[][i][j]) *----------------------------------------------------------------------------*/ extern "C" void -_gradient_vector_cuda(const cs_mesh_t *mesh, - cs_real_3_t *_bc_coeff_a, - cs_real_33_t *_bc_coeff_b, - bool perf) +_gradient_vector_cuda(const cs_mesh_t *mesh, + const cs_real_3_t *bc_coeff_a_cpu, + const cs_real_33_t *bc_coeff_b_cpu, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool perf) { const cs_lnum_t n_b_faces = mesh->n_b_faces; @@ -2116,12 +2118,19 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, /* Initialization */ CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t)); + + if(bc_coeff_a_cpu == NULL){ + cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t)); + } + CS_CUDA_CHECK(cudaEventRecord(init1, stream)); - cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); - _set_one_to_coeff_b<<< n_b_faces/blocksize * 3, blocksize, 0, stream>>> - (n_b_faces, _bc_coeff_b_d); + if(bc_coeff_b_cpu == NULL){ + cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); + _set_one_to_coeff_b<<< get_gridsize(n_b_faces, blocksize) * 3, blocksize, 0, stream>>> + (n_b_faces * 3, _bc_coeff_b_d); + } + CS_CUDA_CHECK(cudaEventRecord(init2, stream)); diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 97109cc8b0..d07c392e34 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -143,10 +143,12 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, bool perf); void -_gradient_vector_cuda(const cs_mesh_t *mesh, - cs_real_3_t *_bc_coeff_a, - cs_real_33_t *_bc_coeff_b, - bool perf); +_gradient_vector_cuda(const cs_mesh_t *mesh, + const cs_real_3_t *bc_coeff_a_cpu, + const cs_real_33_t *bc_coeff_b_cpu, + cs_real_3_t *_bc_coeff_a, + cs_real_33_t *_bc_coeff_b, + bool perf); #endif From 05d6e1117aaf8beb24a7ff62d9fddb205f605d90 Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Thu, 4 Jan 2024 10:40:11 +0100 Subject: [PATCH 63/70] review replace pointer by boolean --- src/alge/cs_gradient.cxx | 2 +- src/alge/cs_gradient_cuda.cu | 10 +++++----- src/alge/cs_gradient_priv.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index c609c5f6b3..03d3d4b26a 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -9577,7 +9577,7 @@ res_cpu = !compute_cuda; if(perf){ start = std::chrono::high_resolution_clock::now(); } - _gradient_vector_cuda(mesh, bc_coeff_a, bc_coeff_b, _bc_coeff_a_gpu, _bc_coeff_b_gpu, perf); + _gradient_vector_cuda(mesh, _bc_coeff_a_gpu, _bc_coeff_b_gpu, (bc_coeff_a == NULL), (bc_coeff_b == NULL), perf); if(perf){ stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 88675ce9e5..e6d1ad21bc 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -2081,10 +2081,10 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, *----------------------------------------------------------------------------*/ extern "C" void _gradient_vector_cuda(const cs_mesh_t *mesh, - const cs_real_3_t *bc_coeff_a_cpu, - const cs_real_33_t *bc_coeff_b_cpu, cs_real_3_t *_bc_coeff_a, cs_real_33_t *_bc_coeff_b, + bool a_null, + bool b_null, bool perf) { const cs_lnum_t n_b_faces = mesh->n_b_faces; @@ -2119,15 +2119,15 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaEventRecord(mem_h2d, stream)); - if(bc_coeff_a_cpu == NULL){ + if(a_null){ cudaMemset(_bc_coeff_a_d, 0, n_b_faces * sizeof(cs_real_3_t)); } CS_CUDA_CHECK(cudaEventRecord(init1, stream)); - if(bc_coeff_b_cpu == NULL){ + if(b_null){ cudaMemset(_bc_coeff_b_d, 0, n_b_faces * sizeof(cs_real_33_t)); - _set_one_to_coeff_b<<< get_gridsize(n_b_faces, blocksize) * 3, blocksize, 0, stream>>> + _set_one_to_coeff_b<<< get_gridsize(n_b_faces * 3, blocksize), blocksize, 0, stream>>> (n_b_faces * 3, _bc_coeff_b_d); } diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index d07c392e34..a4cfc45542 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -144,10 +144,10 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, void _gradient_vector_cuda(const cs_mesh_t *mesh, - const cs_real_3_t *bc_coeff_a_cpu, - const cs_real_33_t *bc_coeff_b_cpu, cs_real_3_t *_bc_coeff_a, cs_real_33_t *_bc_coeff_b, + bool a_null, + bool b_null, bool perf); #endif From 5163d160278e81a6653f41020c46e9ddf72c2411 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 9 Jan 2024 14:55:22 +0100 Subject: [PATCH 64/70] Remove function duplication for device --- src/alge/cs_gradient.cxx | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 03d3d4b26a..6c435c4621 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6863,20 +6863,6 @@ _find_bc_coeffs(const char *var_name, * gradv --> gradient of pvar (du_i/dx_j : gradv[][i][j]) *----------------------------------------------------------------------------*/ -void cs_math_3_normalize_target(const cs_real_t in[3], - cs_real_t out[3]) -{ - cs_real_t norm = sqrt(in[0]*in[0] - + in[1]*in[1] - + in[2]*in[2]); - - cs_real_t inverse_norm = 1. / norm; - - out[0] = inverse_norm * in[0]; - out[1] = inverse_norm * in[1]; - out[2] = inverse_norm * in[2]; -} - BEGIN_C_DECLS #if defined(HAVE_OPENMP_TARGET) @@ -7120,14 +7106,14 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, coefbv[0:n_b_faces], \ b_face_cells[0:n_b_faces], \ pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) schedule(static,1) + cocg[0:n_cells_ext]) firstprivate(cs_math_zero_threshold) schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { cs_lnum_t c_id1 = b_face_cells[f_id]; cs_real_t n_d_dist[3]; // /* Normal is vector 0 if the b_face_normal norm is too small */ - cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize(b_face_normal[f_id], n_d_dist); cs_real_t d_b_dist = 1. / b_dist[f_id]; @@ -7161,7 +7147,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, b_cells[0:n_cells], \ cell_b_faces_idx[0:n_cells+1], \ pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) schedule(static,1) + cocg[0:n_cells_ext]) firstprivate(cs_math_zero_threshold) schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { cs_lnum_t c_id = b_cells[c_idx]; @@ -7177,7 +7163,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, f_id = cell_b_faces[index]; - cs_math_3_normalize_target(b_face_normal[f_id], n_d_dist); + cs_math_3_normalize(b_face_normal[f_id], n_d_dist); cs_real_t d_b_dist = 1. / b_dist[f_id]; From af13249c7b122715cae7380930f89ad35304c707 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 3 Jan 2024 16:52:50 +0100 Subject: [PATCH 65/70] More of convection diffusion OpenMP --- src/alge/cs_convection_diffusion.c | 660 ++++++++++++++++++++++++----- src/alge/cs_gradient.cxx | 112 ++--- 2 files changed, 599 insertions(+), 173 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index d0325e2fa8..f30c2bfebd 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -1289,6 +1289,10 @@ cs_slope_test_gradient_vector(const int inc, } #if defined(HAVE_OPENMP_TARGET) +// #pragma omp declare target +// const cs_real_t cs_math_zero_threshold = FLT_MIN; +// #pragma omp end declare target + void cs_slope_test_gradient_vector_target(const int inc, const cs_halo_type_t halo_type, @@ -4387,6 +4391,20 @@ cs_face_convection_scalar(int idtvar, BFT_FREE(courant); } +void cs_math_3_normalize_target_cd(const cs_real_t in[3], + cs_real_t out[3]) +{ + cs_real_t norm = sqrt(in[0]*in[0] + + in[1]*in[1] + + in[2]*in[2]); + + cs_real_t inverse_norm = 1. / norm; + + out[0] = inverse_norm * in[0]; + out[1] = inverse_norm * in[1]; + out[2] = inverse_norm * in[2]; +} + /*----------------------------------------------------------------------------*/ /*! * \brief Add the explicit part of the convection/diffusion terms of a transport @@ -4498,7 +4516,10 @@ cs_convection_diffusion_vector(int idtvar, cs_mesh_quantities_t *fvq = cs_glob_mesh_quantities; const cs_lnum_t n_cells = m->n_cells; + const cs_lnum_t n_b_cells = m->n_b_cells; const cs_lnum_t n_cells_ext = m->n_cells_with_ghosts; + const cs_lnum_t n_i_faces = m->n_i_faces; + const cs_lnum_t n_b_faces = m->n_b_faces; const int n_i_groups = m->i_face_numbering->n_groups; const int n_i_threads = m->i_face_numbering->n_threads; const int n_b_threads = m->b_face_numbering->n_threads; @@ -5537,131 +5558,294 @@ res_cpu = !compute_cuda; /* Unsteady */ } else { + // ---------------OMP and CUDA here --------------------- + +// for (int g_id = 0; g_id < n_i_groups; g_id++) { +// # pragma omp parallel for reduction(+:n_upwind) +// for (int t_id = 0; t_id < n_i_threads; t_id++) { +// for (cs_lnum_t face_id = i_group_index[(t_id*n_i_groups + g_id)*2]; +// face_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; +// face_id++) { + +// cs_lnum_t ii = i_face_cells[face_id][0]; +// cs_lnum_t jj = i_face_cells[face_id][1]; + +// cs_real_t fluxi[3], fluxj[3] ; +// for (int isou = 0; isou < 3; isou++) { +// fluxi[isou] = 0; +// fluxj[isou] = 0; +// } +// cs_real_3_t pip, pjp; +// cs_real_3_t pif, pjf; +// bool upwind_switch = false; +// cs_real_3_t _pi, _pj; + +// for (int i = 0; i < 3; i++) { +// _pi[i] = _pvar[ii][i]; +// _pj[i] = _pvar[jj][i]; +// } + +// /* Scaling due to mass balance in porous modelling */ +// if (i_f_face_factor != NULL) { +// cs_real_3_t n; +// cs_math_3_normalize(i_face_normal[face_id], n); + +// cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); +// cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); +// } + +// cs_real_t bldfrp = (cs_real_t) ircflp; +// /* Local limitation of the reconstruction */ +// if (df_limiter != NULL && ircflp > 0) +// bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), +// 0.); + +// cs_i_cd_unsteady_slope_test_vector(&upwind_switch, +// iconvp, +// bldfrp, +// ischcp, +// blencp, +// blend_st, +// weight[face_id], +// i_dist[face_id], +// i_face_surf[face_id], +// cell_cen[ii], +// cell_cen[jj], +// i_face_normal[face_id], +// i_face_cog[face_id], +// diipf[face_id], +// djjpf[face_id], +// i_massflux[face_id], +// (const cs_real_3_t *)grad[ii], +// (const cs_real_3_t *)grad[jj], +// (const cs_real_3_t *)grdpa[ii], +// (const cs_real_3_t *)grdpa[jj], +// _pi, +// _pj, +// pif, +// pjf, +// pip, +// pjp); + +// cs_i_conv_flux_vector(iconvp, +// thetap, +// imasac, +// _pvar[ii], +// _pvar[jj], +// pif, +// pif, /* no relaxation */ +// pjf, +// pjf, /* no relaxation */ +// i_massflux[face_id], +// fluxi, +// fluxj); + + +// cs_i_diff_flux_vector(idiffp, +// thetap, +// pip, +// pjp, +// pip, /* no relaxation */ +// pjp, /* no relaxation */ +// i_visc[face_id], +// fluxi, +// fluxj); + +// if (upwind_switch) { + +// /* in parallel, face will be counted by one and only one rank */ +// if (ii < n_cells) +// n_upwind++; + +// if (v_slope_test != NULL) { +// v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; +// v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; +// } +// } +// /* Saving velocity at internal faces, if needed */ +// if (i_pvar != NULL) { +// if (i_massflux[face_id] >= 0.) { +// for (cs_lnum_t i = 0; i < 3; i++) +// i_pvar[face_id][i] += thetap * pif[i]; +// } +// else { +// for (cs_lnum_t i = 0; i < 3; i++) +// i_pvar[face_id][i] += thetap * pjf[i]; +// } +// } + +// for (int isou = 0; isou < 3; isou++) { + +// rhs[ii][isou] -= fluxi[isou]; +// rhs[jj][isou] += fluxj[isou]; + +// } /* isou */ + +// } +// } +// } + #if defined(HAVE_OPENMP_TARGET) + #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + i_massflux[0:n_i_faces], \ + i_f_face_factor[0:n_i_faces], \ + i_face_normal[0:n_i_faces], \ + i_visc[0:n_i_faces], \ + i_face_cog[0:n_i_faces], \ + i_face_surf[0:n_i_faces], \ + i_dist[0:n_i_faces], \ + weight[0:n_i_faces], \ + diipf[0:n_i_faces], \ + djjpf[0:n_i_faces], \ + i_pvar[0:n_i_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + cell_cen[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) + { + #pragma omp target teams distribute parallel for reduction(+:n_upwind) \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: i_face_cells[0:n_i_faces], \ + i_massflux[0:n_i_faces], \ + i_f_face_factor[0:n_i_faces], \ + i_face_normal[0:n_i_faces], \ + i_visc[0:n_i_faces], \ + i_face_cog[0:n_i_faces], \ + i_face_surf[0:n_i_faces], \ + i_dist[0:n_i_faces], \ + weight[0:n_i_faces], \ + diipf[0:n_i_faces], \ + djjpf[0:n_i_faces], \ + i_pvar[0:n_i_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + cell_cen[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) \ + firstprivate(cs_math_zero_threshold, \ + iconvp, thetap, ischcp, blencp, blend_st, \ + imasac, idiffp, ircflp) \ + schedule(static,1) + for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++) { - for (int g_id = 0; g_id < n_i_groups; g_id++) { -# pragma omp parallel for reduction(+:n_upwind) - for (int t_id = 0; t_id < n_i_threads; t_id++) { - for (cs_lnum_t face_id = i_group_index[(t_id*n_i_groups + g_id)*2]; - face_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; - face_id++) { - - cs_lnum_t ii = i_face_cells[face_id][0]; - cs_lnum_t jj = i_face_cells[face_id][1]; - - cs_real_t fluxi[3], fluxj[3] ; - for (int isou = 0; isou < 3; isou++) { - fluxi[isou] = 0; - fluxj[isou] = 0; - } - cs_real_3_t pip, pjp; - cs_real_3_t pif, pjf; - bool upwind_switch = false; - cs_real_3_t _pi, _pj; + cs_lnum_t ii = i_face_cells[face_id][0]; + cs_lnum_t jj = i_face_cells[face_id][1]; - for (int i = 0; i < 3; i++) { - _pi[i] = _pvar[ii][i]; - _pj[i] = _pvar[jj][i]; - } + cs_real_t fluxi[3], fluxj[3] ; + for (int isou = 0; isou < 3; isou++) { + fluxi[isou] = 0; + fluxj[isou] = 0; + } + cs_real_3_t pip, pjp; + cs_real_3_t pif, pjf; + bool upwind_switch = false; + cs_real_3_t _pi, _pj; - /* Scaling due to mass balance in porous modelling */ - if (i_f_face_factor != NULL) { - cs_real_3_t n; - cs_math_3_normalize(i_face_normal[face_id], n); + for (int i = 0; i < 3; i++) { + _pi[i] = _pvar[ii][i]; + _pj[i] = _pvar[jj][i]; + } - cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); - cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); - } + /* Scaling due to mass balance in porous modelling */ + if (i_f_face_factor != NULL) { + cs_real_3_t n; + cs_math_3_normalize_target_cd(i_face_normal[face_id], n); - cs_real_t bldfrp = (cs_real_t) ircflp; - /* Local limitation of the reconstruction */ - if (df_limiter != NULL && ircflp > 0) - bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), - 0.); + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); + } - cs_i_cd_unsteady_slope_test_vector(&upwind_switch, - iconvp, - bldfrp, - ischcp, - blencp, - blend_st, - weight[face_id], - i_dist[face_id], - i_face_surf[face_id], - cell_cen[ii], - cell_cen[jj], - i_face_normal[face_id], - i_face_cog[face_id], - diipf[face_id], - djjpf[face_id], - i_massflux[face_id], - (const cs_real_3_t *)grad[ii], - (const cs_real_3_t *)grad[jj], - (const cs_real_3_t *)grdpa[ii], - (const cs_real_3_t *)grdpa[jj], - _pi, - _pj, - pif, - pjf, - pip, - pjp); + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), + 0.); - cs_i_conv_flux_vector(iconvp, - thetap, - imasac, - _pvar[ii], - _pvar[jj], - pif, - pif, /* no relaxation */ - pjf, - pjf, /* no relaxation */ - i_massflux[face_id], - fluxi, - fluxj); + cs_i_cd_unsteady_slope_test_vector(&upwind_switch, + iconvp, + bldfrp, + ischcp, + blencp, + blend_st, + weight[face_id], + i_dist[face_id], + i_face_surf[face_id], + cell_cen[ii], + cell_cen[jj], + i_face_normal[face_id], + i_face_cog[face_id], + diipf[face_id], + djjpf[face_id], + i_massflux[face_id], + grad[ii], + grad[jj], + grdpa[ii], + grdpa[jj], + _pi, + _pj, + pif, + pjf, + pip, + pjp); + + cs_i_conv_flux_vector(iconvp, + thetap, + imasac, + _pvar[ii], + _pvar[jj], + pif, + pif, /* no relaxation */ + pjf, + pjf, /* no relaxation */ + i_massflux[face_id], + fluxi, + fluxj); - cs_i_diff_flux_vector(idiffp, - thetap, - pip, - pjp, - pip, /* no relaxation */ - pjp, /* no relaxation */ - i_visc[face_id], - fluxi, - fluxj); + cs_i_diff_flux_vector(idiffp, + thetap, + pip, + pjp, + pip, /* no relaxation */ + pjp, /* no relaxation */ + i_visc[face_id], + fluxi, + fluxj); - if (upwind_switch) { + if (upwind_switch) { - /* in parallel, face will be counted by one and only one rank */ - if (ii < n_cells) - n_upwind++; + /* in parallel, face will be counted by one and only one rank */ + if (ii < n_cells) + n_upwind++; - if (v_slope_test != NULL) { - v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; - v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; - } + if (v_slope_test != NULL) { + v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; + v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; } - /* Saving velocity at internal faces, if needed */ - if (i_pvar != NULL) { - if (i_massflux[face_id] >= 0.) { - for (cs_lnum_t i = 0; i < 3; i++) - i_pvar[face_id][i] += thetap * pif[i]; - } - else { - for (cs_lnum_t i = 0; i < 3; i++) - i_pvar[face_id][i] += thetap * pjf[i]; - } + } + /* Saving velocity at internal faces, if needed */ + if (i_pvar != NULL) { + if (i_massflux[face_id] >= 0.) { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pif[i]; } + else { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pjf[i]; + } + } - for (int isou = 0; isou < 3; isou++) { - - rhs[ii][isou] -= fluxi[isou]; - rhs[jj][isou] += fluxj[isou]; + for (int isou = 0; isou < 3; isou++) { + + #pragma omp atomic + rhs[ii][isou] -= fluxi[isou]; + #pragma omp atomic + rhs[jj][isou] += fluxj[isou]; - } /* isou */ + } /* isou */ - } } - } + } // target data + #endif } /* idtvar */ @@ -5888,12 +6072,251 @@ res_cpu = !compute_cuda; /* Unsteady */ } else { + // ---------------OMP and CUDA here --------------------- +// # pragma omp parallel for if(m->n_b_faces > CS_THR_MIN) +// for (int t_id = 0; t_id < n_b_threads; t_id++) { +// for (cs_lnum_t face_id = b_group_index[t_id*2]; +// face_id < b_group_index[t_id*2 + 1]; +// face_id++) { + +// cs_lnum_t ii = b_face_cells[face_id]; + +// cs_real_t fluxi[3]; +// for (int isou = 0; isou < 3; isou++) { +// fluxi[isou] = 0; +// } +// cs_real_3_t pip; +// cs_real_3_t _pi; +// cs_real_t pfac[3]; + +// for (int i = 0; i < 3; i++) { +// _pi[i] = _pvar[ii][i]; +// } + +// /* Scaling due to mass balance in porous modelling */ +// if (b_f_face_factor != NULL) { +// cs_real_3_t n; +// cs_math_3_normalize(b_face_normal[face_id], n); + +// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); +// } + +// cs_real_t bldfrp = (cs_real_t) ircflp; +// /* Local limitation of the reconstruction */ +// if (df_limiter != NULL && ircflp > 0) +// bldfrp = cs_math_fmax(df_limiter[ii], 0.); + +// cs_b_cd_unsteady_vector(bldfrp, +// diipb[face_id], +// (const cs_real_3_t *)grad[ii], +// _pi, +// pip); +// cs_b_upwind_flux_vector(iconvp, +// thetap, +// imasac, +// inc, +// bc_type[face_id], +// _pi, +// _pi, /* no relaxation */ +// pip, +// coefav[face_id], +// coefbv[face_id], +// b_massflux[face_id], +// pfac, +// fluxi); + +// /* Saving velocity on boundary faces */ +// if (b_pvar != NULL) { +// if (b_massflux[face_id] >= 0.) { +// for (cs_lnum_t i = 0; i < 3; i++) +// b_pvar[face_id][i] += thetap * _pi[i]; +// } +// else { +// for (cs_lnum_t i = 0; i < 3; i++) { +// b_pvar[face_id][i] += thetap * pfac[i]; +// } +// } +// } + +// cs_b_diff_flux_vector(idiffp, +// thetap, +// inc, +// pip, +// cofafv[face_id], +// cofbfv[face_id], +// b_visc[face_id], +// fluxi); + +// for(int isou = 0; isou < 3; isou++) { +// rhs[ii][isou] -= fluxi[isou]; +// } + +// } +// } + +// /* The variable is internally coupled and an implicit contribution +// * is required */ +// if (icoupl > 0) { +// /* Prepare data for sending */ +// BFT_MALLOC(pvar_distant, n_distant, cs_real_3_t); + +// for (cs_lnum_t ii = 0; ii < n_distant; ii++) { +// cs_lnum_t face_id = faces_distant[ii]; +// cs_lnum_t jj = b_face_cells[face_id]; + +// cs_real_3_t pip; +// cs_real_3_t _pj; + +// for (int i = 0; i < 3; i++) { +// _pj[i] = _pvar[jj][i]; +// } + +// cs_real_t bldfrp = (cs_real_t) ircflp; +// /* Local limitation of the reconstruction */ +// /* Note: to be treated exactly as a internal face, should be a bending +// * between the two cells... */ +// if (df_limiter != NULL && ircflp > 0) +// bldfrp = cs_math_fmax(df_limiter[jj], 0.); + +// /* Scaling due to mass balance in porous modelling */ +// if (b_f_face_factor != NULL) { +// cs_real_3_t n; +// cs_math_3_normalize(b_face_normal[face_id], n); + +// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); +// } + +// cs_b_cd_unsteady_vector(bldfrp, +// diipb[face_id], +// (const cs_real_3_t *)grad[jj], +// _pj, +// pip); + +// for (int k = 0; k < 3; k++) +// pvar_distant[ii][k] = pip[k]; +// } + +// /* Receive data */ +// BFT_MALLOC(pvar_local, n_local, cs_real_3_t); +// cs_internal_coupling_exchange_var(cpl, +// 3, /* Dimension */ +// (cs_real_t *)pvar_distant, +// (cs_real_t *)pvar_local); + +// if (df_limiter != NULL) { +// BFT_MALLOC(df_limiter_local, n_local, cs_real_t); +// cs_internal_coupling_exchange_var(cpl, +// 1, /* Dimension */ +// df_limiter, +// df_limiter_local); +// } + +// /* Flux contribution */ +// assert(f != NULL); +// cs_real_t *hintp = f->bc_coeffs->hint; +// cs_real_t *hextp = f->bc_coeffs->rcodcl2; +// for (cs_lnum_t ii = 0; ii < n_local; ii++) { +// cs_lnum_t face_id = faces_local[ii]; +// cs_lnum_t jj = b_face_cells[face_id]; +// cs_real_t surf = b_face_surf[face_id]; +// cs_real_t pip[3], pjp[3]; +// cs_real_t fluxi[3] = {0., 0., 0.}; +// cs_real_3_t _pj; + +// for (int i = 0; i < 3; i++) { +// _pj[i] = _pvar[jj][i]; +// } + +// /* Scaling due to mass balance in porous modelling */ +// if (b_f_face_factor != NULL) { +// cs_real_3_t n; +// cs_math_3_normalize(b_face_normal[face_id], n); + +// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); +// } + +// cs_real_t bldfrp = (cs_real_t) ircflp; +// /* Local limitation of the reconstruction */ +// if (df_limiter != NULL && ircflp > 0) +// bldfrp = cs_math_fmax(cs_math_fmin(df_limiter_local[ii], +// df_limiter[jj]), +// 0.); + +// cs_b_cd_unsteady_vector(bldfrp, +// diipb[face_id], +// (const cs_real_3_t *)grad[jj], +// _pj, +// pip); + +// for (int k = 0; k < 3; k++) +// pjp[k] = pvar_local[ii][k]; + +// cs_real_t hint = hintp[face_id]; +// cs_real_t hext = hextp[face_id]; +// cs_real_t heq = _calc_heq(hint, hext)*surf; + +// cs_b_diff_flux_coupling_vector(idiffp, +// pip, +// pjp, +// heq, +// fluxi); + +// for (int k = 0; k < 3; k++) +// rhs[jj][k] -= thetap * fluxi[k]; +// } + +// BFT_FREE(pvar_local); +// /* Sending structures are no longer needed */ +// BFT_FREE(pvar_distant); +// if (df_limiter != NULL) { +// BFT_FREE(df_limiter_local); +// } +// } -# pragma omp parallel for if(m->n_b_faces > CS_THR_MIN) - for (int t_id = 0; t_id < n_b_threads; t_id++) { - for (cs_lnum_t face_id = b_group_index[t_id*2]; - face_id < b_group_index[t_id*2 + 1]; - face_id++) { +#if defined(HAVE_OPENMP_TARGET) +#pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_cells[0:n_b_faces], \ + b_massflux[0:n_b_faces], \ + b_f_face_factor[0:n_b_faces], \ + b_face_normal[0:n_b_faces], \ + bc_type[0:n_b_faces], \ + b_visc[0:n_b_faces], \ + b_face_cells[0:n_b_faces], \ + b_face_surf[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + cofafv[0:n_b_faces], \ + cofbfv[0:n_b_faces], \ + diipb[0:n_b_faces], \ + b_pvar[0:n_b_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) +{ + #pragma omp target teams distribute parallel for \ + map(tofrom: rhs[0:n_cells_ext]) \ + map(to: b_face_cells[0:n_b_faces], \ + b_massflux[0:n_b_faces], \ + b_f_face_factor[0:n_b_faces], \ + b_face_normal[0:n_b_faces], \ + bc_type[0:n_b_faces], \ + b_visc[0:n_b_faces], \ + b_face_cells[0:n_b_faces], \ + b_face_surf[0:n_b_faces], \ + coefav[0:n_b_faces], \ + coefbv[0:n_b_faces], \ + cofafv[0:n_b_faces], \ + cofbfv[0:n_b_faces], \ + diipb[0:n_b_faces], \ + b_pvar[0:n_b_faces], \ + grad[0:n_cells_ext], \ + grdpa[0:n_cells_ext], \ + _pvar[0:n_cells_ext]) \ + private(pvar_distant, pvar_local, df_limiter_local) \ + firstprivate(iconvp, thetap, ischcp, blencp, blend_st, \ + imasac, idiffp, ircflp, inc, n_local, n_distant) \ + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) + for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { cs_lnum_t ii = b_face_cells[face_id]; @@ -5912,7 +6335,7 @@ res_cpu = !compute_cuda; /* Scaling due to mass balance in porous modelling */ if (b_f_face_factor != NULL) { cs_real_3_t n; - cs_math_3_normalize(b_face_normal[face_id], n); + cs_math_3_normalize_target_cd(b_face_normal[face_id], n); cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); } @@ -5924,7 +6347,7 @@ res_cpu = !compute_cuda; cs_b_cd_unsteady_vector(bldfrp, diipb[face_id], - (const cs_real_3_t *)grad[ii], + grad[ii], _pi, pip); cs_b_upwind_flux_vector(iconvp, @@ -5964,6 +6387,7 @@ res_cpu = !compute_cuda; fluxi); for(int isou = 0; isou < 3; isou++) { + #pragma omp atomic rhs[ii][isou] -= fluxi[isou]; } @@ -6004,7 +6428,7 @@ res_cpu = !compute_cuda; cs_b_cd_unsteady_vector(bldfrp, diipb[face_id], - (const cs_real_3_t *)grad[jj], + grad[jj], _pj, pip); @@ -6060,7 +6484,7 @@ res_cpu = !compute_cuda; cs_b_cd_unsteady_vector(bldfrp, diipb[face_id], - (const cs_real_3_t *)grad[jj], + grad[jj], _pj, pip); @@ -6078,6 +6502,7 @@ res_cpu = !compute_cuda; fluxi); for (int k = 0; k < 3; k++) + #pragma omp atomic rhs[jj][k] -= thetap * fluxi[k]; } @@ -6087,7 +6512,8 @@ res_cpu = !compute_cuda; if (df_limiter != NULL) { BFT_FREE(df_limiter_local); } - } + } // target data +#endif } /* idtvar */ /* Boundary convective flux imposed at some faces (tags in icvfli array) */ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 6c435c4621..44aae09b73 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -7315,7 +7315,7 @@ _lsq_vector_gradient(const cs_mesh_t *m, // compute_cpu = true; // res_cpu = false; // perf = false; - // accuracy = false; + // accuracy = true; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); BFT_MALLOC(rhs_cuda, n_cells_ext, cs_real_33_t); @@ -7527,61 +7527,61 @@ if(compute_cpu){ /* Compute gradient on boundary cells */ /*------------------------------------*/ - // #pragma omp parallel - // { - // cs_lnum_t t_s_id, t_e_id; - // cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); - - // /* Build indices bijection between [1-9] and [1-3]*[1-3] */ - - // cs_lnum_t _33_9_idx[9][2]; - // int nn = 0; - // for (int ll = 0; ll < 3; ll++) { - // for (int mm = 0; mm < 3; mm++) { - // _33_9_idx[nn][0] = ll; - // _33_9_idx[nn][1] = mm; - // nn++; - // } - // } - - // /* Loop on boundary cells */ - - // for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { - - // cs_lnum_t c_id = m->b_cells[b_c_id]; - - // cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; - - // _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); - - // _compute_cocgb_rhsb_lsq_v - // (c_id, - // inc, - // madj, - // fvq, - // _33_9_idx, - // (const cs_real_3_t *)pvar, - // (const cs_real_3_t *)coefav, - // (const cs_real_33_t *)coefbv, - // (const cs_real_3_t *)cocgb, - // (const cs_real_3_t *)rhs[c_id], - // cocgb_v, - // rhsb_v); - - // _fw_and_bw_ldtl_pp(cocgb_v, - // 9, - // x, - // rhsb_v); - - // for (int kk = 0; kk < 9; kk++) { - // int ii = _33_9_idx[kk][0]; - // int jj = _33_9_idx[kk][1]; - // gradv_cpu[c_id][ii][jj] = x[kk]; - // } - - // } - - // } + #pragma omp parallel + { + cs_lnum_t t_s_id, t_e_id; + cs_parall_thread_range(m->n_b_cells, sizeof(cs_real_t), &t_s_id, &t_e_id); + + /* Build indices bijection between [1-9] and [1-3]*[1-3] */ + + cs_lnum_t _33_9_idx[9][2]; + int nn = 0; + for (int ll = 0; ll < 3; ll++) { + for (int mm = 0; mm < 3; mm++) { + _33_9_idx[nn][0] = ll; + _33_9_idx[nn][1] = mm; + nn++; + } + } + + /* Loop on boundary cells */ + + for (cs_lnum_t b_c_id = t_s_id; b_c_id < t_e_id; b_c_id++) { + + cs_lnum_t c_id = m->b_cells[b_c_id]; + + cs_real_t cocgb[3][3], cocgb_v[45], rhsb_v[9], x[9]; + + _complete_cocg_lsq(c_id, madj, fvq, cocgb_s[b_c_id], cocgb); + + _compute_cocgb_rhsb_lsq_v + (c_id, + inc, + madj, + fvq, + _33_9_idx, + (const cs_real_3_t *)pvar, + (const cs_real_3_t *)coefav, + (const cs_real_33_t *)coefbv, + (const cs_real_3_t *)cocgb, + (const cs_real_3_t *)rhs[c_id], + cocgb_v, + rhsb_v); + + _fw_and_bw_ldtl_pp(cocgb_v, + 9, + x, + rhsb_v); + + for (int kk = 0; kk < 9; kk++) { + int ii = _33_9_idx[kk][0]; + int jj = _33_9_idx[kk][1]; + gradv_cpu[c_id][ii][jj] = x[kk]; + } + + } + + } stop = std::chrono::high_resolution_clock::now(); elapsed = std::chrono::duration_cast(stop - start); } // end if COMPUTE_CPU From 068df55aa690f5f86b16905399cd3d60b7650cb7 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 5 Jan 2024 12:16:50 +0100 Subject: [PATCH 66/70] Clean up code --- src/alge/cs_convection_diffusion.c | 41 ++++------------------------- src/alge/cs_gradient.cxx | 42 +++++------------------------- 2 files changed, 12 insertions(+), 71 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index f30c2bfebd..2859773148 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -1373,14 +1373,7 @@ cs_slope_test_gradient_vector_target(const int inc, { if(scatter){ #pragma omp target teams distribute parallel for \ - map(tofrom: grdpa[0:n_cells_ext]) \ - map(to: grad[0:n_cells_ext], \ - i_face_cog[0:n_i_faces], \ - cell_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext], \ - i_massflux[0:n_i_faces], \ - i_f_face_normal[0:n_i_faces], \ - i_face_cells[0:n_i_faces]) schedule(static,1) + schedule(static,1) for (cs_lnum_t face_id = 0; face_id < n_i_faces; face_id++){ cs_real_t difv[3], djfv[3]; @@ -1422,12 +1415,7 @@ cs_slope_test_gradient_vector_target(const int inc, } #pragma omp target teams distribute parallel for \ - map(tofrom: grdpa[0:n_cells_ext]) \ - map(to: b_face_cells[0:n_b_faces], \ - coefb[0:n_b_faces], \ - coefa[0:n_b_faces], \ - grad[0:n_cells_ext]) schedule(static,1) \ - if(m->n_b_faces > CS_THR_MIN) + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { cs_real_t diipbv[3]; @@ -1459,17 +1447,7 @@ cs_slope_test_gradient_vector_target(const int inc, } else{ #pragma omp target teams distribute parallel for \ - map(tofrom: grdpa[0:n_cells_ext]) \ - map(to: grad[0:n_cells_ext], \ - i_face_cog[0:n_i_faces], \ - cell_i_faces_sgn[0:n_i_faces], \ - cell_i_faces[0:n_i_faces], \ - cell_cen[0:n_cells_ext], \ - cell_cells_idx[0:n_cells_ext], \ - cell_cells[0:n_cells_ext], \ - pvar[0:n_cells_ext], \ - i_massflux[0:n_i_faces], \ - i_f_face_normal[0:n_i_faces]) schedule(static,1) + schedule(static,1) for (cs_lnum_t ii = 0; ii < n_cells; ii++){ cs_lnum_t s_id = cell_cells_idx[ii]; @@ -1515,14 +1493,7 @@ cs_slope_test_gradient_vector_target(const int inc, } #pragma omp target teams distribute parallel for \ - map(tofrom: grdpa[0:n_cells_ext]) \ - map(to: b_face_cells[0:n_b_faces], \ - coefb[0:n_b_faces], \ - coefa[0:n_b_faces], \ - b_cells[0:n_cells], \ - cell_b_faces_idx[0:n_cells+1], \ - grad[0:n_cells_ext]) schedule(static,1)\ - if(m->n_b_faces > CS_THR_MIN) + schedule(static,1) if(m->n_b_faces > CS_THR_MIN) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { cs_lnum_t ii = b_cells[c_idx]; @@ -1562,9 +1533,7 @@ cs_slope_test_gradient_vector_target(const int inc, } } - #pragma omp target teams distribute parallel for \ - map(tofrom: grdpa[0:n_cells_ext]) \ - map(to: cell_vol[0:n_cells_ext]) + #pragma omp target teams distribute parallel for for (cs_lnum_t cell_id = 0; cell_id < n_cells; cell_id++) { cs_real_t unsvol = 1./cell_vol[cell_id]; for (int isou = 0; isou < 3; isou++) { diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 44aae09b73..d21d7a11d3 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -6944,7 +6944,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, cocg[0:n_cells_ext]) { #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) schedule(static,1) + schedule(static,1) for (cs_lnum_t c_id = 0; c_id < n_cells_ext; c_id++) { for (cs_lnum_t i = 0; i < 3; i++){ for (cs_lnum_t j = 0; j < 3; j++){ @@ -6954,10 +6954,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } if(scatter){ #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: i_face_cells[0:n_i_faces], \ - cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) schedule(static,1) + schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_i_faces; f_id++) { cs_lnum_t c_id1 = i_face_cells[f_id][0]; @@ -6999,12 +6996,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } else{ #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: i_face_cells[0:n_i_faces], \ - cell_cells_idx[0:n_cells_ext], \ - cell_cells[0:n_cells_ext], \ - cell_f_cen[0:n_cells_ext], \ - pvar[0:n_cells_ext]) schedule(static,1) + schedule(static,1) for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { cs_lnum_t s_id = cell_cells_idx[c_id]; @@ -7066,10 +7058,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, if (halo_type == CS_HALO_EXTENDED) { #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: cell_f_cen[0:n_cells_ext], pvar[0:n_cells_ext],\ - cell_cells_idx[0:n_cells_ext], \ - cell_cells_lst[0:n_cells_ext]) schedule(static,1) + schedule(static,1) for (cs_lnum_t c_id1 = 0; c_id1 < n_cells; c_id1++) { for (cs_lnum_t cidx = cell_cells_idx[c_id1]; cidx < cell_cells_idx[c_id1+1]; @@ -7100,13 +7089,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, if(scatter){ #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: b_face_normal[0:n_b_faces], \ - coefav[0:n_b_faces], \ - coefbv[0:n_b_faces], \ - b_face_cells[0:n_b_faces], \ - pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) firstprivate(cs_math_zero_threshold) schedule(static,1) + schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { cs_lnum_t c_id1 = b_face_cells[f_id]; @@ -7139,15 +7122,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } else{ #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: b_face_normal[0:n_b_faces], \ - cell_b_faces[0:n_b_faces], \ - coefav[0:n_b_faces], \ - coefbv[0:n_b_faces], \ - b_cells[0:n_cells], \ - cell_b_faces_idx[0:n_cells+1], \ - pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) firstprivate(cs_math_zero_threshold) schedule(static,1) + schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { cs_lnum_t c_id = b_cells[c_idx]; @@ -7190,10 +7165,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(from: gradv[0:n_cells_ext]) \ - map(to: pvar[0:n_cells_ext],\ - cocg[0:n_cells_ext]) schedule(static,1) + schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_cells*3*3; c_idx++) { size_t c_id = c_idx / (3*3); From 12a72981159c137b26da4fd679bb4bdcf48b98a3 Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Tue, 9 Jan 2024 10:41:01 +0100 Subject: [PATCH 67/70] Clean omp code --- src/alge/cs_convection_diffusion.c | 39 ++---------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 2859773148..8f83667f41 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -5673,24 +5673,7 @@ res_cpu = !compute_cuda; _pvar[0:n_cells_ext]) { #pragma omp target teams distribute parallel for reduction(+:n_upwind) \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: i_face_cells[0:n_i_faces], \ - i_massflux[0:n_i_faces], \ - i_f_face_factor[0:n_i_faces], \ - i_face_normal[0:n_i_faces], \ - i_visc[0:n_i_faces], \ - i_face_cog[0:n_i_faces], \ - i_face_surf[0:n_i_faces], \ - i_dist[0:n_i_faces], \ - weight[0:n_i_faces], \ - diipf[0:n_i_faces], \ - djjpf[0:n_i_faces], \ - i_pvar[0:n_i_faces], \ - grad[0:n_cells_ext], \ - grdpa[0:n_cells_ext], \ - cell_cen[0:n_cells_ext], \ - _pvar[0:n_cells_ext]) \ - firstprivate(cs_math_zero_threshold, \ + firstprivate(cs_math_zero_threshold, \ iconvp, thetap, ischcp, blencp, blend_st, \ imasac, idiffp, ircflp) \ schedule(static,1) @@ -6263,25 +6246,7 @@ res_cpu = !compute_cuda; _pvar[0:n_cells_ext]) { #pragma omp target teams distribute parallel for \ - map(tofrom: rhs[0:n_cells_ext]) \ - map(to: b_face_cells[0:n_b_faces], \ - b_massflux[0:n_b_faces], \ - b_f_face_factor[0:n_b_faces], \ - b_face_normal[0:n_b_faces], \ - bc_type[0:n_b_faces], \ - b_visc[0:n_b_faces], \ - b_face_cells[0:n_b_faces], \ - b_face_surf[0:n_b_faces], \ - coefav[0:n_b_faces], \ - coefbv[0:n_b_faces], \ - cofafv[0:n_b_faces], \ - cofbfv[0:n_b_faces], \ - diipb[0:n_b_faces], \ - b_pvar[0:n_b_faces], \ - grad[0:n_cells_ext], \ - grdpa[0:n_cells_ext], \ - _pvar[0:n_cells_ext]) \ - private(pvar_distant, pvar_local, df_limiter_local) \ + private(pvar_distant, pvar_local, df_limiter_local) \ firstprivate(iconvp, thetap, ischcp, blencp, blend_st, \ imasac, idiffp, ircflp, inc, n_local, n_distant) \ schedule(static,1) if(m->n_b_faces > CS_THR_MIN) From a6be843a4ab0f7b918fa8172fad77b2c2eff6cea Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Wed, 10 Jan 2024 10:30:56 +0100 Subject: [PATCH 68/70] CPU timing convection --- src/alge/cs_convection_diffusion.c | 757 ++++++++++++++++------------- src/alge/cs_gradient.cxx | 58 ++- 2 files changed, 440 insertions(+), 375 deletions(-) diff --git a/src/alge/cs_convection_diffusion.c b/src/alge/cs_convection_diffusion.c index 8f83667f41..d4c0661498 100644 --- a/src/alge/cs_convection_diffusion.c +++ b/src/alge/cs_convection_diffusion.c @@ -4656,8 +4656,8 @@ cs_convection_diffusion_vector(int idtvar, /* Timing the computation */ - clock_t start, stop; - unsigned long elapsed, elapsed_cuda; + clock_t start, stop, start_slope, stop_slope; + unsigned long elapsed, elapsed_cuda, elapsed_slope; cs_real_33_t *grad_cpu, *grad_gpu; cs_real_33_t *grdpa_cpu, *grdpa_gpu; @@ -4694,11 +4694,11 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever // compute_cuda = true; - // compute_cpu = true; + compute_cpu = true; // res_cpu = false; // A ne pas garder dans la version finale - // perf = false; + perf = true; // accuracy = false; #if defined(HAVE_CUDA) @@ -4856,25 +4856,46 @@ res_cpu = !compute_cuda; if (iconvp > 0 && iupwin == 0 && isstpp == 0) { - cs_slope_test_gradient_vector(inc, - halo_type, - (const cs_real_33_t *)grad_cpu, - grdpa_cpu, - _pvar, - coefav, - coefbv, - i_massflux); - // #if defined(HAVE_OPENMP_TARGET) - // cs_slope_test_gradient_vector_target(inc, - // halo_type, - // (const cs_real_33_t *)grad_cpu, - // grdpa_cpu, - // _pvar, - // coefav, - // coefbv, - // i_massflux); - // #endif + if(compute_cpu){ + if(perf){ + start_slope = clock(); + } + cs_slope_test_gradient_vector(inc, + halo_type, + (const cs_real_33_t *)grad_cpu, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("convection compute Slope time in us: CPU = %ld\n", elapsed_slope); + } + } + + #if defined(HAVE_OPENMP_TARGET) + if(compute_cuda){ + if(perf){ + start_slope = clock(); + } + cs_slope_test_gradient_vector_target(inc, + halo_type, + (const cs_real_33_t *)grad_cpu, + grdpa_cpu, + _pvar, + coefav, + coefbv, + i_massflux); + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("convection compute Slope time in us: OMP = %ld\n", elapsed_slope); + } + } + #endif } if(perf){ @@ -5528,132 +5549,146 @@ res_cpu = !compute_cuda; } else { // ---------------OMP and CUDA here --------------------- +if(compute_cpu){ + if(perf){ + start_slope = clock(); + } + for (int g_id = 0; g_id < n_i_groups; g_id++) { +# pragma omp parallel for reduction(+:n_upwind) + for (int t_id = 0; t_id < n_i_threads; t_id++) { + for (cs_lnum_t face_id = i_group_index[(t_id*n_i_groups + g_id)*2]; + face_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; + face_id++) { + + cs_lnum_t ii = i_face_cells[face_id][0]; + cs_lnum_t jj = i_face_cells[face_id][1]; + + cs_real_t fluxi[3], fluxj[3] ; + for (int isou = 0; isou < 3; isou++) { + fluxi[isou] = 0; + fluxj[isou] = 0; + } + cs_real_3_t pip, pjp; + cs_real_3_t pif, pjf; + bool upwind_switch = false; + cs_real_3_t _pi, _pj; + + for (int i = 0; i < 3; i++) { + _pi[i] = _pvar[ii][i]; + _pj[i] = _pvar[jj][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (i_f_face_factor != NULL) { + cs_real_3_t n; + cs_math_3_normalize(i_face_normal[face_id], n); + + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); + cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), + 0.); + + cs_i_cd_unsteady_slope_test_vector(&upwind_switch, + iconvp, + bldfrp, + ischcp, + blencp, + blend_st, + weight[face_id], + i_dist[face_id], + i_face_surf[face_id], + cell_cen[ii], + cell_cen[jj], + i_face_normal[face_id], + i_face_cog[face_id], + diipf[face_id], + djjpf[face_id], + i_massflux[face_id], + (const cs_real_3_t *)grad[ii], + (const cs_real_3_t *)grad[jj], + (const cs_real_3_t *)grdpa[ii], + (const cs_real_3_t *)grdpa[jj], + _pi, + _pj, + pif, + pjf, + pip, + pjp); + + cs_i_conv_flux_vector(iconvp, + thetap, + imasac, + _pvar[ii], + _pvar[jj], + pif, + pif, /* no relaxation */ + pjf, + pjf, /* no relaxation */ + i_massflux[face_id], + fluxi, + fluxj); + + + cs_i_diff_flux_vector(idiffp, + thetap, + pip, + pjp, + pip, /* no relaxation */ + pjp, /* no relaxation */ + i_visc[face_id], + fluxi, + fluxj); + + if (upwind_switch) { + + /* in parallel, face will be counted by one and only one rank */ + if (ii < n_cells) + n_upwind++; + + if (v_slope_test != NULL) { + v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; + v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; + } + } + /* Saving velocity at internal faces, if needed */ + if (i_pvar != NULL) { + if (i_massflux[face_id] >= 0.) { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pif[i]; + } + else { + for (cs_lnum_t i = 0; i < 3; i++) + i_pvar[face_id][i] += thetap * pjf[i]; + } + } + + for (int isou = 0; isou < 3; isou++) { + + rhs[ii][isou] -= fluxi[isou]; + rhs[jj][isou] += fluxj[isou]; + + } /* isou */ + + } + } + } + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady i_faces time in us: CPU = %ld\n", elapsed_slope); + } +} //compute_cpu -// for (int g_id = 0; g_id < n_i_groups; g_id++) { -// # pragma omp parallel for reduction(+:n_upwind) -// for (int t_id = 0; t_id < n_i_threads; t_id++) { -// for (cs_lnum_t face_id = i_group_index[(t_id*n_i_groups + g_id)*2]; -// face_id < i_group_index[(t_id*n_i_groups + g_id)*2 + 1]; -// face_id++) { - -// cs_lnum_t ii = i_face_cells[face_id][0]; -// cs_lnum_t jj = i_face_cells[face_id][1]; - -// cs_real_t fluxi[3], fluxj[3] ; -// for (int isou = 0; isou < 3; isou++) { -// fluxi[isou] = 0; -// fluxj[isou] = 0; -// } -// cs_real_3_t pip, pjp; -// cs_real_3_t pif, pjf; -// bool upwind_switch = false; -// cs_real_3_t _pi, _pj; - -// for (int i = 0; i < 3; i++) { -// _pi[i] = _pvar[ii][i]; -// _pj[i] = _pvar[jj][i]; -// } - -// /* Scaling due to mass balance in porous modelling */ -// if (i_f_face_factor != NULL) { -// cs_real_3_t n; -// cs_math_3_normalize(i_face_normal[face_id], n); - -// cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); -// cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); -// } - -// cs_real_t bldfrp = (cs_real_t) ircflp; -// /* Local limitation of the reconstruction */ -// if (df_limiter != NULL && ircflp > 0) -// bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), -// 0.); - -// cs_i_cd_unsteady_slope_test_vector(&upwind_switch, -// iconvp, -// bldfrp, -// ischcp, -// blencp, -// blend_st, -// weight[face_id], -// i_dist[face_id], -// i_face_surf[face_id], -// cell_cen[ii], -// cell_cen[jj], -// i_face_normal[face_id], -// i_face_cog[face_id], -// diipf[face_id], -// djjpf[face_id], -// i_massflux[face_id], -// (const cs_real_3_t *)grad[ii], -// (const cs_real_3_t *)grad[jj], -// (const cs_real_3_t *)grdpa[ii], -// (const cs_real_3_t *)grdpa[jj], -// _pi, -// _pj, -// pif, -// pjf, -// pip, -// pjp); - -// cs_i_conv_flux_vector(iconvp, -// thetap, -// imasac, -// _pvar[ii], -// _pvar[jj], -// pif, -// pif, /* no relaxation */ -// pjf, -// pjf, /* no relaxation */ -// i_massflux[face_id], -// fluxi, -// fluxj); - - -// cs_i_diff_flux_vector(idiffp, -// thetap, -// pip, -// pjp, -// pip, /* no relaxation */ -// pjp, /* no relaxation */ -// i_visc[face_id], -// fluxi, -// fluxj); - -// if (upwind_switch) { - -// /* in parallel, face will be counted by one and only one rank */ -// if (ii < n_cells) -// n_upwind++; - -// if (v_slope_test != NULL) { -// v_slope_test[ii] += fabs(i_massflux[face_id]) / cell_vol[ii]; -// v_slope_test[jj] += fabs(i_massflux[face_id]) / cell_vol[jj]; -// } -// } -// /* Saving velocity at internal faces, if needed */ -// if (i_pvar != NULL) { -// if (i_massflux[face_id] >= 0.) { -// for (cs_lnum_t i = 0; i < 3; i++) -// i_pvar[face_id][i] += thetap * pif[i]; -// } -// else { -// for (cs_lnum_t i = 0; i < 3; i++) -// i_pvar[face_id][i] += thetap * pjf[i]; -// } -// } - -// for (int isou = 0; isou < 3; isou++) { - -// rhs[ii][isou] -= fluxi[isou]; -// rhs[jj][isou] += fluxj[isou]; - -// } /* isou */ - -// } -// } -// } #if defined(HAVE_OPENMP_TARGET) + if(compute_cuda){ + if(perf){ + start_slope = clock(); + } #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ map(to: i_face_cells[0:n_i_faces], \ i_massflux[0:n_i_faces], \ @@ -5700,7 +5735,7 @@ res_cpu = !compute_cuda; /* Scaling due to mass balance in porous modelling */ if (i_f_face_factor != NULL) { cs_real_3_t n; - cs_math_3_normalize_target_cd(i_face_normal[face_id], n); + cs_math_3_normalize(i_face_normal[face_id], n); cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); @@ -5797,6 +5832,12 @@ res_cpu = !compute_cuda; } } // target data + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady i_faces time in us: OMP = %ld\n", elapsed_slope); + } + } // compute_cuda #endif } /* idtvar */ @@ -6025,207 +6066,221 @@ res_cpu = !compute_cuda; } else { // ---------------OMP and CUDA here --------------------- -// # pragma omp parallel for if(m->n_b_faces > CS_THR_MIN) -// for (int t_id = 0; t_id < n_b_threads; t_id++) { -// for (cs_lnum_t face_id = b_group_index[t_id*2]; -// face_id < b_group_index[t_id*2 + 1]; -// face_id++) { - -// cs_lnum_t ii = b_face_cells[face_id]; - -// cs_real_t fluxi[3]; -// for (int isou = 0; isou < 3; isou++) { -// fluxi[isou] = 0; -// } -// cs_real_3_t pip; -// cs_real_3_t _pi; -// cs_real_t pfac[3]; - -// for (int i = 0; i < 3; i++) { -// _pi[i] = _pvar[ii][i]; -// } - -// /* Scaling due to mass balance in porous modelling */ -// if (b_f_face_factor != NULL) { -// cs_real_3_t n; -// cs_math_3_normalize(b_face_normal[face_id], n); - -// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); -// } - -// cs_real_t bldfrp = (cs_real_t) ircflp; -// /* Local limitation of the reconstruction */ -// if (df_limiter != NULL && ircflp > 0) -// bldfrp = cs_math_fmax(df_limiter[ii], 0.); - -// cs_b_cd_unsteady_vector(bldfrp, -// diipb[face_id], -// (const cs_real_3_t *)grad[ii], -// _pi, -// pip); -// cs_b_upwind_flux_vector(iconvp, -// thetap, -// imasac, -// inc, -// bc_type[face_id], -// _pi, -// _pi, /* no relaxation */ -// pip, -// coefav[face_id], -// coefbv[face_id], -// b_massflux[face_id], -// pfac, -// fluxi); - -// /* Saving velocity on boundary faces */ -// if (b_pvar != NULL) { -// if (b_massflux[face_id] >= 0.) { -// for (cs_lnum_t i = 0; i < 3; i++) -// b_pvar[face_id][i] += thetap * _pi[i]; -// } -// else { -// for (cs_lnum_t i = 0; i < 3; i++) { -// b_pvar[face_id][i] += thetap * pfac[i]; -// } -// } -// } - -// cs_b_diff_flux_vector(idiffp, -// thetap, -// inc, -// pip, -// cofafv[face_id], -// cofbfv[face_id], -// b_visc[face_id], -// fluxi); - -// for(int isou = 0; isou < 3; isou++) { -// rhs[ii][isou] -= fluxi[isou]; -// } - -// } -// } - -// /* The variable is internally coupled and an implicit contribution -// * is required */ -// if (icoupl > 0) { -// /* Prepare data for sending */ -// BFT_MALLOC(pvar_distant, n_distant, cs_real_3_t); - -// for (cs_lnum_t ii = 0; ii < n_distant; ii++) { -// cs_lnum_t face_id = faces_distant[ii]; -// cs_lnum_t jj = b_face_cells[face_id]; - -// cs_real_3_t pip; -// cs_real_3_t _pj; - -// for (int i = 0; i < 3; i++) { -// _pj[i] = _pvar[jj][i]; -// } - -// cs_real_t bldfrp = (cs_real_t) ircflp; -// /* Local limitation of the reconstruction */ -// /* Note: to be treated exactly as a internal face, should be a bending -// * between the two cells... */ -// if (df_limiter != NULL && ircflp > 0) -// bldfrp = cs_math_fmax(df_limiter[jj], 0.); - -// /* Scaling due to mass balance in porous modelling */ -// if (b_f_face_factor != NULL) { -// cs_real_3_t n; -// cs_math_3_normalize(b_face_normal[face_id], n); - -// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); -// } - -// cs_b_cd_unsteady_vector(bldfrp, -// diipb[face_id], -// (const cs_real_3_t *)grad[jj], -// _pj, -// pip); - -// for (int k = 0; k < 3; k++) -// pvar_distant[ii][k] = pip[k]; -// } - -// /* Receive data */ -// BFT_MALLOC(pvar_local, n_local, cs_real_3_t); -// cs_internal_coupling_exchange_var(cpl, -// 3, /* Dimension */ -// (cs_real_t *)pvar_distant, -// (cs_real_t *)pvar_local); - -// if (df_limiter != NULL) { -// BFT_MALLOC(df_limiter_local, n_local, cs_real_t); -// cs_internal_coupling_exchange_var(cpl, -// 1, /* Dimension */ -// df_limiter, -// df_limiter_local); -// } - -// /* Flux contribution */ -// assert(f != NULL); -// cs_real_t *hintp = f->bc_coeffs->hint; -// cs_real_t *hextp = f->bc_coeffs->rcodcl2; -// for (cs_lnum_t ii = 0; ii < n_local; ii++) { -// cs_lnum_t face_id = faces_local[ii]; -// cs_lnum_t jj = b_face_cells[face_id]; -// cs_real_t surf = b_face_surf[face_id]; -// cs_real_t pip[3], pjp[3]; -// cs_real_t fluxi[3] = {0., 0., 0.}; -// cs_real_3_t _pj; - -// for (int i = 0; i < 3; i++) { -// _pj[i] = _pvar[jj][i]; -// } - -// /* Scaling due to mass balance in porous modelling */ -// if (b_f_face_factor != NULL) { -// cs_real_3_t n; -// cs_math_3_normalize(b_face_normal[face_id], n); - -// cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); -// } - -// cs_real_t bldfrp = (cs_real_t) ircflp; -// /* Local limitation of the reconstruction */ -// if (df_limiter != NULL && ircflp > 0) -// bldfrp = cs_math_fmax(cs_math_fmin(df_limiter_local[ii], -// df_limiter[jj]), -// 0.); - -// cs_b_cd_unsteady_vector(bldfrp, -// diipb[face_id], -// (const cs_real_3_t *)grad[jj], -// _pj, -// pip); - -// for (int k = 0; k < 3; k++) -// pjp[k] = pvar_local[ii][k]; - -// cs_real_t hint = hintp[face_id]; -// cs_real_t hext = hextp[face_id]; -// cs_real_t heq = _calc_heq(hint, hext)*surf; - -// cs_b_diff_flux_coupling_vector(idiffp, -// pip, -// pjp, -// heq, -// fluxi); - -// for (int k = 0; k < 3; k++) -// rhs[jj][k] -= thetap * fluxi[k]; -// } - -// BFT_FREE(pvar_local); -// /* Sending structures are no longer needed */ -// BFT_FREE(pvar_distant); -// if (df_limiter != NULL) { -// BFT_FREE(df_limiter_local); -// } -// } +if(compute_cpu){ + if(perf){ + start_slope = clock(); + } +# pragma omp parallel for if(m->n_b_faces > CS_THR_MIN) + for (int t_id = 0; t_id < n_b_threads; t_id++) { + for (cs_lnum_t face_id = b_group_index[t_id*2]; + face_id < b_group_index[t_id*2 + 1]; + face_id++) { + + cs_lnum_t ii = b_face_cells[face_id]; + + cs_real_t fluxi[3]; + for (int isou = 0; isou < 3; isou++) { + fluxi[isou] = 0; + } + cs_real_3_t pip; + cs_real_3_t _pi; + cs_real_t pfac[3]; + + for (int i = 0; i < 3; i++) { + _pi[i] = _pvar[ii][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + cs_real_3_t n; + cs_math_3_normalize(b_face_normal[face_id], n); + + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(df_limiter[ii], 0.); + + cs_b_cd_unsteady_vector(bldfrp, + diipb[face_id], + (const cs_real_3_t *)grad[ii], + _pi, + pip); + cs_b_upwind_flux_vector(iconvp, + thetap, + imasac, + inc, + bc_type[face_id], + _pi, + _pi, /* no relaxation */ + pip, + coefav[face_id], + coefbv[face_id], + b_massflux[face_id], + pfac, + fluxi); + + /* Saving velocity on boundary faces */ + if (b_pvar != NULL) { + if (b_massflux[face_id] >= 0.) { + for (cs_lnum_t i = 0; i < 3; i++) + b_pvar[face_id][i] += thetap * _pi[i]; + } + else { + for (cs_lnum_t i = 0; i < 3; i++) { + b_pvar[face_id][i] += thetap * pfac[i]; + } + } + } + + cs_b_diff_flux_vector(idiffp, + thetap, + inc, + pip, + cofafv[face_id], + cofbfv[face_id], + b_visc[face_id], + fluxi); + + for(int isou = 0; isou < 3; isou++) { + rhs[ii][isou] -= fluxi[isou]; + } + + } + } + + /* The variable is internally coupled and an implicit contribution + * is required */ + if (icoupl > 0) { + /* Prepare data for sending */ + BFT_MALLOC(pvar_distant, n_distant, cs_real_3_t); + + for (cs_lnum_t ii = 0; ii < n_distant; ii++) { + cs_lnum_t face_id = faces_distant[ii]; + cs_lnum_t jj = b_face_cells[face_id]; + + cs_real_3_t pip; + cs_real_3_t _pj; + + for (int i = 0; i < 3; i++) { + _pj[i] = _pvar[jj][i]; + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + /* Note: to be treated exactly as a internal face, should be a bending + * between the two cells... */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(df_limiter[jj], 0.); + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + cs_real_3_t n; + cs_math_3_normalize(b_face_normal[face_id], n); + + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); + } + + cs_b_cd_unsteady_vector(bldfrp, + diipb[face_id], + (const cs_real_3_t *)grad[jj], + _pj, + pip); + + for (int k = 0; k < 3; k++) + pvar_distant[ii][k] = pip[k]; + } + + /* Receive data */ + BFT_MALLOC(pvar_local, n_local, cs_real_3_t); + cs_internal_coupling_exchange_var(cpl, + 3, /* Dimension */ + (cs_real_t *)pvar_distant, + (cs_real_t *)pvar_local); + + if (df_limiter != NULL) { + BFT_MALLOC(df_limiter_local, n_local, cs_real_t); + cs_internal_coupling_exchange_var(cpl, + 1, /* Dimension */ + df_limiter, + df_limiter_local); + } + + /* Flux contribution */ + assert(f != NULL); + cs_real_t *hintp = f->bc_coeffs->hint; + cs_real_t *hextp = f->bc_coeffs->rcodcl2; + for (cs_lnum_t ii = 0; ii < n_local; ii++) { + cs_lnum_t face_id = faces_local[ii]; + cs_lnum_t jj = b_face_cells[face_id]; + cs_real_t surf = b_face_surf[face_id]; + cs_real_t pip[3], pjp[3]; + cs_real_t fluxi[3] = {0., 0., 0.}; + cs_real_3_t _pj; + + for (int i = 0; i < 3; i++) { + _pj[i] = _pvar[jj][i]; + } + + /* Scaling due to mass balance in porous modelling */ + if (b_f_face_factor != NULL) { + cs_real_3_t n; + cs_math_3_normalize(b_face_normal[face_id], n); + + cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); + } + + cs_real_t bldfrp = (cs_real_t) ircflp; + /* Local limitation of the reconstruction */ + if (df_limiter != NULL && ircflp > 0) + bldfrp = cs_math_fmax(cs_math_fmin(df_limiter_local[ii], + df_limiter[jj]), + 0.); + + cs_b_cd_unsteady_vector(bldfrp, + diipb[face_id], + (const cs_real_3_t *)grad[jj], + _pj, + pip); + + for (int k = 0; k < 3; k++) + pjp[k] = pvar_local[ii][k]; + + cs_real_t hint = hintp[face_id]; + cs_real_t hext = hextp[face_id]; + cs_real_t heq = _calc_heq(hint, hext)*surf; + + cs_b_diff_flux_coupling_vector(idiffp, + pip, + pjp, + heq, + fluxi); + + for (int k = 0; k < 3; k++) + rhs[jj][k] -= thetap * fluxi[k]; + } + + BFT_FREE(pvar_local); + /* Sending structures are no longer needed */ + BFT_FREE(pvar_distant); + if (df_limiter != NULL) { + BFT_FREE(df_limiter_local); + } + } + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady b_faces time in us: CPU = %ld\n", elapsed_slope); + } +} // compute_cpu #if defined(HAVE_OPENMP_TARGET) +if(compute_cuda){ + if(perf){ + start_slope = clock(); + } #pragma omp target data map(tofrom: rhs[0:n_cells_ext]) \ map(to: b_face_cells[0:n_b_faces], \ b_massflux[0:n_b_faces], \ @@ -6247,7 +6302,7 @@ res_cpu = !compute_cuda; { #pragma omp target teams distribute parallel for \ private(pvar_distant, pvar_local, df_limiter_local) \ - firstprivate(iconvp, thetap, ischcp, blencp, blend_st, \ + firstprivate(cs_math_zero_threshold, iconvp, thetap, ischcp, blencp, blend_st, \ imasac, idiffp, ircflp, inc, n_local, n_distant) \ schedule(static,1) if(m->n_b_faces > CS_THR_MIN) for (cs_lnum_t face_id = 0; face_id < n_b_faces; face_id++) { @@ -6269,7 +6324,7 @@ res_cpu = !compute_cuda; /* Scaling due to mass balance in porous modelling */ if (b_f_face_factor != NULL) { cs_real_3_t n; - cs_math_3_normalize_target_cd(b_face_normal[face_id], n); + cs_math_3_normalize(b_face_normal[face_id], n); cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); } @@ -6447,6 +6502,12 @@ res_cpu = !compute_cuda; BFT_FREE(df_limiter_local); } } // target data + if(perf){ + stop_slope = clock(); + elapsed_slope = (stop_slope - start_slope) * 1e6 / CLOCKS_PER_SEC; + printf("idtvar => 0 unsteady b_faces time in us: OMP = %ld\n", elapsed_slope); + } +} // compute_cuda #endif } /* idtvar */ diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index d21d7a11d3..6f9f5f280d 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5591,11 +5591,11 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever // compute_cuda = true; - // compute_cpu = true; + compute_cpu = true; // res_cpu = false; // A ne pas garder dans la version finale - // perf = false; + perf = true; // accuracy = false; @@ -7089,7 +7089,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, if(scatter){ #pragma omp target teams distribute parallel for \ - schedule(static,1) + firstprivate(cs_math_zero_threshold) schedule(static,1) for (cs_lnum_t f_id = 0; f_id < n_b_faces; f_id++) { cs_lnum_t c_id1 = b_face_cells[f_id]; @@ -7122,7 +7122,7 @@ _lsq_vector_gradient_target(const cs_mesh_t *m, } else{ #pragma omp target teams distribute parallel for \ - schedule(static,1) + firstprivate(cs_math_zero_threshold) schedule(static,1) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { cs_lnum_t c_id = b_cells[c_idx]; @@ -7284,9 +7284,9 @@ _lsq_vector_gradient(const cs_mesh_t *m, // Pour l'instant ces lignes sont pour moi // Elles seront à enlever // compute_cuda = true; - // compute_cpu = true; + compute_cpu = true; // res_cpu = false; - // perf = false; + perf = true; // accuracy = true; BFT_MALLOC(rhs, n_cells_ext, cs_real_33_t); @@ -7325,8 +7325,10 @@ BFT_MALLOC(gradv_target, n_cells_ext, cs_real_33_t); } // end if compute_cuda #endif -start = std::chrono::high_resolution_clock::now(); #if defined(HAVE_OPENMP_TARGET) +if(perf){ + start = std::chrono::high_resolution_clock::now(); +} _lsq_vector_gradient_target(m, madj, fvq, @@ -7339,10 +7341,12 @@ _lsq_vector_gradient_target(m, gradv_target, cocg, rhs_target); +if(perf){ + stop = std::chrono::high_resolution_clock::now(); + elapsed_target = std::chrono::duration_cast(stop - start); + printf("OMP target lsq %ld\n", elapsed_target.count()); +} #endif -stop = std::chrono::high_resolution_clock::now(); -elapsed_target = std::chrono::duration_cast(stop - start); -printf("OMP target lsq %ld\n", elapsed_target.count()); if(compute_cpu){ if(perf){ @@ -8008,23 +8012,23 @@ _lsq_strided_gradient(const cs_mesh_t *m, cs_real_t c_norm, ref_norm; // #if defined(HAVE_CUDA) - cs_lsq_vector_gradient_strided_cuda - ( - m, - madj, - fvq, - halo_type, - inc, - coefav, - coefbv, - pvar, - c_weight, - cocg, - cocgb, - gradv, - rhs, - n_c_iter_max, - c_eps); + // cs_lsq_vector_gradient_strided_cuda + // ( + // m, + // madj, + // fvq, + // halo_type, + // inc, + // coefav, + // coefbv, + // pvar, + // c_weight, + // cocg, + // cocgb, + // gradv, + // rhs, + // n_c_iter_max, + // c_eps); // #else #pragma omp parallel for schedule(dynamic, CS_THR_MIN) for (cs_lnum_t c_idx = 0; c_idx < n_b_cells; c_idx++) { From 5246828a6f183c710fbca36bbac924c5ddc4939d Mon Sep 17 00:00:00 2001 From: ddiakiteaneo Date: Fri, 12 Jan 2024 11:19:51 +0100 Subject: [PATCH 69/70] Fix reconstruct cuda with stride --- src/alge/cs_convection_diffusion.cxx | 202 +++++++++--------- src/alge/cs_convection_diffusion_cuda.cu | 2 + src/alge/cs_gradient.cxx | 26 +-- src/alge/cs_gradient_cuda.cu | 25 +-- src/alge/cs_gradient_priv.h | 2 +- .../cs_reconstruct_vector_gradient_gather.cuh | 26 +-- ..._reconstruct_vector_gradient_gather_v2.cuh | 30 +-- ..._reconstruct_vector_gradient_gather_v3.cuh | 43 ++-- ..._reconstruct_vector_gradient_gather_v4.cuh | 30 +-- ...cs_reconstruct_vector_gradient_scatter.cuh | 2 +- ...reconstruct_vector_gradient_scatter_cf.cuh | 29 +-- ...reconstruct_vector_gradient_scatter_v2.cuh | 32 +-- ...onstruct_vector_gradient_scatter_v2_cf.cuh | 30 +-- 13 files changed, 237 insertions(+), 242 deletions(-) diff --git a/src/alge/cs_convection_diffusion.cxx b/src/alge/cs_convection_diffusion.cxx index 97624ffcee..3ebbad926a 100644 --- a/src/alge/cs_convection_diffusion.cxx +++ b/src/alge/cs_convection_diffusion.cxx @@ -5586,10 +5586,9 @@ if(compute_cpu){ map(to: i_face_cells[0:n_i_faces], \ i_massflux[0:n_i_faces], \ i_f_face_factor[0:n_i_faces], \ - i_face_normal[0:n_i_faces], \ + i_face_u_normal[0:n_i_faces], \ i_visc[0:n_i_faces], \ i_face_cog[0:n_i_faces], \ - i_face_surf[0:n_i_faces], \ i_dist[0:n_i_faces], \ weight[0:n_i_faces], \ diipf[0:n_i_faces], \ @@ -5627,9 +5626,7 @@ if(compute_cpu){ /* Scaling due to mass balance in porous modelling */ if (i_f_face_factor != NULL) { - cs_real_3_t n; - cs_math_3_normalize(i_face_normal[face_id], n); - + const cs_real_t *n = i_face_u_normal[face_id]; cs_math_3_normal_scaling(n, i_f_face_factor[face_id][0], _pi); cs_math_3_normal_scaling(n, i_f_face_factor[face_id][1], _pj); } @@ -5640,56 +5637,55 @@ if(compute_cpu){ bldfrp = cs_math_fmax(cs_math_fmin(df_limiter[ii], df_limiter[jj]), 0.); - cs_i_cd_unsteady_slope_test_vector(&upwind_switch, - iconvp, - bldfrp, - ischcp, - blencp, - blend_st, - weight[face_id], - i_dist[face_id], - i_face_surf[face_id], - cell_cen[ii], - cell_cen[jj], - i_face_normal[face_id], - i_face_cog[face_id], - diipf[face_id], - djjpf[face_id], - i_massflux[face_id], - grad[ii], - grad[jj], - grdpa[ii], - grdpa[jj], - _pi, - _pj, - pif, - pjf, - pip, - pjp); - - cs_i_conv_flux_vector(iconvp, - thetap, - imasac, - _pvar[ii], - _pvar[jj], - pif, - pif, /* no relaxation */ - pjf, - pjf, /* no relaxation */ - i_massflux[face_id], - fluxi, - fluxj); - - - cs_i_diff_flux_vector(idiffp, - thetap, - pip, - pjp, - pip, /* no relaxation */ - pjp, /* no relaxation */ - i_visc[face_id], - fluxi, - fluxj); + cs_i_cd_unsteady_slope_test_strided<3>(&upwind_switch, + iconvp, + bldfrp, + ischcp, + blencp, + blend_st, + weight[face_id], + i_dist[face_id], + cell_cen[ii], + cell_cen[jj], + i_face_u_normal[face_id], + i_face_cog[face_id], + diipf[face_id], + djjpf[face_id], + i_massflux[face_id], + grad[ii], + grad[jj], + grdpa[ii], + grdpa[jj], + _pi, + _pj, + pif, + pjf, + pip, + pjp); + + cs_i_conv_flux_strided<3>(iconvp, + thetap, + imasac, + _pvar[ii], + _pvar[jj], + pif, + pif, /* no relaxation */ + pjf, + pjf, /* no relaxation */ + i_massflux[face_id], + fluxi, + fluxj); + + + cs_i_diff_flux_strided<3>(idiffp, + thetap, + pip, + pjp, + pip, /* no relaxation */ + pjp, /* no relaxation */ + i_visc[face_id], + fluxi, + fluxj); if (upwind_switch) { @@ -6165,7 +6161,7 @@ if(compute_cuda){ map(to: b_face_cells[0:n_b_faces], \ b_massflux[0:n_b_faces], \ b_f_face_factor[0:n_b_faces], \ - b_face_normal[0:n_b_faces], \ + b_face_u_normal[0:n_b_faces], \ bc_type[0:n_b_faces], \ b_visc[0:n_b_faces], \ b_face_cells[0:n_b_faces], \ @@ -6203,9 +6199,7 @@ if(compute_cuda){ /* Scaling due to mass balance in porous modelling */ if (b_f_face_factor != NULL) { - cs_real_3_t n; - cs_math_3_normalize(b_face_normal[face_id], n); - + const cs_real_t *n = b_face_u_normal[face_id]; cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pi); } @@ -6214,24 +6208,24 @@ if(compute_cuda){ if (df_limiter != NULL && ircflp > 0) bldfrp = cs_math_fmax(df_limiter[ii], 0.); - cs_b_cd_unsteady_vector(bldfrp, - diipb[face_id], - grad[ii], - _pi, - pip); - cs_b_upwind_flux_vector(iconvp, - thetap, - imasac, - inc, - bc_type[face_id], - _pi, - _pi, /* no relaxation */ - pip, - coefav[face_id], - coefbv[face_id], - b_massflux[face_id], - pfac, - fluxi); + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[ii], + _pi, + pip); + cs_b_upwind_flux_strided<3>(iconvp, + thetap, + imasac, + inc, + bc_type[face_id], + _pi, + _pi, /* no relaxation */ + pip, + coefav[face_id], + coefbv[face_id], + b_massflux[face_id], + pfac, + fluxi); /* Saving velocity on boundary faces */ if (b_pvar != NULL) { @@ -6246,14 +6240,14 @@ if(compute_cuda){ } } - cs_b_diff_flux_vector(idiffp, - thetap, - inc, - pip, - cofafv[face_id], - cofbfv[face_id], - b_visc[face_id], - fluxi); + cs_b_diff_flux_strided<3>(idiffp, + thetap, + inc, + pip, + cofafv[face_id], + cofbfv[face_id], + b_visc[face_id], + fluxi); for(int isou = 0; isou < 3; isou++) { #pragma omp atomic @@ -6289,17 +6283,15 @@ if(compute_cuda){ /* Scaling due to mass balance in porous modelling */ if (b_f_face_factor != NULL) { - cs_real_3_t n; - cs_math_3_normalize(b_face_normal[face_id], n); - + const cs_real_t *n = b_face_u_normal[face_id]; cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); } - cs_b_cd_unsteady_vector(bldfrp, - diipb[face_id], - grad[jj], - _pj, - pip); + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[jj], + _pj, + pip); for (int k = 0; k < 3; k++) pvar_distant[ii][k] = pip[k]; @@ -6338,9 +6330,7 @@ if(compute_cuda){ /* Scaling due to mass balance in porous modelling */ if (b_f_face_factor != NULL) { - cs_real_3_t n; - cs_math_3_normalize(b_face_normal[face_id], n); - + const cs_real_t *n = b_face_u_normal[face_id]; cs_math_3_normal_scaling(n, b_f_face_factor[face_id], _pj); } @@ -6351,11 +6341,11 @@ if(compute_cuda){ df_limiter[jj]), 0.); - cs_b_cd_unsteady_vector(bldfrp, - diipb[face_id], - grad[jj], - _pj, - pip); + cs_b_cd_unsteady_strided<3>(bldfrp, + diipb[face_id], + grad[jj], + _pj, + pip); for (int k = 0; k < 3; k++) pjp[k] = pvar_local[ii][k]; @@ -6364,11 +6354,11 @@ if(compute_cuda){ cs_real_t hext = hextp[face_id]; cs_real_t heq = _calc_heq(hint, hext)*surf; - cs_b_diff_flux_coupling_vector(idiffp, - pip, - pjp, - heq, - fluxi); + cs_b_diff_flux_coupling_strided<3>(idiffp, + pip, + pjp, + heq, + fluxi); for (int k = 0; k < 3; k++) #pragma omp atomic diff --git a/src/alge/cs_convection_diffusion_cuda.cu b/src/alge/cs_convection_diffusion_cuda.cu index ff26fe7294..c110ababfd 100644 --- a/src/alge/cs_convection_diffusion_cuda.cu +++ b/src/alge/cs_convection_diffusion_cuda.cu @@ -280,10 +280,12 @@ cs_convection_diffusion_vector_cuda(const cs_mesh_t *mesh, if (_coefb_d != NULL) CS_CUDA_CHECK(cudaFree(_coefb_d)); + CS_CUDA_CHECK(cudaFree(grad_d)); CS_CUDA_CHECK(cudaFree(grdpa_d)); CS_CUDA_CHECK(cudaFree(i_massflux_d)); CS_CUDA_CHECK(cudaFree(i_f_face_normal)); CS_CUDA_CHECK(cudaFree(cell_vol)); CS_CUDA_CHECK(cudaFree(cell_i_faces)); CS_CUDA_CHECK(cudaFree(cell_i_faces_sgn)); + CS_CUDA_CHECK(cudaFree(i_face_cog)); } diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 61cf25cd5f..27862a2428 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5576,19 +5576,19 @@ res_cpu = !compute_cuda; start = std::chrono::high_resolution_clock::now(); } - // cs_reconstruct_vector_gradient_cuda(m, - // madj, - // fvq, - // halo_type, - // inc, - // coefav, - // coefbv, - // pvar, - // c_weight, - // r_grad, - // grad_gpu, - // cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION, - // perf); + cs_reconstruct_vector_gradient_cuda(m, + madj, + fvq, + halo_type, + inc, + coefav, + coefbv, + pvar, + c_weight, + r_grad, + grad_gpu, + cs_glob_mesh_quantities_flag & CS_BAD_CELLS_WARPED_CORRECTION, + perf); if(perf){ stop = std::chrono::high_resolution_clock::now(); elapsed_cuda = std::chrono::duration_cast(stop - start); diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 8c40ff1589..32e8b6abab 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -95,7 +95,7 @@ const cs_real_t (*restrict coefbv)[stride][stride], \ const cs_real_t (*restrict pvar)[stride], \ const cs_real_t *restrict c_weight, \ - cs_real_t (*restrict r_grad)[stride][3], \ + const cs_real_t (*restrict r_grad)[stride][3], \ cs_real_t (*restrict grad)[stride][3], \ bool test_bool, \ bool perf) @@ -1382,12 +1382,12 @@ cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, const cs_real_3_t *restrict diipb = (const cs_real_3_t *restrict)cs_get_device_ptr_const_pf(fvq->diipb); - _sync_or_copy_real_h2d(pvar, n_cells_ext*stride, device_id, stream, + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - _sync_or_copy_real_h2d(coefav, n_b_faces*stride, device_id, stream, + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_h2d(coefbv, n_b_faces*stride*stride, device_id, stream, + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); cs_cuda_copy_h2d(grad_d, gradv, sizeof(cs_real_t) * n_cells * stride * 3); @@ -1530,7 +1530,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_t (*restrict coefbv)[stride][stride], const cs_real_t (*restrict pvar)[stride], const cs_real_t *restrict c_weight, - cs_real_t (*restrict r_grad)[stride][3], + const cs_real_t (*restrict r_grad)[stride][3], cs_real_t (*restrict grad)[stride][3], bool test_bool, bool perf) @@ -1650,15 +1650,15 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, cs_cuda_copy_h2d(c_disable_flag, (void *)fvq->c_disable_flag, sizeof(int)*n_cells); - _sync_or_copy_real_h2d(pvar, n_cells_ext*stride, device_id, stream, + _sync_or_copy_real_h2d(pvar, n_cells_ext, device_id, stream, &pvar_d, &_pvar_d); - // _sync_or_copy_real_h2d(r_grad, n_cells_ext*stride*3, device_id, stream, - // &r_grad_d, &_r_grad_d); + _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream, + &r_grad_d, &_r_grad_d); - _sync_or_copy_real_h2d(coefav, n_b_faces*stride, device_id, stream, + _sync_or_copy_real_h2d(coefav, n_b_faces, device_id, stream, &coefa_d, &_coefa_d); - _sync_or_copy_real_h2d(coefbv, n_b_faces*stride*stride, device_id, stream, + _sync_or_copy_real_h2d(coefbv, n_b_faces, device_id, stream, &coefb_d, &_coefb_d); @@ -1824,9 +1824,6 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cs_cuda_copy_h2d(grad_d, grad, n_cells_ext * sizeof(cs_real_33_t)); - // _sync_or_copy_real_h2d(r_grad, n_cells_ext, device_id, stream, - // &r_grad_d, &_r_grad_d); - CS_CUDA_CHECK(cudaEventRecord(b_faces_1, stream)); // ----------------------------Begin of Kernels part 2------------------------------------------- @@ -1988,7 +1985,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /* Sync to host */ if (grad_d != NULL) { - size_t size = n_cells_ext * sizeof(cs_real_t) * 3 * 3; + size_t size = n_cells_ext * sizeof(cs_real_t) * stride * 3; cs_cuda_copy_d2h(grad, grad_d, size); } else diff --git a/src/alge/cs_gradient_priv.h b/src/alge/cs_gradient_priv.h index 72527a6ea6..867db2cac8 100644 --- a/src/alge/cs_gradient_priv.h +++ b/src/alge/cs_gradient_priv.h @@ -192,7 +192,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, const cs_real_t (*restrict coefbv)[stride][stride], const cs_real_t (*restrict pvar)[stride], const cs_real_t *restrict c_weight, - cs_real_t (*restrict r_grad)[stride][3], + const cs_real_t (*restrict r_grad)[stride][3], cs_real_t (*restrict grad)[stride][3], bool test_bool, bool perf); diff --git a/src/alge/cs_reconstruct_vector_gradient_gather.cuh b/src/alge/cs_reconstruct_vector_gradient_gather.cuh index b7c375e9b9..c7262866b2 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather.cuh @@ -24,14 +24,14 @@ /*----------------------------------------------------------------------------*/ - +template __global__ static void _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal, const cs_lnum_t *restrict cell_cells_idx, @@ -63,7 +63,7 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); /* Reconstruction part */ @@ -83,16 +83,16 @@ _compute_reconstruct_v_i_face_gather(cs_lnum_t n_cells, - +template __global__ static void _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_cells, const cs_lnum_t *restrict cell_b_faces, @@ -115,7 +115,7 @@ _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, for(cs_lnum_t index = s_id; index < e_id; index++){ f_id = cell_b_faces[index]; - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfac = inc*coefav[f_id][i]; @@ -127,7 +127,7 @@ _compute_reconstruct_v_b_face_gather(cs_lnum_t n_b_cells, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + r_grad[c_id][k][1] * diipb[f_id][1] + r_grad[c_id][k][2] * diipb[f_id][2]; diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh index 404421b4c9..ff1723ba0f 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v2.cuh @@ -24,14 +24,14 @@ /*----------------------------------------------------------------------------*/ - +template __global__ static void _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal, const cs_lnum_t *restrict cell_cells_idx, @@ -49,8 +49,8 @@ _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, cs_lnum_t c_id2, f_id; cs_real_t pond, ktpond, pfaci, pfacj, rfac; - size_t c_idx = c_id1 / (3*3); - size_t i = (c_id1 / 3) % 3; + size_t c_idx = c_id1 / (stride*3); + size_t i = (c_id1 / 3) % stride; size_t j = c_id1 % 3; cs_lnum_t s_id = cell_cells_idx[c_idx]; @@ -84,16 +84,16 @@ _compute_reconstruct_v_i_face_gather_v2(cs_lnum_t n_cells, - +template __global__ static void _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_cells, const cs_lnum_t *restrict cell_b_faces, @@ -106,8 +106,8 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, return; } - size_t c_id1 = c_idx / 3; - size_t i = c_idx % 3; + size_t c_id1 = c_idx / stride; + size_t i = c_idx % stride; cs_lnum_t c_id = b_cells[c_id1]; @@ -129,7 +129,7 @@ _compute_reconstruct_v_b_face_gather_v2(cs_lnum_t n_b_cells, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + r_grad[c_id][k][1] * diipb[f_id][1] + r_grad[c_id][k][2] * diipb[f_id][2]; diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh index ab443b6f51..f4299bafb9 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh @@ -25,14 +25,14 @@ /*----------------------------------------------------------------------------*/ - +template __global__ static void _compute_reconstruct_v_i_face_gather_v3(cs_lnum_t n_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal, const cs_lnum_t *restrict cell_cells_idx, @@ -73,7 +73,7 @@ _compute_reconstruct_v_i_face_gather_v3(cs_lnum_t n_cells, / ( pond * c_weight[c_id1] + (1.0-pond)* c_weight[c_id2]); - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfaci = (1.0-ktpond) * (_pvar2[i] - _pvar1[i]); /* Reconstruction part */ @@ -89,23 +89,25 @@ _compute_reconstruct_v_i_face_gather_v3(cs_lnum_t n_cells, } } } - grad[c_id1][0][0] = _grad[0][0]; grad[c_id1][0][1] = _grad[0][1]; grad[c_id1][0][2] = _grad[0][2]; - grad[c_id1][1][0] = _grad[1][0]; grad[c_id1][1][1] = _grad[1][1]; grad[c_id1][1][2] = _grad[1][2]; - grad[c_id1][2][0] = _grad[2][0]; grad[c_id1][2][1] = _grad[2][1]; grad[c_id1][2][2] = _grad[2][2]; + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id1][i][j] = _grad[i][j]; + } + } } - +template __global__ static void _compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_cells, const cs_lnum_t *restrict cell_b_faces, @@ -137,7 +139,7 @@ _compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, auto _coefbv = coefbv[f_id]; auto _b_f_face_normal = b_f_face_normal[f_id]; - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfac = inc*_coefav[i]; @@ -149,7 +151,7 @@ _compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = _r_grad[k][0] * _diipb[0] + _r_grad[k][1] * _diipb[1] + _r_grad[k][2] * _diipb[2]; @@ -162,7 +164,10 @@ _compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, } } - grad[c_id][0][0] = _grad[0][0]; grad[c_id][0][1] = _grad[0][1]; grad[c_id][0][2] = _grad[0][2]; - grad[c_id][1][0] = _grad[1][0]; grad[c_id][1][1] = _grad[1][1]; grad[c_id][1][2] = _grad[1][2]; - grad[c_id][2][0] = _grad[2][0]; grad[c_id][2][1] = _grad[2][1]; grad[c_id][2][2] = _grad[2][2]; + for(cs_lnum_t i = 0; i < stride; i++){ + for(cs_lnum_t j = 0; j < 3; j++){ + grad[c_id1][i][j] = _grad[i][j]; + } + } + } diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh index 906374456d..35eed09b76 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v4.cuh @@ -24,14 +24,14 @@ /*----------------------------------------------------------------------------*/ - +template __global__ static void _compute_reconstruct_v_i_face_gather_v4(cs_lnum_t n_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal, const cs_lnum_t *restrict cell_cells_idx, @@ -49,8 +49,8 @@ _compute_reconstruct_v_i_face_gather_v4(cs_lnum_t n_cells, cs_lnum_t c_id2, f_id; cs_real_t pond, ktpond, pfaci, pfacj, rfac; - size_t c_idx = c_id1 / (3*3); - size_t i = (c_id1 / 3) % 3; + size_t c_idx = c_id1 / (stride*3); + size_t i = (c_id1 / 3) % stride; size_t j = c_id1 % 3; cs_lnum_t s_id = cell_cells_idx[c_idx]; @@ -86,16 +86,16 @@ _compute_reconstruct_v_i_face_gather_v4(cs_lnum_t n_cells, - +template __global__ static void _compute_reconstruct_v_b_face_gather_v4(cs_lnum_t n_b_cells, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_cells, const cs_lnum_t *restrict cell_b_faces, @@ -108,8 +108,8 @@ _compute_reconstruct_v_b_face_gather_v4(cs_lnum_t n_b_cells, return; } - size_t c_id1 = c_idx / 3; - size_t i = c_idx % 3; + size_t c_id1 = c_idx / stride; + size_t i = c_idx % stride; cs_lnum_t c_id = b_cells[c_id1]; @@ -133,7 +133,7 @@ _compute_reconstruct_v_b_face_gather_v4(cs_lnum_t n_b_cells, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + r_grad[c_id][k][1] * diipb[f_id][1] + r_grad[c_id][k][2] * diipb[f_id][2]; diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh index e664a5ff3e..a0d0f2b000 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh @@ -99,7 +99,7 @@ _compute_reconstruct_v_b_face(cs_lnum_t n_b_faces, c_id = b_face_cells[f_id]; - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfac = inc*coefav[f_id][i]; diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh index f684409ffd..f68189bfca 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_cf.cuh @@ -24,14 +24,15 @@ /*----------------------------------------------------------------------------*/ +template __global__ static void _compute_reconstruct_v_i_face_cf(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal) { @@ -54,11 +55,11 @@ _compute_reconstruct_v_i_face_cf(cs_lnum_t n_i_faces, + (1.0-pond)* c_weight[c_id2]); - using Cell = AtomicCell; + using Cell = AtomicCell; Cell grad_cf1, grad_cf2; - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfaci = (1.0-ktpond) * (pvar[c_id2][i] - pvar[c_id1][i]); pfacj = - ktpond * (pvar[c_id2][i] - pvar[c_id1][i]); @@ -80,16 +81,16 @@ _compute_reconstruct_v_i_face_cf(cs_lnum_t n_i_faces, } - +template __global__ static void _compute_reconstruct_v_b_face_cf(cs_lnum_t n_b_faces, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_face_cells) { @@ -103,10 +104,10 @@ _compute_reconstruct_v_b_face_cf(cs_lnum_t n_b_faces, c_id = b_face_cells[f_id]; - using Cell = AtomicCell; + using Cell = AtomicCell; Cell grad_cf; - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { pfac = inc*coefav[f_id][i]; @@ -118,7 +119,7 @@ _compute_reconstruct_v_b_face_cf(cs_lnum_t n_b_faces, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + r_grad[c_id][k][1] * diipb[f_id][1] + r_grad[c_id][k][2] * diipb[f_id][2]; diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh index cf82af3054..a2dfcf3c4c 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2.cuh @@ -26,15 +26,15 @@ - +template __global__ static void _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal) { @@ -44,8 +44,8 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, return; } - size_t f_idt = f_id / 3; - size_t i = f_id % 3; + size_t f_idt = f_id / stride; + size_t i = f_id % stride; cs_lnum_t c_id1, c_id2; cs_real_t pond, ktpond, pfaci, pfacj, rfac; @@ -79,16 +79,16 @@ _compute_reconstruct_v_i_face_v2(cs_lnum_t n_i_faces, } - +template __global__ static void _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_face_cells) { @@ -98,8 +98,8 @@ _compute_reconstruct_v_b_face_v2(cs_lnum_t n_b_faces, return; } - size_t f_idt = f_id / 3; - size_t i = f_id % 3; + size_t f_idt = f_id / stride; + size_t i = f_id % stride; cs_lnum_t c_id; cs_real_t pond, ktpond, pfac, rfac, vecfac; @@ -151,8 +151,8 @@ _compute_reconstruct_correction_v2( cs_lnum_t n_cells, return; } - size_t c_idt = c_id / 3; - size_t i = c_id % 3; + size_t c_idt = c_id / stride; + size_t i = c_id % stride; cs_real_t dvol; /* Is the cell disabled (for solid or porous)? Not the case if coupled */ diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh index c437fde30e..ae4dbd5092 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter_v2_cf.cuh @@ -25,15 +25,15 @@ /*----------------------------------------------------------------------------*/ - +template __global__ static void _compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, const cs_lnum_2_t *i_face_cells, - const cs_real_3_t *pvar, + const cs_real_t (*restrict pvar)[stride], const cs_real_t *weight, const cs_real_t *c_weight, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict dofij, const cs_real_3_t *restrict i_f_face_normal) { @@ -43,8 +43,8 @@ _compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, return; } - size_t f_idt = f_id / 3; - size_t i = f_id % 3; + size_t f_idt = f_id / stride; + size_t i = f_id % stride; cs_lnum_t c_id1, c_id2; cs_real_t pond, ktpond, pfaci, pfacj, rfac; @@ -86,16 +86,16 @@ _compute_reconstruct_v_i_face_v2_cf(cs_lnum_t n_i_faces, - +template __global__ static void _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, - const cs_real_33_t *restrict coefbv, - const cs_real_3_t *restrict coefav, - const cs_real_3_t *restrict pvar, + const cs_real_t (*restrict coefbv)[stride][stride], + const cs_real_t (*restrict coefav)[stride], + const cs_real_t (*restrict pvar)[stride], int inc, const cs_real_3_t *restrict diipb, - const cs_real_33_t *restrict r_grad, - cs_real_33_t *restrict grad, + const cs_real_t (*restrict r_grad)[stride][3], + cs_real_t (*restrict grad)[stride][3], const cs_real_3_t *restrict b_f_face_normal, const cs_lnum_t *restrict b_face_cells) { @@ -105,8 +105,8 @@ _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, return; } - size_t f_idt = f_id / 3; - size_t i = f_id % 3; + size_t f_idt = f_id / stride; + size_t i = f_id % stride; cs_lnum_t c_id; cs_real_t pond, ktpond, pfac, rfac, vecfac; @@ -123,7 +123,7 @@ _compute_reconstruct_v_b_face_v2_cf(cs_lnum_t n_b_faces, // /* Reconstruction part */ rfac = 0.; - for (cs_lnum_t k = 0; k < 3; k++) { + for (cs_lnum_t k = 0; k < stride; k++) { vecfac = r_grad[c_id][k][0] * diipb[f_idt][0] + r_grad[c_id][k][1] * diipb[f_idt][1] + r_grad[c_id][k][2] * diipb[f_idt][2]; From 0f82e6235138fae61d47cee154082604851c9fcc Mon Sep 17 00:00:00 2001 From: aneo-mderbane Date: Fri, 12 Jan 2024 17:11:03 +0100 Subject: [PATCH 70/70] Fix strided version and fix name and accuracy function --- src/alge/cs_gradient.cxx | 6 +-- src/alge/cs_gradient_cuda.cu | 46 +++++++++---------- ..._reconstruct_vector_gradient_gather_v3.cuh | 2 +- ..._reconstruct_vector_gradient_gather_v5.cuh | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/alge/cs_gradient.cxx b/src/alge/cs_gradient.cxx index 27862a2428..ece9a34171 100644 --- a/src/alge/cs_gradient.cxx +++ b/src/alge/cs_gradient.cxx @@ -5557,11 +5557,11 @@ res_cpu = !compute_cuda; // Pour l'instant ces lignes sont pour moi // Elles seront à enlever // compute_cuda = true; - compute_cpu = true; + // compute_cpu = true; // res_cpu = false; // A ne pas garder dans la version finale - perf = true; + // perf = false; // accuracy = false; @@ -5777,7 +5777,7 @@ res_cpu = !compute_cuda; if(compute_cuda){ if(compute_cpu){ for (cs_lnum_t c_id = 0; c_id < n_cells; c_id++) { - for (cs_lnum_t i = 0; i < 3; i++) { + for (cs_lnum_t i = 0; i < stride; i++) { for (int j =0; j < 3; ++j) { auto cpu = grad_cpu[c_id][i][j]; auto cuda = grad_gpu[c_id][i][j]; diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 32e8b6abab..df08ace6a0 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -70,7 +70,7 @@ * Recompute cocg at boundaries, using saved cocgb *----------------------------------------------------------------------------*/ -#define INSTANTIATE(name, stride) template void name (const cs_mesh_t *m,\ +#define INSTANTIATE_LSQ(name, stride) template void name (const cs_mesh_t *m,\ const cs_mesh_adjacencies_t *madj,\ const cs_mesh_quantities_t *fvq,\ const cs_halo_type_t halo_type,\ @@ -86,7 +86,7 @@ cs_lnum_t n_c_iter_max,\ cs_real_t c_eps) -#define INSTANTIATE1(name, stride) template void name (const cs_mesh_t *m, \ +#define INSTANTIATE_RECONSTRUCT(name, stride) template void name (const cs_mesh_t *m, \ const cs_mesh_adjacencies_t *madj, \ const cs_mesh_quantities_t *fvq, \ cs_halo_type_t halo_type, \ @@ -1496,9 +1496,9 @@ cs_lsq_vector_gradient_strided_cuda(const cs_mesh_t *m, } -INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 1); -INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 3); -INSTANTIATE(cs_lsq_vector_gradient_strided_cuda, 6); +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 1); +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 3); +INSTANTIATE_LSQ(cs_lsq_vector_gradient_strided_cuda, 6); @@ -1687,7 +1687,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, dofij, i_f_face_normal); - // _compute_reconstruct_v_i_face_v2<<>> + // _compute_reconstruct_v_i_face_v2<<>> // (n_i_faces * 3, // i_face_cells, // pvar_d, @@ -1699,7 +1699,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // i_f_face_normal); /*************************************Kernels Scatter conflict free**************************************/ - // _compute_reconstruct_v_i_face_cf<<>> + // _compute_reconstruct_v_i_face_cf<<>> // (n_i_faces, // i_face_cells, // pvar_d, @@ -1710,7 +1710,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // dofij, // i_f_face_normal); - // _compute_reconstruct_v_i_face_v2_cf<<>> + // _compute_reconstruct_v_i_face_v2_cf<<>> // (n_i_faces * 3, // i_face_cells, // pvar_d, @@ -1722,7 +1722,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // i_f_face_normal); /*************************************Kernels Gather**************************************************/ - // _compute_reconstruct_v_i_face_gather<<>> + // _compute_reconstruct_v_i_face_gather<<>> // ( n_cells, // pvar_d, // weight, @@ -1737,7 +1737,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces_sgn); - // _compute_reconstruct_v_i_face_gather_v2<<>> + // _compute_reconstruct_v_i_face_gather_v2<<>> // ( n_cells * 3 * 3, // pvar_d, // weight, @@ -1754,7 +1754,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather registers memory************************************/ - // _compute_reconstruct_v_i_face_gather_v3<<>> + // _compute_reconstruct_v_i_face_gather_v3<<>> // ( n_cells, // pvar_d, // weight, @@ -1769,7 +1769,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_i_faces_sgn); - // _compute_reconstruct_v_i_face_gather_v4<<>> + // _compute_reconstruct_v_i_face_gather_v4<<>> // ( n_cells * 3 * 3, // pvar_d, // weight, @@ -1843,7 +1843,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, b_face_cells); - // _compute_reconstruct_v_b_face_v2<<>> + // _compute_reconstruct_v_b_face_v2<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1856,7 +1856,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); /*************************************Kernels Scatter conflict free************************************/ - // _compute_reconstruct_v_b_face_cf<<>> + // _compute_reconstruct_v_b_face_cf<<>> // ( n_b_faces, // coefb_d, // coefa_d, @@ -1868,7 +1868,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); - // _compute_reconstruct_v_b_face_v2_cf<<>> + // _compute_reconstruct_v_b_face_v2_cf<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1881,7 +1881,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); /*************************************Kernels Gather**************************************************/ - // _compute_reconstruct_v_b_face_gather<<>> + // _compute_reconstruct_v_b_face_gather<<>> // ( n_b_cells, // coefb_d, // coefa_d, @@ -1896,7 +1896,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); - // _compute_reconstruct_v_b_face_gather_v2<<>> + // _compute_reconstruct_v_b_face_gather_v2<<>> // ( n_b_cells * 3, // coefb_d, // coefa_d, @@ -1911,7 +1911,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); /*************************************Kernels Gather registers memory***************************************/ - // _compute_reconstruct_v_b_face_gather_v3<<>> + // _compute_reconstruct_v_b_face_gather_v3<<>> // ( n_b_cells, // coefb_d, // coefa_d, @@ -1926,7 +1926,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // cell_b_faces_idx); - // _compute_reconstruct_v_b_face_gather_v4<<>> + // _compute_reconstruct_v_b_face_gather_v4<<>> // ( n_b_cells * 3, // coefb_d, // coefa_d, @@ -1960,7 +1960,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); - // _compute_reconstruct_correction<<>> + // _compute_reconstruct_correction<<>> // ( n_cells, // has_dc, // c_disable_flag, @@ -2182,6 +2182,6 @@ _gradient_vector_cuda(const cs_mesh_t *mesh, CS_CUDA_CHECK(cudaFree(_bc_coeff_b_d)); } -INSTANTIATE1(cs_reconstruct_vector_gradient_cuda, 1); -INSTANTIATE1(cs_reconstruct_vector_gradient_cuda, 3); -INSTANTIATE1(cs_reconstruct_vector_gradient_cuda, 6); +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 1); +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 3); +INSTANTIATE_RECONSTRUCT(cs_reconstruct_vector_gradient_cuda, 6); diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh index f4299bafb9..aa53aa9f9e 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v3.cuh @@ -166,7 +166,7 @@ _compute_reconstruct_v_b_face_gather_v3(cs_lnum_t n_b_cells, } for(cs_lnum_t i = 0; i < stride; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - grad[c_id1][i][j] = _grad[i][j]; + grad[c_id][i][j] = _grad[i][j]; } } diff --git a/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh index ee1bc03192..cd7ebe49e1 100644 --- a/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_gather_v5.cuh @@ -185,7 +185,7 @@ _compute_reconstruct_v_b_face_gather_v5(cs_lnum_t n_b_cells, for(cs_lnum_t i = 0; i < stride; i++){ for(cs_lnum_t j = 0; j < 3; j++){ - grad[c_id1][i][j] = _grad[lindex][i][j]; + grad[c_id][i][j] = _grad[lindex][i][j]; } } }