Description
Describe the bug
I implemented the attached kernel for matrix multiplication. It works on a Nvidia A30 and returns correct results if I compare the results to a naive CPU implementation. The performance is for NVIDIA is up to 5 Tflops depending on the matrix size.
If I compile the kernel for the AMD MI210, the performance drops significantly to ≈ 0.5 Gflops. Furthermore, the results of the kernel are wrong if I compare them to the naive CPU implementation.
A second issue is that if I increase the BLOCK_SIZE to 32, the AMD kernel crashes throwing a core dump error whereas on Nvidia the kernel still executes fine and even faster as expected when increasing the block size.
My question is if someone experienced the same problems when using the joint_matrix extension on AMD, especially the wrong results or if it is a user error?
To reproduce
template <unsigned int matrix_Size, typename TOperand, typename TResult,
size_t tileM, size_t tileN, size_t tileK, class kernel_name>
void compute_joint_fast( size_t sg_Size, queue &q, int testIterations) {
double t;
constexpr int TILE_SIZE = 16;
constexpr int BLOCK_SIZE = 16;
const int padded_N = ((matrix_Size % BLOCK_SIZE) == 0) ? matrix_Size : matrix_Size + (BLOCK_SIZE - (matrix_Size % BLOCK_SIZE));
TOperand *A = malloc_shared<TOperand>(padded_N * padded_N, q);
TOperand *B = malloc_shared<TOperand>(padded_N * padded_N, q);
TResult *C = malloc_shared<TResult>(padded_N * padded_N, q);
// Initialize; fill matrices
fill_matrix_random(padded_N, matrix_Size, A);
fill_matrix_random(padded_N, matrix_Size, B);
#ifdef CHECKRESULTS
TResult *refC = malloc_shared<TResult>(padded_N * padded_N, q);
matrix_multiply_CPU(A, B, refC, padded_N);
#endif
range<2> global{padded_N / TILE_SIZE, (padded_N / TILE_SIZE) * sg_Size};
range<2> local{BLOCK_SIZE / TILE_SIZE, BLOCK_SIZE / TILE_SIZE * sg_Size};
std::chrono::steady_clock::time_point start =
std::chrono::steady_clock::now();
for (unsigned int i = 0; i < testIterations; i++) {
event mk = q.submit(
[&](handler &h)
{
h.parallel_for<kernel_name>(nd_range<2>{global, local}, [=](nd_item<2>
it)[[sycl::reqd_sub_group_size(64)]] {
auto pA =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(A);
auto pB =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(B);
auto pC =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(C);
auto m2 = it.get_group(0);
auto n2 = it.get_group(1);
auto m1 = it.get_local_id(0);
auto n1 = it.get_local_id(1);
auto sg = it.get_sub_group();
int warpM = (n2 * it.get_local_range(1) + n1) / sg_Size;
int warpN = (m2 * it.get_local_range(0) + m1);
joint_matrix<sub_group, TResult, use::accumulator, tileM, tileN> tC;
joint_matrix<sub_group, TOperand, use::a, tileM, tileK,
layout::row_major>
tA;
joint_matrix<sub_group, TOperand, use::b, tileK, tileN,
layout::row_major>
tB;
joint_matrix_fill(sg, tC, 0);
for (unsigned int i = 0; i < padded_N; i += tileK) {
int aCol = i;
int aRow = warpM * tileM;
int bCol = i;
int bRow = warpN * tileN;
joint_matrix_load(sg, tA, pA + aCol + aRow * padded_N,
padded_N);
joint_matrix_load(sg, tB, pB + bRow + bCol * padded_N,
padded_N);
joint_matrix_mad(sg, tC, tA, tB, tC);
}
int cCol = warpN * tileN;
int cRow = warpM * tileM;
joint_matrix_store(sg, tC, pC + cCol + cRow * padded_N,
padded_N, layout::row_major);
});
});
}
q.wait();
Compile:
icpx -O3 -fsycl -fsycl-targets=amd_gpu_gfx90a translateCuda.cpp
Environment
- OS: Linux
- Target device and vendor: AMD MI210
- DPC++ version: Intel(R) oneAPI DPC++/C++ Compiler 2025.0.0 (2025.0.0.20241008)
Additional context
No response