Skip to content

Commit badf4c2

Browse files
committed
switch more block reads to the production versions
1 parent 62a1fd8 commit badf4c2

File tree

3 files changed

+23
-137
lines changed

3 files changed

+23
-137
lines changed

samples/99_matrixexperimentstf32/matrix_helpers_tf32.cl

-122
Original file line numberDiff line numberDiff line change
@@ -284,125 +284,3 @@ void store_c_rowmajor_fp32_8rNc(global float* C, float8 v, int rowStart, int col
284284
}
285285

286286
#endif // defined(cl_intel_subgroups)
287-
288-
#ifdef cl_intel_subgroup_2d_block_io
289-
290-
// Note for 2D block reads:
291-
// - the tile width and height is encoded into the function name.
292-
// - base_address is the byte address. Must be 64B aligned.
293-
// - width is the width of the entire matrix, in bytes. Must be >= 64B. Must be 4B aligned.
294-
// - height is the height of the entire matrix, or equivalently the number of rows.
295-
// - pitch is the number of bytes between rows of the entire matrix. Must be >= 64B. Must be a multiple of 8 bytes.
296-
// - coord is the number of elements (x coord) and row (y coord) to read from. X coord must be multiple 4 for for 1B data and 2 for 2B data.
297-
298-
// For intrinsics, the pattern is:
299-
// - prefix: __builtin_IB_subgroup_block_read_flat or __builtin_IB_subgroup_block_write_flat
300-
// - operation (optional): _transpose or _transform
301-
// - for no transpose or transform:
302-
// - type / elements size: _u8 or _u16 or _u32 or _u64
303-
// - number of tile rows: _m32 or _m16 or _m8 or _m4 or _m2 or _m1
304-
// - tile width: _k64 or _k32 or _k16 or _k8
305-
// - number of tiles: _v2 or _v1
306-
// - for transpose:
307-
// - type / element size: _u64 or _u32
308-
// - number of tile rows: subgroup size (16)
309-
// - tile width: _k4 (for _u64) or _k8 (for _u32)
310-
// - number of tiles: 1
311-
// - for transform:
312-
// - type / element size: _u16 or _u8
313-
// - number of tile rows: _k32 (for _u8) or _k16 (for _u16)
314-
// - tile width: subgroup size (16)
315-
// - number of tiles: 1
316-
317-
enum LSC_LDCC {
318-
LSC_LDCC_DEFAULT = 0,
319-
LSC_LDCC_L1UC_L3UC = 1, // Override to L1 uncached and L3 uncached
320-
LSC_LDCC_L1UC_L3C = 2, // Override to L1 uncached and L3 cached
321-
LSC_LDCC_L1C_L3UC = 3, // Override to L1 cached and L3 uncached
322-
LSC_LDCC_L1C_L3C = 4, // Override to L1 cached and L3 cached
323-
LSC_LDCC_L1S_L3UC = 5, // Override to L1 streaming load and L3 uncached
324-
LSC_LDCC_L1S_L3C = 6, // Override to L1 streaming load and L3 cached
325-
LSC_LDCC_L1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached
326-
};
327-
328-
// Define block reads, prefetches, and writes. These are supported by the hardware but are not in the headers:
329-
330-
uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
331-
uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
332-
uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
333-
uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
334-
335-
uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
336-
uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
337-
uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
338-
uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
339-
340-
uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
341-
342-
void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint data);
343-
void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint2 data);
344-
void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint4 data);
345-
void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint8 data);
346-
347-
uint intel_sub_group_block_read_32b_1r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
348-
{
349-
return __builtin_IB_subgroup_block_read_flat_u32_m1k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
350-
}
351-
uint intel_sub_group_block_read_32b_2r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
352-
{
353-
return __builtin_IB_subgroup_block_read_flat_u32_m2k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
354-
}
355-
uint2 intel_sub_group_block_read_32b_4r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
356-
{
357-
return __builtin_IB_subgroup_block_read_flat_u32_m4k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
358-
}
359-
uint4 intel_sub_group_block_read_32b_8r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
360-
{
361-
return __builtin_IB_subgroup_block_read_flat_u32_m8k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
362-
}
363-
364-
uint intel_sub_group_block_read_32b_1r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
365-
{
366-
return __builtin_IB_subgroup_block_read_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
367-
}
368-
uint2 intel_sub_group_block_read_32b_2r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
369-
{
370-
return __builtin_IB_subgroup_block_read_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
371-
}
372-
uint4 intel_sub_group_block_read_32b_4r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
373-
{
374-
return __builtin_IB_subgroup_block_read_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
375-
}
376-
uint8 intel_sub_group_block_read_32b_8r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
377-
{
378-
return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
379-
}
380-
381-
uint8 intel_sub_group_block_read_32b_8r8x2c(const __global void* base_address, int width, int height, int pitch, int2 coord)
382-
{
383-
return __builtin_IB_subgroup_block_read_flat_u32_m8k8v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
384-
}
385-
386-
387-
#if !defined(BLOCK_PREFETCH_CACHE_TYPE)
388-
#define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
389-
#endif
390-
391-
void intel_sub_group_block_write_32b_1r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint data)
392-
{
393-
__builtin_IB_subgroup_block_write_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
394-
}
395-
void intel_sub_group_block_write_32b_2r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data)
396-
{
397-
__builtin_IB_subgroup_block_write_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
398-
}
399-
void intel_sub_group_block_write_32b_4r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data)
400-
{
401-
__builtin_IB_subgroup_block_write_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
402-
}
403-
void intel_sub_group_block_write_32b_8r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data)
404-
{
405-
__builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
406-
}
407-
408-
#endif // cl_intel_subgroup_2d_block_io

samples/99_matrixexperimentstf32/matrix_kernel_tiled_tf32.cl

+3-3
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void HELPER_NAME(atile_block_load_rowmajor, MM, NN)(global float* A, int tM, int
147147
{
148148
for (int kk = 0; kk < KK; kk++) {
149149
for (int mm = 0; mm < MM; mm++) {
150-
aData[kk][mm] = as_float4(intel_sub_group_block_read_32b_8r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM)));
150+
intel_sub_group_2d_block_read_32b_8r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM), (uint*)&aData[kk][mm]);
151151
}
152152
}
153153
}
@@ -156,7 +156,7 @@ void HELPER_NAME(btile_block_load_rowmajor, MM, NN)(global float* B, int tN, int
156156
{
157157
for (int kk = 0; kk < KK; kk++) {
158158
for (int nn = 0; nn < NN; nn++) {
159-
bData[nn][kk] = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK)));
159+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK), (uint*)&bData[nn][kk]);
160160
}
161161
}
162162
}
@@ -217,7 +217,7 @@ kernel void MM_KERNEL_NAME(tf32_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(gl
217217
for (int mm = 0; mm < MM; mm++) {
218218
for (int nn = 0; nn < NN; nn++) {
219219
sum[nn][mm] = activation(sum[nn][mm]);
220-
intel_sub_group_block_write_32b_8r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[nn][mm]));
220+
intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), (uint*)&sum[nn][mm]);
221221
}
222222
}
223223
}

samples/99_matrixexperimentstf32/matrix_kernels_tf32.cl

+20-12
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,15 @@ kernel void tf32_dpas_blockread_rowmajor_m1_n16(global float* C, global float* A
127127

128128
float sum = 0;
129129
for (int k = 0; k < K; k += tK) {
130-
float aData = as_float(intel_sub_group_block_read_32b_1r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
131-
float8 bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
130+
float aData;
131+
intel_sub_group_2d_block_read_32b_1r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
132+
float8 bData;
133+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
132134
sum = mat_mul_sg16(aData, bData, sum);
133135
}
134136

135137
sum = activation(sum);
136-
intel_sub_group_block_write_32b_1r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
138+
intel_sub_group_2d_block_write_32b_1r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
137139
}
138140

139141
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -149,13 +151,15 @@ kernel void tf32_dpas_blockread_rowmajor_m2_n16(global float* C, global float* A
149151

150152
float2 sum = 0;
151153
for (int k = 0; k < K; k += tK) {
152-
float aData = as_float(intel_sub_group_block_read_32b_2r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
153-
float8 bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
154+
float aData;
155+
intel_sub_group_2d_block_read_32b_2r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
156+
float8 bData;
157+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
154158
sum = mat_mul_sg16(aData, bData, sum);
155159
}
156160

157161
sum = activation(sum);
158-
intel_sub_group_block_write_32b_2r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
162+
intel_sub_group_2d_block_write_32b_2r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
159163
}
160164

161165
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -171,13 +175,15 @@ kernel void tf32_dpas_blockread_rowmajor_m4_n16(global float* C, global float* A
171175

172176
float4 sum = 0;
173177
for (int k = 0; k < K; k += tK) {
174-
float2 aData = as_float2(intel_sub_group_block_read_32b_4r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
175-
float8 bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
178+
float2 aData;
179+
intel_sub_group_2d_block_read_32b_4r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
180+
float8 bData;
181+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
176182
sum = mat_mul_sg16(aData, bData, sum);
177183
}
178184

179185
sum = activation(sum);
180-
intel_sub_group_block_write_32b_4r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
186+
intel_sub_group_2d_block_write_32b_4r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
181187
}
182188

183189
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -193,13 +199,15 @@ kernel void tf32_dpas_blockread_rowmajor_m8_n16(global float* C, global float* A
193199

194200
float8 sum = 0;
195201
for (int k = 0; k < K; k += tK) {
196-
float4 aData = as_float4(intel_sub_group_block_read_32b_8r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
197-
float8 bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
202+
float4 aData;
203+
intel_sub_group_2d_block_read_32b_8r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
204+
float8 bData;
205+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
198206
sum = mat_mul_sg16(aData, bData, sum);
199207
}
200208

201209
sum = activation(sum);
202-
intel_sub_group_block_write_32b_8r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
210+
intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
203211
}
204212

205213
#endif // cl_intel_subgroup_2d_block_io

0 commit comments

Comments
 (0)