switch more block reads to the production versions

bashbaug · bashbaug · commit badf4c2e5199 · 2025-05-03T21:44:34.000-07:00
diff --git a/samples/99_matrixexperimentstf32/matrix_helpers_tf32.cl b/samples/99_matrixexperimentstf32/matrix_helpers_tf32.cl
@@ -284,125 +284,3 @@ void store_c_rowmajor_fp32_8rNc(global float* C, float8 v, int rowStart, int col
 }
 
 #endif // defined(cl_intel_subgroups)
-
-#ifdef cl_intel_subgroup_2d_block_io
-
-// Note for 2D block reads:
-//  - the tile width and height is encoded into the function name.
-//  - base_address is the byte address.  Must be 64B aligned.
-//  - width is the width of the entire matrix, in bytes.  Must be >= 64B.  Must be 4B aligned.
-//  - height is the height of the entire matrix, or equivalently the number of rows.
-//  - pitch is the number of bytes between rows of the entire matrix.  Must be >= 64B.  Must be a multiple of 8 bytes.
-//  - coord is the number of elements (x coord) and row (y coord) to read from.  X coord must be multiple 4 for for 1B data and 2 for 2B data.
-
-// For intrinsics, the pattern is:
-//  - prefix: __builtin_IB_subgroup_block_read_flat or __builtin_IB_subgroup_block_write_flat
-//  - operation (optional): _transpose or _transform
-//  - for no transpose or transform:
-//      - type / elements size: _u8 or _u16 or _u32 or _u64
-//      - number of tile rows: _m32 or _m16 or _m8 or _m4 or _m2 or _m1
-//      - tile width: _k64 or _k32 or _k16 or _k8
-//      - number of tiles: _v2 or _v1
-//  - for transpose:
-//      - type / element size: _u64 or _u32
-//      - number of tile rows: subgroup size (16)
-//      - tile width: _k4 (for _u64) or _k8 (for _u32)
-//      - number of tiles: 1
-//  - for transform:
-//      - type / element size: _u16 or _u8
-//      - number of tile rows: _k32 (for _u8) or _k16 (for _u16)
-//      - tile width: subgroup size (16)
-//      - number of tiles: 1
-
-enum LSC_LDCC {
-    LSC_LDCC_DEFAULT      = 0,
-    LSC_LDCC_L1UC_L3UC    = 1,   // Override to L1 uncached and L3 uncached
-    LSC_LDCC_L1UC_L3C     = 2,   // Override to L1 uncached and L3 cached
-    LSC_LDCC_L1C_L3UC     = 3,   // Override to L1 cached and L3 uncached
-    LSC_LDCC_L1C_L3C      = 4,   // Override to L1 cached and L3 cached
-    LSC_LDCC_L1S_L3UC     = 5,   // Override to L1 streaming load and L3 uncached
-    LSC_LDCC_L1S_L3C      = 6,   // Override to L1 streaming load and L3 cached
-    LSC_LDCC_L1IAR_L3C    = 7,   // Override to L1 invalidate-after-read, and L3 cached
-};
-
-// Define block reads, prefetches, and writes.  These are supported by the hardware but are not in the headers:
-
-uint   __builtin_IB_subgroup_block_read_flat_u32_m1k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint2  __builtin_IB_subgroup_block_read_flat_u32_m2k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint4  __builtin_IB_subgroup_block_read_flat_u32_m4k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint8  __builtin_IB_subgroup_block_read_flat_u32_m8k8v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-
-uint   __builtin_IB_subgroup_block_read_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint2  __builtin_IB_subgroup_block_read_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint4  __builtin_IB_subgroup_block_read_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-uint8  __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-
-uint8  __builtin_IB_subgroup_block_read_flat_u32_m8k8v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
-
-void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint  data);
-void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint2 data);
-void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint4 data);
-void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint8 data);
-
-uint   intel_sub_group_block_read_32b_1r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m1k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-uint  intel_sub_group_block_read_32b_2r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m2k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
-}
-uint2  intel_sub_group_block_read_32b_4r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m4k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
-}
-uint4  intel_sub_group_block_read_32b_8r8c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m8k8v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord).lo;
-}
-
-uint   intel_sub_group_block_read_32b_1r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-uint2  intel_sub_group_block_read_32b_2r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-uint4  intel_sub_group_block_read_32b_4r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-uint8  intel_sub_group_block_read_32b_8r16c(const __global void *base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-
-uint8 intel_sub_group_block_read_32b_8r8x2c(const __global void* base_address, int width, int height, int pitch, int2 coord)
-{
-    return __builtin_IB_subgroup_block_read_flat_u32_m8k8v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
-}
-
-
-#if !defined(BLOCK_PREFETCH_CACHE_TYPE)
-#define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
-#endif
-
-void intel_sub_group_block_write_32b_1r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint data)
-{
-    __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
-}
-void intel_sub_group_block_write_32b_2r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint2 data)
-{
-    __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
-}
-void intel_sub_group_block_write_32b_4r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint4 data)
-{
-    __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
-}
-void intel_sub_group_block_write_32b_8r16c(__global void* base_address, int width, int height, int pitch, int2 coord, uint8 data)
-{
-    __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
-}
-
-#endif // cl_intel_subgroup_2d_block_io
diff --git a/samples/99_matrixexperimentstf32/matrix_kernel_tiled_tf32.cl b/samples/99_matrixexperimentstf32/matrix_kernel_tiled_tf32.cl
@@ -147,7 +147,7 @@ void HELPER_NAME(atile_block_load_rowmajor, MM, NN)(global float* A, int tM, int
 {
     for (int kk = 0; kk < KK; kk++) {
         for (int mm = 0; mm < MM; mm++) {
-            aData[kk][mm] = as_float4(intel_sub_group_block_read_32b_8r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM)));
+            intel_sub_group_2d_block_read_32b_8r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM), (uint*)&aData[kk][mm]);
         }
     }
 }
@@ -156,7 +156,7 @@ void HELPER_NAME(btile_block_load_rowmajor, MM, NN)(global float* B, int tN, int
 {
     for (int kk = 0; kk < KK; kk++) {
         for (int nn = 0; nn < NN; nn++) {
-            bData[nn][kk] = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK)));
+            intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK), (uint*)&bData[nn][kk]);
         }
     }
 }
@@ -217,7 +217,7 @@ kernel void MM_KERNEL_NAME(tf32_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(gl
     for (int mm = 0; mm < MM; mm++) {
         for (int nn = 0; nn < NN; nn++) {
             sum[nn][mm] = activation(sum[nn][mm]);
-            intel_sub_group_block_write_32b_8r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[nn][mm]));
+            intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), (uint*)&sum[nn][mm]);
         }
     }
 }
diff --git a/samples/99_matrixexperimentstf32/matrix_kernels_tf32.cl b/samples/99_matrixexperimentstf32/matrix_kernels_tf32.cl
@@ -127,13 +127,15 @@ kernel void tf32_dpas_blockread_rowmajor_m1_n16(global float* C, global float* A
 
     float sum = 0;
     for (int k = 0; k < K; k += tK) {
-        float   aData = as_float(intel_sub_group_block_read_32b_1r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
-        float8  bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
+        float   aData;
+        intel_sub_group_2d_block_read_32b_1r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
+        float8  bData;
+        intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
         sum = mat_mul_sg16(aData, bData, sum);
     }
 
     sum = activation(sum);
-    intel_sub_group_block_write_32b_1r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
+    intel_sub_group_2d_block_write_32b_1r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
 }
 
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -149,13 +151,15 @@ kernel void tf32_dpas_blockread_rowmajor_m2_n16(global float* C, global float* A
 
     float2 sum = 0;
     for (int k = 0; k < K; k += tK) {
-        float   aData = as_float(intel_sub_group_block_read_32b_2r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
-        float8  bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
+        float   aData;
+        intel_sub_group_2d_block_read_32b_2r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
+        float8  bData;
+        intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
         sum = mat_mul_sg16(aData, bData, sum);
     }
 
     sum = activation(sum);
-    intel_sub_group_block_write_32b_2r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
+    intel_sub_group_2d_block_write_32b_2r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
 }
 
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -171,13 +175,15 @@ kernel void tf32_dpas_blockread_rowmajor_m4_n16(global float* C, global float* A
 
     float4 sum = 0;
     for (int k = 0; k < K; k += tK) {
-        float2  aData = as_float2(intel_sub_group_block_read_32b_4r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
-        float8  bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
+        float2  aData;
+        intel_sub_group_2d_block_read_32b_4r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
+        float8  bData;
+        intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
         sum = mat_mul_sg16(aData, bData, sum);
     }
 
     sum = activation(sum);
-    intel_sub_group_block_write_32b_4r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
+    intel_sub_group_2d_block_write_32b_4r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
 }
 
 __attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -193,13 +199,15 @@ kernel void tf32_dpas_blockread_rowmajor_m8_n16(global float* C, global float* A
 
     float8 sum = 0;
     for (int k = 0; k < K; k += tK) {
-        float4  aData = as_float4(intel_sub_group_block_read_32b_8r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m)));
-        float8  bData = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k)));
+        float4  aData;
+        intel_sub_group_2d_block_read_32b_8r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k, m), (uint*)&aData);
+        float8  bData;
+        intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n, k), (uint*)&bData);
         sum = mat_mul_sg16(aData, bData, sum);
     }
 
     sum = activation(sum);
-    intel_sub_group_block_write_32b_8r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
+    intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
 }
 
 #endif // cl_intel_subgroup_2d_block_io

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ void HELPER_NAME(atile_block_load_rowmajor, MM, NN)(global float* A, int tM, int`
`147`	`147`	`{`
`148`	`148`	`for (int kk = 0; kk < KK; kk++) {`
`149`	`149`	`for (int mm = 0; mm < MM; mm++) {`
`150`		`- aData[kk][mm] = as_float4(intel_sub_group_block_read_32b_8r8c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM)));`
	`150`	`+ intel_sub_group_2d_block_read_32b_8r8x1c(A, K * sizeof(float), M, K * sizeof(float), (int2)(k + kk * tK, m + mm * tM), (uint*)&aData[kk][mm]);`
`151`	`151`	`}`
`152`	`152`	`}`
`153`	`153`	`}`
`@@ -156,7 +156,7 @@ void HELPER_NAME(btile_block_load_rowmajor, MM, NN)(global float* B, int tN, int`
`156`	`156`	`{`
`157`	`157`	`for (int kk = 0; kk < KK; kk++) {`
`158`	`158`	`for (int nn = 0; nn < NN; nn++) {`
`159`		`- bData[nn][kk] = as_float8(intel_sub_group_block_read_32b_8r16c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK)));`
	`159`	`+ intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(float), K, N * sizeof(float), (int2)(n + nn * tN, k + kk * tK), (uint*)&bData[nn][kk]);`
`160`	`160`	`}`
`161`	`161`	`}`
`162`	`162`	`}`
`@@ -217,7 +217,7 @@ kernel void MM_KERNEL_NAME(tf32_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN)(gl`
`217`	`217`	`for (int mm = 0; mm < MM; mm++) {`
`218`	`218`	`for (int nn = 0; nn < NN; nn++) {`
`219`	`219`	`sum[nn][mm] = activation(sum[nn][mm]);`
`220`		`- intel_sub_group_block_write_32b_8r16c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), as_uint8(sum[nn][mm]));`
	`220`	`+ intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n + nn * tN, m + mm * tM), (uint*)&sum[nn][mm]);`
`221`	`221`	`}`
`222`	`222`	`}`
`223`	`223`	`}`