@@ -284,125 +284,3 @@ void store_c_rowmajor_fp32_8rNc(global float* C, float8 v, int rowStart, int col
284
284
}
285
285
286
286
#endif // defined(cl_intel_subgroups)
287
-
288
- #ifdef cl_intel_subgroup_2d_block_io
289
-
290
- // Note for 2D block reads:
291
- // - the tile width and height is encoded into the function name.
292
- // - base_address is the byte address. Must be 64B aligned.
293
- // - width is the width of the entire matrix, in bytes. Must be >= 64B. Must be 4B aligned.
294
- // - height is the height of the entire matrix, or equivalently the number of rows.
295
- // - pitch is the number of bytes between rows of the entire matrix. Must be >= 64B. Must be a multiple of 8 bytes.
296
- // - coord is the number of elements (x coord) and row (y coord) to read from. X coord must be multiple 4 for for 1B data and 2 for 2B data.
297
-
298
- // For intrinsics, the pattern is:
299
- // - prefix: __builtin_IB_subgroup_block_read_flat or __builtin_IB_subgroup_block_write_flat
300
- // - operation (optional): _transpose or _transform
301
- // - for no transpose or transform:
302
- // - type / elements size: _u8 or _u16 or _u32 or _u64
303
- // - number of tile rows: _m32 or _m16 or _m8 or _m4 or _m2 or _m1
304
- // - tile width: _k64 or _k32 or _k16 or _k8
305
- // - number of tiles: _v2 or _v1
306
- // - for transpose:
307
- // - type / element size: _u64 or _u32
308
- // - number of tile rows: subgroup size (16)
309
- // - tile width: _k4 (for _u64) or _k8 (for _u32)
310
- // - number of tiles: 1
311
- // - for transform:
312
- // - type / element size: _u16 or _u8
313
- // - number of tile rows: _k32 (for _u8) or _k16 (for _u16)
314
- // - tile width: subgroup size (16)
315
- // - number of tiles: 1
316
-
317
- enum LSC_LDCC {
318
- LSC_LDCC_DEFAULT = 0 ,
319
- LSC_LDCC_L1UC_L3UC = 1 , // Override to L1 uncached and L3 uncached
320
- LSC_LDCC_L1UC_L3C = 2 , // Override to L1 uncached and L3 cached
321
- LSC_LDCC_L1C_L3UC = 3 , // Override to L1 cached and L3 uncached
322
- LSC_LDCC_L1C_L3C = 4 , // Override to L1 cached and L3 cached
323
- LSC_LDCC_L1S_L3UC = 5 , // Override to L1 streaming load and L3 uncached
324
- LSC_LDCC_L1S_L3C = 6 , // Override to L1 streaming load and L3 cached
325
- LSC_LDCC_L1IAR_L3C = 7 , // Override to L1 invalidate-after-read, and L3 cached
326
- };
327
-
328
- // Define block reads, prefetches, and writes. These are supported by the hardware but are not in the headers:
329
-
330
- uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
331
- uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
332
- uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
333
- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
334
-
335
- uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
336
- uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
337
- uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
338
- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
339
-
340
- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
341
-
342
- void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint data );
343
- void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint2 data );
344
- void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint4 data );
345
- void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint8 data );
346
-
347
- uint intel_sub_group_block_read_32b_1r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
348
- {
349
- return __builtin_IB_subgroup_block_read_flat_u32_m1k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
350
- }
351
- uint intel_sub_group_block_read_32b_2r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
352
- {
353
- return __builtin_IB_subgroup_block_read_flat_u32_m2k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
354
- }
355
- uint2 intel_sub_group_block_read_32b_4r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
356
- {
357
- return __builtin_IB_subgroup_block_read_flat_u32_m4k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
358
- }
359
- uint4 intel_sub_group_block_read_32b_8r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
360
- {
361
- return __builtin_IB_subgroup_block_read_flat_u32_m8k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
362
- }
363
-
364
- uint intel_sub_group_block_read_32b_1r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
365
- {
366
- return __builtin_IB_subgroup_block_read_flat_u32_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
367
- }
368
- uint2 intel_sub_group_block_read_32b_2r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
369
- {
370
- return __builtin_IB_subgroup_block_read_flat_u32_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
371
- }
372
- uint4 intel_sub_group_block_read_32b_4r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
373
- {
374
- return __builtin_IB_subgroup_block_read_flat_u32_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
375
- }
376
- uint8 intel_sub_group_block_read_32b_8r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
377
- {
378
- return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
379
- }
380
-
381
- uint8 intel_sub_group_block_read_32b_8r8x2c (const __global void * base_address , int width , int height , int pitch , int2 coord )
382
- {
383
- return __builtin_IB_subgroup_block_read_flat_u32_m8k8v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
384
- }
385
-
386
-
387
- #if !defined(BLOCK_PREFETCH_CACHE_TYPE )
388
- #define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
389
- #endif
390
-
391
- void intel_sub_group_block_write_32b_1r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint data )
392
- {
393
- __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
394
- }
395
- void intel_sub_group_block_write_32b_2r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint2 data )
396
- {
397
- __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
398
- }
399
- void intel_sub_group_block_write_32b_4r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint4 data )
400
- {
401
- __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
402
- }
403
- void intel_sub_group_block_write_32b_8r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint8 data )
404
- {
405
- __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
406
- }
407
-
408
- #endif // cl_intel_subgroup_2d_block_io
0 commit comments