Skip to content

Commit 08d5986

Browse files
[SYCL] Optimize mul_mat for Q4_0 on Intel GPU (ggml-org#12035)
* opt performance by reorder for Intel GPU * detect hw type and save opt feature, and print opt feature * correct name * support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed * add env variable GGML_SYCL_DISABLE_OPT for debug * use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT * add performance data * mv getrows functions to separeted files * fix global variables --------- Co-authored-by: arthw <[email protected]>
1 parent 651adf4 commit 08d5986

14 files changed

+803
-266
lines changed

docs/backend/SYCL.md

+14-2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ The following release is verified with good quality:
4242

4343
## News
4444

45+
- 2025.2
46+
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
47+
|GPU|Base tokens/s|Increased tokens/s|Percent|
48+
|-|-|-|-|
49+
|PVC 1550|39|73|+87%|
50+
|Flex 170|39|50|+28%|
51+
|Arc770|42|55|+30%|
52+
|MTL|13|16|+23%|
53+
|ARL-H|14|17|+21%|
54+
4555
- 2024.11
4656
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
4757

@@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family:
97107
| Intel Data Center Max Series | Support | Max 1550, 1100 |
98108
| Intel Data Center Flex Series | Support | Flex 170 |
99109
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
100-
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
101-
| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
110+
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake |
111+
| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
102112

103113
*Notes:*
104114

@@ -660,8 +670,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
660670
| Name | Value | Function |
661671
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
662672
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
673+
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
663674
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
664675

676+
665677
## Known Issues
666678

667679
- `Split-mode:[row]` is not supported.

examples/sycl/run-llama2.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# MIT license
44
# Copyright (C) 2024 Intel Corporation
55
# SPDX-License-Identifier: MIT
6-
6+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
77
source /opt/intel/oneapi/setvars.sh
88

99
#export GGML_SYCL_DEBUG=1
@@ -13,7 +13,7 @@ source /opt/intel/oneapi/setvars.sh
1313
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
1414
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
1515
NGL=33
16-
CONEXT=8192
16+
CONEXT=4096
1717

1818
if [ $# -gt 0 ]; then
1919
GGML_SYCL_DEVICE=$1

ggml/src/ggml-sycl/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
2+
13
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
24
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
35
endif()

ggml/src/ggml-sycl/common.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
9999
<< ", line:" << __LINE__ << std::endl;
100100
std::exit(1);
101101
}
102+
103+
104+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105+
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106+
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
107+
if (extra->events[i][is] != nullptr) {
108+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
109+
}
110+
}
111+
if (extra->data_device[i] != nullptr && streams.size()>0) {
112+
ggml_sycl_set_device(i);
113+
SYCL_CHECK(
114+
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
115+
}
116+
}
117+
delete extra;
118+
}

ggml/src/ggml-sycl/common.hpp

+49-9
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
#include "dpct/helper.hpp"
2020
#include "ggml-sycl.h"
2121
#include "presets.hpp"
22+
#include "sycl_hw.hpp"
23+
24+
2225
#if GGML_SYCL_DNNL
2326
#include "dnnl.hpp"
2427
#include "dnnl_sycl.hpp"
@@ -35,7 +38,10 @@
3538
void* ggml_sycl_host_malloc(size_t size);
3639
void ggml_sycl_host_free(void* ptr);
3740

41+
3842
extern int g_ggml_sycl_debug;
43+
extern int g_ggml_sycl_disable_optimize;
44+
3945
#define GGML_SYCL_DEBUG(...) \
4046
do { \
4147
if (g_ggml_sycl_debug) \
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
182188
}
183189

184190
//////////////////////
191+
struct optimize_feature {
192+
bool reorder=false;
193+
};
194+
195+
struct sycl_device_info {
196+
int cc; // compute capability
197+
// int nsm; // number of streaming multiprocessors
198+
// size_t smpb; // max. shared memory per block
199+
bool vmm; // virtual memory support
200+
size_t total_vram;
201+
sycl_hw_info hw_info;
202+
optimize_feature opt_feature;
203+
};
204+
185205

186206
struct ggml_sycl_device_info {
187207
int device_count;
188208

189-
struct sycl_device_info {
190-
int cc; // compute capability
191-
// int nsm; // number of streaming multiprocessors
192-
// size_t smpb; // max. shared memory per block
193-
bool vmm; // virtual memory support
194-
size_t total_vram;
195-
};
196-
197209
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
198210

199211
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
260272
// tensors
261273
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
262274
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
275+
optimize_feature optimized_feature;
263276
};
264277

278+
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
279+
280+
inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
281+
optimize_feature opt;
282+
283+
opt.reorder =
284+
(arch == syclex::architecture::intel_gpu_dg1 ||
285+
arch == syclex::architecture::intel_gpu_acm_g10 ||
286+
arch == syclex::architecture::intel_gpu_acm_g11 ||
287+
arch == syclex::architecture::intel_gpu_acm_g12 ||
288+
arch == syclex::architecture::intel_gpu_pvc ||
289+
arch == syclex::architecture::intel_gpu_pvc_vg ||
290+
arch == syclex::architecture::intel_gpu_mtl_u ||
291+
arch == syclex::architecture::intel_gpu_mtl_s ||
292+
arch == syclex::architecture::intel_gpu_mtl_h ||
293+
arch == syclex::architecture::intel_gpu_arl_u ||
294+
arch == syclex::architecture::intel_gpu_arl_s ||
295+
arch == syclex::architecture::intel_gpu_arl_h ||
296+
arch == syclex::architecture::intel_gpu_bmg_g21 ||
297+
arch == syclex::architecture::intel_gpu_lnl_m
298+
);
299+
300+
return opt;
301+
}
302+
265303
struct ggml_backend_sycl_context {
266304
int device;
267305
std::string name;
306+
optimize_feature opt_feature;
307+
bool optimized_graph=false;
268308

269309
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
270310

271311
explicit ggml_backend_sycl_context(int device) :
272312
device(device),
273313
name(GGML_SYCL_NAME + std::to_string(device)) {
314+
opt_feature = ggml_sycl_info().devices[device].opt_feature;
274315
}
275316

276317
queue_ptr stream(int device, int stream) {
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
680721
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
681722
const ggml_tensor *src1, ggml_tensor *dst,
682723
const ggml_sycl_op_flatten_t op);
683-
684724
#endif // GGML_SYCL_COMMON_HPP

ggml/src/ggml-sycl/convert.cpp

+33-4
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
125125
}
126126
}
127127

128+
template <typename dst_t>
129+
static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
130+
dpct::queue_ptr stream) {
131+
132+
dpct::has_capability_or_fail(stream->get_device(),
133+
{sycl::aspect::fp16});
134+
135+
int constexpr WARP_K = WARP_SIZE * QK4_0;
136+
const int n_warp = (k + WARP_K - 1) / WARP_K;
137+
GGML_ASSERT(k % 2 == 0);
138+
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139+
sycl::range<3>(1, 1, WARP_SIZE),
140+
sycl::range<3>(1, 1, WARP_SIZE)),
141+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
142+
dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143+
});
144+
145+
}
146+
128147
template <typename dst_t>
129148
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
130149
dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
452471
}
453472
}
454473

455-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
474+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
456475
switch (type) {
457476
case GGML_TYPE_Q4_0:
458-
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
477+
if (dst->src[0]->extra &&
478+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
479+
return dequantize_row_q4_0_sycl_reorder;
480+
} else {
481+
return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482+
}
459483
case GGML_TYPE_Q4_1:
460484
return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
461485
case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
499523
}
500524
}
501525

502-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
526+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
503527
switch (type) {
504528
case GGML_TYPE_Q4_0:
505-
return dequantize_row_q4_0_sycl;
529+
if (dst->src[0]->extra &&
530+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
531+
return dequantize_row_q4_0_sycl_reorder;
532+
} else {
533+
return dequantize_row_q4_0_sycl;
534+
}
506535
case GGML_TYPE_Q4_1:
507536
return dequantize_row_q4_1_sycl;
508537
case GGML_TYPE_Q5_0:

ggml/src/ggml-sycl/convert.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
2121
typedef to_t_sycl_t<float> to_fp32_sycl_t;
2222
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
2323

24-
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25-
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
24+
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25+
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
2626

2727
#endif // GGML_SYCL_CONVERT_HPP

ggml/src/ggml-sycl/dequantize.hpp

+55
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "common.hpp"
1717

1818
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
19+
typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
20+
const int iqs, dfloat2 &v);
1921

2022
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
2123
const int iqs, dfloat2 &v) {
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
4042
#endif // GGML_SYCL_F16
4143
}
4244

45+
static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
46+
const int iqs, dfloat2 &v) {
47+
// const block_q4_0 * x = (const block_q4_0 *) vx;
48+
49+
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
50+
51+
const int vui = *((const uint8_t *)qs+iqs);
52+
53+
v.x() = vui & 0xF;
54+
v.y() = vui >> 4;
55+
56+
#ifdef GGML_SYCL_F16
57+
// v = v - {8.0f, 8.0f};
58+
// v = v * {d, d};
59+
v.s0() = (v.s0() - 8.0f) * d;
60+
v.s1() = (v.s1() - 8.0f) * d;
61+
62+
#else
63+
v.x() = (v.x() - 8.0f) * d;
64+
v.y() = (v.y() - 8.0f) * d;
65+
#endif // GGML_SYCL_F16
66+
}
67+
4368
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
4469
const int iqs, dfloat2 &v) {
4570
const block_q4_1 * x = (const block_q4_1 *) vx;
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
167192
}
168193
}
169194

195+
template<typename dst_t>
196+
static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
197+
const sycl::nd_item<3> &item_ct1) {
198+
199+
const int64_t i = item_ct1.get_group(2);
200+
auto k=nb32;
201+
// assume 32 threads
202+
const int64_t tid = item_ct1.get_local_id(2);
203+
const int lane_ib = i * WARP_SIZE + tid;
204+
205+
if (lane_ib >= k / QK4_0) {
206+
return;
207+
}
208+
209+
dst_t * y_ptr = yy + lane_ib * QK4_0;
210+
211+
auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
212+
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
213+
214+
const float d = float(*s_ptr);
215+
216+
#pragma unroll
217+
for (int l = 0; l < QK4_0 / 2; ++l) {
218+
int vq = qs[l];
219+
y_ptr[l + 0] = d * ((vq & 0xF) - 8);
220+
y_ptr[l + 16] = d * ((vq >> 4) - 8);
221+
}
222+
223+
}
224+
170225
template<typename dst_t>
171226
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
172227
const sycl::nd_item<3> &item_ct1) {

0 commit comments

Comments
 (0)