|
19 | 19 | #include "dpct/helper.hpp"
|
20 | 20 | #include "ggml-sycl.h"
|
21 | 21 | #include "presets.hpp"
|
| 22 | +#include "sycl_hw.hpp" |
| 23 | + |
| 24 | + |
22 | 25 | #if GGML_SYCL_DNNL
|
23 | 26 | #include "dnnl.hpp"
|
24 | 27 | #include "dnnl_sycl.hpp"
|
|
35 | 38 | void* ggml_sycl_host_malloc(size_t size);
|
36 | 39 | void ggml_sycl_host_free(void* ptr);
|
37 | 40 |
|
| 41 | + |
38 | 42 | extern int g_ggml_sycl_debug;
|
| 43 | +extern int g_ggml_sycl_disable_optimize; |
| 44 | + |
39 | 45 | #define GGML_SYCL_DEBUG(...) \
|
40 | 46 | do { \
|
41 | 47 | if (g_ggml_sycl_debug) \
|
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
182 | 188 | }
|
183 | 189 |
|
184 | 190 | //////////////////////
|
| 191 | +struct optimize_feature { |
| 192 | + bool reorder=false; |
| 193 | +}; |
| 194 | + |
| 195 | +struct sycl_device_info { |
| 196 | + int cc; // compute capability |
| 197 | + // int nsm; // number of streaming multiprocessors |
| 198 | + // size_t smpb; // max. shared memory per block |
| 199 | + bool vmm; // virtual memory support |
| 200 | + size_t total_vram; |
| 201 | + sycl_hw_info hw_info; |
| 202 | + optimize_feature opt_feature; |
| 203 | +}; |
| 204 | + |
185 | 205 |
|
186 | 206 | struct ggml_sycl_device_info {
|
187 | 207 | int device_count;
|
188 | 208 |
|
189 |
| - struct sycl_device_info { |
190 |
| - int cc; // compute capability |
191 |
| - // int nsm; // number of streaming multiprocessors |
192 |
| - // size_t smpb; // max. shared memory per block |
193 |
| - bool vmm; // virtual memory support |
194 |
| - size_t total_vram; |
195 |
| - }; |
196 |
| - |
197 | 209 | sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
|
198 | 210 |
|
199 | 211 | std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
|
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
|
260 | 272 | // tensors
|
261 | 273 | dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
|
262 | 274 | [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
| 275 | + optimize_feature optimized_feature; |
263 | 276 | };
|
264 | 277 |
|
| 278 | +void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={}); |
| 279 | + |
| 280 | +inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { |
| 281 | + optimize_feature opt; |
| 282 | + |
| 283 | + opt.reorder = |
| 284 | + (arch == syclex::architecture::intel_gpu_dg1 || |
| 285 | + arch == syclex::architecture::intel_gpu_acm_g10 || |
| 286 | + arch == syclex::architecture::intel_gpu_acm_g11 || |
| 287 | + arch == syclex::architecture::intel_gpu_acm_g12 || |
| 288 | + arch == syclex::architecture::intel_gpu_pvc || |
| 289 | + arch == syclex::architecture::intel_gpu_pvc_vg || |
| 290 | + arch == syclex::architecture::intel_gpu_mtl_u || |
| 291 | + arch == syclex::architecture::intel_gpu_mtl_s || |
| 292 | + arch == syclex::architecture::intel_gpu_mtl_h || |
| 293 | + arch == syclex::architecture::intel_gpu_arl_u || |
| 294 | + arch == syclex::architecture::intel_gpu_arl_s || |
| 295 | + arch == syclex::architecture::intel_gpu_arl_h || |
| 296 | + arch == syclex::architecture::intel_gpu_bmg_g21 || |
| 297 | + arch == syclex::architecture::intel_gpu_lnl_m |
| 298 | + ); |
| 299 | + |
| 300 | + return opt; |
| 301 | +} |
| 302 | + |
265 | 303 | struct ggml_backend_sycl_context {
|
266 | 304 | int device;
|
267 | 305 | std::string name;
|
| 306 | + optimize_feature opt_feature; |
| 307 | + bool optimized_graph=false; |
268 | 308 |
|
269 | 309 | queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
|
270 | 310 |
|
271 | 311 | explicit ggml_backend_sycl_context(int device) :
|
272 | 312 | device(device),
|
273 | 313 | name(GGML_SYCL_NAME + std::to_string(device)) {
|
| 314 | + opt_feature = ggml_sycl_info().devices[device].opt_feature; |
274 | 315 | }
|
275 | 316 |
|
276 | 317 | queue_ptr stream(int device, int stream) {
|
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
|
680 | 721 | void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
681 | 722 | const ggml_tensor *src1, ggml_tensor *dst,
|
682 | 723 | const ggml_sycl_op_flatten_t op);
|
683 |
| - |
684 | 724 | #endif // GGML_SYCL_COMMON_HPP
|
0 commit comments