@@ -157,10 +157,18 @@ struct Slab {
157
157
Header *header = reinterpret_cast <Header *>(memory);
158
158
header->chunk_size = chunk_size;
159
159
header->global_index = global_index;
160
+ }
160
161
161
- // This memset is expensive and likely not necessary for the current 'kfd'
162
- // driver. Until zeroed pages are exposed by the API we must be careful.
163
- __builtin_memset (get_bitfield (), 0 , bitfield_bytes (chunk_size));
162
+ // Set the necessary bitfield bytes to zero in parallel using many lanes. This
163
+ // must be called before the bitfield can be accessed safely, memory is not
164
+ // guaranteed to be zero initialized in the current implementation.
165
+ void initialize (uint64_t uniform) {
166
+ uint64_t lane_mask = gpu::get_lane_mask ();
167
+ uint32_t *bitfield = get_bitfield ();
168
+ uint32_t workers = cpp::popcount (uniform);
169
+ for (uint32_t i = impl::lane_count (lane_mask & uniform);
170
+ i < bitfield_bytes (get_chunk_size ()) / sizeof (uint32_t ); i += workers)
171
+ bitfield[i] = 0 ;
164
172
}
165
173
166
174
// Get the number of chunks that can theoretically fit inside this slab.
@@ -283,7 +291,7 @@ struct Slab {
283
291
284
292
// / A wait-free guard around a pointer resource to be created dynamically if
285
293
// / space is available and freed once there are no more users.
286
- template < typename T> struct GuardPtr {
294
+ struct GuardPtr {
287
295
private:
288
296
struct RefCounter {
289
297
// Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,32 +347,25 @@ template <typename T> struct GuardPtr {
339
347
cpp::Atomic<uint64_t > counter{0 };
340
348
};
341
349
342
- cpp::Atomic<T *> ptr{nullptr };
350
+ cpp::Atomic<Slab *> ptr{nullptr };
343
351
RefCounter ref{};
344
352
345
353
// Should be called be a single lane for each different pointer.
346
354
template <typename ... Args>
347
- T *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
348
- T *expected = ptr.load (cpp::MemoryOrder::RELAXED);
355
+ Slab *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
356
+ Slab *expected = ptr.load (cpp::MemoryOrder::RELAXED);
349
357
if (!expected &&
350
- ptr.compare_exchange_strong (expected, reinterpret_cast <T *>(SENTINEL),
351
- cpp::MemoryOrder::RELAXED ,
352
- cpp::MemoryOrder::RELAXED)) {
358
+ ptr.compare_exchange_strong (
359
+ expected, reinterpret_cast <Slab *>(SENTINEL) ,
360
+ cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
353
361
count = cpp::numeric_limits<uint64_t >::max ();
354
- void *raw = impl::rpc_allocate (sizeof (T ));
362
+ void *raw = impl::rpc_allocate (sizeof (Slab ));
355
363
if (!raw)
356
364
return nullptr ;
357
- T *mem = new (raw) T (cpp::forward<Args>(args)...);
358
-
359
- cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
360
- ptr.store (mem, cpp::MemoryOrder::RELAXED);
361
- cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
362
- if (!ref.acquire (n, count))
363
- ref.reset (n, count);
364
- return mem;
365
+ return new (raw) Slab (cpp::forward<Args>(args)...);
365
366
}
366
367
367
- if (!expected || expected == reinterpret_cast <T *>(SENTINEL))
368
+ if (!expected || expected == reinterpret_cast <Slab *>(SENTINEL))
368
369
return nullptr ;
369
370
370
371
if (!ref.acquire (n, count))
@@ -374,15 +375,25 @@ template <typename T> struct GuardPtr {
374
375
return ptr.load (cpp::MemoryOrder::RELAXED);
375
376
}
376
377
378
+ // Finalize the associated memory and signal that it is ready to use by
379
+ // resetting the counter.
380
+ void finalize (Slab *mem, uint32_t n, uint64_t &count) {
381
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
382
+ ptr.store (mem, cpp::MemoryOrder::RELAXED);
383
+ cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
384
+ if (!ref.acquire (n, count))
385
+ ref.reset (n, count);
386
+ }
387
+
377
388
public:
378
389
// Attempt to lock access to the pointer, potentially creating it if empty.
379
390
// The uniform mask represents which lanes share the same pointer. For each
380
391
// uniform value we elect a leader to handle it on behalf of the other lanes.
381
392
template <typename ... Args>
382
- T *try_lock (uint64_t lane_mask, uint64_t uniform, uint64_t &count,
383
- Args &&...args) {
393
+ Slab *try_lock (uint64_t lane_mask, uint64_t uniform, uint64_t &count,
394
+ Args &&...args) {
384
395
count = 0 ;
385
- T *result = nullptr ;
396
+ Slab *result = nullptr ;
386
397
if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (uniform)))
387
398
result = try_lock_impl (cpp::popcount (uniform), count,
388
399
cpp::forward<Args>(args)...);
@@ -392,6 +403,14 @@ template <typename T> struct GuardPtr {
392
403
if (!result)
393
404
return nullptr ;
394
405
406
+ // We defer storing the newly allocated slab until now so that we can use
407
+ // multiple lanes to initialize it and release it for use.
408
+ if (count == cpp::numeric_limits<uint64_t >::max ()) {
409
+ result->initialize (uniform);
410
+ if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (uniform)))
411
+ finalize (result, cpp::popcount (uniform), count);
412
+ }
413
+
395
414
if (count != cpp::numeric_limits<uint64_t >::max ())
396
415
count = count - cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
397
416
@@ -403,8 +422,8 @@ template <typename T> struct GuardPtr {
403
422
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
404
423
if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (mask)) &&
405
424
ref.release (cpp::popcount (mask))) {
406
- T *p = ptr.load (cpp::MemoryOrder::RELAXED);
407
- p->~T ();
425
+ Slab *p = ptr.load (cpp::MemoryOrder::RELAXED);
426
+ p->~Slab ();
408
427
impl::rpc_free (p);
409
428
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
410
429
ptr.store (nullptr , cpp::MemoryOrder::RELAXED);
@@ -417,7 +436,7 @@ template <typename T> struct GuardPtr {
417
436
};
418
437
419
438
// The global array used to search for a valid slab to allocate from.
420
- static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
439
+ static GuardPtr slots[ARRAY_SIZE] = {};
421
440
422
441
// Tries to find a slab in the table that can support the given chunk size.
423
442
static Slab *find_slab (uint32_t chunk_size) {
0 commit comments