Skip to content

Commit 9bc554d

Browse files
committed
[libc] Perform bitfield zero initialization wave-parallel
Summary: We need to set the bitfield memory to zero because the system does not guarantee zeroed out memory. Even if fresh pages are zero, the system allows re-use so we would need a `kfd` level API to skip this step. Because we can't this patch updates the logic to perform the zero initialization wave-parallel. This reduces the amount of time it takes to allocate a fresh by up to a tenth. This has the unfortunate side effect that the control flow is more convoluted and we waste some extra registers, but it's worth it to reduce the slab allocation latency.
1 parent f5e499a commit 9bc554d

File tree

1 file changed

+45
-26
lines changed

1 file changed

+45
-26
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,18 @@ struct Slab {
157157
Header *header = reinterpret_cast<Header *>(memory);
158158
header->chunk_size = chunk_size;
159159
header->global_index = global_index;
160+
}
160161

161-
// This memset is expensive and likely not necessary for the current 'kfd'
162-
// driver. Until zeroed pages are exposed by the API we must be careful.
163-
__builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
162+
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
163+
// must be called before the bitfield can be accessed safely, memory is not
164+
// guaranteed to be zero initialized in the current implementation.
165+
void initialize(uint64_t uniform) {
166+
uint64_t lane_mask = gpu::get_lane_mask();
167+
uint32_t *bitfield = get_bitfield();
168+
uint32_t workers = cpp::popcount(uniform);
169+
for (uint32_t i = impl::lane_count(lane_mask & uniform);
170+
i < bitfield_bytes(get_chunk_size()) / sizeof(uint32_t); i += workers)
171+
bitfield[i] = 0;
164172
}
165173

166174
// Get the number of chunks that can theoretically fit inside this slab.
@@ -283,7 +291,7 @@ struct Slab {
283291

284292
/// A wait-free guard around a pointer resource to be created dynamically if
285293
/// space is available and freed once there are no more users.
286-
template <typename T> struct GuardPtr {
294+
struct GuardPtr {
287295
private:
288296
struct RefCounter {
289297
// Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,32 +347,25 @@ template <typename T> struct GuardPtr {
339347
cpp::Atomic<uint64_t> counter{0};
340348
};
341349

342-
cpp::Atomic<T *> ptr{nullptr};
350+
cpp::Atomic<Slab *> ptr{nullptr};
343351
RefCounter ref{};
344352

345353
// Should be called be a single lane for each different pointer.
346354
template <typename... Args>
347-
T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
348-
T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
355+
Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
356+
Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
349357
if (!expected &&
350-
ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
351-
cpp::MemoryOrder::RELAXED,
352-
cpp::MemoryOrder::RELAXED)) {
358+
ptr.compare_exchange_strong(
359+
expected, reinterpret_cast<Slab *>(SENTINEL),
360+
cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
353361
count = cpp::numeric_limits<uint64_t>::max();
354-
void *raw = impl::rpc_allocate(sizeof(T));
362+
void *raw = impl::rpc_allocate(sizeof(Slab));
355363
if (!raw)
356364
return nullptr;
357-
T *mem = new (raw) T(cpp::forward<Args>(args)...);
358-
359-
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
360-
ptr.store(mem, cpp::MemoryOrder::RELAXED);
361-
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
362-
if (!ref.acquire(n, count))
363-
ref.reset(n, count);
364-
return mem;
365+
return new (raw) Slab(cpp::forward<Args>(args)...);
365366
}
366367

367-
if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
368+
if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
368369
return nullptr;
369370

370371
if (!ref.acquire(n, count))
@@ -374,15 +375,25 @@ template <typename T> struct GuardPtr {
374375
return ptr.load(cpp::MemoryOrder::RELAXED);
375376
}
376377

378+
// Finalize the associated memory and signal that it is ready to use by
379+
// resetting the counter.
380+
void finalize(Slab *mem, uint32_t n, uint64_t &count) {
381+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
382+
ptr.store(mem, cpp::MemoryOrder::RELAXED);
383+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
384+
if (!ref.acquire(n, count))
385+
ref.reset(n, count);
386+
}
387+
377388
public:
378389
// Attempt to lock access to the pointer, potentially creating it if empty.
379390
// The uniform mask represents which lanes share the same pointer. For each
380391
// uniform value we elect a leader to handle it on behalf of the other lanes.
381392
template <typename... Args>
382-
T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
383-
Args &&...args) {
393+
Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
394+
Args &&...args) {
384395
count = 0;
385-
T *result = nullptr;
396+
Slab *result = nullptr;
386397
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
387398
result = try_lock_impl(cpp::popcount(uniform), count,
388399
cpp::forward<Args>(args)...);
@@ -392,6 +403,14 @@ template <typename T> struct GuardPtr {
392403
if (!result)
393404
return nullptr;
394405

406+
// We defer storing the newly allocated slab until now so that we can use
407+
// multiple lanes to initialize it and release it for use.
408+
if (count == cpp::numeric_limits<uint64_t>::max()) {
409+
result->initialize(uniform);
410+
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
411+
finalize(result, cpp::popcount(uniform), count);
412+
}
413+
395414
if (count != cpp::numeric_limits<uint64_t>::max())
396415
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
397416

@@ -403,8 +422,8 @@ template <typename T> struct GuardPtr {
403422
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
404423
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
405424
ref.release(cpp::popcount(mask))) {
406-
T *p = ptr.load(cpp::MemoryOrder::RELAXED);
407-
p->~T();
425+
Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
426+
p->~Slab();
408427
impl::rpc_free(p);
409428
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
410429
ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
@@ -417,7 +436,7 @@ template <typename T> struct GuardPtr {
417436
};
418437

419438
// The global array used to search for a valid slab to allocate from.
420-
static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
439+
static GuardPtr slots[ARRAY_SIZE] = {};
421440

422441
// Tries to find a slab in the table that can support the given chunk size.
423442
static Slab *find_slab(uint32_t chunk_size) {

0 commit comments

Comments
 (0)