Skip to content

Commit dc4335a

Browse files
authored
[libc] Perform bitfield zero initialization wave-parallel (#143607)
Summary: We need to set the bitfield memory to zero because the system does not guarantee zeroed out memory. Even if fresh pages are zero, the system allows re-use so we would need a `kfd` level API to skip this step. Because we can't this patch updates the logic to perform the zero initialization wave-parallel. This reduces the amount of time it takes to allocate a fresh by up to a tenth. This has the unfortunate side effect that the control flow is more convoluted and we waste some extra registers, but it's worth it to reduce the slab allocation latency.
1 parent ee35e34 commit dc4335a

File tree

1 file changed

+35
-11
lines changed

1 file changed

+35
-11
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
129129
return (x + N) & ~(N - 1);
130130
}
131131

132+
// Perform a lane parallel memset on a uint32_t pointer.
133+
void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
134+
uint64_t mask = gpu::get_lane_mask();
135+
uint32_t workers = cpp::popcount(uniform);
136+
for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
137+
s[i] = c;
138+
}
139+
132140
} // namespace impl
133141

134142
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -157,10 +165,15 @@ struct Slab {
157165
Header *header = reinterpret_cast<Header *>(memory);
158166
header->chunk_size = chunk_size;
159167
header->global_index = global_index;
168+
}
160169

161-
// This memset is expensive and likely not necessary for the current 'kfd'
162-
// driver. Until zeroed pages are exposed by the API we must be careful.
163-
__builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
170+
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
171+
// must be called before the bitfield can be accessed safely, memory is not
172+
// guaranteed to be zero initialized in the current implementation.
173+
void initialize(uint64_t uniform) {
174+
uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
175+
sizeof(uint32_t);
176+
impl::uniform_memset(get_bitfield(), 0, size, uniform);
164177
}
165178

166179
// Get the number of chunks that can theoretically fit inside this slab.
@@ -354,14 +367,7 @@ struct GuardPtr {
354367
void *raw = impl::rpc_allocate(sizeof(Slab));
355368
if (!raw)
356369
return nullptr;
357-
Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
358-
359-
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
360-
ptr.store(mem, cpp::MemoryOrder::RELAXED);
361-
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
362-
if (!ref.acquire(n, count))
363-
ref.reset(n, count);
364-
return mem;
370+
return new (raw) Slab(cpp::forward<Args>(args)...);
365371
}
366372

367373
if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
@@ -374,6 +380,16 @@ struct GuardPtr {
374380
return ptr.load(cpp::MemoryOrder::RELAXED);
375381
}
376382

383+
// Finalize the associated memory and signal that it is ready to use by
384+
// resetting the counter.
385+
void finalize(Slab *mem, uint32_t n, uint64_t &count) {
386+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
387+
ptr.store(mem, cpp::MemoryOrder::RELAXED);
388+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
389+
if (!ref.acquire(n, count))
390+
ref.reset(n, count);
391+
}
392+
377393
public:
378394
// Attempt to lock access to the pointer, potentially creating it if empty.
379395
// The uniform mask represents which lanes share the same pointer. For each
@@ -392,6 +408,14 @@ struct GuardPtr {
392408
if (!result)
393409
return nullptr;
394410

411+
// We defer storing the newly allocated slab until now so that we can use
412+
// multiple lanes to initialize it and release it for use.
413+
if (count == cpp::numeric_limits<uint64_t>::max()) {
414+
result->initialize(uniform);
415+
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
416+
finalize(result, cpp::popcount(uniform), count);
417+
}
418+
395419
if (count != cpp::numeric_limits<uint64_t>::max())
396420
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
397421

0 commit comments

Comments
 (0)