Skip to content

Commit 739e997

Browse files
authored
[libc] Remove ballot on slab find (#176606)
Summary: This negatively impacts performance, while the other changes in the initial PR slightly improved it. This was originally done to make Volta independent thread scheduling work, but that doesn't seem to work correctly all the time either so we should make this faster.
1 parent 49cd842 commit 739e997

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -498,20 +498,21 @@ struct GuardPtr {
498498
result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
499499
count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
500500

501+
if (!result)
502+
return nullptr;
503+
501504
// We defer storing the newly allocated slab until now so that we can use
502505
// multiple lanes to initialize it and release it for use.
503-
uint64_t slab_mask =
504-
gpu::ballot(lane_mask, result && impl::is_sentinel(count));
505-
if (slab_mask & impl::id_in_mask()) {
506-
result->initialize(slab_mask, uniform);
506+
if (impl::is_sentinel(count)) {
507+
uint64_t count_mask = gpu::get_lane_mask();
508+
result->initialize(count_mask, uniform);
507509
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
508510
finalize(result, cpp::popcount(uniform), count);
509-
count = gpu::shuffle(slab_mask, cpp::countr_zero(uniform), count);
511+
count = gpu::shuffle(count_mask, cpp::countr_zero(uniform), count);
510512
}
511513

512-
if (result)
513-
count = count - cpp::popcount(uniform) +
514-
impl::lane_count(uniform, gpu::get_lane_id());
514+
count = count - cpp::popcount(uniform) +
515+
impl::lane_count(uniform, gpu::get_lane_id());
515516

516517
return result;
517518
}

0 commit comments

Comments
 (0)