Skip to content

Commit 329f86a

Browse files
committed
Auto merge of #277 - Zoxc:opt-insert, r=Amanieu
Optimize insertion to only use a single lookup This changes the `insert` method to use a single lookup for insertion instead of 2 separate lookups. This reduces runtime of rustc on check builds by an average of 0.5% on [local benchmarks](https://github.com/Zoxc/rcb/tree/93790089032fc3fb4a4d708fb0adee9551125916/benchs) (1% for `winapi`). The compiler size is reduced by 0.45%. `unwrap_unchecked` is lifted from `libstd` since it's currently unstable and that probably requires some attribution.
2 parents dcb1de1 + b9d97a8 commit 329f86a

File tree

3 files changed

+137
-50
lines changed

3 files changed

+137
-50
lines changed

benches/bench.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ fn rehash_in_place(b: &mut Bencher) {
311311

312312
// Each loop triggers one rehash
313313
for _ in 0..10 {
314-
for i in 0..224 {
314+
for i in 0..223 {
315315
set.insert(i);
316316
}
317317

src/map.rs

+15-6
Original file line numberDiff line numberDiff line change
@@ -1787,12 +1787,21 @@ where
17871787
#[cfg_attr(feature = "inline-more", inline)]
17881788
pub fn insert(&mut self, k: K, v: V) -> Option<V> {
17891789
let hash = make_insert_hash::<K, S>(&self.hash_builder, &k);
1790-
if let Some((_, item)) = self.table.get_mut(hash, equivalent_key(&k)) {
1791-
Some(mem::replace(item, v))
1792-
} else {
1793-
self.table
1794-
.insert(hash, (k, v), make_hasher::<_, V, S>(&self.hash_builder));
1795-
None
1790+
self.table
1791+
.reserve(1, make_hasher::<_, V, S>(&self.hash_builder));
1792+
1793+
unsafe {
1794+
let (index, found) = self.table.find_potential(hash, equivalent_key(&k));
1795+
1796+
let bucket = self.table.bucket(index);
1797+
1798+
if found {
1799+
Some(mem::replace(&mut bucket.as_mut().1, v))
1800+
} else {
1801+
self.table.mark_inserted(index, hash);
1802+
bucket.write((k, v));
1803+
None
1804+
}
17961805
}
17971806
}
17981807

src/raw/mod.rs

+121-43
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ impl<T> Bucket<T> {
443443
// | (to the end of T5)
444444
// | | `base: NonNull<T>` must point here
445445
// v | (to the end of T0 or to the start of C0)
446-
// /‾‾‾\ v v
446+
// /???\ v v
447447
// [Padding], Tlast, ..., |T10|, ..., T5|, T4, T3, T2, T1, T0, |C0, C1, C2, C3, C4, C5, ..., C10, ..., Clast
448448
// \__________ __________/
449449
// \/
@@ -1083,7 +1083,7 @@ impl<T, A: Allocator + Clone> RawTable<T, A> {
10831083
/// without reallocation.
10841084
#[cfg_attr(feature = "inline-more", inline)]
10851085
pub fn reserve(&mut self, additional: usize, hasher: impl Fn(&T) -> u64) {
1086-
if additional > self.table.growth_left {
1086+
if unlikely(additional > self.table.growth_left) {
10871087
// Avoid `Result::unwrap_or_else` because it bloats LLVM IR.
10881088
if self
10891089
.reserve_rehash(additional, hasher, Fallibility::Infallible)
@@ -1252,6 +1252,22 @@ impl<T, A: Allocator + Clone> RawTable<T, A> {
12521252
}
12531253
}
12541254

1255+
/// Searches for an element in the table,
1256+
/// or a potential slot where that element could be inserted.
1257+
#[inline]
1258+
pub fn find_potential(&self, hash: u64, mut eq: impl FnMut(&T) -> bool) -> (usize, bool) {
1259+
self.table.find_potential_inner(hash, &mut |index| unsafe {
1260+
eq(self.bucket(index).as_ref())
1261+
})
1262+
}
1263+
1264+
/// Marks an element in the table as inserted.
1265+
#[inline]
1266+
pub unsafe fn mark_inserted(&mut self, index: usize, hash: u64) {
1267+
let old_ctrl = *self.table.ctrl(index);
1268+
self.table.record_item_insert_at(index, old_ctrl, hash);
1269+
}
1270+
12551271
/// Searches for an element in the table.
12561272
#[inline]
12571273
pub fn find(&self, hash: u64, mut eq: impl FnMut(&T) -> bool) -> Option<Bucket<T>> {
@@ -1585,6 +1601,106 @@ impl<A: Allocator + Clone> RawTableInner<A> {
15851601
}
15861602
}
15871603

1604+
/// Fixes up an insertion slot due to false positives for groups smaller than the group width.
1605+
/// This must only be used on insertion slots found by `find_insert_slot_in_group`.
1606+
#[inline]
1607+
unsafe fn fix_insert_slot(&self, index: usize) -> usize {
1608+
// In tables smaller than the group width
1609+
// (self.buckets() < Group::WIDTH), trailing control
1610+
// bytes outside the range of the table are filled with
1611+
// EMPTY entries. These will unfortunately trigger a
1612+
// match, but once masked may point to a full bucket that
1613+
// is already occupied. We detect this situation here and
1614+
// perform a second scan starting at the beginning of the
1615+
// table. This second scan is guaranteed to find an empty
1616+
// slot (due to the load factor) before hitting the trailing
1617+
// control bytes (containing EMPTY).
1618+
if unlikely(self.is_bucket_full(index)) {
1619+
debug_assert!(self.bucket_mask < Group::WIDTH);
1620+
// SAFETY:
1621+
//
1622+
// * We are in range and `ptr = self.ctrl(0)` are valid for reads
1623+
// and properly aligned, because the table is already allocated
1624+
// (see `TableLayout::calculate_layout_for` and `ptr::read`);
1625+
//
1626+
// * For tables larger than the group width (self.buckets() >= Group::WIDTH),
1627+
// we will never end up in the given branch, since
1628+
// `(probe_seq.pos + bit) & self.bucket_mask` in `find_insert_slot_in_group` cannot
1629+
// return a full bucket index. For tables smaller than the group width, calling the
1630+
// `lowest_set_bit_nonzero` function (when `nightly` feature enabled) is also
1631+
// safe, as the trailing control bytes outside the range of the table are filled
1632+
// with EMPTY bytes, so this second scan either finds an empty slot (due to the
1633+
// load factor) or hits the trailing control bytes (containing EMPTY). See
1634+
// `intrinsics::cttz_nonzero` for more information.
1635+
Group::load_aligned(self.ctrl(0))
1636+
.match_empty_or_deleted()
1637+
.lowest_set_bit_nonzero()
1638+
} else {
1639+
index
1640+
}
1641+
}
1642+
1643+
/// Finds the position to insert something in a group.
1644+
/// This may have false positives and must be fixed up with `fix_insert_slot` before it's used.
1645+
#[inline]
1646+
fn find_insert_slot_in_group(&self, group: &Group, probe_seq: &ProbeSeq) -> Option<usize> {
1647+
let bit = group.match_empty_or_deleted().lowest_set_bit();
1648+
1649+
if likely(bit.is_some()) {
1650+
Some((probe_seq.pos + bit.unwrap()) & self.bucket_mask)
1651+
} else {
1652+
None
1653+
}
1654+
}
1655+
1656+
/// Searches for an element in the table, or a potential slot where that element could be
1657+
/// inserted.
1658+
///
1659+
/// This uses dynamic dispatch to reduce the amount of code generated, but that is
1660+
/// eliminated by LLVM optimizations.
1661+
#[inline]
1662+
pub fn find_potential_inner(
1663+
&self,
1664+
hash: u64,
1665+
eq: &mut dyn FnMut(usize) -> bool,
1666+
) -> (usize, bool) {
1667+
let mut insert_slot = None;
1668+
1669+
let h2_hash = h2(hash);
1670+
let mut probe_seq = self.probe_seq(hash);
1671+
1672+
loop {
1673+
let group = unsafe { Group::load(self.ctrl(probe_seq.pos)) };
1674+
1675+
for bit in group.match_byte(h2_hash) {
1676+
let index = (probe_seq.pos + bit) & self.bucket_mask;
1677+
1678+
if likely(eq(index)) {
1679+
return (index, true);
1680+
}
1681+
}
1682+
1683+
// We didn't find the element we were looking for in the group, try to get an
1684+
// insertion slot from the group if we don't have one yet.
1685+
if likely(insert_slot.is_none()) {
1686+
insert_slot = self.find_insert_slot_in_group(&group, &probe_seq);
1687+
}
1688+
1689+
// Only stop the search if the group contains at least one empty element.
1690+
// Otherwise, the element that we are looking for might be in a following group.
1691+
if likely(group.match_empty().any_bit_set()) {
1692+
// We must have found a insert slot by now, since the current group contains at
1693+
// least one. For tables smaller than the group width, there will still be an
1694+
// empty element in the current (and only) group due to the load factor.
1695+
unsafe {
1696+
return (self.fix_insert_slot(insert_slot.unwrap_unchecked()), false);
1697+
}
1698+
}
1699+
1700+
probe_seq.move_next(self.bucket_mask);
1701+
}
1702+
}
1703+
15881704
/// Searches for an empty or deleted bucket which is suitable for inserting
15891705
/// a new element and sets the hash for that slot.
15901706
///
@@ -1637,48 +1753,10 @@ impl<A: Allocator + Clone> RawTableInner<A> {
16371753
// bytes, which is safe (see RawTableInner::new_in).
16381754
unsafe {
16391755
let group = Group::load(self.ctrl(probe_seq.pos));
1640-
if let Some(bit) = group.match_empty_or_deleted().lowest_set_bit() {
1641-
// This is the same as `(probe_seq.pos + bit) % self.buckets()` because the number
1642-
// of buckets is a power of two, and `self.bucket_mask = self.buckets() - 1`.
1643-
let result = (probe_seq.pos + bit) & self.bucket_mask;
1644-
1645-
// In tables smaller than the group width
1646-
// (self.buckets() < Group::WIDTH), trailing control
1647-
// bytes outside the range of the table are filled with
1648-
// EMPTY entries. These will unfortunately trigger a
1649-
// match, but once masked may point to a full bucket that
1650-
// is already occupied. We detect this situation here and
1651-
// perform a second scan starting at the beginning of the
1652-
// table. This second scan is guaranteed to find an empty
1653-
// slot (due to the load factor) before hitting the trailing
1654-
// control bytes (containing EMPTY).
1655-
//
1656-
// SAFETY: The `result` is guaranteed to be in range `0..self.bucket_mask`
1657-
// due to masking with `self.bucket_mask`
1658-
if unlikely(self.is_bucket_full(result)) {
1659-
debug_assert!(self.bucket_mask < Group::WIDTH);
1660-
debug_assert_ne!(probe_seq.pos, 0);
1661-
// SAFETY:
1662-
//
1663-
// * We are in range and `ptr = self.ctrl(0)` are valid for reads
1664-
// and properly aligned, because the table is already allocated
1665-
// (see `TableLayout::calculate_layout_for` and `ptr::read`);
1666-
//
1667-
// * For tables larger than the group width (self.buckets() >= Group::WIDTH),
1668-
// we will never end up in the given branch, since
1669-
// `(probe_seq.pos + bit) & self.bucket_mask` cannot return a
1670-
// full bucket index. For tables smaller than the group width, calling the
1671-
// `lowest_set_bit_nonzero` function (when `nightly` feature enabled) is also
1672-
// safe, as the trailing control bytes outside the range of the table are filled
1673-
// with EMPTY bytes, so this second scan either finds an empty slot (due to the
1674-
// load factor) or hits the trailing control bytes (containing EMPTY). See
1675-
// `intrinsics::cttz_nonzero` for more information.
1676-
return Group::load_aligned(self.ctrl(0))
1677-
.match_empty_or_deleted()
1678-
.lowest_set_bit_nonzero();
1679-
}
1756+
let index = self.find_insert_slot_in_group(&group, &probe_seq);
16801757

1681-
return result;
1758+
if likely(index.is_some()) {
1759+
return self.fix_insert_slot(index.unwrap_unchecked());
16821760
}
16831761
}
16841762
probe_seq.move_next(self.bucket_mask);

0 commit comments

Comments
 (0)