Skip to content

Commit 9805437

Browse files
committed
Add a dedicated length-prefixing method to Hasher
This accomplishes two main goals: - Make it clear who is responsible for prefix-freedom, including how they should do it - Make it feasible for a `Hasher` that *doesn't* care about Hash-DoS resistance to get better performance by not hashing lengths This does not change rustc-hash, since that's in an external crate, but that could potentially use it in future.
1 parent 086bf7a commit 9805437

File tree

14 files changed

+166
-6
lines changed

14 files changed

+166
-6
lines changed

compiler/rustc_data_structures/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#![feature(generators)]
1919
#![feature(let_else)]
2020
#![feature(hash_raw_entry)]
21+
#![feature(hasher_prefixfree_extras)]
2122
#![feature(maybe_uninit_uninit_array)]
2223
#![feature(min_specialization)]
2324
#![feature(never_type)]

compiler/rustc_data_structures/src/sip128.rs

+8
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,14 @@ impl Hasher for SipHasher128 {
462462
self.slice_write(msg);
463463
}
464464

465+
#[inline]
466+
fn write_str(&mut self, s: &str) {
467+
// This hasher works byte-wise, and `0xFF` cannot show up in a `str`,
468+
// so just hashing the one extra byte is enough to be prefix-free.
469+
self.write(s.as_bytes());
470+
self.write_u8(0xFF);
471+
}
472+
465473
fn finish(&self) -> u64 {
466474
panic!("SipHasher128 cannot provide valid 64 bit hashes")
467475
}

compiler/rustc_data_structures/src/stable_hasher.rs

+11
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,17 @@ impl Hasher for StableHasher {
7373
self.state.write(bytes);
7474
}
7575

76+
#[inline]
77+
fn write_str(&mut self, s: &str) {
78+
self.state.write_str(s);
79+
}
80+
81+
#[inline]
82+
fn write_length_prefix(&mut self, len: usize) {
83+
// Our impl for `usize` will extend it if needed.
84+
self.write_usize(len);
85+
}
86+
7687
#[inline]
7788
fn write_u8(&mut self, i: u8) {
7889
self.state.write_u8(i);

library/alloc/src/boxed.rs

+6
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,12 @@ impl<T: ?Sized + Hasher, A: Allocator> Hasher for Box<T, A> {
13691369
fn write_isize(&mut self, i: isize) {
13701370
(**self).write_isize(i)
13711371
}
1372+
fn write_length_prefix(&mut self, len: usize) {
1373+
(**self).write_length_prefix(len)
1374+
}
1375+
fn write_str(&mut self, s: &str) {
1376+
(**self).write_str(s)
1377+
}
13721378
}
13731379

13741380
#[cfg(not(no_global_oom_handling))]

library/alloc/src/collections/btree/map.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1990,7 +1990,7 @@ impl<'a, K: Ord + Copy, V: Copy> Extend<(&'a K, &'a V)> for BTreeMap<K, V> {
19901990
#[stable(feature = "rust1", since = "1.0.0")]
19911991
impl<K: Hash, V: Hash> Hash for BTreeMap<K, V> {
19921992
fn hash<H: Hasher>(&self, state: &mut H) {
1993-
self.len().hash(state);
1993+
state.write_length_prefix(self.len());
19941994
for elt in self {
19951995
elt.hash(state);
19961996
}

library/alloc/src/collections/linked_list.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1944,7 +1944,7 @@ impl<T: fmt::Debug> fmt::Debug for LinkedList<T> {
19441944
#[stable(feature = "rust1", since = "1.0.0")]
19451945
impl<T: Hash> Hash for LinkedList<T> {
19461946
fn hash<H: Hasher>(&self, state: &mut H) {
1947-
self.len().hash(state);
1947+
state.write_length_prefix(self.len());
19481948
for elt in self {
19491949
elt.hash(state);
19501950
}

library/alloc/src/collections/vec_deque/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2899,7 +2899,7 @@ impl<T: Ord, A: Allocator> Ord for VecDeque<T, A> {
28992899
#[stable(feature = "rust1", since = "1.0.0")]
29002900
impl<T: Hash, A: Allocator> Hash for VecDeque<T, A> {
29012901
fn hash<H: Hasher>(&self, state: &mut H) {
2902-
self.len().hash(state);
2902+
state.write_length_prefix(self.len());
29032903
// It's not possible to use Hash::hash_slice on slices
29042904
// returned by as_slices method as their length can vary
29052905
// in otherwise identical deques.

library/alloc/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@
117117
#![feature(extend_one)]
118118
#![feature(fmt_internals)]
119119
#![feature(fn_traits)]
120+
#![feature(hasher_prefixfree_extras)]
120121
#![feature(inplace_iteration)]
121122
#![feature(iter_advance_by)]
122123
#![feature(layout_for_ptr)]

library/core/src/hash/mod.rs

+104-3
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,12 @@ pub trait Hasher {
333333
///
334334
/// println!("Hash is {:x}!", hasher.finish());
335335
/// ```
336+
///
337+
/// # Note to Implementers
338+
///
339+
/// You generally should not do length-prefixing as part of implementing
340+
/// this method. It's up to the [`Hash`] implementation to call
341+
/// [`Hasher::write_length_prefix`] before sequences that need it.
336342
#[stable(feature = "rust1", since = "1.0.0")]
337343
fn write(&mut self, bytes: &[u8]);
338344

@@ -409,6 +415,96 @@ pub trait Hasher {
409415
fn write_isize(&mut self, i: isize) {
410416
self.write_usize(i as usize)
411417
}
418+
419+
/// Writes a length prefix into this hasher, as part of being prefix-free.
420+
///
421+
/// If you're implementing [`Hash`] for a custom collection, call this before
422+
/// writing its contents to this `Hasher`. That way
423+
/// `(collection![1, 2, 3], collection![4, 5])` and
424+
/// `(collection![1, 2], collection![3, 4, 5])` will provide different
425+
/// sequences of values to the `Hasher`
426+
///
427+
/// The `impl<T> Hash for [T]` includes a call to this method, so if you're
428+
/// hashing a slice (or array or vector) via its `Hash::hash` method,
429+
/// you should **not** call this yourself.
430+
///
431+
/// This method is only for providing domain separation. If you want to
432+
/// hash a `usize` that represents part of the *data*, then it's important
433+
/// that you pass it to [`Hasher::write_usize`] instead of to this method.
434+
///
435+
/// # Examples
436+
///
437+
/// ```
438+
/// #![feature(hasher_prefixfree_extras)]
439+
/// # // Stubs to make the `impl` below pass the compiler
440+
/// # struct MyCollection<T>(Option<T>);
441+
/// # impl<T> MyCollection<T> {
442+
/// # fn len(&self) -> usize { todo!() }
443+
/// # }
444+
/// # impl<'a, T> IntoIterator for &'a MyCollection<T> {
445+
/// # type Item = T;
446+
/// # type IntoIter = std::iter::Empty<T>;
447+
/// # fn into_iter(self) -> Self::IntoIter { todo!() }
448+
/// # }
449+
///
450+
/// use std::hash::{Hash, Hasher};
451+
/// impl<T: Hash> Hash for MyCollection<T> {
452+
/// fn hash<H: Hasher>(&self, state: &mut H) {
453+
/// state.write_length_prefix(self.len());
454+
/// for elt in self {
455+
/// elt.hash(state);
456+
/// }
457+
/// }
458+
/// }
459+
/// ```
460+
///
461+
/// # Note to Implementers
462+
///
463+
/// If you've decided that your `Hasher` is willing to be susceptible to
464+
/// Hash-DoS attacks, then you might consider skipping hashing some or all
465+
/// of the `len` provided in the name of increased performance.
466+
#[inline]
467+
#[unstable(feature = "hasher_prefixfree_extras", issue = "96762")]
468+
fn write_length_prefix(&mut self, len: usize) {
469+
self.write_usize(len);
470+
}
471+
472+
/// Writes a single `str` into this hasher.
473+
///
474+
/// If you're implementing [`Hash`], you generally do not need to call this,
475+
/// as the `impl Hash for str` does, so you should prefer that instead.
476+
///
477+
/// This includes the domain separator for prefix-freedom, so you should
478+
/// **not** call `Self::write_length_prefix` before calling this.
479+
///
480+
/// # Note to Implementers
481+
///
482+
/// The default implementation of this method includes a call to
483+
/// [`Self::write_length_prefix`], so if your implementation of `Hasher`
484+
/// doesn't care about prefix-freedom and you've thus overridden
485+
/// that method to do nothing, there's no need to override this one.
486+
///
487+
/// This method is available to be overridden separately from the others
488+
/// as `str` being UTF-8 means that it never contains `0xFF` bytes, which
489+
/// can be used to provide prefix-freedom cheaper than hashing a length.
490+
///
491+
/// For example, if your `Hasher` works byte-by-byte (perhaps by accumulating
492+
/// them into a buffer), then you can hash the bytes of the `str` followed
493+
/// by a single `0xFF` byte.
494+
///
495+
/// If your `Hasher` works in chunks, you can also do this by being careful
496+
/// about how you pad partial chunks. If the chunks are padded with `0x00`
497+
/// bytes then just hashing an extra `0xFF` byte doesn't necessarily
498+
/// provide prefix-freedom, as `"ab"` and `"ab\u{0}"` would likely hash
499+
/// the same sequence of chunks. But if you pad with `0xFF` bytes instead,
500+
/// ensuring at least one padding byte, then it can often provide
501+
/// prefix-freedom cheaper than hashing the length would.
502+
#[inline]
503+
#[unstable(feature = "hasher_prefixfree_extras", issue = "96762")]
504+
fn write_str(&mut self, s: &str) {
505+
self.write_length_prefix(s.len());
506+
self.write(s.as_bytes());
507+
}
412508
}
413509

414510
#[stable(feature = "indirect_hasher_impl", since = "1.22.0")]
@@ -455,6 +551,12 @@ impl<H: Hasher + ?Sized> Hasher for &mut H {
455551
fn write_isize(&mut self, i: isize) {
456552
(**self).write_isize(i)
457553
}
554+
fn write_length_prefix(&mut self, len: usize) {
555+
(**self).write_length_prefix(len)
556+
}
557+
fn write_str(&mut self, s: &str) {
558+
(**self).write_str(s)
559+
}
458560
}
459561

460562
/// A trait for creating instances of [`Hasher`].
@@ -709,8 +811,7 @@ mod impls {
709811
impl Hash for str {
710812
#[inline]
711813
fn hash<H: Hasher>(&self, state: &mut H) {
712-
state.write(self.as_bytes());
713-
state.write_u8(0xff)
814+
state.write_str(self);
714815
}
715816
}
716817

@@ -767,7 +868,7 @@ mod impls {
767868
impl<T: Hash> Hash for [T] {
768869
#[inline]
769870
fn hash<H: Hasher>(&self, state: &mut H) {
770-
self.len().hash(state);
871+
state.write_length_prefix(self.len());
771872
Hash::hash_slice(self, state)
772873
}
773874
}

library/core/src/hash/sip.rs

+18
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,11 @@ impl super::Hasher for SipHasher {
233233
self.0.hasher.write(msg)
234234
}
235235

236+
#[inline]
237+
fn write_str(&mut self, s: &str) {
238+
self.0.hasher.write_str(s);
239+
}
240+
236241
#[inline]
237242
fn finish(&self) -> u64 {
238243
self.0.hasher.finish()
@@ -246,6 +251,11 @@ impl super::Hasher for SipHasher13 {
246251
self.hasher.write(msg)
247252
}
248253

254+
#[inline]
255+
fn write_str(&mut self, s: &str) {
256+
self.hasher.write_str(s);
257+
}
258+
249259
#[inline]
250260
fn finish(&self) -> u64 {
251261
self.hasher.finish()
@@ -307,6 +317,14 @@ impl<S: Sip> super::Hasher for Hasher<S> {
307317
self.ntail = left;
308318
}
309319

320+
#[inline]
321+
fn write_str(&mut self, s: &str) {
322+
// This hasher works byte-wise, and `0xFF` cannot show up in a `str`,
323+
// so just hashing the one extra byte is enough to be prefix-free.
324+
self.write(s.as_bytes());
325+
self.write_u8(0xFF);
326+
}
327+
310328
#[inline]
311329
fn finish(&self) -> u64 {
312330
let mut state = self.state;

library/core/tests/hash/mod.rs

+4
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ impl Hasher for MyHasher {
2020
self.hash += *byte as u64;
2121
}
2222
}
23+
fn write_str(&mut self, s: &str) {
24+
self.write(s.as_bytes());
25+
self.write_u8(0xFF);
26+
}
2327
fn finish(&self) -> u64 {
2428
self.hash
2529
}

library/core/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#![feature(future_join)]
3838
#![feature(future_poll_fn)]
3939
#![feature(array_from_fn)]
40+
#![feature(hasher_prefixfree_extras)]
4041
#![feature(hashmap_internals)]
4142
#![feature(try_find)]
4243
#![feature(inline_const)]

library/std/src/collections/hash/map.rs

+8
Original file line numberDiff line numberDiff line change
@@ -3006,11 +3006,19 @@ impl Default for DefaultHasher {
30063006

30073007
#[stable(feature = "hashmap_default_hasher", since = "1.13.0")]
30083008
impl Hasher for DefaultHasher {
3009+
// The underlying `SipHasher13` doesn't override the other
3010+
// `write_*` methods, so it's ok not to forward them here.
3011+
30093012
#[inline]
30103013
fn write(&mut self, msg: &[u8]) {
30113014
self.0.write(msg)
30123015
}
30133016

3017+
#[inline]
3018+
fn write_str(&mut self, s: &str) {
3019+
self.0.write_str(s);
3020+
}
3021+
30143022
#[inline]
30153023
fn finish(&self) -> u64 {
30163024
self.0.finish()

library/std/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@
270270
#![feature(exact_size_is_empty)]
271271
#![feature(extend_one)]
272272
#![feature(float_minimum_maximum)]
273+
#![feature(hasher_prefixfree_extras)]
273274
#![feature(hashmap_internals)]
274275
#![feature(int_error_internals)]
275276
#![feature(maybe_uninit_slice)]

0 commit comments

Comments
 (0)