Skip to content

Commit e8695f9

Browse files
committed
Hasher and Hash updates
- str now emits a delimiter of its own length - str and [u8] hash the same - Hasher::delimiter customizes how a delimiter is handled Add method `fn delimit(&mut self, len: usize)` to Hasher. This method makes the hasher emit a delimiter for a chunk of length `len`. For example str and slices both emit a delimiter for their length during hashing. The Hasher impl decides how to implement the delimiter. By default it emits the whole `usize` as data to the hashing stream. SipHash will ignore the first delimiter and hash the others as data. Since it hashes in the total length, hashing all but one delimiters is equivalent to hashing all lengths. For the next example, take something like farmhash that is not designed for streaming hashing. It could be implemented like this: - Every call to Hasher::write runs the whole hashing algorithm. Previous hash is xored together with the new result. - Delimiters are ignored, since the length of each chunk to write is already hashed in. It follows a sketch of how siphash and farmhash could work with this change: When hashing a: &[u8] - SipHash: `write(a); finish();` - Farmhash: `hash = write(a); hash` Both SipHash and Farmhash will hash just the bytes of a string in a single Hasher::write and a single Hasher::finish. When hashing (a: &[u8], b: [u8]): - SipHash: `write(a); write(b.len()); write(b); finish();` - Farmhash: `hash = write(a); hash ^= write(b); hash`
1 parent 2375743 commit e8695f9

File tree

3 files changed

+20
-19
lines changed

3 files changed

+20
-19
lines changed

src/libcore/hash/mod.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ pub trait Hasher {
118118
#[stable(feature = "rust1", since = "1.0.0")]
119119
fn write(&mut self, bytes: &[u8]);
120120

121+
/// Emit a delimiter for data of length `len`
122+
#[inline]
123+
#[unstable(feature = "hash_delimit", since = "1.4.0", issue="0")]
124+
fn delimit(&mut self, len: usize) {
125+
self.write_usize(len);
126+
}
127+
121128
/// Write a single `u8` into this hasher
122129
#[inline]
123130
#[stable(feature = "hasher_write", since = "1.3.0")]
@@ -230,8 +237,9 @@ mod impls {
230237
#[stable(feature = "rust1", since = "1.0.0")]
231238
impl Hash for str {
232239
fn hash<H: Hasher>(&self, state: &mut H) {
240+
// See `[T]` impl for why we write the u8
241+
state.delimit(self.len());
233242
state.write(self.as_bytes());
234-
state.write_u8(0xff)
235243
}
236244
}
237245

@@ -272,7 +280,7 @@ mod impls {
272280
#[stable(feature = "rust1", since = "1.0.0")]
273281
impl<T: Hash> Hash for [T] {
274282
fn hash<H: Hasher>(&self, state: &mut H) {
275-
self.len().hash(state);
283+
state.delimit(self.len());
276284
Hash::hash_slice(self, state)
277285
}
278286
}

src/libcore/hash/sip.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,14 @@ impl Hasher for SipHasher {
192192
self.write(msg)
193193
}
194194

195+
#[inline]
196+
fn delimit(&mut self, len: usize) {
197+
// skip the first delimiter
198+
if self.length > 0 {
199+
self.write_usize(len);
200+
}
201+
}
202+
195203
#[inline]
196204
fn finish(&self) -> u64 {
197205
let mut v0 = self.v0;

src/libstd/sys/common/wtf8.rs

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ impl CodePoint {
124124
///
125125
/// Similar to `String`, but can additionally contain surrogate code points
126126
/// if they’re not in a surrogate pair.
127-
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
127+
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Hash)]
128128
pub struct Wtf8Buf {
129129
bytes: Vec<u8>
130130
}
@@ -382,6 +382,7 @@ impl Extend<CodePoint> for Wtf8Buf {
382382
///
383383
/// Similar to `&str`, but can additionally contain surrogate code points
384384
/// if they’re not in a surrogate pair.
385+
#[derive(Hash)]
385386
pub struct Wtf8 {
386387
bytes: [u8]
387388
}
@@ -796,22 +797,6 @@ impl Hash for CodePoint {
796797
}
797798
}
798799

799-
impl Hash for Wtf8Buf {
800-
#[inline]
801-
fn hash<H: Hasher>(&self, state: &mut H) {
802-
state.write(&self.bytes);
803-
0xfeu8.hash(state)
804-
}
805-
}
806-
807-
impl Hash for Wtf8 {
808-
#[inline]
809-
fn hash<H: Hasher>(&self, state: &mut H) {
810-
state.write(&self.bytes);
811-
0xfeu8.hash(state)
812-
}
813-
}
814-
815800
impl AsciiExt for Wtf8 {
816801
type Owned = Wtf8Buf;
817802

0 commit comments

Comments
 (0)