|
1 |
| -// Sadly, all of the crc crates in Rust appear to be insufficient for high |
2 |
| -// performance algorithms. In fact, the implementation here is insufficient! |
3 |
| -// The best implementation uses the CRC32 instruction from SSE 4.2, but using |
4 |
| -// specialty instructions on stable Rust is an absolute nightmare at the |
5 |
| -// moment. The only way I can think to do it is to write some Assembly in a |
6 |
| -// separate source file and run an Assembler from build.rs. But we'd need to be |
7 |
| -// careful to only do this on platforms that support the CRC32 instruction, |
8 |
| -// which means we'd need to query CPUID. There are a couple Rust crates for |
9 |
| -// that, but of course, they only work on unstable Rust so we'd need to hand |
10 |
| -// roll that too. |
11 |
| -// |
12 |
| -// As a stopgap, we implement the fastest (slicing-by-16) algorithm described |
13 |
| -// here: http://create.stephan-brumme.com/crc32/ |
14 |
| -// (Actually, slicing-by-16 doesn't seem to be measurably faster than |
15 |
| -// slicing-by-8 on my i7-6900K, so use slicing-by-8.) |
16 |
| -// |
17 |
| -// For Snappy, we only care about CRC32C (32 bit, Castagnoli). |
18 |
| -// |
19 |
| -// ---AG |
| 1 | +use crate::bytes; |
| 2 | +use crate::crc32_table::{TABLE, TABLE16}; |
20 | 3 |
|
21 |
| -// We have a bunch of different implementations of crc32 below that seem |
22 |
| -// somewhat useful to leave around for easy benchmarking. |
23 |
| -#![allow(dead_code)] |
| 4 | +/// Provides a simple API to generate "masked" CRC32C checksums specifically |
| 5 | +/// for use in Snappy. When available, this will make use of SSE 4.2 to compute |
| 6 | +/// checksums. Otherwise, it falls back to only-marginally-slower "slicing by |
| 7 | +/// 16" technique. |
| 8 | +/// |
| 9 | +/// The main purpose of this type is to cache the CPU feature check and expose |
| 10 | +/// a safe API. |
| 11 | +#[derive(Clone, Copy, Debug)] |
| 12 | +pub struct CheckSummer { |
| 13 | + sse42: bool, |
| 14 | +} |
24 | 15 |
|
25 |
| -use crate::bytes; |
26 |
| -use crate::crc32_table::{CASTAGNOLI_POLY, TABLE, TABLE16}; |
| 16 | +impl CheckSummer { |
| 17 | + /// Create a new checksummer that can compute CRC32C checksums on arbitrary |
| 18 | + /// bytes. |
| 19 | + #[cfg(not(target_arch = "x86_64"))] |
| 20 | + pub fn new() -> CheckSummer { |
| 21 | + CheckSummer { sse42: false } |
| 22 | + } |
27 | 23 |
|
28 |
| -/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
29 |
| -pub fn crc32c(buf: &[u8]) -> u32 { |
30 |
| - // I can't measure any difference between slice8 and slice16. |
31 |
| - crc32c_slice8(buf) |
| 24 | + /// Create a new checksummer that can compute CRC32C checksums on arbitrary |
| 25 | + /// bytes. |
| 26 | + #[cfg(target_arch = "x86_64")] |
| 27 | + pub fn new() -> CheckSummer { |
| 28 | + CheckSummer { sse42: is_x86_feature_detected!("sse4.2") } |
| 29 | + } |
| 30 | + |
| 31 | + /// Returns the "masked" CRC32 checksum of `buf` using the Castagnoli |
| 32 | + /// polynomial. This "masked" checksum is defined by the Snappy frame |
| 33 | + /// format. Masking is supposed to make the checksum robust with respect to |
| 34 | + /// the data that contains the checksum itself. |
| 35 | + pub fn crc32c_masked(&self, buf: &[u8]) -> u32 { |
| 36 | + let sum = self.crc32c(buf); |
| 37 | + (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8) |
| 38 | + } |
| 39 | + |
| 40 | + /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
| 41 | + #[cfg(not(target_arch = "x86_64"))] |
| 42 | + fn crc32c(&self, buf: &[u8]) -> u32 { |
| 43 | + crc32c_slice16(buf) |
| 44 | + } |
| 45 | + |
| 46 | + /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
| 47 | + #[cfg(target_arch = "x86_64")] |
| 48 | + fn crc32c(&self, buf: &[u8]) -> u32 { |
| 49 | + if self.sse42 { |
| 50 | + // SAFETY: When sse42 is true, we are guaranteed to be running on |
| 51 | + // a CPU that supports SSE 4.2. |
| 52 | + unsafe { crc32c_sse(buf) } |
| 53 | + } else { |
| 54 | + crc32c_slice16(buf) |
| 55 | + } |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +#[cfg(target_arch = "x86_64")] |
| 60 | +#[target_feature(enable = "sse4.2")] |
| 61 | +unsafe fn crc32c_sse(buf: &[u8]) -> u32 { |
| 62 | + use std::arch::x86_64::*; |
| 63 | + |
| 64 | + let mut crc = !0u32; |
| 65 | + // SAFETY: This is safe since alignment is handled by align_to (oh how I |
| 66 | + // love you) and since 8 adjacent u8's are guaranteed to have the same |
| 67 | + // in-memory representation as u64 for all possible values. |
| 68 | + let (prefix, u64s, suffix) = buf.align_to::<u64>(); |
| 69 | + for &b in prefix { |
| 70 | + // SAFETY: Safe since we have sse4.2 enabled. |
| 71 | + crc = _mm_crc32_u8(crc, b); |
| 72 | + } |
| 73 | + for &n in u64s { |
| 74 | + // SAFETY: Safe since we have sse4.2 enabled. |
| 75 | + crc = _mm_crc32_u64(crc as u64, n) as u32; |
| 76 | + } |
| 77 | + for &b in suffix { |
| 78 | + // SAFETY: Safe since we have sse4.2 enabled. |
| 79 | + crc = _mm_crc32_u8(crc, b); |
| 80 | + } |
| 81 | + !crc |
32 | 82 | }
|
33 | 83 |
|
34 | 84 | /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
|
@@ -59,62 +109,3 @@ fn crc32c_slice16(mut buf: &[u8]) -> u32 {
|
59 | 109 | }
|
60 | 110 | !crc
|
61 | 111 | }
|
62 |
| - |
63 |
| -/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
64 |
| -fn crc32c_slice8(mut buf: &[u8]) -> u32 { |
65 |
| - let mut crc: u32 = !0; |
66 |
| - while buf.len() >= 8 { |
67 |
| - crc ^= bytes::read_u32_le(buf); |
68 |
| - crc = TABLE16[0][buf[7] as usize] |
69 |
| - ^ TABLE16[1][buf[6] as usize] |
70 |
| - ^ TABLE16[2][buf[5] as usize] |
71 |
| - ^ TABLE16[3][buf[4] as usize] |
72 |
| - ^ TABLE16[4][(crc >> 24) as u8 as usize] |
73 |
| - ^ TABLE16[5][(crc >> 16) as u8 as usize] |
74 |
| - ^ TABLE16[6][(crc >> 8) as u8 as usize] |
75 |
| - ^ TABLE16[7][(crc) as u8 as usize]; |
76 |
| - buf = &buf[8..]; |
77 |
| - } |
78 |
| - for &b in buf { |
79 |
| - crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); |
80 |
| - } |
81 |
| - !crc |
82 |
| -} |
83 |
| - |
84 |
| -/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
85 |
| -fn crc32c_slice4(mut buf: &[u8]) -> u32 { |
86 |
| - let mut crc: u32 = !0; |
87 |
| - while buf.len() >= 4 { |
88 |
| - crc ^= bytes::read_u32_le(buf); |
89 |
| - crc = TABLE16[0][(crc >> 24) as u8 as usize] |
90 |
| - ^ TABLE16[1][(crc >> 16) as u8 as usize] |
91 |
| - ^ TABLE16[2][(crc >> 8) as u8 as usize] |
92 |
| - ^ TABLE16[3][(crc) as u8 as usize]; |
93 |
| - buf = &buf[4..]; |
94 |
| - } |
95 |
| - for &b in buf { |
96 |
| - crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); |
97 |
| - } |
98 |
| - !crc |
99 |
| -} |
100 |
| - |
101 |
| -/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
102 |
| -fn crc32c_multiple(buf: &[u8]) -> u32 { |
103 |
| - let mut crc: u32 = !0; |
104 |
| - for &b in buf { |
105 |
| - crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); |
106 |
| - } |
107 |
| - !crc |
108 |
| -} |
109 |
| - |
110 |
| -/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. |
111 |
| -fn crc32c_bitwise(buf: &[u8]) -> u32 { |
112 |
| - let mut crc: u32 = !0; |
113 |
| - for &b in buf { |
114 |
| - crc ^= b as u32; |
115 |
| - for _ in 0..8 { |
116 |
| - crc = (crc >> 1) ^ ((crc & 1) * CASTAGNOLI_POLY); |
117 |
| - } |
118 |
| - } |
119 |
| - !crc |
120 |
| -} |
0 commit comments