perf: accelerate CRC32C with SSE4.2 when possible

BurntSushi · BurntSushi · commit a746ec8ba37e · 2020-02-14T11:16:07.000-05:00
We avoid bring in another crate since the code to do this is so small and reasonably simple. Surprisingly, this results in fairly marginal performance gains. Closes #19, Closes #20
diff --git a/build.rs b/build.rs
@@ -73,8 +73,6 @@ fn write_crc_tables(out_dir: &Path) -> Result<()> {
     let table = make_table(CASTAGNOLI_POLY);
     let table16 = make_table16(CASTAGNOLI_POLY);
 
-    writeln!(out, "pub const CASTAGNOLI_POLY: u32 = {};\n", CASTAGNOLI_POLY)?;
-
     writeln!(out, "pub const TABLE: [u32; 256] = [")?;
     for &x in table.iter() {
         writeln!(out, "    {},", x)?;
diff --git a/src/crc32.rs b/src/crc32.rs
@@ -1,34 +1,84 @@
-// Sadly, all of the crc crates in Rust appear to be insufficient for high
-// performance algorithms. In fact, the implementation here is insufficient!
-// The best implementation uses the CRC32 instruction from SSE 4.2, but using
-// specialty instructions on stable Rust is an absolute nightmare at the
-// moment. The only way I can think to do it is to write some Assembly in a
-// separate source file and run an Assembler from build.rs. But we'd need to be
-// careful to only do this on platforms that support the CRC32 instruction,
-// which means we'd need to query CPUID. There are a couple Rust crates for
-// that, but of course, they only work on unstable Rust so we'd need to hand
-// roll that too.
-//
-// As a stopgap, we implement the fastest (slicing-by-16) algorithm described
-// here: http://create.stephan-brumme.com/crc32/
-// (Actually, slicing-by-16 doesn't seem to be measurably faster than
-// slicing-by-8 on my i7-6900K, so use slicing-by-8.)
-//
-// For Snappy, we only care about CRC32C (32 bit, Castagnoli).
-//
-// ---AG
+use crate::bytes;
+use crate::crc32_table::{TABLE, TABLE16};
 
-// We have a bunch of different implementations of crc32 below that seem
-// somewhat useful to leave around for easy benchmarking.
-#![allow(dead_code)]
+/// Provides a simple API to generate "masked" CRC32C checksums specifically
+/// for use in Snappy. When available, this will make use of SSE 4.2 to compute
+/// checksums. Otherwise, it falls back to only-marginally-slower "slicing by
+/// 16" technique.
+///
+/// The main purpose of this type is to cache the CPU feature check and expose
+/// a safe API.
+#[derive(Clone, Copy, Debug)]
+pub struct CheckSummer {
+    sse42: bool,
+}
 
-use crate::bytes;
-use crate::crc32_table::{CASTAGNOLI_POLY, TABLE, TABLE16};
+impl CheckSummer {
+    /// Create a new checksummer that can compute CRC32C checksums on arbitrary
+    /// bytes.
+    #[cfg(not(target_arch = "x86_64"))]
+    pub fn new() -> CheckSummer {
+        CheckSummer { sse42: false }
+    }
 
-/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
-pub fn crc32c(buf: &[u8]) -> u32 {
-    // I can't measure any difference between slice8 and slice16.
-    crc32c_slice8(buf)
+    /// Create a new checksummer that can compute CRC32C checksums on arbitrary
+    /// bytes.
+    #[cfg(target_arch = "x86_64")]
+    pub fn new() -> CheckSummer {
+        CheckSummer { sse42: is_x86_feature_detected!("sse4.2") }
+    }
+
+    /// Returns the "masked" CRC32 checksum of `buf` using the Castagnoli
+    /// polynomial. This "masked" checksum is defined by the Snappy frame
+    /// format. Masking is supposed to make the checksum robust with respect to
+    /// the data that contains the checksum itself.
+    pub fn crc32c_masked(&self, buf: &[u8]) -> u32 {
+        let sum = self.crc32c(buf);
+        (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
+    }
+
+    /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
+    #[cfg(not(target_arch = "x86_64"))]
+    fn crc32c(&self, buf: &[u8]) -> u32 {
+        crc32c_slice16(buf)
+    }
+
+    /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
+    #[cfg(target_arch = "x86_64")]
+    fn crc32c(&self, buf: &[u8]) -> u32 {
+        if self.sse42 {
+            // SAFETY: When sse42 is true, we are guaranteed to be running on
+            // a CPU that supports SSE 4.2.
+            unsafe { crc32c_sse(buf) }
+        } else {
+            crc32c_slice16(buf)
+        }
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse4.2")]
+unsafe fn crc32c_sse(buf: &[u8]) -> u32 {
+    use std::arch::x86_64::*;
+
+    let mut crc = !0u32;
+    // SAFETY: This is safe since alignment is handled by align_to (oh how I
+    // love you) and since 8 adjacent u8's are guaranteed to have the same
+    // in-memory representation as u64 for all possible values.
+    let (prefix, u64s, suffix) = buf.align_to::<u64>();
+    for &b in prefix {
+        // SAFETY: Safe since we have sse4.2 enabled.
+        crc = _mm_crc32_u8(crc, b);
+    }
+    for &n in u64s {
+        // SAFETY: Safe since we have sse4.2 enabled.
+        crc = _mm_crc32_u64(crc as u64, n) as u32;
+    }
+    for &b in suffix {
+        // SAFETY: Safe since we have sse4.2 enabled.
+        crc = _mm_crc32_u8(crc, b);
+    }
+    !crc
 }
 
 /// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
@@ -59,62 +109,3 @@ fn crc32c_slice16(mut buf: &[u8]) -> u32 {
     }
     !crc
 }
-
-/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
-fn crc32c_slice8(mut buf: &[u8]) -> u32 {
-    let mut crc: u32 = !0;
-    while buf.len() >= 8 {
-        crc ^= bytes::read_u32_le(buf);
-        crc = TABLE16[0][buf[7] as usize]
-            ^ TABLE16[1][buf[6] as usize]
-            ^ TABLE16[2][buf[5] as usize]
-            ^ TABLE16[3][buf[4] as usize]
-            ^ TABLE16[4][(crc >> 24) as u8 as usize]
-            ^ TABLE16[5][(crc >> 16) as u8 as usize]
-            ^ TABLE16[6][(crc >> 8) as u8 as usize]
-            ^ TABLE16[7][(crc) as u8 as usize];
-        buf = &buf[8..];
-    }
-    for &b in buf {
-        crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
-    }
-    !crc
-}
-
-/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
-fn crc32c_slice4(mut buf: &[u8]) -> u32 {
-    let mut crc: u32 = !0;
-    while buf.len() >= 4 {
-        crc ^= bytes::read_u32_le(buf);
-        crc = TABLE16[0][(crc >> 24) as u8 as usize]
-            ^ TABLE16[1][(crc >> 16) as u8 as usize]
-            ^ TABLE16[2][(crc >> 8) as u8 as usize]
-            ^ TABLE16[3][(crc) as u8 as usize];
-        buf = &buf[4..];
-    }
-    for &b in buf {
-        crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
-    }
-    !crc
-}
-
-/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
-fn crc32c_multiple(buf: &[u8]) -> u32 {
-    let mut crc: u32 = !0;
-    for &b in buf {
-        crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
-    }
-    !crc
-}
-
-/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
-fn crc32c_bitwise(buf: &[u8]) -> u32 {
-    let mut crc: u32 = !0;
-    for &b in buf {
-        crc ^= b as u32;
-        for _ in 0..8 {
-            crc = (crc >> 1) ^ ((crc & 1) * CASTAGNOLI_POLY);
-        }
-    }
-    !crc
-}
diff --git a/src/frame.rs b/src/frame.rs
@@ -1,6 +1,6 @@
 use crate::bytes;
 use crate::compress::{max_compress_len, Encoder};
-use crate::crc32::crc32c;
+use crate::crc32::CheckSummer;
 use crate::error::Error;
 use crate::MAX_BLOCK_SIZE;
 
@@ -49,13 +49,6 @@ impl ChunkType {
     }
 }
 
-pub fn crc32c_masked(buf: &[u8]) -> u32 {
-    // TODO(burntsushi): SSE 4.2 has a CRC32 instruction that is probably
-    // faster. Oh how I long for you, SIMD. See src/crc32.rs for a lamentation.
-    let sum = crc32c(buf);
-    (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
-}
-
 /// Compress a single frame (or decide to pass it through uncompressed). This
 /// will output a frame header in `dst_chunk_header`, and it will return a slice
 /// pointing to the data to use in the frame. The `dst_chunk_header` array must
@@ -68,6 +61,7 @@ pub fn crc32c_masked(buf: &[u8]) -> u32 {
 /// for a single function to always be in charge of writing to `dst`.
 pub fn compress_frame<'a>(
     enc: &mut Encoder,
+    checksummer: CheckSummer,
     src: &'a [u8],
     dst_chunk_header: &mut [u8],
     dst: &'a mut [u8],
@@ -79,7 +73,7 @@ pub fn compress_frame<'a>(
     assert_eq!(dst_chunk_header.len(), CHUNK_HEADER_AND_CRC_SIZE);
 
     // Build a checksum of our _uncompressed_ data.
-    let checksum = crc32c_masked(src);
+    let checksum = checksummer.crc32c_masked(src);
 
     // Compress the buffer. If compression sucked, throw it out and
     // write uncompressed bytes instead. Since our buffer is at most
diff --git a/src/read.rs b/src/read.rs
@@ -19,10 +19,11 @@ use std::io;
 
 use crate::bytes;
 use crate::compress::Encoder;
+use crate::crc32::CheckSummer;
 use crate::decompress::{decompress_len, Decoder};
 use crate::error::Error;
 use crate::frame::{
-    compress_frame, crc32c_masked, ChunkType, CHUNK_HEADER_AND_CRC_SIZE,
+    compress_frame, ChunkType, CHUNK_HEADER_AND_CRC_SIZE,
     MAX_COMPRESS_BLOCK_SIZE, STREAM_BODY, STREAM_IDENTIFIER,
 };
 use crate::MAX_BLOCK_SIZE;
@@ -49,6 +50,10 @@ pub struct FrameDecoder<R: io::Read> {
     /// A Snappy decoder that we reuse that does the actual block based
     /// decompression.
     dec: Decoder,
+    /// A CRC32 checksummer that is configured to either use the portable
+    /// fallback version or the SSE4.2 accelerated version when the right CPU
+    /// features are available.
+    checksummer: CheckSummer,
     /// The compressed bytes buffer, taken from the underlying reader.
     src: Vec<u8>,
     /// The decompressed bytes buffer. Bytes are decompressed from src to dst
@@ -68,6 +73,7 @@ impl<R: io::Read> FrameDecoder<R> {
         FrameDecoder {
             r: rdr,
             dec: Decoder::new(),
+            checksummer: CheckSummer::new(),
             src: vec![0; MAX_COMPRESS_BLOCK_SIZE],
             dst: vec![0; MAX_BLOCK_SIZE],
             dsts: 0,
@@ -161,7 +167,8 @@ impl<R: io::Read> io::Read for FrameDecoder<R> {
                         });
                     }
                     self.r.read_exact(&mut self.dst[0..n])?;
-                    let got_sum = crc32c_masked(&self.dst[0..n]);
+                    let got_sum =
+                        self.checksummer.crc32c_masked(&self.dst[0..n]);
                     if expected_sum != got_sum {
                         fail!(Error::Checksum {
                             expected: expected_sum,
@@ -190,7 +197,8 @@ impl<R: io::Read> io::Read for FrameDecoder<R> {
                     }
                     self.dec
                         .decompress(&self.src[0..sn], &mut self.dst[0..dn])?;
-                    let got_sum = crc32c_masked(&self.dst[0..dn]);
+                    let got_sum =
+                        self.checksummer.crc32c_masked(&self.dst[0..dn]);
                     if expected_sum != got_sum {
                         fail!(Error::Checksum {
                             expected: expected_sum,
@@ -210,6 +218,7 @@ impl<R: fmt::Debug + io::Read> fmt::Debug for FrameDecoder<R> {
         f.debug_struct("FrameDecoder")
             .field("r", &self.r)
             .field("dec", &self.dec)
+            .field("checksummer", &self.checksummer)
             .field("src", &"[...]")
             .field("dst", &"[...]")
             .field("dsts", &self.dsts)
@@ -253,6 +262,10 @@ struct Inner<R: io::Read> {
     r: R,
     /// An encoder that we reuse that does the actual block based compression.
     enc: Encoder,
+    /// A CRC32 checksummer that is configured to either use the portable
+    /// fallback version or the SSE4.2 accelerated version when the right CPU
+    /// features are available.
+    checksummer: CheckSummer,
     /// Data taken from the underlying `r`, and not yet compressed.
     src: Vec<u8>,
     /// Have we written the standard snappy header to `dst` yet?
@@ -266,6 +279,7 @@ impl<R: io::Read> FrameEncoder<R> {
             inner: Inner {
                 r: rdr,
                 enc: Encoder::new(),
+                checksummer: CheckSummer::new(),
                 src: vec![0; MAX_BLOCK_SIZE],
                 wrote_stream_ident: false,
             },
@@ -352,6 +366,7 @@ impl<R: io::Read> Inner<R> {
         // put the output in `dst`.
         let frame_data = compress_frame(
             &mut self.enc,
+            self.checksummer,
             &self.src[..nread],
             chunk_header,
             remaining_dst,
@@ -377,6 +392,7 @@ impl<R: fmt::Debug + io::Read> fmt::Debug for Inner<R> {
         f.debug_struct("Inner")
             .field("r", &self.r)
             .field("enc", &self.enc)
+            .field("checksummer", &self.checksummer)
             .field("src", &"[...]")
             .field("wrote_stream_ident", &self.wrote_stream_ident)
             .finish()
diff --git a/src/write.rs b/src/write.rs
@@ -13,6 +13,7 @@ use std::fmt;
 use std::io::{self, Write};
 
 use crate::compress::Encoder;
+use crate::crc32::CheckSummer;
 pub use crate::error::IntoInnerError;
 use crate::frame::{
     compress_frame, CHUNK_HEADER_AND_CRC_SIZE, MAX_COMPRESS_BLOCK_SIZE,
@@ -52,6 +53,10 @@ struct Inner<W> {
     w: W,
     /// An encoder that we reuse that does the actual block based compression.
     enc: Encoder,
+    /// A CRC32 checksummer that is configured to either use the portable
+    /// fallback version or the SSE4.2 accelerated version when the right CPU
+    /// features are available.
+    checksummer: CheckSummer,
     /// The compressed bytes buffer. Bytes are compressed from src (usually)
     /// to dst before being written to w.
     dst: Vec<u8>,
@@ -70,6 +75,7 @@ impl<W: io::Write> FrameEncoder<W> {
             inner: Some(Inner {
                 w: wtr,
                 enc: Encoder::new(),
+                checksummer: CheckSummer::new(),
                 dst: vec![0; MAX_COMPRESS_BLOCK_SIZE],
                 wrote_stream_ident: false,
                 chunk_header: [0; CHUNK_HEADER_AND_CRC_SIZE],
@@ -164,6 +170,7 @@ impl<W: io::Write> Inner<W> {
 
             let frame_data = compress_frame(
                 &mut self.enc,
+                self.checksummer,
                 src,
                 &mut self.chunk_header,
                 &mut self.dst,
@@ -191,6 +198,7 @@ impl<W: fmt::Debug + io::Write> fmt::Debug for Inner<W> {
         f.debug_struct("Inner")
             .field("w", &self.w)
             .field("enc", &self.enc)
+            .field("checksummer", &self.checksummer)
             .field("dst", &"[...]")
             .field("wrote_stream_ident", &self.wrote_stream_ident)
             .field("chunk_header", &self.chunk_header)