Skip to content

Commit a746ec8

Browse files
committed
perf: accelerate CRC32C with SSE4.2 when possible
We avoid bring in another crate since the code to do this is so small and reasonably simple. Surprisingly, this results in fairly marginal performance gains. Closes #19, Closes #20
1 parent 739a556 commit a746ec8

File tree

5 files changed

+108
-101
lines changed

5 files changed

+108
-101
lines changed

build.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ fn write_crc_tables(out_dir: &Path) -> Result<()> {
7373
let table = make_table(CASTAGNOLI_POLY);
7474
let table16 = make_table16(CASTAGNOLI_POLY);
7575

76-
writeln!(out, "pub const CASTAGNOLI_POLY: u32 = {};\n", CASTAGNOLI_POLY)?;
77-
7876
writeln!(out, "pub const TABLE: [u32; 256] = [")?;
7977
for &x in table.iter() {
8078
writeln!(out, " {},", x)?;

src/crc32.rs

Lines changed: 78 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,84 @@
1-
// Sadly, all of the crc crates in Rust appear to be insufficient for high
2-
// performance algorithms. In fact, the implementation here is insufficient!
3-
// The best implementation uses the CRC32 instruction from SSE 4.2, but using
4-
// specialty instructions on stable Rust is an absolute nightmare at the
5-
// moment. The only way I can think to do it is to write some Assembly in a
6-
// separate source file and run an Assembler from build.rs. But we'd need to be
7-
// careful to only do this on platforms that support the CRC32 instruction,
8-
// which means we'd need to query CPUID. There are a couple Rust crates for
9-
// that, but of course, they only work on unstable Rust so we'd need to hand
10-
// roll that too.
11-
//
12-
// As a stopgap, we implement the fastest (slicing-by-16) algorithm described
13-
// here: http://create.stephan-brumme.com/crc32/
14-
// (Actually, slicing-by-16 doesn't seem to be measurably faster than
15-
// slicing-by-8 on my i7-6900K, so use slicing-by-8.)
16-
//
17-
// For Snappy, we only care about CRC32C (32 bit, Castagnoli).
18-
//
19-
// ---AG
1+
use crate::bytes;
2+
use crate::crc32_table::{TABLE, TABLE16};
203

21-
// We have a bunch of different implementations of crc32 below that seem
22-
// somewhat useful to leave around for easy benchmarking.
23-
#![allow(dead_code)]
4+
/// Provides a simple API to generate "masked" CRC32C checksums specifically
5+
/// for use in Snappy. When available, this will make use of SSE 4.2 to compute
6+
/// checksums. Otherwise, it falls back to only-marginally-slower "slicing by
7+
/// 16" technique.
8+
///
9+
/// The main purpose of this type is to cache the CPU feature check and expose
10+
/// a safe API.
11+
#[derive(Clone, Copy, Debug)]
12+
pub struct CheckSummer {
13+
sse42: bool,
14+
}
2415

25-
use crate::bytes;
26-
use crate::crc32_table::{CASTAGNOLI_POLY, TABLE, TABLE16};
16+
impl CheckSummer {
17+
/// Create a new checksummer that can compute CRC32C checksums on arbitrary
18+
/// bytes.
19+
#[cfg(not(target_arch = "x86_64"))]
20+
pub fn new() -> CheckSummer {
21+
CheckSummer { sse42: false }
22+
}
2723

28-
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
29-
pub fn crc32c(buf: &[u8]) -> u32 {
30-
// I can't measure any difference between slice8 and slice16.
31-
crc32c_slice8(buf)
24+
/// Create a new checksummer that can compute CRC32C checksums on arbitrary
25+
/// bytes.
26+
#[cfg(target_arch = "x86_64")]
27+
pub fn new() -> CheckSummer {
28+
CheckSummer { sse42: is_x86_feature_detected!("sse4.2") }
29+
}
30+
31+
/// Returns the "masked" CRC32 checksum of `buf` using the Castagnoli
32+
/// polynomial. This "masked" checksum is defined by the Snappy frame
33+
/// format. Masking is supposed to make the checksum robust with respect to
34+
/// the data that contains the checksum itself.
35+
pub fn crc32c_masked(&self, buf: &[u8]) -> u32 {
36+
let sum = self.crc32c(buf);
37+
(sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
38+
}
39+
40+
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
41+
#[cfg(not(target_arch = "x86_64"))]
42+
fn crc32c(&self, buf: &[u8]) -> u32 {
43+
crc32c_slice16(buf)
44+
}
45+
46+
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
47+
#[cfg(target_arch = "x86_64")]
48+
fn crc32c(&self, buf: &[u8]) -> u32 {
49+
if self.sse42 {
50+
// SAFETY: When sse42 is true, we are guaranteed to be running on
51+
// a CPU that supports SSE 4.2.
52+
unsafe { crc32c_sse(buf) }
53+
} else {
54+
crc32c_slice16(buf)
55+
}
56+
}
57+
}
58+
59+
#[cfg(target_arch = "x86_64")]
60+
#[target_feature(enable = "sse4.2")]
61+
unsafe fn crc32c_sse(buf: &[u8]) -> u32 {
62+
use std::arch::x86_64::*;
63+
64+
let mut crc = !0u32;
65+
// SAFETY: This is safe since alignment is handled by align_to (oh how I
66+
// love you) and since 8 adjacent u8's are guaranteed to have the same
67+
// in-memory representation as u64 for all possible values.
68+
let (prefix, u64s, suffix) = buf.align_to::<u64>();
69+
for &b in prefix {
70+
// SAFETY: Safe since we have sse4.2 enabled.
71+
crc = _mm_crc32_u8(crc, b);
72+
}
73+
for &n in u64s {
74+
// SAFETY: Safe since we have sse4.2 enabled.
75+
crc = _mm_crc32_u64(crc as u64, n) as u32;
76+
}
77+
for &b in suffix {
78+
// SAFETY: Safe since we have sse4.2 enabled.
79+
crc = _mm_crc32_u8(crc, b);
80+
}
81+
!crc
3282
}
3383

3484
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
@@ -59,62 +109,3 @@ fn crc32c_slice16(mut buf: &[u8]) -> u32 {
59109
}
60110
!crc
61111
}
62-
63-
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
64-
fn crc32c_slice8(mut buf: &[u8]) -> u32 {
65-
let mut crc: u32 = !0;
66-
while buf.len() >= 8 {
67-
crc ^= bytes::read_u32_le(buf);
68-
crc = TABLE16[0][buf[7] as usize]
69-
^ TABLE16[1][buf[6] as usize]
70-
^ TABLE16[2][buf[5] as usize]
71-
^ TABLE16[3][buf[4] as usize]
72-
^ TABLE16[4][(crc >> 24) as u8 as usize]
73-
^ TABLE16[5][(crc >> 16) as u8 as usize]
74-
^ TABLE16[6][(crc >> 8) as u8 as usize]
75-
^ TABLE16[7][(crc) as u8 as usize];
76-
buf = &buf[8..];
77-
}
78-
for &b in buf {
79-
crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
80-
}
81-
!crc
82-
}
83-
84-
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
85-
fn crc32c_slice4(mut buf: &[u8]) -> u32 {
86-
let mut crc: u32 = !0;
87-
while buf.len() >= 4 {
88-
crc ^= bytes::read_u32_le(buf);
89-
crc = TABLE16[0][(crc >> 24) as u8 as usize]
90-
^ TABLE16[1][(crc >> 16) as u8 as usize]
91-
^ TABLE16[2][(crc >> 8) as u8 as usize]
92-
^ TABLE16[3][(crc) as u8 as usize];
93-
buf = &buf[4..];
94-
}
95-
for &b in buf {
96-
crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
97-
}
98-
!crc
99-
}
100-
101-
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
102-
fn crc32c_multiple(buf: &[u8]) -> u32 {
103-
let mut crc: u32 = !0;
104-
for &b in buf {
105-
crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
106-
}
107-
!crc
108-
}
109-
110-
/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
111-
fn crc32c_bitwise(buf: &[u8]) -> u32 {
112-
let mut crc: u32 = !0;
113-
for &b in buf {
114-
crc ^= b as u32;
115-
for _ in 0..8 {
116-
crc = (crc >> 1) ^ ((crc & 1) * CASTAGNOLI_POLY);
117-
}
118-
}
119-
!crc
120-
}

src/frame.rs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::bytes;
22
use crate::compress::{max_compress_len, Encoder};
3-
use crate::crc32::crc32c;
3+
use crate::crc32::CheckSummer;
44
use crate::error::Error;
55
use crate::MAX_BLOCK_SIZE;
66

@@ -49,13 +49,6 @@ impl ChunkType {
4949
}
5050
}
5151

52-
pub fn crc32c_masked(buf: &[u8]) -> u32 {
53-
// TODO(burntsushi): SSE 4.2 has a CRC32 instruction that is probably
54-
// faster. Oh how I long for you, SIMD. See src/crc32.rs for a lamentation.
55-
let sum = crc32c(buf);
56-
(sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
57-
}
58-
5952
/// Compress a single frame (or decide to pass it through uncompressed). This
6053
/// will output a frame header in `dst_chunk_header`, and it will return a slice
6154
/// pointing to the data to use in the frame. The `dst_chunk_header` array must
@@ -68,6 +61,7 @@ pub fn crc32c_masked(buf: &[u8]) -> u32 {
6861
/// for a single function to always be in charge of writing to `dst`.
6962
pub fn compress_frame<'a>(
7063
enc: &mut Encoder,
64+
checksummer: CheckSummer,
7165
src: &'a [u8],
7266
dst_chunk_header: &mut [u8],
7367
dst: &'a mut [u8],
@@ -79,7 +73,7 @@ pub fn compress_frame<'a>(
7973
assert_eq!(dst_chunk_header.len(), CHUNK_HEADER_AND_CRC_SIZE);
8074

8175
// Build a checksum of our _uncompressed_ data.
82-
let checksum = crc32c_masked(src);
76+
let checksum = checksummer.crc32c_masked(src);
8377

8478
// Compress the buffer. If compression sucked, throw it out and
8579
// write uncompressed bytes instead. Since our buffer is at most

src/read.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ use std::io;
1919

2020
use crate::bytes;
2121
use crate::compress::Encoder;
22+
use crate::crc32::CheckSummer;
2223
use crate::decompress::{decompress_len, Decoder};
2324
use crate::error::Error;
2425
use crate::frame::{
25-
compress_frame, crc32c_masked, ChunkType, CHUNK_HEADER_AND_CRC_SIZE,
26+
compress_frame, ChunkType, CHUNK_HEADER_AND_CRC_SIZE,
2627
MAX_COMPRESS_BLOCK_SIZE, STREAM_BODY, STREAM_IDENTIFIER,
2728
};
2829
use crate::MAX_BLOCK_SIZE;
@@ -49,6 +50,10 @@ pub struct FrameDecoder<R: io::Read> {
4950
/// A Snappy decoder that we reuse that does the actual block based
5051
/// decompression.
5152
dec: Decoder,
53+
/// A CRC32 checksummer that is configured to either use the portable
54+
/// fallback version or the SSE4.2 accelerated version when the right CPU
55+
/// features are available.
56+
checksummer: CheckSummer,
5257
/// The compressed bytes buffer, taken from the underlying reader.
5358
src: Vec<u8>,
5459
/// The decompressed bytes buffer. Bytes are decompressed from src to dst
@@ -68,6 +73,7 @@ impl<R: io::Read> FrameDecoder<R> {
6873
FrameDecoder {
6974
r: rdr,
7075
dec: Decoder::new(),
76+
checksummer: CheckSummer::new(),
7177
src: vec![0; MAX_COMPRESS_BLOCK_SIZE],
7278
dst: vec![0; MAX_BLOCK_SIZE],
7379
dsts: 0,
@@ -161,7 +167,8 @@ impl<R: io::Read> io::Read for FrameDecoder<R> {
161167
});
162168
}
163169
self.r.read_exact(&mut self.dst[0..n])?;
164-
let got_sum = crc32c_masked(&self.dst[0..n]);
170+
let got_sum =
171+
self.checksummer.crc32c_masked(&self.dst[0..n]);
165172
if expected_sum != got_sum {
166173
fail!(Error::Checksum {
167174
expected: expected_sum,
@@ -190,7 +197,8 @@ impl<R: io::Read> io::Read for FrameDecoder<R> {
190197
}
191198
self.dec
192199
.decompress(&self.src[0..sn], &mut self.dst[0..dn])?;
193-
let got_sum = crc32c_masked(&self.dst[0..dn]);
200+
let got_sum =
201+
self.checksummer.crc32c_masked(&self.dst[0..dn]);
194202
if expected_sum != got_sum {
195203
fail!(Error::Checksum {
196204
expected: expected_sum,
@@ -210,6 +218,7 @@ impl<R: fmt::Debug + io::Read> fmt::Debug for FrameDecoder<R> {
210218
f.debug_struct("FrameDecoder")
211219
.field("r", &self.r)
212220
.field("dec", &self.dec)
221+
.field("checksummer", &self.checksummer)
213222
.field("src", &"[...]")
214223
.field("dst", &"[...]")
215224
.field("dsts", &self.dsts)
@@ -253,6 +262,10 @@ struct Inner<R: io::Read> {
253262
r: R,
254263
/// An encoder that we reuse that does the actual block based compression.
255264
enc: Encoder,
265+
/// A CRC32 checksummer that is configured to either use the portable
266+
/// fallback version or the SSE4.2 accelerated version when the right CPU
267+
/// features are available.
268+
checksummer: CheckSummer,
256269
/// Data taken from the underlying `r`, and not yet compressed.
257270
src: Vec<u8>,
258271
/// Have we written the standard snappy header to `dst` yet?
@@ -266,6 +279,7 @@ impl<R: io::Read> FrameEncoder<R> {
266279
inner: Inner {
267280
r: rdr,
268281
enc: Encoder::new(),
282+
checksummer: CheckSummer::new(),
269283
src: vec![0; MAX_BLOCK_SIZE],
270284
wrote_stream_ident: false,
271285
},
@@ -352,6 +366,7 @@ impl<R: io::Read> Inner<R> {
352366
// put the output in `dst`.
353367
let frame_data = compress_frame(
354368
&mut self.enc,
369+
self.checksummer,
355370
&self.src[..nread],
356371
chunk_header,
357372
remaining_dst,
@@ -377,6 +392,7 @@ impl<R: fmt::Debug + io::Read> fmt::Debug for Inner<R> {
377392
f.debug_struct("Inner")
378393
.field("r", &self.r)
379394
.field("enc", &self.enc)
395+
.field("checksummer", &self.checksummer)
380396
.field("src", &"[...]")
381397
.field("wrote_stream_ident", &self.wrote_stream_ident)
382398
.finish()

src/write.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use std::fmt;
1313
use std::io::{self, Write};
1414

1515
use crate::compress::Encoder;
16+
use crate::crc32::CheckSummer;
1617
pub use crate::error::IntoInnerError;
1718
use crate::frame::{
1819
compress_frame, CHUNK_HEADER_AND_CRC_SIZE, MAX_COMPRESS_BLOCK_SIZE,
@@ -52,6 +53,10 @@ struct Inner<W> {
5253
w: W,
5354
/// An encoder that we reuse that does the actual block based compression.
5455
enc: Encoder,
56+
/// A CRC32 checksummer that is configured to either use the portable
57+
/// fallback version or the SSE4.2 accelerated version when the right CPU
58+
/// features are available.
59+
checksummer: CheckSummer,
5560
/// The compressed bytes buffer. Bytes are compressed from src (usually)
5661
/// to dst before being written to w.
5762
dst: Vec<u8>,
@@ -70,6 +75,7 @@ impl<W: io::Write> FrameEncoder<W> {
7075
inner: Some(Inner {
7176
w: wtr,
7277
enc: Encoder::new(),
78+
checksummer: CheckSummer::new(),
7379
dst: vec![0; MAX_COMPRESS_BLOCK_SIZE],
7480
wrote_stream_ident: false,
7581
chunk_header: [0; CHUNK_HEADER_AND_CRC_SIZE],
@@ -164,6 +170,7 @@ impl<W: io::Write> Inner<W> {
164170

165171
let frame_data = compress_frame(
166172
&mut self.enc,
173+
self.checksummer,
167174
src,
168175
&mut self.chunk_header,
169176
&mut self.dst,
@@ -191,6 +198,7 @@ impl<W: fmt::Debug + io::Write> fmt::Debug for Inner<W> {
191198
f.debug_struct("Inner")
192199
.field("w", &self.w)
193200
.field("enc", &self.enc)
201+
.field("checksummer", &self.checksummer)
194202
.field("dst", &"[...]")
195203
.field("wrote_stream_ident", &self.wrote_stream_ident)
196204
.field("chunk_header", &self.chunk_header)

0 commit comments

Comments
 (0)