Skip to content

Commit 03a3ebd

Browse files
committed
sha2: Reduce memory pressure
1 parent b2f6d86 commit 03a3ebd

File tree

2 files changed

+14
-66
lines changed

2 files changed

+14
-66
lines changed

sha2/src/consts.rs

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -163,56 +163,6 @@ pub const K64X2: [[u64; 2]; 40] = [
163163
[K64[79], K64[78]],
164164
];
165165

166-
macro_rules! dup_array {
167-
([$([$a:expr, $b:expr]),*,]) => {[
168-
$($b, $a, $b, $a),*,
169-
]}
170-
}
171-
172-
/// Constants necessary for SHA-512 family of digests.
173-
pub const K64X4: [u64; 160] = dup_array!([
174-
[K64[1], K64[0]],
175-
[K64[3], K64[2]],
176-
[K64[5], K64[4]],
177-
[K64[7], K64[6]],
178-
[K64[9], K64[8]],
179-
[K64[11], K64[10]],
180-
[K64[13], K64[12]],
181-
[K64[15], K64[14]],
182-
[K64[17], K64[16]],
183-
[K64[19], K64[18]],
184-
[K64[21], K64[20]],
185-
[K64[23], K64[22]],
186-
[K64[25], K64[24]],
187-
[K64[27], K64[26]],
188-
[K64[29], K64[28]],
189-
[K64[31], K64[30]],
190-
[K64[33], K64[32]],
191-
[K64[35], K64[34]],
192-
[K64[37], K64[36]],
193-
[K64[39], K64[38]],
194-
[K64[41], K64[40]],
195-
[K64[43], K64[42]],
196-
[K64[45], K64[44]],
197-
[K64[47], K64[46]],
198-
[K64[49], K64[48]],
199-
[K64[51], K64[50]],
200-
[K64[53], K64[52]],
201-
[K64[55], K64[54]],
202-
[K64[57], K64[56]],
203-
[K64[59], K64[58]],
204-
[K64[61], K64[60]],
205-
[K64[63], K64[62]],
206-
[K64[65], K64[64]],
207-
[K64[67], K64[66]],
208-
[K64[69], K64[68]],
209-
[K64[71], K64[70]],
210-
[K64[73], K64[72]],
211-
[K64[75], K64[74]],
212-
[K64[77], K64[76]],
213-
[K64[79], K64[78]],
214-
]);
215-
216166
pub static H224: [u32; STATE_LEN] = [
217167
0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4,
218168
];

sha2/src/sha512/x86.rs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ use core::arch::x86::*;
99
#[cfg(target_arch = "x86_64")]
1010
use core::arch::x86_64::*;
1111

12-
use crate::consts::{K64, K64X4};
12+
use crate::consts::K64;
1313

14-
cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3");
14+
cpufeatures::new!(avx2_cpuid, "avx2");
1515

1616
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
1717
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
@@ -25,7 +25,7 @@ pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
2525
}
2626
}
2727

28-
#[target_feature(enable = "avx,avx2,sse2,sse3")]
28+
#[target_feature(enable = "avx2")]
2929
unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
3030
let mut start_block = 0;
3131

@@ -110,10 +110,9 @@ unsafe fn load_data_avx2(
110110
x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0);
111111

112112
x[$i] = _mm256_shuffle_epi8(x[$i], MASK);
113-
let y = _mm256_add_epi64(
114-
x[$i],
115-
_mm256_loadu_si256(&K64X4[4 * $i] as *const u64 as *const _),
116-
);
113+
114+
let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _);
115+
let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t));
117116

118117
_mm_store_si128(
119118
&mut ms[2 * $i] as *mut u64 as *mut _,
@@ -135,7 +134,8 @@ unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &
135134

136135
for _ in 0..4 {
137136
for j in 0..8 {
138-
let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _);
137+
let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _);
138+
let y = sha512_update_x_avx(x, k64);
139139

140140
sha_round(current_state, ms[2 * j]);
141141
sha_round(current_state, ms[2 * j + 1]);
@@ -153,11 +153,12 @@ unsafe fn rounds_0_63_avx2(
153153
ms: &mut MsgSchedule,
154154
t2: &mut RoundStates,
155155
) {
156-
let mut k64x4_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM;
156+
let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM;
157157

158158
for i in 1..5 {
159159
for j in 0..8 {
160-
let y = sha512_update_x_avx2(x, &K64X4[k64x4_idx] as *const u64 as *const _);
160+
let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _);
161+
let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t));
161162

162163
sha_round(current_state, ms[2 * j]);
163164
sha_round(current_state, ms[2 * j + 1]);
@@ -171,7 +172,7 @@ unsafe fn rounds_0_63_avx2(
171172
_mm256_extracti128_si256(y, 1),
172173
);
173174

174-
k64x4_idx += 4;
175+
k64x4_idx += 2;
175176
}
176177
}
177178
}
@@ -249,14 +250,13 @@ unsafe fn accumulate_state(dst: &mut State, src: &State) {
249250

250251
macro_rules! fn_sha512_update_x {
251252
($name:ident, $ty:ident, {
252-
LOAD = $LOAD:ident,
253253
ADD64 = $ADD64:ident,
254254
ALIGNR8 = $ALIGNR8:ident,
255255
SRL64 = $SRL64:ident,
256256
SLL64 = $SLL64:ident,
257257
XOR = $XOR:ident,
258258
}) => {
259-
unsafe fn $name(x: &mut [$ty; 8], k64_p: *const $ty) -> $ty {
259+
unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty {
260260
// q[2:1]
261261
let mut t0 = $ALIGNR8(x[1], x[0], 8);
262262
// q[10:9]
@@ -320,13 +320,12 @@ macro_rules! fn_sha512_update_x {
320320
x[6] = x[7];
321321
x[7] = temp;
322322

323-
$ADD64(x[7], $LOAD(k64_p))
323+
$ADD64(x[7], k64)
324324
}
325325
};
326326
}
327327

328328
fn_sha512_update_x!(sha512_update_x_avx, __m128i, {
329-
LOAD = _mm_loadu_si128,
330329
ADD64 = _mm_add_epi64,
331330
ALIGNR8 = _mm_alignr_epi8,
332331
SRL64 = _mm_srli_epi64,
@@ -335,7 +334,6 @@ fn_sha512_update_x!(sha512_update_x_avx, __m128i, {
335334
});
336335

337336
fn_sha512_update_x!(sha512_update_x_avx2, __m256i, {
338-
LOAD = _mm256_loadu_si256,
339337
ADD64 = _mm256_add_epi64,
340338
ALIGNR8 = _mm256_alignr_epi8,
341339
SRL64 = _mm256_srli_epi64,

0 commit comments

Comments
 (0)