Skip to content

Commit 72d3f4b

Browse files
committed
Revert "std.crypto.onetimeauth.ghash: faster GHASH on modern CPUs (ziglang#13566)"
This reverts commit 7cfeae1 which is causing std lib tests to fail on wasm32-wasi.
1 parent 88a0f3d commit 72d3f4b

File tree

1 file changed

+74
-124
lines changed

1 file changed

+74
-124
lines changed

lib/std/crypto/ghash.zig

Lines changed: 74 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,12 @@ pub const Ghash = struct {
1818
pub const mac_length = 16;
1919
pub const key_length = 16;
2020

21-
const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 2;
21+
const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 4;
22+
const agg_2_treshold = 5;
2223
const agg_4_treshold = 22;
2324
const agg_8_treshold = 84;
2425
const agg_16_treshold = 328;
2526

26-
// Before the Haswell architecture, the carryless multiplication instruction was
27-
// extremely slow. Even with 128-bit operands, using Karatsuba multiplication was
28-
// thus faster than a schoolbook multiplication.
29-
// This is no longer the case -- Modern CPUs, including ARM-based ones, have a fast
30-
// carryless multiplication instruction; using 4 multiplications is now faster than
31-
// 3 multiplications with extra shifts and additions.
32-
const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .schoolbook;
33-
3427
hx: [pc_count]Precomp,
3528
acc: u128 = 0,
3629

@@ -50,10 +43,10 @@ pub const Ghash = struct {
5043
var hx: [pc_count]Precomp = undefined;
5144
hx[0] = h;
5245
hx[1] = gcmReduce(clsq128(hx[0])); // h^2
46+
hx[2] = gcmReduce(clmul128(hx[1], h)); // h^3
47+
hx[3] = gcmReduce(clsq128(hx[1])); // h^4 = h^2^2
5348

5449
if (builtin.mode != .ReleaseSmall) {
55-
hx[2] = gcmReduce(clmul128(hx[1], h)); // h^3
56-
hx[3] = gcmReduce(clsq128(hx[1])); // h^4 = h^2^2
5750
if (block_count >= agg_8_treshold) {
5851
hx[4] = gcmReduce(clmul128(hx[3], h)); // h^5
5952
hx[5] = gcmReduce(clsq128(hx[2])); // h^6 = h^3^2
@@ -76,71 +69,47 @@ pub const Ghash = struct {
7669
return Ghash.initForBlockCount(key, math.maxInt(usize));
7770
}
7871

79-
const Selector = enum { lo, hi, hi_lo };
72+
const Selector = enum { lo, hi };
8073

8174
// Carryless multiplication of two 64-bit integers for x86_64.
8275
inline fn clmulPclmul(x: u128, y: u128, comptime half: Selector) u128 {
83-
switch (half) {
84-
.hi => {
85-
const product = asm (
86-
\\ vpclmulqdq $0x11, %[x], %[y], %[out]
87-
: [out] "=x" (-> @Vector(2, u64)),
88-
: [x] "x" (@bitCast(@Vector(2, u64), x)),
89-
[y] "x" (@bitCast(@Vector(2, u64), y)),
90-
);
91-
return @bitCast(u128, product);
92-
},
93-
.lo => {
94-
const product = asm (
95-
\\ vpclmulqdq $0x00, %[x], %[y], %[out]
96-
: [out] "=x" (-> @Vector(2, u64)),
97-
: [x] "x" (@bitCast(@Vector(2, u64), x)),
98-
[y] "x" (@bitCast(@Vector(2, u64), y)),
99-
);
100-
return @bitCast(u128, product);
101-
},
102-
.hi_lo => {
103-
const product = asm (
104-
\\ vpclmulqdq $0x10, %[x], %[y], %[out]
105-
: [out] "=x" (-> @Vector(2, u64)),
106-
: [x] "x" (@bitCast(@Vector(2, u64), x)),
107-
[y] "x" (@bitCast(@Vector(2, u64), y)),
108-
);
109-
return @bitCast(u128, product);
110-
},
76+
if (half == .hi) {
77+
const product = asm (
78+
\\ vpclmulqdq $0x11, %[x], %[y], %[out]
79+
: [out] "=x" (-> @Vector(2, u64)),
80+
: [x] "x" (@bitCast(@Vector(2, u64), @as(u128, x))),
81+
[y] "x" (@bitCast(@Vector(2, u64), @as(u128, y))),
82+
);
83+
return @bitCast(u128, product);
84+
} else {
85+
const product = asm (
86+
\\ vpclmulqdq $0x00, %[x], %[y], %[out]
87+
: [out] "=x" (-> @Vector(2, u64)),
88+
: [x] "x" (@bitCast(@Vector(2, u64), @as(u128, x))),
89+
[y] "x" (@bitCast(@Vector(2, u64), @as(u128, y))),
90+
);
91+
return @bitCast(u128, product);
11192
}
11293
}
11394

11495
// Carryless multiplication of two 64-bit integers for ARM crypto.
11596
inline fn clmulPmull(x: u128, y: u128, comptime half: Selector) u128 {
116-
switch (half) {
117-
.hi => {
118-
const product = asm (
119-
\\ pmull2 %[out].1q, %[x].2d, %[y].2d
120-
: [out] "=w" (-> @Vector(2, u64)),
121-
: [x] "w" (@bitCast(@Vector(2, u64), x)),
122-
[y] "w" (@bitCast(@Vector(2, u64), y)),
123-
);
124-
return @bitCast(u128, product);
125-
},
126-
.lo => {
127-
const product = asm (
128-
\\ pmull %[out].1q, %[x].1d, %[y].1d
129-
: [out] "=w" (-> @Vector(2, u64)),
130-
: [x] "w" (@bitCast(@Vector(2, u64), x)),
131-
[y] "w" (@bitCast(@Vector(2, u64), y)),
132-
);
133-
return @bitCast(u128, product);
134-
},
135-
.hi_lo => {
136-
const product = asm (
137-
\\ pmull %[out].1q, %[x].1d, %[y].1d
138-
: [out] "=w" (-> @Vector(2, u64)),
139-
: [x] "w" (@bitCast(@Vector(2, u64), x >> 64)),
140-
[y] "w" (@bitCast(@Vector(2, u64), y)),
141-
);
142-
return @bitCast(u128, product);
143-
},
97+
if (half == .hi) {
98+
const product = asm (
99+
\\ pmull2 %[out].1q, %[x].2d, %[y].2d
100+
: [out] "=w" (-> @Vector(2, u64)),
101+
: [x] "w" (@bitCast(@Vector(2, u64), @as(u128, x))),
102+
[y] "w" (@bitCast(@Vector(2, u64), @as(u128, y))),
103+
);
104+
return @bitCast(u128, product);
105+
} else {
106+
const product = asm (
107+
\\ pmull %[out].1q, %[x].1d, %[y].1d
108+
: [out] "=w" (-> @Vector(2, u64)),
109+
: [x] "w" (@bitCast(@Vector(2, u64), @as(u128, x))),
110+
[y] "w" (@bitCast(@Vector(2, u64), @as(u128, y))),
111+
);
112+
return @bitCast(u128, product);
144113
}
145114
}
146115

@@ -175,63 +144,38 @@ pub const Ghash = struct {
175144
(z3 & 0x88888888888888888888888888888888) ^ extra;
176145
}
177146

178-
const I256 = struct {
179-
hi: u128,
180-
lo: u128,
181-
mid: u128,
182-
};
183-
184-
inline fn xor256(x: *I256, y: I256) void {
185-
x.* = I256{
186-
.hi = x.hi ^ y.hi,
187-
.lo = x.lo ^ y.lo,
188-
.mid = x.mid ^ y.mid,
189-
};
190-
}
191-
192147
// Square a 128-bit integer in GF(2^128).
193-
fn clsq128(x: u128) I256 {
194-
return .{
195-
.hi = clmul(x, x, .hi),
196-
.lo = clmul(x, x, .lo),
197-
.mid = 0,
198-
};
148+
fn clsq128(x: u128) u256 {
149+
const lo = @truncate(u64, x);
150+
const hi = @truncate(u64, x >> 64);
151+
const mid = lo ^ hi;
152+
const r_lo = clmul(x, x, .lo);
153+
const r_hi = clmul(x, x, .hi);
154+
const r_mid = clmul(mid, mid, .lo) ^ r_lo ^ r_hi;
155+
return (@as(u256, r_hi) << 128) ^ (@as(u256, r_mid) << 64) ^ r_lo;
199156
}
200157

201158
// Multiply two 128-bit integers in GF(2^128).
202-
inline fn clmul128(x: u128, y: u128) I256 {
203-
if (mul_algorithm == .karatsuba) {
204-
const x_hi = @truncate(u64, x >> 64);
205-
const y_hi = @truncate(u64, y >> 64);
206-
const r_lo = clmul(x, y, .lo);
207-
const r_hi = clmul(x, y, .hi);
208-
const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi;
209-
return .{
210-
.hi = r_hi,
211-
.lo = r_lo,
212-
.mid = r_mid,
213-
};
214-
} else {
215-
return .{
216-
.hi = clmul(x, y, .hi),
217-
.lo = clmul(x, y, .lo),
218-
.mid = clmul(x, y, .hi_lo) ^ clmul(y, x, .hi_lo),
219-
};
220-
}
159+
inline fn clmul128(x: u128, y: u128) u256 {
160+
const x_hi = @truncate(u64, x >> 64);
161+
const y_hi = @truncate(u64, y >> 64);
162+
const r_lo = clmul(x, y, .lo);
163+
const r_hi = clmul(x, y, .hi);
164+
const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi;
165+
return (@as(u256, r_hi) << 128) ^ (@as(u256, r_mid) << 64) ^ r_lo;
221166
}
222167

223168
// Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1.
224169
// This is done *without reversing the bits*, using Shay Gueron's black magic demysticated here:
225170
// https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
226-
inline fn gcmReduce(x: I256) u128 {
227-
const hi = x.hi ^ (x.mid >> 64);
228-
const lo = x.lo ^ (x.mid << 64);
171+
inline fn gcmReduce(x: u256) u128 {
229172
const p64 = (((1 << 121) | (1 << 126) | (1 << 127)) >> 64);
173+
const lo = @truncate(u128, x);
230174
const a = clmul(lo, p64, .lo);
231175
const b = ((lo << 64) | (lo >> 64)) ^ a;
232176
const c = clmul(b, p64, .lo);
233177
const d = ((b << 64) | (b >> 64)) ^ c;
234-
return d ^ hi;
178+
return d ^ @truncate(u128, x >> 128);
235179
}
236180

237181
const has_pclmul = std.Target.x86.featureSetHas(builtin.cpu.features, .pclmul);
@@ -258,7 +202,7 @@ pub const Ghash = struct {
258202
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[15 - 0]);
259203
comptime var j = 1;
260204
inline while (j < 16) : (j += 1) {
261-
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[15 - j]));
205+
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[15 - j]);
262206
}
263207
acc = gcmReduce(u);
264208
}
@@ -268,7 +212,7 @@ pub const Ghash = struct {
268212
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[7 - 0]);
269213
comptime var j = 1;
270214
inline while (j < 8) : (j += 1) {
271-
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[7 - j]));
215+
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[7 - j]);
272216
}
273217
acc = gcmReduce(u);
274218
}
@@ -278,25 +222,31 @@ pub const Ghash = struct {
278222
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[3 - 0]);
279223
comptime var j = 1;
280224
inline while (j < 4) : (j += 1) {
281-
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[3 - j]));
225+
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[3 - j]);
282226
}
283227
acc = gcmReduce(u);
284228
}
285-
}
286-
// 2-blocks aggregated reduction
287-
while (i + 32 <= msg.len) : (i += 32) {
288-
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[1 - 0]);
289-
comptime var j = 1;
290-
inline while (j < 2) : (j += 1) {
291-
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[1 - j]));
229+
} else if (msg.len >= agg_2_treshold * block_length) {
230+
// 2-blocks aggregated reduction
231+
while (i + 32 <= msg.len) : (i += 32) {
232+
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[1 - 0]);
233+
comptime var j = 1;
234+
inline while (j < 2) : (j += 1) {
235+
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[1 - j]);
236+
}
237+
acc = gcmReduce(u);
292238
}
293-
acc = gcmReduce(u);
294239
}
295240
// remaining blocks
296241
if (i < msg.len) {
297-
const u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[0]);
242+
const n = (msg.len - i) / 16;
243+
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[n - 1 - 0]);
244+
var j: usize = 1;
245+
while (j < n) : (j += 1) {
246+
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[n - 1 - j]);
247+
}
248+
i += n * 16;
298249
acc = gcmReduce(u);
299-
i += 16;
300250
}
301251
assert(i == msg.len);
302252
st.acc = acc;

0 commit comments

Comments
 (0)