@@ -18,19 +18,12 @@ pub const Ghash = struct {
18
18
pub const mac_length = 16 ;
19
19
pub const key_length = 16 ;
20
20
21
- const pc_count = if (builtin .mode != .ReleaseSmall ) 16 else 2 ;
21
+ const pc_count = if (builtin .mode != .ReleaseSmall ) 16 else 4 ;
22
+ const agg_2_treshold = 5 ;
22
23
const agg_4_treshold = 22 ;
23
24
const agg_8_treshold = 84 ;
24
25
const agg_16_treshold = 328 ;
25
26
26
- // Before the Haswell architecture, the carryless multiplication instruction was
27
- // extremely slow. Even with 128-bit operands, using Karatsuba multiplication was
28
- // thus faster than a schoolbook multiplication.
29
- // This is no longer the case -- Modern CPUs, including ARM-based ones, have a fast
30
- // carryless multiplication instruction; using 4 multiplications is now faster than
31
- // 3 multiplications with extra shifts and additions.
32
- const mul_algorithm = if (builtin .cpu .arch == .x86 ) .karatsuba else .schoolbook ;
33
-
34
27
hx : [pc_count ]Precomp ,
35
28
acc : u128 = 0 ,
36
29
@@ -50,10 +43,10 @@ pub const Ghash = struct {
50
43
var hx : [pc_count ]Precomp = undefined ;
51
44
hx [0 ] = h ;
52
45
hx [1 ] = gcmReduce (clsq128 (hx [0 ])); // h^2
46
+ hx [2 ] = gcmReduce (clmul128 (hx [1 ], h )); // h^3
47
+ hx [3 ] = gcmReduce (clsq128 (hx [1 ])); // h^4 = h^2^2
53
48
54
49
if (builtin .mode != .ReleaseSmall ) {
55
- hx [2 ] = gcmReduce (clmul128 (hx [1 ], h )); // h^3
56
- hx [3 ] = gcmReduce (clsq128 (hx [1 ])); // h^4 = h^2^2
57
50
if (block_count >= agg_8_treshold ) {
58
51
hx [4 ] = gcmReduce (clmul128 (hx [3 ], h )); // h^5
59
52
hx [5 ] = gcmReduce (clsq128 (hx [2 ])); // h^6 = h^3^2
@@ -76,71 +69,47 @@ pub const Ghash = struct {
76
69
return Ghash .initForBlockCount (key , math .maxInt (usize ));
77
70
}
78
71
79
- const Selector = enum { lo , hi , hi_lo };
72
+ const Selector = enum { lo , hi };
80
73
81
74
// Carryless multiplication of two 64-bit integers for x86_64.
82
75
inline fn clmulPclmul (x : u128 , y : u128 , comptime half : Selector ) u128 {
83
- switch (half ) {
84
- .hi = > {
85
- const product = asm (
86
- \\ vpclmulqdq $0x11, %[x], %[y], %[out]
87
- : [out ] "=x" (- > @Vector (2 , u64 )),
88
- : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
89
- [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
90
- );
91
- return @bitCast (u128 , product );
92
- },
93
- .lo = > {
94
- const product = asm (
95
- \\ vpclmulqdq $0x00, %[x], %[y], %[out]
96
- : [out ] "=x" (- > @Vector (2 , u64 )),
97
- : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
98
- [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
99
- );
100
- return @bitCast (u128 , product );
101
- },
102
- .hi_lo = > {
103
- const product = asm (
104
- \\ vpclmulqdq $0x10, %[x], %[y], %[out]
105
- : [out ] "=x" (- > @Vector (2 , u64 )),
106
- : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
107
- [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
108
- );
109
- return @bitCast (u128 , product );
110
- },
76
+ if (half == .hi ) {
77
+ const product = asm (
78
+ \\ vpclmulqdq $0x11, %[x], %[y], %[out]
79
+ : [out ] "=x" (- > @Vector (2 , u64 )),
80
+ : [x ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
81
+ [y ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
82
+ );
83
+ return @bitCast (u128 , product );
84
+ } else {
85
+ const product = asm (
86
+ \\ vpclmulqdq $0x00, %[x], %[y], %[out]
87
+ : [out ] "=x" (- > @Vector (2 , u64 )),
88
+ : [x ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
89
+ [y ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
90
+ );
91
+ return @bitCast (u128 , product );
111
92
}
112
93
}
113
94
114
95
// Carryless multiplication of two 64-bit integers for ARM crypto.
115
96
inline fn clmulPmull (x : u128 , y : u128 , comptime half : Selector ) u128 {
116
- switch (half ) {
117
- .hi = > {
118
- const product = asm (
119
- \\ pmull2 %[out].1q, %[x].2d, %[y].2d
120
- : [out ] "=w" (- > @Vector (2 , u64 )),
121
- : [x ] "w" (@bitCast (@Vector (2 , u64 ), x )),
122
- [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
123
- );
124
- return @bitCast (u128 , product );
125
- },
126
- .lo = > {
127
- const product = asm (
128
- \\ pmull %[out].1q, %[x].1d, %[y].1d
129
- : [out ] "=w" (- > @Vector (2 , u64 )),
130
- : [x ] "w" (@bitCast (@Vector (2 , u64 ), x )),
131
- [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
132
- );
133
- return @bitCast (u128 , product );
134
- },
135
- .hi_lo = > {
136
- const product = asm (
137
- \\ pmull %[out].1q, %[x].1d, %[y].1d
138
- : [out ] "=w" (- > @Vector (2 , u64 )),
139
- : [x ] "w" (@bitCast (@Vector (2 , u64 ), x >> 64 )),
140
- [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
141
- );
142
- return @bitCast (u128 , product );
143
- },
97
+ if (half == .hi ) {
98
+ const product = asm (
99
+ \\ pmull2 %[out].1q, %[x].2d, %[y].2d
100
+ : [out ] "=w" (- > @Vector (2 , u64 )),
101
+ : [x ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
102
+ [y ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
103
+ );
104
+ return @bitCast (u128 , product );
105
+ } else {
106
+ const product = asm (
107
+ \\ pmull %[out].1q, %[x].1d, %[y].1d
108
+ : [out ] "=w" (- > @Vector (2 , u64 )),
109
+ : [x ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
110
+ [y ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
111
+ );
112
+ return @bitCast (u128 , product );
144
113
}
145
114
}
146
115
@@ -175,63 +144,38 @@ pub const Ghash = struct {
175
144
(z3 & 0x88888888888888888888888888888888 ) ^ extra ;
176
145
}
177
146
178
- const I256 = struct {
179
- hi : u128 ,
180
- lo : u128 ,
181
- mid : u128 ,
182
- };
183
-
184
- inline fn xor256 (x : * I256 , y : I256 ) void {
185
- x .* = I256 {
186
- .hi = x .hi ^ y .hi ,
187
- .lo = x .lo ^ y .lo ,
188
- .mid = x .mid ^ y .mid ,
189
- };
190
- }
191
-
192
147
// Square a 128-bit integer in GF(2^128).
193
- fn clsq128 (x : u128 ) I256 {
194
- return .{
195
- .hi = clmul (x , x , .hi ),
196
- .lo = clmul (x , x , .lo ),
197
- .mid = 0 ,
198
- };
148
+ fn clsq128 (x : u128 ) u256 {
149
+ const lo = @truncate (u64 , x );
150
+ const hi = @truncate (u64 , x >> 64 );
151
+ const mid = lo ^ hi ;
152
+ const r_lo = clmul (x , x , .lo );
153
+ const r_hi = clmul (x , x , .hi );
154
+ const r_mid = clmul (mid , mid , .lo ) ^ r_lo ^ r_hi ;
155
+ return (@as (u256 , r_hi ) << 128 ) ^ (@as (u256 , r_mid ) << 64 ) ^ r_lo ;
199
156
}
200
157
201
158
// Multiply two 128-bit integers in GF(2^128).
202
- inline fn clmul128 (x : u128 , y : u128 ) I256 {
203
- if (mul_algorithm == .karatsuba ) {
204
- const x_hi = @truncate (u64 , x >> 64 );
205
- const y_hi = @truncate (u64 , y >> 64 );
206
- const r_lo = clmul (x , y , .lo );
207
- const r_hi = clmul (x , y , .hi );
208
- const r_mid = clmul (x ^ x_hi , y ^ y_hi , .lo ) ^ r_lo ^ r_hi ;
209
- return .{
210
- .hi = r_hi ,
211
- .lo = r_lo ,
212
- .mid = r_mid ,
213
- };
214
- } else {
215
- return .{
216
- .hi = clmul (x , y , .hi ),
217
- .lo = clmul (x , y , .lo ),
218
- .mid = clmul (x , y , .hi_lo ) ^ clmul (y , x , .hi_lo ),
219
- };
220
- }
159
+ inline fn clmul128 (x : u128 , y : u128 ) u256 {
160
+ const x_hi = @truncate (u64 , x >> 64 );
161
+ const y_hi = @truncate (u64 , y >> 64 );
162
+ const r_lo = clmul (x , y , .lo );
163
+ const r_hi = clmul (x , y , .hi );
164
+ const r_mid = clmul (x ^ x_hi , y ^ y_hi , .lo ) ^ r_lo ^ r_hi ;
165
+ return (@as (u256 , r_hi ) << 128 ) ^ (@as (u256 , r_mid ) << 64 ) ^ r_lo ;
221
166
}
222
167
223
168
// Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1.
224
169
// This is done *without reversing the bits*, using Shay Gueron's black magic demysticated here:
225
170
// https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
226
- inline fn gcmReduce (x : I256 ) u128 {
227
- const hi = x .hi ^ (x .mid >> 64 );
228
- const lo = x .lo ^ (x .mid << 64 );
171
+ inline fn gcmReduce (x : u256 ) u128 {
229
172
const p64 = (((1 << 121 ) | (1 << 126 ) | (1 << 127 )) >> 64 );
173
+ const lo = @truncate (u128 , x );
230
174
const a = clmul (lo , p64 , .lo );
231
175
const b = ((lo << 64 ) | (lo >> 64 )) ^ a ;
232
176
const c = clmul (b , p64 , .lo );
233
177
const d = ((b << 64 ) | (b >> 64 )) ^ c ;
234
- return d ^ hi ;
178
+ return d ^ @truncate ( u128 , x >> 128 ) ;
235
179
}
236
180
237
181
const has_pclmul = std .Target .x86 .featureSetHas (builtin .cpu .features , .pclmul );
@@ -258,7 +202,7 @@ pub const Ghash = struct {
258
202
var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [15 - 0 ]);
259
203
comptime var j = 1 ;
260
204
inline while (j < 16 ) : (j += 1 ) {
261
- xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [15 - j ]) );
205
+ u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [15 - j ]);
262
206
}
263
207
acc = gcmReduce (u );
264
208
}
@@ -268,7 +212,7 @@ pub const Ghash = struct {
268
212
var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [7 - 0 ]);
269
213
comptime var j = 1 ;
270
214
inline while (j < 8 ) : (j += 1 ) {
271
- xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [7 - j ]) );
215
+ u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [7 - j ]);
272
216
}
273
217
acc = gcmReduce (u );
274
218
}
@@ -278,25 +222,31 @@ pub const Ghash = struct {
278
222
var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [3 - 0 ]);
279
223
comptime var j = 1 ;
280
224
inline while (j < 4 ) : (j += 1 ) {
281
- xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [3 - j ]) );
225
+ u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [3 - j ]);
282
226
}
283
227
acc = gcmReduce (u );
284
228
}
285
- }
286
- // 2-blocks aggregated reduction
287
- while (i + 32 <= msg .len ) : (i += 32 ) {
288
- var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [1 - 0 ]);
289
- comptime var j = 1 ;
290
- inline while (j < 2 ) : (j += 1 ) {
291
- xor256 (& u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [1 - j ]));
229
+ } else if (msg .len >= agg_2_treshold * block_length ) {
230
+ // 2-blocks aggregated reduction
231
+ while (i + 32 <= msg .len ) : (i += 32 ) {
232
+ var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [1 - 0 ]);
233
+ comptime var j = 1 ;
234
+ inline while (j < 2 ) : (j += 1 ) {
235
+ u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [1 - j ]);
236
+ }
237
+ acc = gcmReduce (u );
292
238
}
293
- acc = gcmReduce (u );
294
239
}
295
240
// remaining blocks
296
241
if (i < msg .len ) {
297
- const u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [0 ]);
242
+ const n = (msg .len - i ) / 16 ;
243
+ var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [n - 1 - 0 ]);
244
+ var j : usize = 1 ;
245
+ while (j < n ) : (j += 1 ) {
246
+ u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [n - 1 - j ]);
247
+ }
248
+ i += n * 16 ;
298
249
acc = gcmReduce (u );
299
- i += 16 ;
300
250
}
301
251
assert (i == msg .len );
302
252
st .acc = acc ;
0 commit comments