@@ -74,12 +74,34 @@ maybe_small_long(PyLongObject *v)
74
74
#define KARATSUBA_CUTOFF 70
75
75
#define KARATSUBA_SQUARE_CUTOFF (2 * KARATSUBA_CUTOFF)
76
76
77
- /* For exponentiation, use the binary left-to-right algorithm
78
- * unless the exponent contains more than FIVEARY_CUTOFF digits.
79
- * In that case, do 5 bits at a time. The potential drawback is that
80
- * a table of 2**5 intermediate results is computed.
77
+ /* For exponentiation, use the binary left-to-right algorithm unless the
78
+ ^ exponent contains more than HUGE_EXP_CUTOFF bits. In that case, do
79
+ * (no more than) EXP_WINDOW_SIZE bits at a time. The potential drawback is
80
+ * that a table of 2**(EXP_WINDOW_SIZE - 1) intermediate results is
81
+ * precomputed.
81
82
*/
82
- #define FIVEARY_CUTOFF 8
83
+ #define EXP_WINDOW_SIZE 5
84
+ #define EXP_TABLE_LEN (1 << (EXP_WINDOW_SIZE - 1))
85
+ /* Suppose the exponent has bit length e. All ways of doing this
86
+ * need e squarings. The binary method also needs a multiply for
87
+ * each bit set. In a k-ary method with window width w, a multiply
88
+ * for each non-zero window, so at worst (and likely!)
89
+ * ceiling(e/w). The k-ary sliding window method has the same
90
+ * worst case, but the window slides so it can sometimes skip
91
+ * over an all-zero window that the fixed-window method can't
92
+ * exploit. In addition, the windowing methods need multiplies
93
+ * to precompute a table of small powers.
94
+ *
95
+ * For the sliding window method with width 5, 16 precomputation
96
+ * multiplies are needed. Assuming about half the exponent bits
97
+ * are set, then, the binary method needs about e/2 extra mults
98
+ * and the window method about 16 + e/5.
99
+ *
100
+ * The latter is smaller for e > 53 1/3. We don't have direct
101
+ * access to the bit length, though, so call it 60, which is a
102
+ * multiple of a long digit's max bit length (15 or 30 so far).
103
+ */
104
+ #define HUGE_EXP_CUTOFF 60
83
105
84
106
#define SIGCHECK (PyTryBlock ) \
85
107
do { \
@@ -4172,14 +4194,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
4172
4194
int negativeOutput = 0 ; /* if x<0 return negative output */
4173
4195
4174
4196
PyLongObject * z = NULL ; /* accumulated result */
4175
- Py_ssize_t i , j , k ; /* counters */
4197
+ Py_ssize_t i , j ; /* counters */
4176
4198
PyLongObject * temp = NULL ;
4199
+ PyLongObject * a2 = NULL ; /* may temporarily hold a**2 % c */
4177
4200
4178
- /* 5-ary values. If the exponent is large enough, table is
4179
- * precomputed so that table[i] == a**i % c for i in range(32).
4201
+ /* k-ary values. If the exponent is large enough, table is
4202
+ * precomputed so that table[i] == a**(2*i+1) % c for i in
4203
+ * range(EXP_TABLE_LEN).
4180
4204
*/
4181
- PyLongObject * table [32 ] = {0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
4182
- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 };
4205
+ PyLongObject * table [EXP_TABLE_LEN ] = {0 };
4183
4206
4184
4207
/* a, b, c = v, w, x */
4185
4208
CHECK_BINOP (v , w );
@@ -4332,7 +4355,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
4332
4355
}
4333
4356
/* else bi is 0, and z==1 is correct */
4334
4357
}
4335
- else if (i <= FIVEARY_CUTOFF ) {
4358
+ else if (i <= HUGE_EXP_CUTOFF / PyLong_SHIFT ) {
4336
4359
/* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
4337
4360
/* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */
4338
4361
@@ -4366,23 +4389,59 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
4366
4389
}
4367
4390
}
4368
4391
else {
4369
- /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */
4370
- Py_INCREF (z ); /* still holds 1L */
4371
- table [0 ] = z ;
4372
- for (i = 1 ; i < 32 ; ++ i )
4373
- MULT (table [i - 1 ], a , table [i ]);
4392
+ /* Left-to-right k-ary sliding window exponentiation
4393
+ * (Handbook of Applied Cryptography (HAC) Algorithm 14.85)
4394
+ */
4395
+ Py_INCREF (a );
4396
+ table [0 ] = a ;
4397
+ MULT (a , a , a2 );
4398
+ /* table[i] == a**(2*i + 1) % c */
4399
+ for (i = 1 ; i < EXP_TABLE_LEN ; ++ i )
4400
+ MULT (table [i - 1 ], a2 , table [i ]);
4401
+ Py_CLEAR (a2 );
4402
+
4403
+ /* Repeatedly extract the next (no more than) EXP_WINDOW_SIZE bits
4404
+ * into `pending`, starting with the next 1 bit. The current bit
4405
+ * length of `pending` is `blen`.
4406
+ */
4407
+ int pending = 0 , blen = 0 ;
4408
+ #define ABSORB_PENDING do { \
4409
+ int ntz = 0; /* number of trailing zeroes in `pending` */ \
4410
+ assert (pending && blen ); \
4411
+ assert (pending >> (blen - 1 )); \
4412
+ assert (pending >> blen == 0 ); \
4413
+ while ((pending & 1 ) == 0 ) { \
4414
+ ++ ntz ; \
4415
+ pending >>= 1 ; \
4416
+ } \
4417
+ assert (ntz < blen ); \
4418
+ blen -= ntz ; \
4419
+ do { \
4420
+ MULT (z , z , z ); \
4421
+ } while (-- blen ); \
4422
+ MULT (z , table [pending >> 1 ], z ); \
4423
+ while (ntz -- > 0 ) \
4424
+ MULT (z , z , z ); \
4425
+ assert (blen == 0 ); \
4426
+ pending = 0 ; \
4427
+ } while (0 )
4374
4428
4375
4429
for (i = Py_SIZE (b ) - 1 ; i >= 0 ; -- i ) {
4376
4430
const digit bi = b -> ob_digit [i ];
4377
-
4378
- for (j = PyLong_SHIFT - 5 ; j >= 0 ; j -= 5 ) {
4379
- const int index = (bi >> j ) & 0x1f ;
4380
- for (k = 0 ; k < 5 ; ++ k )
4431
+ for (j = PyLong_SHIFT - 1 ; j >= 0 ; -- j ) {
4432
+ const int bit = (bi >> j ) & 1 ;
4433
+ pending = (pending << 1 ) | bit ;
4434
+ if (pending ) {
4435
+ ++ blen ;
4436
+ if (blen == EXP_WINDOW_SIZE )
4437
+ ABSORB_PENDING ;
4438
+ }
4439
+ else /* absorb strings of 0 bits */
4381
4440
MULT (z , z , z );
4382
- if (index )
4383
- MULT (z , table [index ], z );
4384
4441
}
4385
4442
}
4443
+ if (pending )
4444
+ ABSORB_PENDING ;
4386
4445
}
4387
4446
4388
4447
if (negativeOutput && (Py_SIZE (z ) != 0 )) {
@@ -4399,13 +4458,14 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
4399
4458
Py_CLEAR (z );
4400
4459
/* fall through */
4401
4460
Done :
4402
- if (Py_SIZE (b ) > FIVEARY_CUTOFF ) {
4403
- for (i = 0 ; i < 32 ; ++ i )
4461
+ if (Py_SIZE (b ) > HUGE_EXP_CUTOFF / PyLong_SHIFT ) {
4462
+ for (i = 0 ; i < EXP_TABLE_LEN ; ++ i )
4404
4463
Py_XDECREF (table [i ]);
4405
4464
}
4406
4465
Py_DECREF (a );
4407
4466
Py_DECREF (b );
4408
4467
Py_XDECREF (c );
4468
+ Py_XDECREF (a2 );
4409
4469
Py_XDECREF (temp );
4410
4470
return (PyObject * )z ;
4411
4471
}
0 commit comments