Merge #328

bors[bot] · bors[bot] · commit 4896746fec21 · 2017-09-20T20:53:40.000Z
328: Optimizing BigUint and Bigint multiplication with the Toom-3 algorithm r=cuviper a=kompass

Hi !

I finally implemented the Toom-3 algorithm ! I first tried to minimize the memory allocations by allocating the `Vec&lt;BigDigit&gt;` myself, as was done for Toom-2, but Toom-3 needs more complex calculations, with negative numbers. So I gave up this method, to use `BigInt` directly, and it's already faster ! I also chose a better threshold for the Toom-2 algorithm.

Before any modification :
```
running 4 tests
test multiply_0        ... bench:         257 ns/iter (+/- 25)
test multiply_1        ... bench:      30,240 ns/iter (+/- 1,651)
test multiply_2        ... bench:   2,752,360 ns/iter (+/- 52,102)
test multiply_3        ... bench:  11,618,575 ns/iter (+/- 266,286)
```

With a better Toom-2 threshold (16 instead of 4) :
```
running 4 tests
test multiply_0        ... bench:         130 ns/iter (+/- 8)
test multiply_1        ... bench:      19,772 ns/iter (+/- 1,083)
test multiply_2        ... bench:   1,340,644 ns/iter (+/- 17,987)
test multiply_3        ... bench:   7,302,854 ns/iter (+/- 82,060)
```

With the Toom-3 algorithm (with a threshold of 300):
```
running 4 tests
test multiply_0        ... bench:         123 ns/iter (+/- 3)
test multiply_1        ... bench:      19,689 ns/iter (+/- 837)
test multiply_2        ... bench:   1,189,589 ns/iter (+/- 29,101)
test multiply_3        ... bench:   3,014,225 ns/iter (+/- 61,222)
```

I think this could be optimized, but it's a first step !
diff --git a/benches/bigint.rs b/benches/bigint.rs
@@ -78,6 +78,11 @@ fn multiply_2(b: &mut Bencher) {
     multiply_bench(b, 1 << 16, 1 << 16);
 }
 
+#[bench]
+fn multiply_3(b: &mut Bencher) {
+    multiply_bench(b, 1 << 16, 1 << 17);
+}
+
 #[bench]
 fn divide_0(b: &mut Bencher) {
     divide_bench(b, 1 << 8, 1 << 6);
diff --git a/bigint/src/algorithms.rs b/bigint/src/algorithms.rs
@@ -8,6 +8,7 @@ use traits::{Zero, One};
 
 use biguint::BigUint;
 
+use bigint::BigInt;
 use bigint::Sign;
 use bigint::Sign::{Minus, NoSign, Plus};
 
@@ -225,20 +226,18 @@ fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
         return;
     }
 
-    let mut b_iter = b.iter();
     let mut carry = 0;
+    let (a_lo, a_hi) = acc.split_at_mut(b.len());
 
-    for ai in acc.iter_mut() {
-        if let Some(bi) = b_iter.next() {
-            *ai = mac_with_carry(*ai, *bi, c, &mut carry);
-        } else if carry != 0 {
-            *ai = mac_with_carry(*ai, 0, c, &mut carry);
-        } else {
-            break;
-        }
+    for (a, &b) in a_lo.iter_mut().zip(b) {
+        *a = mac_with_carry(*a, b, c, &mut carry);
     }
 
-    assert!(carry == 0);
+    let mut a = a_hi.iter_mut();
+    while carry != 0 {
+        let a = a.next().expect("carry overflow during multiplication!");
+        *a = adc(*a, 0, &mut carry);
+    }
 }
 
 /// Three argument multiply accumulate:
@@ -250,13 +249,23 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         (c, b)
     };
 
-    // Karatsuba multiplication is slower than long multiplication for small x and y:
+    // We use three algorithms for different input sizes.
     //
-    if x.len() <= 4 {
+    // - For small inputs, long multiplication is fastest.
+    // - Next we use Karatsuba multiplication (Toom-2), which we have optimized
+    //   to avoid unnecessary allocations for intermediate values.
+    // - For the largest inputs we use Toom-3, which better optimizes the
+    //   number of operations, but uses more temporary allocations.
+    //
+    // The thresholds are somewhat arbitrary, chosen by evaluating the results
+    // of `cargo bench --bench bigint multiply`.
+
+    if x.len() <= 32 {
+        // Long multiplication:
         for (i, xi) in x.iter().enumerate() {
             mac_digit(&mut acc[i..], y, *xi);
         }
-    } else {
+    } else if x.len() <= 256 {
         /*
          * Karatsuba multiplication:
          *
@@ -375,6 +384,53 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
             },
             NoSign  => (),
         }
+
+    } else {
+        // Toom-3 multiplication:
+        //
+        // Toom-3 is like Karatsuba above, but dividing the inputs into three parts.
+        // Both are instances of Toom-Cook, using `k=3` and `k=2` respectively.
+        //
+        // FIXME: It would be nice to have comments breaking down the operations below.
+
+        let i = y.len()/3 + 1;
+
+        let x0_len = cmp::min(x.len(), i);
+        let x1_len = cmp::min(x.len() - x0_len, i);
+
+        let y0_len = i;
+        let y1_len = cmp::min(y.len() - y0_len, i);
+
+        let x0 = BigInt::from_slice(Plus, &x[..x0_len]);
+        let x1 = BigInt::from_slice(Plus, &x[x0_len..x0_len + x1_len]);
+        let x2 = BigInt::from_slice(Plus, &x[x0_len + x1_len..]);
+
+        let y0 = BigInt::from_slice(Plus, &y[..y0_len]);
+        let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]);
+        let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]);
+
+        let p = &x0 + &x2;
+        let q = &y0 + &y2;
+
+        let p2 = &p - &x1;
+        let q2 = &q - &y1;
+
+        let r0 = &x0 * &y0;
+        let r4 = &x2 * &y2;
+        let r1 = (p + x1) * (q + y1);
+        let r2 = &p2 * &q2;
+        let r3 = ((p2 + x2)*2 - x0) * ((q2 + y2)*2 - y0);
+
+        let mut comp3: BigInt = (r3 - &r1) / 3;
+        let mut comp1: BigInt = (r1 - &r2) / 2;
+        let mut comp2: BigInt = r2 - &r0;
+        comp3 = (&comp2 - comp3)/2 + &r4*2;
+        comp2 = comp2 + &comp1 - &r4;
+        comp1 = comp1 - &comp3;
+
+        let result = r0 + (comp1 << 32*i) + (comp2 << 2*32*i) + (comp3 << 3*32*i) + (r4 << 4*32*i);
+        let result_pos = result.to_biguint().unwrap();
+        add2(&mut acc[..], &result_pos.data);
     }
 }