lift comba limit for s_mp_mul_comba

minad · minad · commit 1bec79e51e51 · 2019-11-09T06:46:46.000+01:00
this is how it is done in tfm
diff --git a/etc/tune.c b/etc/tune.c
@@ -60,8 +60,7 @@ static int s_offset = 1;
 
 static mp_err s_mul_full(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   if (MP_HAS(S_MP_MUL_HIGH_COMBA)
-       && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {
+   if (MP_HAS(S_MP_MUL_COMBA)) {
       return s_mp_mul_comba(a, b, c, a->used + b->used + 1);
    }
    return s_mp_mul(a, b, c, a->used + b->used + 1);
diff --git a/mp_mul.c b/mp_mul.c
@@ -31,8 +31,7 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
    } else if (MP_HAS(S_MP_MUL_KARATSUBA) &&
               (min >= MP_MUL_KARATSUBA_CUTOFF)) {
       err = s_mp_mul_karatsuba(a, b, c);
-   } else if (MP_HAS(S_MP_MUL_COMBA) && /* can we use the fast multiplier? */
-              (min <= MP_MAX_COMBA)) {
+   } else if (MP_HAS(S_MP_MUL_COMBA)) {
       err = s_mp_mul_comba(a, b, c, digs);
    } else if (MP_HAS(S_MP_MUL)) {
       err = s_mp_mul(a, b, c, digs);
diff --git a/mp_reduce.c b/mp_reduce.c
@@ -40,8 +40,7 @@ mp_err mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
    if ((err = mp_mod_2d(x, MP_DIGIT_BIT * (um + 1), x)) != MP_OKAY) goto LBL_ERR;
 
    /* q = q * m mod b**(k+1), quick (no division) */
-   if (MP_HAS(S_MP_MUL_COMBA)
-       && (MP_MIN(q.used, m->used) < MP_MAX_COMBA)) {
+   if (MP_HAS(S_MP_MUL_COMBA)) {
       if ((err = s_mp_mul_comba(&q, m, &q, um + 1)) != MP_OKAY)     goto LBL_ERR;
    } else {
       if ((err = s_mp_mul(&q, m, &q, um + 1)) != MP_OKAY)           goto LBL_ERR;
diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c
@@ -23,7 +23,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
    int      oldused, pa, ix;
    mp_err   err;
-   mp_word  W;
+   mp_digit c0, c1, c2;
    mp_int   tmp, *c_;
 
    /* prepare the destination */
@@ -38,7 +38,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
    pa = MP_MIN(digs, a->used + b->used);
 
    /* clear the carry */
-   W = 0;
+   c0 = c1 = c2 = 0;
    for (ix = 0; ix < pa; ix++) {
       int tx, ty, iy, iz;
 
@@ -51,16 +51,59 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
        */
       iy = MP_MIN(a->used-tx, ty+1);
 
-      /* execute loop */
-      for (iz = 0; iz < iy; ++iz) {
-         W += (mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz];
+      /* execute loop
+       *
+       * Give the autovectorizer a hint! this might not be necessary.
+       * I don't think the generated code will be particularily good here,
+       * if we will use full width digits the masks will go away.
+       */
+      for (iz = 0; iz + 3 < iy;) {
+         mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+
+         w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
+      }
+
+      /* execute rest of loop */
+      for (; iz < iy;) {
+         mp_word w = (mp_word)c0 + ((mp_word)a->dp[tx + iz] * (mp_word)b->dp[ty - iz]);
+         c0 = (mp_digit)(w & MP_MASK);
+         w = (mp_word)c1 + (w >> MP_DIGIT_BIT);
+         c1 = (mp_digit)(w & MP_MASK);
+         c2 += (mp_digit)(w >> MP_DIGIT_BIT);
+         ++iz;
       }
 
       /* store term */
-      c_->dp[ix] = (mp_digit)W & MP_MASK;
+      c_->dp[ix] = c0;
 
       /* make next carry */
-      W = W >> (mp_word)MP_DIGIT_BIT;
+      c0 = c1;
+      c1 = c2;
+      c2 = 0;
    }
 
    /* setup dest */

Original file line number	Diff line number	Diff line change
`@@ -60,8 +60,7 @@ static int s_offset = 1;`
`60`	`60`
`61`	`61`	`static mp_err s_mul_full(const mp_int a, const mp_int b, mp_int *c)`
`62`	`62`	`{`
`63`		`- if (MP_HAS(S_MP_MUL_HIGH_COMBA)`
`64`		`- && (MP_MIN(a->used, b->used) < MP_MAX_COMBA)) {`
	`63`	`+ if (MP_HAS(S_MP_MUL_COMBA)) {`
`65`	`64`	`return s_mp_mul_comba(a, b, c, a->used + b->used + 1);`
`66`	`65`	`}`
`67`	`66`	`return s_mp_mul(a, b, c, a->used + b->used + 1);`