@@ -23,7 +23,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
23
23
{
24
24
int oldused , pa , ix ;
25
25
mp_err err ;
26
- mp_word W ;
26
+ mp_digit c0 , c1 , c2 ;
27
27
mp_int tmp , * c_ ;
28
28
29
29
/* prepare the destination */
@@ -38,7 +38,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
38
38
pa = MP_MIN (digs , a -> used + b -> used );
39
39
40
40
/* clear the carry */
41
- W = 0 ;
41
+ c0 = c1 = c2 = 0 ;
42
42
for (ix = 0 ; ix < pa ; ix ++ ) {
43
43
int tx , ty , iy , iz ;
44
44
@@ -51,16 +51,59 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
51
51
*/
52
52
iy = MP_MIN (a -> used - tx , ty + 1 );
53
53
54
- /* execute loop */
55
- for (iz = 0 ; iz < iy ; ++ iz ) {
56
- W += (mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ];
54
+ /* execute loop
55
+ *
56
+ * Give the autovectorizer a hint! this might not be necessary.
57
+ * I don't think the generated code will be particularily good here,
58
+ * if we will use full width digits the masks will go away.
59
+ */
60
+ for (iz = 0 ; iz + 3 < iy ;) {
61
+ mp_word w = (mp_word )c0 + ((mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ]);
62
+ c0 = (mp_digit )(w & MP_MASK );
63
+ w = (mp_word )c1 + (w >> MP_DIGIT_BIT );
64
+ c1 = (mp_digit )(w & MP_MASK );
65
+ c2 += (mp_digit )(w >> MP_DIGIT_BIT );
66
+ ++ iz ;
67
+
68
+ w = (mp_word )c0 + ((mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ]);
69
+ c0 = (mp_digit )(w & MP_MASK );
70
+ w = (mp_word )c1 + (w >> MP_DIGIT_BIT );
71
+ c1 = (mp_digit )(w & MP_MASK );
72
+ c2 += (mp_digit )(w >> MP_DIGIT_BIT );
73
+ ++ iz ;
74
+
75
+ w = (mp_word )c0 + ((mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ]);
76
+ c0 = (mp_digit )(w & MP_MASK );
77
+ w = (mp_word )c1 + (w >> MP_DIGIT_BIT );
78
+ c1 = (mp_digit )(w & MP_MASK );
79
+ c2 += (mp_digit )(w >> MP_DIGIT_BIT );
80
+ ++ iz ;
81
+
82
+ w = (mp_word )c0 + ((mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ]);
83
+ c0 = (mp_digit )(w & MP_MASK );
84
+ w = (mp_word )c1 + (w >> MP_DIGIT_BIT );
85
+ c1 = (mp_digit )(w & MP_MASK );
86
+ c2 += (mp_digit )(w >> MP_DIGIT_BIT );
87
+ ++ iz ;
88
+ }
89
+
90
+ /* execute rest of loop */
91
+ for (; iz < iy ;) {
92
+ mp_word w = (mp_word )c0 + ((mp_word )a -> dp [tx + iz ] * (mp_word )b -> dp [ty - iz ]);
93
+ c0 = (mp_digit )(w & MP_MASK );
94
+ w = (mp_word )c1 + (w >> MP_DIGIT_BIT );
95
+ c1 = (mp_digit )(w & MP_MASK );
96
+ c2 += (mp_digit )(w >> MP_DIGIT_BIT );
97
+ ++ iz ;
57
98
}
58
99
59
100
/* store term */
60
- c_ -> dp [ix ] = ( mp_digit ) W & MP_MASK ;
101
+ c_ -> dp [ix ] = c0 ;
61
102
62
103
/* make next carry */
63
- W = W >> (mp_word )MP_DIGIT_BIT ;
104
+ c0 = c1 ;
105
+ c1 = c2 ;
106
+ c2 = 0 ;
64
107
}
65
108
66
109
/* setup dest */
0 commit comments