diff --git a/quickjs.c b/quickjs.c index 3d1f7229c..3be5b15d2 100644 --- a/quickjs.c +++ b/quickjs.c @@ -44727,7 +44727,7 @@ static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val, JSValue iter, next, item, ret; int done; double d; - xsum_large_accumulator lacc; + xsum_small_accumulator acc; SumPreciseStateEnum state; iter = JS_GetIterator(ctx, argv[0], /*async*/false); @@ -44737,7 +44737,7 @@ static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val, next = JS_GetProperty(ctx, iter, JS_ATOM_next); if (JS_IsException(next)) goto fail; - xsum_large_init(&lacc); + xsum_small_init(&acc); state = SUM_PRECISE_STATE_MINUS_ZERO; for (;;) { item = JS_IteratorNext(ctx, iter, next, 0, NULL, &done); @@ -44773,7 +44773,7 @@ static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val, state = SUM_PRECISE_STATE_MINUS_INFINITY; else if (!(d == 0.0 && signbit(d)) && (state == SUM_PRECISE_STATE_MINUS_ZERO || state == SUM_PRECISE_STATE_FINITE)) { state = SUM_PRECISE_STATE_FINITE; - xsum_large_add1(&lacc, d); + xsum_small_add1(&acc, d); } } } @@ -44792,7 +44792,7 @@ static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val, d = -0.0; break; case SUM_PRECISE_STATE_FINITE: - d = xsum_large_round(&lacc); + d = xsum_small_round(&acc); break; default: abort(); diff --git a/xsum.c b/xsum.c index 0bbf4d9ee..98d970b87 100644 --- a/xsum.c +++ b/xsum.c @@ -22,8 +22,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -#include #include #include #include "xsum.h" @@ -58,26 +56,13 @@ #define USE_SIMD 1 /* Use SIMD intrinsics (SSE2/AVX) if available? */ #define USE_MEMSET_SMALL 1 /* Use memset rather than a loop (for small mem)? */ -#define USE_MEMSET_LARGE 1 /* Use memset rather than a loop (for large mem)? */ -#define USE_USED_LARGE 1 /* Use the used flags in a large accumulator? */ #define OPT_SMALL 0 /* Class of manual optimization for operations on */ /* small accumulator: 0 (none), 1, 2, 3 (SIMD) */ #define OPT_CARRY 1 /* Use manually optimized carry propagation? */ -#define OPT_LARGE_SUM 1 /* Should manually optimized routines be used for */ -#define OPT_LARGE_SQNORM 1 /* operations using the large accumulator? */ -#define OPT_LARGE_DOT 1 - -#define OPT_SIMPLE_SUM 1 /* Should manually optimized routines be used for */ -#define OPT_SIMPLE_SQNORM 1 /* operations done with simple FP arithmetic? */ -#define OPT_SIMPLE_DOT 1 - -#define OPT_KAHAN_SUM 0 /* Use manually optimized routine for Kahan sum? */ - #define INLINE_SMALL 1 /* Inline more of the small accumulator routines? */ /* (Not currently used) */ -#define INLINE_LARGE 1 /* Inline more of the large accumulator routines? */ /* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */ @@ -441,315 +426,6 @@ static NOINLINE int xsum_carry_propagate (xsum_small_accumulator *restrict sacc) } -/* INITIALIZE LARGE ACCUMULATOR CHUNKS. Sets all counts to -1. */ - -static void xsum_large_init_chunks (xsum_large_accumulator *restrict lacc) -{ -# if USE_MEMSET_LARGE - { - /* Since in two's complement representation, -1 consists of all 1 bits, - we can initialize 16-bit values to -1 by initializing their component - bytes to 0xff. */ - - memset (lacc->count, 0xff, XSUM_LCHUNKS * sizeof *lacc->count); - } -# else - { xsum_lcount *p; - int n; - p = lacc->count; - n = XSUM_LCHUNKS; - do { *p++ = -1; n -= 1; } while (n > 0); - } -# endif - -# if USE_USED_LARGE -# if USE_MEMSET_SMALL - { memset(lacc->chunks_used, 0, XSUM_LCHUNKS/64 * sizeof *lacc->chunks_used); - } -# elif USE_SIMD && __AVX__ && XSUM_LCHUNKS/64==64 - { xsum_used *ch = lacc->chunks_used; - __m256i z = _mm256_setzero_si256(); - _mm256_storeu_si256 ((__m256i *)(ch+0), z); - _mm256_storeu_si256 ((__m256i *)(ch+4), z); - _mm256_storeu_si256 ((__m256i *)(ch+8), z); - _mm256_storeu_si256 ((__m256i *)(ch+12), z); - _mm256_storeu_si256 ((__m256i *)(ch+16), z); - _mm256_storeu_si256 ((__m256i *)(ch+20), z); - _mm256_storeu_si256 ((__m256i *)(ch+24), z); - _mm256_storeu_si256 ((__m256i *)(ch+28), z); - _mm256_storeu_si256 ((__m256i *)(ch+32), z); - _mm256_storeu_si256 ((__m256i *)(ch+36), z); - _mm256_storeu_si256 ((__m256i *)(ch+40), z); - _mm256_storeu_si256 ((__m256i *)(ch+44), z); - _mm256_storeu_si256 ((__m256i *)(ch+48), z); - _mm256_storeu_si256 ((__m256i *)(ch+52), z); - _mm256_storeu_si256 ((__m256i *)(ch+56), z); - _mm256_storeu_si256 ((__m256i *)(ch+60), z); - } -# else - { xsum_lchunk *p; - int n; - p = lacc->chunks_used; - n = XSUM_LCHUNKS/64; - do { *p++ = 0; n -= 1; } while (n > 0); - } -# endif - lacc->used_used = 0; -# endif -} - - -/* ADD CHUNK FROM A LARGE ACCUMULATOR TO THE SMALL ACCUMULATOR WITHIN IT. - The large accumulator chunk to add is indexed by ix. This chunk will - be cleared to zero and its count reset after it has been added to the - small accumulator (except no add is done for a new chunk being initialized). - This procedure should not be called for the special chunks correspnding to - Inf or NaN, whose counts should always remain at -1. */ - -#if INLINE_LARGE - INLINE -#endif -static void xsum_add_lchunk_to_small (xsum_large_accumulator *restrict lacc, - xsum_expint ix) -{ - xsum_expint exp, low_exp, high_exp; - xsum_uint low_chunk, mid_chunk, high_chunk; - xsum_lchunk chunk; - - const xsum_expint count = lacc->count[ix]; - - /* Add to the small accumulator only if the count is not -1, which - indicates a chunk that contains nothing yet. */ - - if (count >= 0) - { - /* Propagate carries in the small accumulator if necessary. */ - - if (lacc->sacc.adds_until_propagate == 0) - { (void) xsum_carry_propagate(&lacc->sacc); - } - - /* Get the chunk we will add. Note that this chunk is the integer sum - of entire 64-bit floating-point representations, with sign, exponent, - and mantissa, but we want only the sum of the mantissas. */ - - chunk = lacc->chunk[ix]; - - /* If we added the maximum number of values to 'chunk', the sum of - the sign and exponent parts (all the same, equal to the index) will - have overflowed out the top, leaving only the sum of the mantissas. - If the count of how many more terms we could have summed is greater - than zero, we therefore add this count times the index (shifted to - the position of the sign and exponent) to get the unwanted bits to - overflow out the top. */ - - if (count > 0) - { chunk += (xsum_lchunk)(count*ix) << XSUM_MANTISSA_BITS; - } - - /* Find the exponent for this chunk from the low bits of the index, - and split it into low and high parts, for accessing the small - accumulator. Noting that for denormalized numbers where the - exponent part is zero, the actual exponent is 1 (before subtracting - the bias), not zero. */ - - exp = ix & XSUM_EXP_MASK; - if (exp == 0) - { low_exp = 1; - high_exp = 0; - } - else - { low_exp = exp & XSUM_LOW_EXP_MASK; - high_exp = exp >> XSUM_LOW_EXP_BITS; - } - - /* Split the mantissa into three parts, for three consecutive chunks in - the small accumulator. Except for denormalized numbers, add in the sum - of all the implicit 1 bits that are above the actual mantissa bits. */ - - low_chunk = (chunk << low_exp) & XSUM_LOW_MANTISSA_MASK; - mid_chunk = chunk >> (XSUM_LOW_MANTISSA_BITS - low_exp); - if (exp != 0) /* normalized */ - { mid_chunk += (xsum_lchunk)((1 << XSUM_LCOUNT_BITS) - count) - << (XSUM_MANTISSA_BITS - XSUM_LOW_MANTISSA_BITS + low_exp); - } - high_chunk = mid_chunk >> XSUM_LOW_MANTISSA_BITS; - mid_chunk &= XSUM_LOW_MANTISSA_MASK; - - /* Add or subtract the three parts of the mantissa from three small - accumulator chunks, according to the sign that is part of the index. */ - - if (ix & (1 << XSUM_EXP_BITS)) - { lacc->sacc.chunk[high_exp] -= low_chunk; - lacc->sacc.chunk[high_exp+1] -= mid_chunk; - lacc->sacc.chunk[high_exp+2] -= high_chunk; - } - else - { lacc->sacc.chunk[high_exp] += low_chunk; - lacc->sacc.chunk[high_exp+1] += mid_chunk; - lacc->sacc.chunk[high_exp+2] += high_chunk; - } - - /* The above additions/subtractions reduce by one the number we can - do before we need to do carry propagation again. */ - - lacc->sacc.adds_until_propagate -= 1; - } - - /* We now clear the chunk to zero, and set the count to the number - of adds we can do before the mantissa would overflow. We also - set the bit in chunks_used to indicate that this chunk is in use - (if that is enabled). */ - - lacc->chunk[ix] = 0; - lacc->count[ix] = 1 << XSUM_LCOUNT_BITS; - -# if USE_USED_LARGE - lacc->chunks_used[ix>>6] |= (xsum_used)1 << (ix & 0x3f); - lacc->used_used |= (xsum_used)1 << (ix>>6); -# endif -} - - -/* ADD A CHUNK TO THE LARGE ACCUMULATOR OR PROCESS NAN OR INF. This routine - is called when the count for a chunk is negative after decrementing, which - indicates either inf/nan, or that the chunk has not been initialized, or - that the chunk needs to be transferred to the small accumulator. */ - -#if INLINE_LARGE - INLINE -#endif -static void xsum_large_add_value_inf_nan (xsum_large_accumulator *restrict lacc, - xsum_expint ix, xsum_lchunk uintv) -{ - if ((ix & XSUM_EXP_MASK) == XSUM_EXP_MASK) - { xsum_small_add_inf_nan (&lacc->sacc, uintv); - } - else - { xsum_add_lchunk_to_small (lacc, ix); - lacc->count[ix] -= 1; - lacc->chunk[ix] += uintv; - } -} - - -/* TRANSFER ALL CHUNKS IN LARGE ACCUMULATOR TO ITS SMALL ACCUMULATOR. */ - -static void xsum_large_transfer_to_small (xsum_large_accumulator *restrict lacc) -{ -# if USE_USED_LARGE - { - xsum_used *p, *e; - xsum_used u, uu; - int ix; - - p = lacc->chunks_used; - e = p + XSUM_LCHUNKS/64; - - /* Very quickly skip some unused low-order blocks of chunks by looking - at the used_used flags. */ - - uu = lacc->used_used; - if ((uu & 0xffffffff) == 0) - { uu >>= 32; - p += 32; - } - if ((uu & 0xffff) == 0) - { uu >>= 16; - p += 16; - } - if ((uu & 0xff) == 0) - { p += 8; - } - - /* Loop over remaining blocks of chunks. */ - - do - { - /* Loop to quickly find the next non-zero block of used flags, or finish - up if we've added all the used blocks to the small accumulator. */ - - for (;;) - { u = *p; - if (u != 0) - { break; - } - p += 1; - if (p == e) - { return; - } - u = *p; - if (u != 0) - { break; - } - p += 1; - if (p == e) - { return; - } - u = *p; - if (u != 0) - { break; - } - p += 1; - if (p == e) - { return; - } - u = *p; - if (u != 0) - { break; - } - p += 1; - if (p == e) - { return; - } - } - - /* Find and process the chunks in this block that are used. We skip - forward based on the chunks_used flags until we're within eight - bits of a chunk that is in use. */ - - ix = (p - lacc->chunks_used) << 6; - if ((u & 0xffffffff) == 0) - { u >>= 32; - ix += 32; - } - if ((u & 0xffff) == 0) - { u >>= 16; - ix += 16; - } - if ((u & 0xff) == 0) - { u >>= 8; - ix += 8; - } - - do - { if (lacc->count[ix] >= 0) - { xsum_add_lchunk_to_small (lacc, ix); - } - ix += 1; - u >>= 1; - } while (u != 0); - - p += 1; - - } while (p != e); - } -# else - { xsum_expint ix; - - /* When there are no used flags, we scan sequentially for chunks that - need to be added to the small accumulator. */ - - for (ix = 0; ix < XSUM_LCHUNKS; ix++) - { if (lacc->count[ix] >= 0) - { xsum_add_lchunk_to_small (lacc, ix); - } - } - } -# endif -} - - /* ------------------------ EXTERNAL ROUTINES ------------------------------- */ @@ -1312,549 +988,6 @@ done_rounding: ; } -/* INITIALIZE A LARGE ACCUMULATOR TO ZERO. */ - -void xsum_large_init (xsum_large_accumulator *restrict lacc) -{ - xsum_large_init_chunks (lacc); - xsum_small_init (&lacc->sacc); -} - - -/* ADD A VECTOR OF FLOATING-POINT NUMBERS TO A LARGE ACCUMULATOR. */ - -void xsum_large_addv (xsum_large_accumulator *restrict lacc, - const xsum_flt *restrict vec, - xsum_length n) -{ -# if OPT_LARGE_SUM - { - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - - while (n > 3) - { - COPY64 (uintv, *vec); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - COPY64 (uintv, *vec); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - COPY64 (uintv, *vec); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - COPY64 (uintv, *vec); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 4; - } - - while (n > 0) - { - COPY64 (uintv, *vec); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - } - } -# else - { - /* Version not manually optimized - maybe the compiler can do better. */ - - if (n == 0) - { return; - } - - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - - do - { - /* Fetch the next number, and convert to integer form in uintv. */ - - COPY64 (uintv, *vec); - vec += 1; - - /* Isolate the upper sign+exponent bits that index the chunk. */ - - ix = uintv >> XSUM_MANTISSA_BITS; - - /* Find the count for this chunk, and subtract one. */ - - count = lacc->count[ix] - 1; - - if (count < 0) - { - /* If the decremented count is negative, it's either a special - Inf/NaN chunk (in which case count will stay at -1), or one that - needs to be transferred to the small accumulator, or one that - has never been used before and needs to be initialized. */ - - xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { - /* Store the decremented count of additions allowed before transfer, - and add this value to the chunk. */ - - lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - - } while (n > 0); - } -# endif -} - - -/* ADD ONE DOUBLE TO A LARGE ACCUMULATOR. Just calls xsum_large_addv. */ - -void xsum_large_add1 (xsum_large_accumulator *restrict lacc, xsum_flt value) -{ - xsum_large_addv (lacc, &value, 1); -} - - -/* ADD SQUARED NORM OF VECTOR OF FLOATING-POINT NUMBERS TO LARGE ACCUMULATOR. */ - -void xsum_large_add_sqnorm (xsum_large_accumulator *restrict lacc, - const xsum_flt *restrict vec, - xsum_length n) -{ -# if OPT_LARGE_SQNORM - { - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - double fltv; - - while (n > 3) - { - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 4; - } - - while (n > 0) - { - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - } - } -# else - { - /* Version not manually optimized - maybe the compiler can do better. */ - - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - double fltv; - - if (n == 0) - { return; - } - - do - { - /* Fetch the next number, square it, and convert to integer form in - uintv. */ - - fltv = *vec * *vec; - COPY64 (uintv, fltv); - vec += 1; - - /* Isolate the upper sign+exponent bits that index the chunk. */ - - ix = uintv >> XSUM_MANTISSA_BITS; - - /* Find the count for this chunk, and subtract one. */ - - count = lacc->count[ix] - 1; - - if (count < 0) - { - /* If the decremented count is negative, it's either a special - Inf/NaN chunk (in which case count will stay at -1), or one that - needs to be transferred to the small accumulator, or one that - has never been used before and needs to be initialized. */ - - xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { - /* Store the decremented count of additions allowed before transfer, - and add this value to the chunk. */ - - lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - - } while (n > 0); - } -# endif -} - - -/* ADD DOT PRODUCT OF VECTORS OF FLOATING-POINT NUMBERS TO LARGE ACCUMULATOR. */ - -void xsum_large_add_dot (xsum_large_accumulator *restrict lacc, - const xsum_flt *vec1, - const xsum_flt *vec2, - xsum_length n) -{ -# if OPT_LARGE_DOT - { - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - double fltv; - - while (n > 3) - { - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 4; - } - - while (n > 0) - { - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - ix = uintv >> XSUM_MANTISSA_BITS; - - count = lacc->count[ix] - 1; - - if (count < 0) - { xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - } - } -# else - { - /* Version not manually optimized - maybe the compiler can do better. */ - - xsum_lcount count; - xsum_expint ix; - xsum_uint uintv; - double fltv; - - if (n == 0) - { return; - } - - do - { - /* Fetch the next numbers, multiply them, and convert the result to - integer form in uintv. */ - - fltv = *vec1 * *vec2; - COPY64 (uintv, fltv); - vec1 += 1; vec2 += 1; - - /* Isolate the upper sign+exponent bits that index the chunk. */ - - ix = uintv >> XSUM_MANTISSA_BITS; - - /* Find the count for this chunk, and subtract one. */ - - count = lacc->count[ix] - 1; - - if (count < 0) - { - /* If the decremented count is negative, it's either a special - Inf/NaN chunk (in which case count will stay at -1), or one that - needs to be transferred to the small accumulator, or one that - has never been used before and needs to be initialized. */ - - xsum_large_add_value_inf_nan (lacc, ix, uintv); - } - else - { - /* Store the decremented count of additions allowed before transfer, - and add this value to the chunk. */ - - lacc->count[ix] = count; - lacc->chunk[ix] += uintv; - } - - n -= 1; - - } while (n > 0); - } -# endif -} - - -/* ADD A LARGE ACCUMULATOR TO ANOTHER LARGE ACCUMULATOR. The first argument - is the destination, which is modified. The second is the accumulator to - add, which may also be modified, but should still represent the same - number. Source and destination may be the same. */ - -void xsum_large_add_accumulator (xsum_large_accumulator *dst_lacc, - xsum_large_accumulator *src_lacc) -{ - xsum_large_transfer_to_small (src_lacc); - xsum_small_add_accumulator (&dst_lacc->sacc, &src_lacc->sacc); -} - - -/* NEGATE THE VALUE IN A LARGE ACCUMULATOR. */ - -void xsum_large_negate (xsum_large_accumulator *restrict lacc) -{ - xsum_large_transfer_to_small (lacc); - xsum_small_negate (&lacc->sacc); -} - - - -/* RETURN RESULT OF ROUNDING A LARGE ACCUMULATOR. Rounding mode is to nearest, - with ties to even. - - This is done by adding all the chunks in the large accumulator to the - small accumulator, and then calling its rounding procedure. */ - -xsum_flt xsum_large_round (xsum_large_accumulator *restrict lacc) -{ - xsum_large_transfer_to_small (lacc); - - return xsum_small_round (&lacc->sacc); -} - - -/* TRANSFER NUMBER FROM A LARGE ACCUMULATOR TO A SMALL ACCUMULATOR. */ - -void xsum_large_to_small_accumulator (xsum_small_accumulator *restrict sacc, - xsum_large_accumulator *restrict lacc) -{ - xsum_large_transfer_to_small (lacc); - *sacc = lacc->sacc; -} - - -/* TRANSFER NUMBER FROM A SMALL ACCUMULATOR TO A LARGE ACCUMULATOR. */ - -void xsum_small_to_large_accumulator (xsum_large_accumulator *restrict lacc, - xsum_small_accumulator *restrict sacc) -{ - xsum_large_init_chunks (lacc); - lacc->sacc = *sacc; -} - - /* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY UNSIGNED INTEGER. */ xsum_flt xsum_small_div_unsigned @@ -1987,236 +1120,3 @@ xsum_flt xsum_small_div_int { return xsum_small_div_unsigned (sacc, (unsigned) div); } } - - -/* FIND RESULT OF DIVIDING LARGE ACCUMULATOR BY UNSIGNED INTEGER. */ - -xsum_flt xsum_large_div_unsigned - (xsum_large_accumulator *restrict lacc, unsigned div) -{ - xsum_large_transfer_to_small (lacc); - return xsum_small_div_unsigned (&lacc->sacc, div); -} - - -/* FIND RESULT OF DIVIDING LARGE ACCUMULATOR BY SIGNED INTEGER. */ - -xsum_flt xsum_large_div_int - (xsum_large_accumulator *restrict lacc, int div) -{ - xsum_large_transfer_to_small (lacc); - return xsum_small_div_int (&lacc->sacc, div); -} - - -/* ------------------- ROUTINES FOR NON-EXACT SUMMATION --------------------- */ - - -/* SUM A VECTOR WITH DOUBLE FP ACCUMULATOR. */ - -xsum_flt xsum_sum_double (const xsum_flt *restrict vec, - xsum_length n) -{ double s; - xsum_length j; - s = 0.0; -# if OPT_SIMPLE_SUM - { for (j = 3; j < n; j += 4) - { s += vec[j-3]; - s += vec[j-2]; - s += vec[j-1]; - s += vec[j]; - } - for (j = j-3; j < n; j++) - { s += vec[j]; - } - } -# else - { for (j = 0; j < n; j++) - { s += vec[j]; - } - } -# endif - return (xsum_flt) s; -} - - -/* SUM A VECTOR WITH FLOAT128 ACCUMULATOR. */ - -#ifdef FLOAT128 - -#include - -xsum_flt xsum_sum_float128 (const xsum_flt *restrict vec, - xsum_length n) -{ __float128 s; - xsum_length j; - s = 0.0; - for (j = 0; j < n; j++) - { s += vec[j]; - } - return (xsum_flt) s; -} - -#endif - - -/* SUM A VECTOR WITH DOUBLE FP, NOT IN ORDER. */ - -xsum_flt xsum_sum_double_not_ordered (const xsum_flt *restrict vec, - xsum_length n) -{ double s[2] = { 0, 0 }; - xsum_length j; - for (j = 1; j < n; j += 2) - { s[0] += vec[j-1]; - s[1] += vec[j]; - } - if (j == n) - { s[0] += vec[j-1]; - } - return (xsum_flt) (s[0]+s[1]); -} - - -/* SUM A VECTOR WITH KAHAN'S METHOD. */ - -xsum_flt xsum_sum_kahan (const xsum_flt *restrict vec, - xsum_length n) -{ double s, t, c, y; - xsum_length j; - s = 0.0; - c = 0.0; -# if OPT_KAHAN_SUM - { for (j = 1; j < n; j += 2) - { y = vec[j-1] - c; - t = s; - s += y; - c = (s - t) - y; - y = vec[j] - c; - t = s; - s += y; - c = (s - t) - y; - } - for (j = j-1; j < n; j++) - { y = vec[j] - c; - t = s; - s += y; - c = (s - t) - y; - } - } -# else - { for (j = 0; j < n; j++) - { y = vec[j] - c; - t = s; - s += y; - c = (s - t) - y; - } - } -# endif - return (xsum_flt) s; -} - - -/* SQUARED NORM OF A VECTOR WITH DOUBLE FP ACCUMULATOR. */ - -xsum_flt xsum_sqnorm_double (const xsum_flt *restrict vec, - xsum_length n) -{ double s; - xsum_length j; - - s = 0.0; -# if OPT_SIMPLE_SQNORM - { double a, b, c, d; - for (j = 3; j < n; j += 4) - { a = vec[j-3]; - b = vec[j-2]; - c = vec[j-1]; - d = vec[j]; - s += a*a; - s += b*b; - s += c*c; - s += d*d; - } - for (j = j-3; j < n; j++) - { a = vec[j]; - s += a*a; - } - } -# else - { double a; - for (j = 0; j < n; j++) - { a = vec[j]; - s += a*a; - } - } -# endif - return (xsum_flt) s; -} - - -/* SQUARED NORM OF A VECTOR WITH DOUBLE FP, NOT IN ORDER. */ - -xsum_flt xsum_sqnorm_double_not_ordered (const xsum_flt *restrict vec, - xsum_length n) -{ double s[2] = { 0, 0 }; - double a[2]; - xsum_length j; - for (j = 1; j < n; j += 2) - { a[0] = vec[j-1]; - a[1] = vec[j]; - s[0] += a[0]*a[0]; - s[1] += a[1]*a[1]; - } - if (j == n) - { a[0] = vec[j-1]; - s[0] += a[0]*a[0]; - } - return (xsum_flt) (s[0]+s[1]); -} - - -/* DOT PRODUCT OF VECTORS WITH DOUBLE FP ACCUMULATOR. */ - -xsum_flt xsum_dot_double (const xsum_flt *vec1, - const xsum_flt *vec2, - xsum_length n) -{ double s; - xsum_length j; - - s = 0.0; -# if OPT_SIMPLE_DOT - { for (j = 3; j < n; j += 4) - { s += vec1[j-3] * vec2[j-3]; - s += vec1[j-2] * vec2[j-2]; - s += vec1[j-1] * vec2[j-1]; - s += vec1[j] * vec2[j]; - } - for (j = j-3; j < n; j++) - { s += vec1[j] * vec2[j]; - } - } -# else - { for (j = 0; j < n; j++) - { s += vec1[j] * vec2[j]; - } - } -# endif - return (xsum_flt) s; -} - - -/* DOT PRODUCT OF VECTORS WITH DOUBLE FP, NOT IN ORDER. */ - -xsum_flt xsum_dot_double_not_ordered (const xsum_flt *vec1, - const xsum_flt *vec2, - xsum_length n) -{ double s[2] = { 0, 0 }; - xsum_length j; - for (j = 1; j < n; j += 2) - { s[0] += vec1[j-1] * vec2[j-1]; - s[1] += vec1[j] * vec2[j]; - } - if (j == n) - { s[0] += vec1[j-1] * vec2[j-1]; - } - return (xsum_flt) (s[0]+s[1]); -} diff --git a/xsum.h b/xsum.h index 16522e3cf..2372cac61 100644 --- a/xsum.h +++ b/xsum.h @@ -99,30 +99,6 @@ typedef struct } xsum_small_accumulator; /* propagation must be done again */ -/* CONSTANTS DEFINING THE LARGE ACCUMULATOR FORMAT. */ - -#define XSUM_LCHUNK_BITS 64 /* Bits in chunk of the large accumulator */ -typedef uint64_t xsum_lchunk; /* Integer type of large accumulator chunk, - must be EXACTLY 64 bits in size */ - -#define XSUM_LCOUNT_BITS (64-XSUM_MANTISSA_BITS) /* # of bits in count */ -typedef int_least16_t xsum_lcount; /* Signed int type of counts for large acc.*/ - -#define XSUM_LCHUNKS \ - (1 << (XSUM_EXP_BITS+1)) /* # of chunks in large accumulator */ - -typedef uint64_t xsum_used; /* Unsigned type for holding used flags */ - -typedef struct -{ xsum_lchunk chunk[XSUM_LCHUNKS]; /* Chunks making up large accumulator */ - xsum_lcount count[XSUM_LCHUNKS]; /* Counts of # adds remaining for chunks, - or -1 if not used yet or special. */ - xsum_used chunks_used[XSUM_LCHUNKS/64]; /* Bits indicate chunks in use */ - xsum_used used_used; /* Bits indicate chunk_used entries not 0 */ - xsum_small_accumulator sacc; /* The small accumulator to condense into */ -} xsum_large_accumulator; - - /* TYPE FOR LENGTHS OF ARRAYS. Must be a signed integer type. Set to ptrdiff_t here on the assumption that this will be big enough, but not unnecessarily big, which seems to be true. */ @@ -145,41 +121,8 @@ void xsum_small_add_accumulator (xsum_small_accumulator *, void xsum_small_negate (xsum_small_accumulator *restrict); xsum_flt xsum_small_round (xsum_small_accumulator *restrict); -void xsum_large_init (xsum_large_accumulator *restrict); -void xsum_large_add1 (xsum_large_accumulator *restrict, xsum_flt); -void xsum_large_addv (xsum_large_accumulator *restrict, - const xsum_flt *restrict, xsum_length); -void xsum_large_add_sqnorm (xsum_large_accumulator *restrict, - const xsum_flt *restrict, xsum_length); -void xsum_large_add_dot (xsum_large_accumulator *restrict, - const xsum_flt *, const xsum_flt *, xsum_length); -void xsum_large_add_accumulator (xsum_large_accumulator *, - xsum_large_accumulator *); -void xsum_large_negate (xsum_large_accumulator *restrict); -xsum_flt xsum_large_round (xsum_large_accumulator *restrict); - -void xsum_large_to_small_accumulator (xsum_small_accumulator *restrict, - xsum_large_accumulator *restrict); -void xsum_small_to_large_accumulator (xsum_large_accumulator *restrict, - xsum_small_accumulator *restrict); - xsum_flt xsum_small_div_unsigned (xsum_small_accumulator *restrict, unsigned); xsum_flt xsum_small_div_int (xsum_small_accumulator *restrict, int); -xsum_flt xsum_large_div_unsigned (xsum_large_accumulator *restrict, unsigned); -xsum_flt xsum_large_div_int (xsum_large_accumulator *restrict, int); - - -/* FUNCTIONS FOR DOUBLE AND OTHER INEXACT SUMMATION. Used for testing. */ - -xsum_flt xsum_sum_double (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_sum_double_not_ordered (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_sum_float128 (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_sum_kahan (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_sqnorm_double (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_sqnorm_double_not_ordered (const xsum_flt *restrict, xsum_length); -xsum_flt xsum_dot_double (const xsum_flt *, const xsum_flt *, xsum_length); -xsum_flt xsum_dot_double_not_ordered (const xsum_flt *, const xsum_flt *, - xsum_length); /* DEBUG FLAG. Set to non-zero for debug ouptut. Ignored unless xsum.c