Skip to content

Commit 7ed954f

Browse files
committed
icbrt(n) no longer inlined. Faster icbrt(n). Faster rootint(n,k).
1 parent 01cf24f commit 7ed954f

File tree

7 files changed

+133
-50
lines changed

7 files changed

+133
-50
lines changed

factor.c

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <math.h>
55

66
#define FUNC_isqrt 1
7-
#define FUNC_icbrt 1
87
#define FUNC_gcd_ui 1
98
#define FUNC_is_perfect_square 1
109
#define FUNC_clz 1

lehmer.c

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#define USE_PHI_CACHE 1
77
#define FUNC_isqrt 1
8-
#define FUNC_icbrt 1
98
#include "lehmer.h"
109
#include "util.h"
1110
#include "cache.h"

lmo.c

-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
#define PHI_SIEVE_MULT 13
6262

6363
#define FUNC_isqrt 1
64-
#define FUNC_icbrt 1
6564
#include "util.h"
6665
#include "constants.h"
6766
#include "prime_counts.h"

semi_primes.c

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "ptypes.h"
66
#include "constants.h"
77
#define FUNC_isqrt 1
8-
#define FUNC_icbrt 1
98
#define FUNC_ctz 1
109
#include "util.h"
1110
#include "cache.h"

totients.c

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
#include "ptypes.h"
66
#define FUNC_isqrt 1
7-
#define FUNC_icbrt 1
87
#define FUNC_ctz 1
98
#define FUNC_ipow 1
109
#include "util.h"

util.c

+131-18
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
#include "ptypes.h"
88
#define FUNC_isqrt 1
9-
#define FUNC_icbrt 1
109
#define FUNC_lcm_ui 1
1110
#define FUNC_ctz 1
1211
#define FUNC_log2floor 1
@@ -726,31 +725,141 @@ int is_power(UV n, UV a)
726725
return (ret == 1) ? 0 : ret;
727726
}
728727

728+
/******************************************************************************/
729+
730+
static float _cbrtf(float x)
731+
{
732+
float t, r;
733+
union { float f; uint32_t i; } xx = { x };
734+
xx.i = (xx.i + 2129874493)/3;
735+
t = xx.f;
736+
/* One round of Halley's method gets to 15.53 bits */
737+
r = t * t * t;
738+
t *= (x + (x + r)) / ((x + r) + r);
739+
#if BITS_PER_WORD > 45
740+
/* A second round gets us the 21.5 bits we need. */
741+
r = t * t * t;
742+
t += t * (x - r) / (x + (r + r));
743+
#endif
744+
return t;
745+
}
746+
uint32_t icbrt(UV n) {
747+
if (n > 0) {
748+
uint32_t root = (float)(_cbrtf((float)n) + 0.375f);
749+
UV rem = n - (UV)root * root * root;
750+
return root - ((IV)rem < 0);
751+
}
752+
return 0;
753+
}
754+
755+
/******************************************************************************/
756+
757+
static UV _ipow(unsigned b, unsigned e, unsigned bit)
758+
{
759+
UV r = b;
760+
while (bit >>= 1) {
761+
r *= r;
762+
if (e & bit)
763+
r *= b;
764+
}
765+
return r;
766+
}
767+
/* Estimate the kth root of n.
768+
* Correct (+/- 0) if n is a perfect power, otherwise +/- 1.
769+
* Requires k >= 3 so a float can exactly represent the kth root.
770+
*/
771+
static unsigned _est_root(UV n, unsigned k, unsigned msbit)
772+
{
773+
const int iterations = (k <= 6) ? 2 : (k <= 19) ? 1 : 0;
774+
const float y = n, fk = (int)k;
775+
union { float f; uint32_t i; } both32 = { y };
776+
const uint32_t float_one = (uint32_t)127 << 23;
777+
float x;
778+
int i;
779+
if (n == 0) return 0;
780+
if (k >= BITS_PER_WORD/2) {
781+
if (k >= BITS_PER_WORD || n < (UV)1 << k) return 1;
782+
return 2 + (n > (UV)1 << k && k <= MPU_MAX_POW3);
783+
}
784+
/* Get a simple initial estimte */
785+
both32.i = (both32.i - float_one) / k + float_one;
786+
x = both32.f;
787+
/*
788+
* Improve it with up to two rounds of Halley's method.
789+
* Newton's (quadratic) method for a root of x^k - y == 0 is
790+
* x += (y - x^k) / (k * x^(k-1))
791+
* which simplifies a lot for fixed k, but for variable k,
792+
* Halley's (cubic) method is not much more complex:
793+
* x += 2*x*(y-x^k) / (k*(x^k+y) - (y - x^k))
794+
*/
795+
for (i = 0; i < iterations; i++) {
796+
float err, xk = x;
797+
unsigned j = msbit;
798+
while (j >>= 1) {
799+
xk *= xk;
800+
if (j & k)
801+
xk *= x;
802+
}
803+
err = xk - y;
804+
x -= 2.0f*x*err / (fk*(xk+y) + err);
805+
}
806+
return (int)(x + 0.375);
807+
}
808+
/* Internal use only, k MUST be between 4 and 15, n > 0 */
809+
/* This trims a lot over the generic version */
810+
# define MAX_IROOTN ((BITS_PER_WORD == 64) ? 15 : 10)
811+
static uint32_t _irootn(UV n, uint32_t k)
812+
{
813+
uint32_t const msb = 4 << (k >= 8);
814+
uint32_t const r = _est_root(n,k,msb);
815+
return r - ((IV)(n - _ipow(r,k,msb)) < 0);
816+
}
817+
818+
/******************************************************************************/
819+
729820
#if BITS_PER_WORD == 64
730821
static const uint32_t root_max[1+MPU_MAX_POW3] = {0,0,4294967295U,2642245,65535,7131,1625,565,255,138,84,56,40,30,23,19,15,13,11,10,9,8,7,6,6,5,5,5,4,4,4,4,3,3,3,3,3,3,3,3,3};
731822
#else
732823
static const uint32_t root_max[1+MPU_MAX_POW3] = {0,0,65535,1625,255,84,40,23,15,11,9,7,6,5,4,4,3,3,3,3,3};
733824
#endif
734825

735-
UV rootint(UV n, UV k) {
736-
UV lo, hi, max;
737-
if (k == 0) return 0;
738-
if (k == 1) return n;
739-
if (k == 2) return isqrt(n);
740-
if (k == 3) return icbrt(n);
741-
742-
/* Bracket between powers of 2, but never exceed max power so ipow works */
743-
max = 1 + ((k > MPU_MAX_POW3) ? 2 : root_max[k]);
744-
lo = UVCONST(1) << (log2floor(n)/k);
745-
hi = ((lo*2) < max) ? lo*2 : max;
826+
UV rootint(UV n, uint32_t k)
827+
{
828+
if (n <= 1) return (k != 0 && n != 0);
829+
830+
/* Switch from 0 to M makes significantly faster dispatch */
831+
switch (k) {
832+
case 0: return 0;
833+
case 1: return n;
834+
case 2: return isqrt(n);
835+
case 3: return icbrt(n);
836+
case 4: return isqrt(isqrt(n));
837+
case 5: return _irootn(n,5);
838+
case 6: return _irootn(n,6);
839+
case 7: return _irootn(n,7);
840+
case 8: return _irootn(n,8);
841+
default: break;
842+
}
843+
/* Be careful of the order these are done. */
844+
if (n >> k == 0) return 1;
845+
if (k > MPU_MAX_POW3) return 1 + (k < BITS_PER_WORD);
846+
847+
if (k <= MAX_IROOTN) /* Faster up to about k=24 */
848+
return _irootn(n,k);
849+
850+
/* Binary search for root */
851+
{
852+
uint32_t lo = 1U << (log2floor(n)/k);
853+
uint32_t hi = root_max[k];
854+
if (hi >= lo*2) hi = lo*2 - 1;
746855

747-
/* Binary search */
748-
while (lo < hi) {
749-
UV mid = lo + (hi-lo)/2;
750-
if (ipow(mid,k) <= n) lo = mid+1;
751-
else hi = mid;
856+
while (lo < hi) {
857+
uint32_t mid = lo + (hi-lo+1)/2;
858+
if (ipow(mid,k) > n) hi = mid-1;
859+
else lo = mid;
860+
}
861+
return lo;
752862
}
753-
return lo-1;
754863
}
755864

756865
/* Like ipow but returns UV_MAX if overflow */
@@ -771,6 +880,10 @@ UV ipowsafe(UV n, UV k) {
771880
return p;
772881
}
773882

883+
884+
/******************************************************************************/
885+
886+
774887
/* Like lcm_ui, but returns 0 if overflow */
775888
UV lcmsafe(UV x, UV y) {
776889
y /= gcd_ui(x,y);

util.h

+2-27
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ extern void print_primes(UV low, UV high, int fd);
3434
extern uint32_t powerof(UV n);
3535
extern int is_power(UV n, UV a);
3636

37-
extern UV rootint(UV n, UV k);
37+
extern uint32_t icbrt(UV n);
38+
extern UV rootint(UV n, uint32_t k);
3839
extern UV ipowsafe(UV n, UV k); /* returns UV_MAX if overflows */
3940
extern UV lcmsafe(UV x, UV u); /* returns 0 if overflows */
4041
extern UV valuation(UV n, UV k);
@@ -186,9 +187,6 @@ extern UV gcdz(UV x, UV y);
186187
#if defined(FUNC_is_perfect_square) && !defined(FUNC_isqrt)
187188
#define FUNC_isqrt 1
188189
#endif
189-
#if defined(FUNC_is_perfect_cube) && !defined(FUNC_icbrt)
190-
#define FUNC_icbrt 1
191-
#endif
192190
#if defined(FUNC_lcm_ui) && !defined(FUNC_gcd_ui)
193191
#define FUNC_gcd_ui 1
194192
#endif
@@ -340,29 +338,6 @@ static uint32_t isqrt(UV n) {
340338
}
341339
#endif
342340

343-
#if defined(FUNC_icbrt)
344-
/* Rather slow method from Hacker's Delight */
345-
static uint32_t icbrt(UV n) {
346-
UV b, root = 0;
347-
#if BITS_PER_WORD == 32
348-
int s = 30;
349-
if (n >= UVCONST(4291015625)) return UVCONST(1625);
350-
#else
351-
int s = 63;
352-
if (n >= UVCONST(18446724184312856125)) return UVCONST(2642245);
353-
#endif
354-
for ( ; s >= 0; s -= 3) {
355-
root += root;
356-
b = 3*root*(root+1)+1;
357-
if ((n >> s) >= b) {
358-
n -= b << s;
359-
root++;
360-
}
361-
}
362-
return root;
363-
}
364-
#endif
365-
366341
#if defined(FUNC_gcd_ui)
367342
/* If we have a very fast ctz, then use the fast FLINT version of gcd */
368343
#if defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))

0 commit comments

Comments
 (0)