6
6
7
7
#include "ptypes.h"
8
8
#define FUNC_isqrt 1
9
- #define FUNC_icbrt 1
10
9
#define FUNC_lcm_ui 1
11
10
#define FUNC_ctz 1
12
11
#define FUNC_log2floor 1
@@ -726,31 +725,141 @@ int is_power(UV n, UV a)
726
725
return (ret == 1 ) ? 0 : ret ;
727
726
}
728
727
728
+ /******************************************************************************/
729
+
730
+ static float _cbrtf (float x )
731
+ {
732
+ float t , r ;
733
+ union { float f ; uint32_t i ; } xx = { x };
734
+ xx .i = (xx .i + 2129874493 )/3 ;
735
+ t = xx .f ;
736
+ /* One round of Halley's method gets to 15.53 bits */
737
+ r = t * t * t ;
738
+ t *= (x + (x + r )) / ((x + r ) + r );
739
+ #if BITS_PER_WORD > 45
740
+ /* A second round gets us the 21.5 bits we need. */
741
+ r = t * t * t ;
742
+ t += t * (x - r ) / (x + (r + r ));
743
+ #endif
744
+ return t ;
745
+ }
746
+ uint32_t icbrt (UV n ) {
747
+ if (n > 0 ) {
748
+ uint32_t root = (float )(_cbrtf ((float )n ) + 0.375f );
749
+ UV rem = n - (UV )root * root * root ;
750
+ return root - ((IV )rem < 0 );
751
+ }
752
+ return 0 ;
753
+ }
754
+
755
+ /******************************************************************************/
756
+
757
+ static UV _ipow (unsigned b , unsigned e , unsigned bit )
758
+ {
759
+ UV r = b ;
760
+ while (bit >>= 1 ) {
761
+ r *= r ;
762
+ if (e & bit )
763
+ r *= b ;
764
+ }
765
+ return r ;
766
+ }
767
+ /* Estimate the kth root of n.
768
+ * Correct (+/- 0) if n is a perfect power, otherwise +/- 1.
769
+ * Requires k >= 3 so a float can exactly represent the kth root.
770
+ */
771
+ static unsigned _est_root (UV n , unsigned k , unsigned msbit )
772
+ {
773
+ const int iterations = (k <= 6 ) ? 2 : (k <= 19 ) ? 1 : 0 ;
774
+ const float y = n , fk = (int )k ;
775
+ union { float f ; uint32_t i ; } both32 = { y };
776
+ const uint32_t float_one = (uint32_t )127 << 23 ;
777
+ float x ;
778
+ int i ;
779
+ if (n == 0 ) return 0 ;
780
+ if (k >= BITS_PER_WORD /2 ) {
781
+ if (k >= BITS_PER_WORD || n < (UV )1 << k ) return 1 ;
782
+ return 2 + (n > (UV )1 << k && k <= MPU_MAX_POW3 );
783
+ }
784
+ /* Get a simple initial estimte */
785
+ both32 .i = (both32 .i - float_one ) / k + float_one ;
786
+ x = both32 .f ;
787
+ /*
788
+ * Improve it with up to two rounds of Halley's method.
789
+ * Newton's (quadratic) method for a root of x^k - y == 0 is
790
+ * x += (y - x^k) / (k * x^(k-1))
791
+ * which simplifies a lot for fixed k, but for variable k,
792
+ * Halley's (cubic) method is not much more complex:
793
+ * x += 2*x*(y-x^k) / (k*(x^k+y) - (y - x^k))
794
+ */
795
+ for (i = 0 ; i < iterations ; i ++ ) {
796
+ float err , xk = x ;
797
+ unsigned j = msbit ;
798
+ while (j >>= 1 ) {
799
+ xk *= xk ;
800
+ if (j & k )
801
+ xk *= x ;
802
+ }
803
+ err = xk - y ;
804
+ x -= 2.0f * x * err / (fk * (xk + y ) + err );
805
+ }
806
+ return (int )(x + 0.375 );
807
+ }
808
+ /* Internal use only, k MUST be between 4 and 15, n > 0 */
809
+ /* This trims a lot over the generic version */
810
+ # define MAX_IROOTN ((BITS_PER_WORD == 64) ? 15 : 10)
811
+ static uint32_t _irootn (UV n , uint32_t k )
812
+ {
813
+ uint32_t const msb = 4 << (k >= 8 );
814
+ uint32_t const r = _est_root (n ,k ,msb );
815
+ return r - ((IV )(n - _ipow (r ,k ,msb )) < 0 );
816
+ }
817
+
818
+ /******************************************************************************/
819
+
729
820
#if BITS_PER_WORD == 64
730
821
static const uint32_t root_max [1 + MPU_MAX_POW3 ] = {0 ,0 ,4294967295U ,2642245 ,65535 ,7131 ,1625 ,565 ,255 ,138 ,84 ,56 ,40 ,30 ,23 ,19 ,15 ,13 ,11 ,10 ,9 ,8 ,7 ,6 ,6 ,5 ,5 ,5 ,4 ,4 ,4 ,4 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 };
731
822
#else
732
823
static const uint32_t root_max [1 + MPU_MAX_POW3 ] = {0 ,0 ,65535 ,1625 ,255 ,84 ,40 ,23 ,15 ,11 ,9 ,7 ,6 ,5 ,4 ,4 ,3 ,3 ,3 ,3 ,3 };
733
824
#endif
734
825
735
- UV rootint (UV n , UV k ) {
736
- UV lo , hi , max ;
737
- if (k == 0 ) return 0 ;
738
- if (k == 1 ) return n ;
739
- if (k == 2 ) return isqrt (n );
740
- if (k == 3 ) return icbrt (n );
741
-
742
- /* Bracket between powers of 2, but never exceed max power so ipow works */
743
- max = 1 + ((k > MPU_MAX_POW3 ) ? 2 : root_max [k ]);
744
- lo = UVCONST (1 ) << (log2floor (n )/k );
745
- hi = ((lo * 2 ) < max ) ? lo * 2 : max ;
826
+ UV rootint (UV n , uint32_t k )
827
+ {
828
+ if (n <= 1 ) return (k != 0 && n != 0 );
829
+
830
+ /* Switch from 0 to M makes significantly faster dispatch */
831
+ switch (k ) {
832
+ case 0 : return 0 ;
833
+ case 1 : return n ;
834
+ case 2 : return isqrt (n );
835
+ case 3 : return icbrt (n );
836
+ case 4 : return isqrt (isqrt (n ));
837
+ case 5 : return _irootn (n ,5 );
838
+ case 6 : return _irootn (n ,6 );
839
+ case 7 : return _irootn (n ,7 );
840
+ case 8 : return _irootn (n ,8 );
841
+ default : break ;
842
+ }
843
+ /* Be careful of the order these are done. */
844
+ if (n >> k == 0 ) return 1 ;
845
+ if (k > MPU_MAX_POW3 ) return 1 + (k < BITS_PER_WORD );
846
+
847
+ if (k <= MAX_IROOTN ) /* Faster up to about k=24 */
848
+ return _irootn (n ,k );
849
+
850
+ /* Binary search for root */
851
+ {
852
+ uint32_t lo = 1U << (log2floor (n )/k );
853
+ uint32_t hi = root_max [k ];
854
+ if (hi >= lo * 2 ) hi = lo * 2 - 1 ;
746
855
747
- /* Binary search */
748
- while (lo < hi ) {
749
- UV mid = lo + (hi - lo )/2 ;
750
- if (ipow (mid ,k ) <= n ) lo = mid + 1 ;
751
- else hi = mid ;
856
+ while (lo < hi ) {
857
+ uint32_t mid = lo + (hi - lo + 1 )/2 ;
858
+ if (ipow (mid ,k ) > n ) hi = mid - 1 ;
859
+ else lo = mid ;
860
+ }
861
+ return lo ;
752
862
}
753
- return lo - 1 ;
754
863
}
755
864
756
865
/* Like ipow but returns UV_MAX if overflow */
@@ -771,6 +880,10 @@ UV ipowsafe(UV n, UV k) {
771
880
return p ;
772
881
}
773
882
883
+
884
+ /******************************************************************************/
885
+
886
+
774
887
/* Like lcm_ui, but returns 0 if overflow */
775
888
UV lcmsafe (UV x , UV y ) {
776
889
y /= gcd_ui (x ,y );
0 commit comments