|
| 1 | +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
| 2 | +// file at the top-level directory of this distribution and at |
| 3 | +// http://rust-lang.org/COPYRIGHT. |
| 4 | +// |
| 5 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 8 | +// option. This file may not be copied, modified, or distributed |
| 9 | +// except according to those terms. |
| 10 | + |
| 11 | +//! The various algorithms from the paper. |
| 12 | +
|
| 13 | +use num::flt2dec::strategy::grisu::Fp; |
| 14 | +use prelude::v1::*; |
| 15 | +use cmp::min; |
| 16 | +use cmp::Ordering::{Less, Equal, Greater}; |
| 17 | +use super::table; |
| 18 | +use super::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float}; |
| 19 | +use super::num::{self, Big}; |
| 20 | + |
| 21 | +/// Number of significand bits in Fp |
| 22 | +const P: u32 = 64; |
| 23 | + |
| 24 | +// We simply store the best approximation for *all* exponents, so |
| 25 | +// the variable "h" and the associated conditions can be omitted. |
| 26 | +// This trades performance for space (11 KiB versus... 5 KiB or so?) |
| 27 | + |
| 28 | +fn power_of_ten(e: i16) -> Fp { |
| 29 | + assert!(e >= table::MIN_E); |
| 30 | + let i = e - table::MIN_E; |
| 31 | + let sig = table::POWERS.0[i as usize]; |
| 32 | + let exp = table::POWERS.1[i as usize]; |
| 33 | + Fp { f: sig, e: exp } |
| 34 | +} |
| 35 | + |
| 36 | +/// The fast path of Bellerophon using machine-sized integers and floats. |
| 37 | +/// |
| 38 | +/// This is extracted into a separate function so that it can be attempted before constructing |
| 39 | +/// a bignum. |
| 40 | +pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> { |
| 41 | + let num_digits = integral.len() + fractional.len(); |
| 42 | + // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end, |
| 43 | + // this is just a quick, cheap rejection (and also frees the rest of the code from |
| 44 | + // worrying about underflow). |
| 45 | + if num_digits > 16 { |
| 46 | + return None; |
| 47 | + } |
| 48 | + if e.abs() >= T::ceil_log5_of_max_sig() as i64 { |
| 49 | + return None; |
| 50 | + } |
| 51 | + let f = num::from_str_unchecked(integral.iter().chain(fractional.iter())); |
| 52 | + if f > T::max_sig() { |
| 53 | + return None; |
| 54 | + } |
| 55 | + let e = e as i16; // Can't overflow because e.abs() <= LOG5_OF_EXP_N |
| 56 | + // The case e < 0 cannot be folded into the other branch. Negative powers result in |
| 57 | + // a repeating fractional part in binary, which are rounded, which causes real |
| 58 | + // (and occasioally quite significant!) errors in the final result. |
| 59 | + // The case `e == 0`, however, is unnecessary for correctness. It's just measurably faster. |
| 60 | + if e == 0 { |
| 61 | + Some(T::from_int(f)) |
| 62 | + } else if e > 0 { |
| 63 | + Some(T::from_int(f) * fp_to_float(power_of_ten(e))) |
| 64 | + } else { |
| 65 | + Some(T::from_int(f) / fp_to_float(power_of_ten(-e))) |
| 66 | + } |
| 67 | +} |
| 68 | + |
| 69 | +/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis. |
| 70 | +/// |
| 71 | +/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation |
| 72 | +/// of `10^e` (in the same floating point format). This is often enough to get the correct result. |
| 73 | +/// However, when the result is close to halfway between two adjecent (ordinary) floats, the |
| 74 | +/// compound rounding error from multiplying two approximation means the result may be off by a |
| 75 | +/// few bits. When this happens, the iterative Algorithm R fixes things up. |
| 76 | +/// |
| 77 | +/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper. |
| 78 | +/// In the words of Clinger: |
| 79 | +/// |
| 80 | +/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error |
| 81 | +/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is |
| 82 | +/// > not a bound for the true error, but bounds the difference between the approximation z and |
| 83 | +/// > the best possible approximation that uses p bits of significand.) |
| 84 | +pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T { |
| 85 | + let slop; |
| 86 | + if f <= &Big::from_u64(T::max_sig()) { |
| 87 | + // The cases abs(e) < log5(2^N) are in fast_path() |
| 88 | + slop = if e >= 0 { 0 } else { 3 }; |
| 89 | + } else { |
| 90 | + slop = if e >= 0 { 1 } else { 4 }; |
| 91 | + } |
| 92 | + let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize(); |
| 93 | + let exp_p_n = 1 << (P - T::sig_bits() as u32); |
| 94 | + let lowbits: i64 = (z.f % exp_p_n) as i64; |
| 95 | + // Is the slop large enough to make a difference when |
| 96 | + // rounding to n bits? |
| 97 | + if (lowbits - exp_p_n as i64 / 2).abs() <= slop { |
| 98 | + algorithm_r(f, e, fp_to_float(z)) |
| 99 | + } else { |
| 100 | + fp_to_float(z) |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +/// An iterative algorithm that improves a floating point approximation of `f * 10^e`. |
| 105 | +/// |
| 106 | +/// Each iteration gets one unit in the last place closer, which of course takes terribly long to |
| 107 | +/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the |
| 108 | +/// starting approximation is off by at most one ULP. |
| 109 | +fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T { |
| 110 | + let mut z = z0; |
| 111 | + loop { |
| 112 | + let raw = z.unpack(); |
| 113 | + let (m, k) = (raw.sig, raw.k); |
| 114 | + let mut x = f.clone(); |
| 115 | + let mut y = Big::from_u64(m); |
| 116 | + |
| 117 | + // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`. |
| 118 | + // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the |
| 119 | + // power of two common to `10^e` and `2^k` to make the numbers smaller. |
| 120 | + make_ratio(&mut x, &mut y, e, k); |
| 121 | + |
| 122 | + let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32]; |
| 123 | + // This is written a bit awkwardly because our bignums don't support |
| 124 | + // negative numbers, so we use the absolute value + sign information. |
| 125 | + // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that |
| 126 | + // we need to worry about overflow, then they are also large enough that`make_ratio` has |
| 127 | + // reduced the fraction by a factor of 2^64 or more. |
| 128 | + let (d2, d_negative) = if x >= y { |
| 129 | + // Don't need x any more, save a clone(). |
| 130 | + x.sub(&y).mul_pow2(1).mul_digits(&m_digits); |
| 131 | + (x, false) |
| 132 | + } else { |
| 133 | + // Still need y - make a copy. |
| 134 | + let mut y = y.clone(); |
| 135 | + y.sub(&x).mul_pow2(1).mul_digits(&m_digits); |
| 136 | + (y, true) |
| 137 | + }; |
| 138 | + |
| 139 | + if d2 < y { |
| 140 | + let mut d2_double = d2; |
| 141 | + d2_double.mul_pow2(1); |
| 142 | + if m == T::min_sig() && d_negative && d2_double > y { |
| 143 | + z = prev_float(z); |
| 144 | + } else { |
| 145 | + return z; |
| 146 | + } |
| 147 | + } else if d2 == y { |
| 148 | + if m % 2 == 0 { |
| 149 | + if m == T::min_sig() && d_negative { |
| 150 | + z = prev_float(z); |
| 151 | + } else { |
| 152 | + return z; |
| 153 | + } |
| 154 | + } else if d_negative { |
| 155 | + z = prev_float(z); |
| 156 | + } else { |
| 157 | + z = next_float(z); |
| 158 | + } |
| 159 | + } else if d_negative { |
| 160 | + z = prev_float(z); |
| 161 | + } else { |
| 162 | + z = next_float(z); |
| 163 | + } |
| 164 | + } |
| 165 | +} |
| 166 | + |
| 167 | +/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the |
| 168 | +/// significand of a floating point approximation, make the ratio `x / y` equal to |
| 169 | +/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common. |
| 170 | +fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) { |
| 171 | + let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize); |
| 172 | + if e >= 0 { |
| 173 | + if k >= 0 { |
| 174 | + // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two. |
| 175 | + let common = min(e_abs, k_abs); |
| 176 | + x.mul_pow5(e_abs).mul_pow2(e_abs - common); |
| 177 | + y.mul_pow2(k_abs - common); |
| 178 | + } else { |
| 179 | + // x = f * 10^e * 2^abs(k), y = m |
| 180 | + // This can't overflow because it requires positive `e` and negative `k`, which can |
| 181 | + // only happen for values extremely close to 1, which means that `e` and `k` will be |
| 182 | + // comparatively tiny. |
| 183 | + x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs); |
| 184 | + } |
| 185 | + } else { |
| 186 | + if k >= 0 { |
| 187 | + // x = f, y = m * 10^abs(e) * 2^k |
| 188 | + // This can't overflow either, see above. |
| 189 | + y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs); |
| 190 | + } else { |
| 191 | + // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two. |
| 192 | + let common = min(e_abs, k_abs); |
| 193 | + x.mul_pow2(k_abs - common); |
| 194 | + y.mul_pow5(e_abs).mul_pow2(e_abs - common); |
| 195 | + } |
| 196 | + } |
| 197 | +} |
| 198 | + |
| 199 | +/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float. |
| 200 | +/// |
| 201 | +/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives |
| 202 | +/// a valid float significand. The binary exponent `k` is the number of times we multiplied |
| 203 | +/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`. |
| 204 | +/// When we have found out significand, we only need to round by inspecting the remainder of the |
| 205 | +/// division, which is done in helper functions further below. |
| 206 | +/// |
| 207 | +/// This algorithm is super slow, even with the optimization described in `quick_start()`. |
| 208 | +/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal |
| 209 | +/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed. |
| 210 | +/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand, |
| 211 | +/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return |
| 212 | +/// infinity. |
| 213 | +/// |
| 214 | +/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum |
| 215 | +/// exponent, the ratio might still be too large for a significand. See underflow() for details. |
| 216 | +pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T { |
| 217 | + let mut u; |
| 218 | + let mut v; |
| 219 | + let e_abs = e.abs() as usize; |
| 220 | + let mut k = 0; |
| 221 | + if e < 0 { |
| 222 | + u = f.clone(); |
| 223 | + v = Big::from_small(1); |
| 224 | + v.mul_pow5(e_abs).mul_pow2(e_abs); |
| 225 | + } else { |
| 226 | + // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of |
| 227 | + // fp_to_float(big_to_fp(u)) here, only without the double rounding. |
| 228 | + u = f.clone(); |
| 229 | + u.mul_pow5(e_abs).mul_pow2(e_abs); |
| 230 | + v = Big::from_small(1); |
| 231 | + } |
| 232 | + quick_start::<T>(&mut u, &mut v, &mut k); |
| 233 | + let mut rem = Big::from_small(0); |
| 234 | + let mut x = Big::from_small(0); |
| 235 | + let min_sig = Big::from_u64(T::min_sig()); |
| 236 | + let max_sig = Big::from_u64(T::max_sig()); |
| 237 | + loop { |
| 238 | + u.div_rem(&v, &mut x, &mut rem); |
| 239 | + if k == T::min_exp_int() { |
| 240 | + // We have to stop at the minimum exponent, if we wait until `k < T::min_exp_int()`, |
| 241 | + // then we'd be off by a factor of two. Unfortunately this means we have to special- |
| 242 | + // case normal numbers with the minimum exponent. |
| 243 | + // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure |
| 244 | + // that it's actually correct! |
| 245 | + if x >= min_sig && x <= max_sig { |
| 246 | + break; |
| 247 | + } |
| 248 | + return underflow(x, v, rem); |
| 249 | + } |
| 250 | + if k > T::max_exp_int() { |
| 251 | + return T::infinity(); |
| 252 | + } |
| 253 | + if x < min_sig { |
| 254 | + u.mul_pow2(1); |
| 255 | + k -= 1; |
| 256 | + } else if x > max_sig { |
| 257 | + v.mul_pow2(1); |
| 258 | + k += 1; |
| 259 | + } else { |
| 260 | + break; |
| 261 | + } |
| 262 | + } |
| 263 | + let q = num::to_u64(&x); |
| 264 | + let z = rawfp::encode_normal(Unpacked::new(q, k)); |
| 265 | + round_by_remainder(v, rem, q, z) |
| 266 | +} |
| 267 | + |
| 268 | +/// Skip over most AlgorithmM iterations by checking the bit length. |
| 269 | +fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) { |
| 270 | + // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v). |
| 271 | + // The estimate is off by at most 1, but always an under-estimate, so the error on log(u) |
| 272 | + // and log(v) are of the same sign and cancel out (if both are large). Therefore the error |
| 273 | + // for log(u / v) is at most one as well. |
| 274 | + // The target ratio is one where u/v is in an in-range significand. Thus our termination |
| 275 | + // condition is log2(u / v) being the significand bits, plus/minus one. |
| 276 | + // FIXME Looking at the second bit could improve the estimate and avoid some more divisions. |
| 277 | + let target_ratio = f64::sig_bits() as i16; |
| 278 | + let log2_u = u.bit_length() as i16; |
| 279 | + let log2_v = v.bit_length() as i16; |
| 280 | + let mut u_shift: i16 = 0; |
| 281 | + let mut v_shift: i16 = 0; |
| 282 | + assert!(*k == 0); |
| 283 | + loop { |
| 284 | + if *k == T::min_exp_int() { |
| 285 | + // Underflow or subnormal. Leave it to the main function. |
| 286 | + break; |
| 287 | + } |
| 288 | + if *k == T::max_exp_int() { |
| 289 | + // Overflow. Leave it to the main function. |
| 290 | + break; |
| 291 | + } |
| 292 | + let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift); |
| 293 | + if log2_ratio < target_ratio - 1 { |
| 294 | + u_shift += 1; |
| 295 | + *k -= 1; |
| 296 | + } else if log2_ratio > target_ratio + 1 { |
| 297 | + v_shift += 1; |
| 298 | + *k += 1; |
| 299 | + } else { |
| 300 | + break; |
| 301 | + } |
| 302 | + } |
| 303 | + u.mul_pow2(u_shift as usize); |
| 304 | + v.mul_pow2(v_shift as usize); |
| 305 | +} |
| 306 | + |
| 307 | +fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T { |
| 308 | + if x < Big::from_u64(T::min_sig()) { |
| 309 | + let q = num::to_u64(&x); |
| 310 | + let z = rawfp::encode_subnormal(q); |
| 311 | + return round_by_remainder(v, rem, q, z); |
| 312 | + } |
| 313 | + // Ratio isn't an in-range significand with the minimum exponent, so we need to round off |
| 314 | + // excess bits and adjust the exponent accordingly. The real value now looks like this: |
| 315 | + // |
| 316 | + // x lsb |
| 317 | + // /--------------\/ |
| 318 | + // 1010101010101010.10101010101010 * 2^k |
| 319 | + // \-----/\-------/ \------------/ |
| 320 | + // q trunc. (represented by rem) |
| 321 | + // |
| 322 | + // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding |
| 323 | + // on their own. When they are equal and the remainder is non-zero, the value still |
| 324 | + // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainer |
| 325 | + // is zero, we have a half-to-even situation. |
| 326 | + let bits = x.bit_length(); |
| 327 | + let lsb = bits - T::sig_bits() as usize; |
| 328 | + let q = num::get_bits(&x, lsb, bits); |
| 329 | + let k = T::min_exp_int() + lsb as i16; |
| 330 | + let z = rawfp::encode_normal(Unpacked::new(q, k)); |
| 331 | + let q_even = q % 2 == 0; |
| 332 | + match num::compare_with_half_ulp(&x, lsb) { |
| 333 | + Greater => next_float(z), |
| 334 | + Less => z, |
| 335 | + Equal if rem.is_zero() && q_even => z, |
| 336 | + Equal => next_float(z), |
| 337 | + } |
| 338 | +} |
| 339 | + |
| 340 | +/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division. |
| 341 | +fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T { |
| 342 | + let mut v_minus_r = v; |
| 343 | + v_minus_r.sub(&r); |
| 344 | + if r < v_minus_r { |
| 345 | + z |
| 346 | + } else if r > v_minus_r { |
| 347 | + next_float(z) |
| 348 | + } else if q % 2 == 0 { |
| 349 | + z |
| 350 | + } else { |
| 351 | + next_float(z) |
| 352 | + } |
| 353 | +} |
0 commit comments