Improve accuracy of conversions from floating-point numbers

kimikage · kimikage · commit a41e65c0399e · 2019-10-29T09:45:13.000+09:00
This fixes a problem with the input range checking.
diff --git a/src/normed.jl b/src/normed.jl
@@ -41,27 +41,63 @@ rawone(v) = reinterpret(one(v))
 function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
     U = Normed{T,f}
     y = round((rawone(U)/rawone(x))*reinterpret(x))
-    (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
+    0 <= y <= typemax(T) || throw_converterror(U, x)
     reinterpret(U, _unsafe_trunc(T, y))
 end
 N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))
 
-(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)
+(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)
 
-function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
-    y = round(widen1(rawone(U))*x)
-    (0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
-    U(_unsafe_trunc(T, y), 0)
+function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
+    if T == UInt128 # for UInt128, we can't widen
+        # the following range is not exact.
+        0 <= x <= ((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
+        y = round(rawone(U)*x)
+    else
+        y = round(widen1(rawone(U))*x)
+        0 <= y <= typemax(T) || throw_converterror(U, x)
+    end
+    reinterpret(U, _unsafe_trunc(T, y))
 end
 # Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
-_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
-    _convert(U, T, Float32(x))
-_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
-    _convert(U, UInt128, Float32(x))
-function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
-    y = round(rawone(U)*x)   # for UInt128, we can't widen
-    (0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
-    U(_unsafe_trunc(UInt128, y), 0)
+function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
+    if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
+        x == Float16(typemax(T)/rawone(U)) && return typemax(U)
+    end
+    return _convert(U, Float32(x))
+end
+function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
+    if T == UInt128 && f == 53
+        0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
+    else
+        0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
+    end
+
+    significand_bits = Tf == Float64 ? 52 : 23
+    if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
+        return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
+    end
+    # cf. the implementation of `frexp`
+    Tw = f < sizeof(T) * 8 ? T : widen1(T)
+    bits = sizeof(Tw) * 8 - 1
+    xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
+    k = Int(xu >> significand_bits)
+    k == 0 && return zero(U) # neglect subnormal numbers
+    significand = xu | (one(xu) << significand_bits)
+    yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
+    exponent_bias = Tf == Float64 ? 1023 : 127
+    ex = exponent_bias - k + bits - f
+    if ex <= 0
+        yi = yh - (yh >> f)
+        ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
+        if ex == -1
+            (yi & (one(Tw) << bits)) != 0 && return typemax(U)
+            return reinterpret(U, unsafe_trunc(T, yi + yi))
+        end
+        return typemax(U)
+    end
+    yi = bits >= f ? yh - (yh >> f) + (one(Tw)<<(ex - 1)) : yh
+    return reinterpret(U, unsafe_trunc(T, yi >> ex))
 end
 
 rem(x::T, ::Type{T}) where {T <: Normed} = x
diff --git a/test/normed.jl b/test/normed.jl
@@ -102,6 +102,36 @@ end
     @test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
 end
 
+@testset "conversion from float" begin
+    # issue 102
+    for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
+        for Tf in (Float16, Float32, Float64)
+            @testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
+                U = Normed{T,f}
+                r = FixedPointNumbers.rawone(U)
+
+                @test reinterpret(U(zero(Tf))) == 0x0
+
+                # TODO: fix issue #129
+                # input_typemax = Tf(typemax(U))
+                input_typemax = Tf(BigFloat(typemax(T)) / r)
+                if isinf(input_typemax)
+                    @test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
+                else
+                    @test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
+                end
+
+                input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
+                @test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))
+
+                input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
+                isinf(input_exp2) && continue
+                @test reinterpret(U(input_exp2)) == T(input_exp2) * r
+            end
+        end
+    end
+end
+
 @testset "modulus" begin
     @test  N0f8(0.2) % N0f8  === N0f8(0.2)
     @test N2f14(1.2) % N0f16 === N0f16(0.20002)