Skip to content

Commit 547eb95

Browse files
committed
Improve accuracy of conversions from floating-point numbers
This fixes a problem with the input range checking.
1 parent 2886070 commit 547eb95

File tree

2 files changed

+82
-14
lines changed

2 files changed

+82
-14
lines changed

src/normed.jl

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
4646
end
4747
N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))
4848

49-
(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)
50-
51-
function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
52-
y = round(widen1(rawone(U))*x)
53-
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
54-
U(_unsafe_trunc(T, y), 0)
49+
(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)
50+
51+
function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
52+
if T == UInt128 # for UInt128, we can't widen
53+
# the upper limit is not exact
54+
(0 <= x) & (x <= (typemax(T)/rawone(U))) || throw_converterror(U, x)
55+
y = round(rawone(U)*x)
56+
else
57+
y = round(widen1(rawone(U))*x)
58+
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
59+
end
60+
reinterpret(U, _unsafe_trunc(T, y))
5561
end
5662
# Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
57-
_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
58-
_convert(U, T, Float32(x))
59-
_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
60-
_convert(U, UInt128, Float32(x))
61-
function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
62-
y = round(rawone(U)*x) # for UInt128, we can't widen
63-
(0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
64-
U(_unsafe_trunc(UInt128, y), 0)
63+
function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
64+
if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
65+
x == Float16(typemax(T)/rawone(U)) && return typemax(U)
66+
end
67+
return _convert(U, Float32(x))
68+
end
69+
function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
70+
if T == UInt128 && f == 53
71+
0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
72+
else
73+
0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
74+
end
75+
76+
significand_bits = Tf == Float64 ? 52 : 23
77+
if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
78+
return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
79+
end
80+
# cf. the implementation of `frexp`
81+
Tw = f < sizeof(T) * 8 ? T : widen1(T)
82+
bits = sizeof(Tw) * 8 - 1
83+
xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
84+
k = Int(xu >> significand_bits)
85+
k == 0 && return zero(U) # neglect subnormal numbers
86+
significand = xu | (one(xu) << significand_bits)
87+
yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
88+
exponent_bias = Tf == Float64 ? 1023 : 127
89+
ex = exponent_bias - k + bits - f
90+
yi = bits >= f ? yh - (yh >> f) : yh
91+
if ex <= 0
92+
ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
93+
ex != -1 || signbit(signed(yi)) && return typemax(U)
94+
return reinterpret(U, unsafe_trunc(T, yi + yi))
95+
end
96+
ex > bits && return reinterpret(U, ex == bits + 1 ? one(T) : zero(T))
97+
yi += one(Tw)<<((ex - 1) & bits) # RoundNearestTiesUp
98+
return reinterpret(U, unsafe_trunc(T, yi >> (ex & bits)))
6599
end
66100

67101
rem(x::T, ::Type{T}) where {T <: Normed} = x

test/normed.jl

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,40 @@ end
102102
@test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
103103
end
104104

105+
@testset "conversion from float" begin
106+
# issue 102
107+
for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
108+
for Tf in (Float16, Float32, Float64)
109+
@testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
110+
U = Normed{T,f}
111+
r = FixedPointNumbers.rawone(U)
112+
113+
@test reinterpret(U(zero(Tf))) == 0x0
114+
115+
# TODO: fix issue #129
116+
# input_typemax = Tf(typemax(U))
117+
input_typemax = Tf(BigFloat(typemax(T)) / r)
118+
if isinf(input_typemax)
119+
@test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
120+
else
121+
@test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
122+
end
123+
124+
input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
125+
isinf(input_upper) && continue # for Julia v0.7
126+
@test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))
127+
128+
input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
129+
isinf(input_exp2) && continue
130+
@test reinterpret(U(input_exp2)) == T(input_exp2) * r
131+
end
132+
end
133+
end
134+
@test N0f32(Float32(0x0.7FFFFFp-32)) == zero(N0f32)
135+
@test N0f32(Float32(0x0.800000p-32)) <= eps(N0f32) # should be zero in RoundNearest mode
136+
@test N0f32(Float32(0x0.800001p-32)) == eps(N0f32)
137+
end
138+
105139
@testset "modulus" begin
106140
@test N0f8(0.2) % N0f8 === N0f8(0.2)
107141
@test N2f14(1.2) % N0f16 === N0f16(0.20002)

0 commit comments

Comments
 (0)