Skip to content

Commit 2acf47d

Browse files
authored
Merge pull request #18287 from pkofod/master
Move Float16 functions away from float16.jl to places where other flo…
2 parents 713e995 + 9356115 commit 2acf47d

File tree

8 files changed

+158
-171
lines changed

8 files changed

+158
-171
lines changed

base/bool.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## boolean conversions ##
44

55
convert(::Type{Bool}, x::Bool) = x
6+
convert(::Type{Bool}, x::Float16) = x==0 ? false : x==1 ? true : throw(InexactError())
67
convert(::Type{Bool}, x::Real) = x==0 ? false : x==1 ? true : throw(InexactError())
78

89
# promote Bool to any other numeric type

base/essentials.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ unsafe_convert{T}(::Type{T}, x::T) = x # unsafe_convert (like convert) defaults
8787
unsafe_convert{P<:Ptr}(::Type{P}, x::Ptr) = convert(P, x)
8888

8989
reinterpret{T}(::Type{T}, x) = box(T, x)
90+
reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16,x)
91+
reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16,x)
9092

9193
sizeof(x) = Core.sizeof(x)
9294

base/float.jl

Lines changed: 149 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ for t1 in (Float32,Float64)
6363
end
6464
end
6565
end
66+
convert{T<:Integer}(::Type{T}, x::Float16) = convert(T, Float32(x))
67+
6668

6769
promote_rule(::Type{Float64}, ::Type{UInt128}) = Float64
6870
promote_rule(::Type{Float64}, ::Type{Int128}) = Float64
@@ -129,13 +131,110 @@ function convert(::Type{Float32}, x::Int128)
129131
reinterpret(Float32, s | d + y)
130132
end
131133

134+
function convert(::Type{Float16}, val::Float32)
135+
f = reinterpret(UInt32, val)
136+
i = (f >> 23) & 0x1ff + 1
137+
sh = shifttable[i]
138+
f &= 0x007fffff
139+
h::UInt16 = basetable[i] + (f >> sh)
140+
# round
141+
# NOTE: we maybe should ignore NaNs here, but the payload is
142+
# getting truncated anyway so "rounding" it might not matter
143+
nextbit = (f >> (sh-1)) & 1
144+
if nextbit != 0
145+
# Round halfway to even or check lower bits
146+
if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
147+
h += 1
148+
end
149+
end
150+
reinterpret(Float16, h)
151+
end
152+
153+
function convert(::Type{Float32}, val::Float16)
154+
local ival::UInt32 = reinterpret(UInt16, val),
155+
sign::UInt32 = (ival & 0x8000) >> 15,
156+
exp::UInt32 = (ival & 0x7c00) >> 10,
157+
sig::UInt32 = (ival & 0x3ff) >> 0,
158+
ret::UInt32
159+
160+
if exp == 0
161+
if sig == 0
162+
sign = sign << 31
163+
ret = sign | exp | sig
164+
else
165+
n_bit = 1
166+
bit = 0x0200
167+
while (bit & sig) == 0
168+
n_bit = n_bit + 1
169+
bit = bit >> 1
170+
end
171+
sign = sign << 31
172+
exp = (-14 - n_bit + 127) << 23
173+
sig = ((sig & (~bit)) << n_bit) << (23 - 10)
174+
ret = sign | exp | sig
175+
end
176+
elseif exp == 0x1f
177+
if sig == 0 # Inf
178+
if sign == 0
179+
ret = 0x7f800000
180+
else
181+
ret = 0xff800000
182+
end
183+
else # NaN
184+
ret = 0x7fc00000 | (sign<<31)
185+
end
186+
else
187+
sign = sign << 31
188+
exp = (exp - 15 + 127) << 23
189+
sig = sig << (23 - 10)
190+
ret = sign | exp | sig
191+
end
192+
return reinterpret(Float32, ret)
193+
end
194+
195+
# Float32 -> Float16 algorithm from:
196+
# "Fast Half Float Conversion" by Jeroen van der Zijp
197+
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
198+
199+
const basetable = Array{UInt16}(512)
200+
const shifttable = Array{UInt8}(512)
201+
202+
for i = 0:255
203+
e = i - 127
204+
if e < -24 # Very small numbers map to zero
205+
basetable[i|0x000+1] = 0x0000
206+
basetable[i|0x100+1] = 0x8000
207+
shifttable[i|0x000+1] = 24
208+
shifttable[i|0x100+1] = 24
209+
elseif e < -14 # Small numbers map to denorms
210+
basetable[i|0x000+1] = (0x0400>>(-e-14))
211+
basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
212+
shifttable[i|0x000+1] = -e-1
213+
shifttable[i|0x100+1] = -e-1
214+
elseif e <= 15 # Normal numbers just lose precision
215+
basetable[i|0x000+1] = ((e+15)<<10)
216+
basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
217+
shifttable[i|0x000+1] = 13
218+
shifttable[i|0x100+1] = 13
219+
elseif e < 128 # Large numbers map to Infinity
220+
basetable[i|0x000+1] = 0x7C00
221+
basetable[i|0x100+1] = 0xFC00
222+
shifttable[i|0x000+1] = 24
223+
shifttable[i|0x100+1] = 24
224+
else # Infinity and NaN's stay Infinity and NaN's
225+
basetable[i|0x000+1] = 0x7C00
226+
basetable[i|0x100+1] = 0xFC00
227+
shifttable[i|0x000+1] = 13
228+
shifttable[i|0x100+1] = 13
229+
end
230+
end
132231
#convert(::Type{Float16}, x::Float32) = box(Float16,fptrunc(Float16,x))
133-
convert(::Type{Float16}, x::Float64) = convert(Float16, convert(Float32,x))
134232
convert(::Type{Float32}, x::Float64) = box(Float32,fptrunc(Float32,unbox(Float64,x)))
233+
convert(::Type{Float16}, x::Float64) = convert(Float16, convert(Float32,x))
135234

136235
#convert(::Type{Float32}, x::Float16) = box(Float32,fpext(Float32,x))
137-
convert(::Type{Float64}, x::Float16) = convert(Float64, convert(Float32,x))
138236
convert(::Type{Float64}, x::Float32) = box(Float64,fpext(Float64,unbox(Float32,x)))
237+
convert(::Type{Float64}, x::Float16) = convert(Float64, convert(Float32,x))
139238

140239
convert(::Type{AbstractFloat}, x::Bool) = convert(Float64, x)
141240
convert(::Type{AbstractFloat}, x::Int8) = convert(Float64, x)
@@ -204,23 +303,31 @@ trunc(::Type{Unsigned}, x::Float32) = trunc(UInt,x)
204303
trunc(::Type{Unsigned}, x::Float64) = trunc(UInt,x)
205304
trunc(::Type{Integer}, x::Float32) = trunc(Int,x)
206305
trunc(::Type{Integer}, x::Float64) = trunc(Int,x)
306+
trunc{T<:Integer}(::Type{T}, x::Float16) = trunc(T, Float32(x))
207307

208308
# fallbacks
209309
floor{T<:Integer}(::Type{T}, x::AbstractFloat) = trunc(T,floor(x))
310+
floor{T<:Integer}(::Type{T}, x::Float16) = floor(T, Float32(x))
210311
ceil{ T<:Integer}(::Type{T}, x::AbstractFloat) = trunc(T,ceil(x))
312+
ceil{ T<:Integer}(::Type{T}, x::Float16) = ceil(T, Float32(x))
211313
round{T<:Integer}(::Type{T}, x::AbstractFloat) = trunc(T,round(x))
314+
round{T<:Integer}(::Type{T}, x::Float16) = round(T, Float32(x))
212315

213316
trunc(x::Float64) = box(Float64,trunc_llvm(unbox(Float64,x)))
214317
trunc(x::Float32) = box(Float32,trunc_llvm(unbox(Float32,x)))
318+
trunc(x::Float16) = Float16(trunc(Float32(x)))
215319

216320
floor(x::Float64) = box(Float64,floor_llvm(unbox(Float64,x)))
217321
floor(x::Float32) = box(Float32,floor_llvm(unbox(Float32,x)))
322+
floor(x::Float16) = Float16(floor(Float32(x)))
218323

219324
ceil(x::Float64) = box(Float64,ceil_llvm(unbox(Float64,x)))
220325
ceil(x::Float32) = box(Float32,ceil_llvm(unbox(Float32,x)))
326+
ceil(x::Float16) = Float16( ceil(Float32(x)))
221327

222328
round(x::Float64) = box(Float64,rint_llvm(unbox(Float64,x)))
223329
round(x::Float32) = box(Float32,rint_llvm(unbox(Float32,x)))
330+
round(x::Float16) = Float16(round(Float32(x)))
224331

225332
## floating point promotions ##
226333
promote_rule(::Type{Float32}, ::Type{Float16}) = Float32
@@ -233,9 +340,13 @@ widen(::Type{Float32}) = Float64
233340
_default_type(T::Union{Type{Real},Type{AbstractFloat}}) = Float64
234341

235342
## floating point arithmetic ##
236-
-(x::Float32) = box(Float32,neg_float(unbox(Float32,x)))
237343
-(x::Float64) = box(Float64,neg_float(unbox(Float64,x)))
344+
-(x::Float32) = box(Float32,neg_float(unbox(Float32,x)))
345+
-(x::Float16) = reinterpret(Float16, reinterpret(UInt16,x) $ 0x8000)
238346

347+
for op in (:+,:-,:*,:/,:\,:^)
348+
@eval ($op)(a::Float16, b::Float16) = Float16(($op)(Float32(a), Float32(b)))
349+
end
239350
+(x::Float32, y::Float32) = box(Float32,add_float(unbox(Float32,x),unbox(Float32,y)))
240351
+(x::Float64, y::Float64) = box(Float64,add_float(unbox(Float64,x),unbox(Float64,y)))
241352
-(x::Float32, y::Float32) = box(Float32,sub_float(unbox(Float32,x),unbox(Float32,y)))
@@ -247,10 +358,20 @@ _default_type(T::Union{Type{Real},Type{AbstractFloat}}) = Float64
247358

248359
muladd(x::Float32, y::Float32, z::Float32) = box(Float32,muladd_float(unbox(Float32,x),unbox(Float32,y),unbox(Float32,z)))
249360
muladd(x::Float64, y::Float64, z::Float64) = box(Float64,muladd_float(unbox(Float64,x),unbox(Float64,y),unbox(Float64,z)))
361+
function muladd(a::Float16, b::Float16, c::Float16)
362+
Float16(muladd(Float32(a), Float32(b), Float32(c)))
363+
end
250364

251365
# TODO: faster floating point div?
252366
# TODO: faster floating point fld?
253367
# TODO: faster floating point mod?
368+
369+
for func in (:div,:fld,:cld,:rem,:mod)
370+
@eval begin
371+
$func(a::Float16,b::Float16) = Float16($func(Float32(a),Float32(b)))
372+
end
373+
end
374+
254375
rem(x::Float32, y::Float32) = box(Float32,rem_float(unbox(Float32,x),unbox(Float32,y)))
255376
rem(x::Float64, y::Float64) = box(Float64,rem_float(unbox(Float64,x),unbox(Float64,y)))
256377

@@ -268,6 +389,17 @@ function mod{T<:AbstractFloat}(x::T, y::T)
268389
end
269390

270391
## floating point comparisons ##
392+
function ==(x::Float16, y::Float16)
393+
ix = reinterpret(UInt16,x)
394+
iy = reinterpret(UInt16,y)
395+
if (ix|iy)&0x7fff > 0x7c00 #isnan(x) || isnan(y)
396+
return false
397+
end
398+
if (ix|iy)&0x7fff == 0x0000
399+
return true
400+
end
401+
return ix == iy
402+
end
271403
==(x::Float32, y::Float32) = eq_float(unbox(Float32,x),unbox(Float32,y))
272404
==(x::Float64, y::Float64) = eq_float(unbox(Float64,x),unbox(Float64,y))
273405
!=(x::Float32, y::Float32) = ne_float(unbox(Float32,x),unbox(Float32,y))
@@ -281,6 +413,9 @@ isequal(x::Float32, y::Float32) = fpiseq(unbox(Float32,x),unbox(Float32,y))
281413
isequal(x::Float64, y::Float64) = fpiseq(unbox(Float64,x),unbox(Float64,y))
282414
isless( x::Float32, y::Float32) = fpislt(unbox(Float32,x),unbox(Float32,y))
283415
isless( x::Float64, y::Float64) = fpislt(unbox(Float64,x),unbox(Float64,y))
416+
for op in (:<,:<=,:isless)
417+
@eval ($op)(a::Float16, b::Float16) = ($op)(Float32(a), Float32(b))
418+
end
284419

285420
function cmp(x::AbstractFloat, y::AbstractFloat)
286421
(isnan(x) || isnan(y)) && throw(DomainError())
@@ -349,18 +484,22 @@ end
349484
<=(x::Float32, y::Union{Int32,UInt32}) = Float64(x)<=Float64(y)
350485
<=(x::Union{Int32,UInt32}, y::Float32) = Float64(x)<=Float64(y)
351486

352-
abs(x::Float64) = box(Float64,abs_float(unbox(Float64,x)))
487+
488+
abs(x::Float16) = reinterpret(Float16, reinterpret(UInt16,x) & 0x7fff)
353489
abs(x::Float32) = box(Float32,abs_float(unbox(Float32,x)))
490+
abs(x::Float64) = box(Float64,abs_float(unbox(Float64,x)))
354491

355492
"""
356493
isnan(f) -> Bool
357494
358495
Test whether a floating point number is not a number (NaN).
359496
"""
360497
isnan(x::AbstractFloat) = x != x
498+
isnan(x::Float16) = reinterpret(UInt16,x)&0x7fff > 0x7c00
361499
isnan(x::Real) = false
362500

363501
isfinite(x::AbstractFloat) = x - x == 0
502+
isfinite(x::Float16) = reinterpret(UInt16,x)&0x7c00 != 0x7c00
364503
isfinite(x::Real) = decompose(x)[3] != 0
365504
isfinite(x::Integer) = true
366505

@@ -526,6 +665,12 @@ exponent_one(::Type{Float32}) = 0x3f80_0000
526665
exponent_half(::Type{Float32}) = 0x3f00_0000
527666
significand_mask(::Type{Float32}) = 0x007f_ffff
528667

668+
sign_mask(::Type{Float16}) = 0x8000
669+
exponent_mask(::Type{Float16}) = 0x7c00
670+
exponent_one(::Type{Float16}) = 0x3c00
671+
exponent_half(::Type{Float16}) = 0x3800
672+
significand_mask(::Type{Float16}) = 0x03ff
673+
529674
@pure significand_bits{T<:AbstractFloat}(::Type{T}) = trailing_ones(significand_mask(T))
530675
@pure exponent_bits{T<:AbstractFloat}(::Type{T}) = sizeof(T)*8 - significand_bits(T) - 1
531676
@pure exponent_bias{T<:AbstractFloat}(::Type{T}) = Int(exponent_one(T) >> significand_bits(T))

0 commit comments

Comments
 (0)