@@ -63,6 +63,8 @@ for t1 in (Float32,Float64)
63
63
end
64
64
end
65
65
end
66
+ convert {T<:Integer} (:: Type{T} , x:: Float16 ) = convert (T, Float32 (x))
67
+
66
68
67
69
promote_rule (:: Type{Float64} , :: Type{UInt128} ) = Float64
68
70
promote_rule (:: Type{Float64} , :: Type{Int128} ) = Float64
@@ -129,13 +131,110 @@ function convert(::Type{Float32}, x::Int128)
129
131
reinterpret (Float32, s | d + y)
130
132
end
131
133
134
+ function convert (:: Type{Float16} , val:: Float32 )
135
+ f = reinterpret (UInt32, val)
136
+ i = (f >> 23 ) & 0x1ff + 1
137
+ sh = shifttable[i]
138
+ f &= 0x007fffff
139
+ h:: UInt16 = basetable[i] + (f >> sh)
140
+ # round
141
+ # NOTE: we maybe should ignore NaNs here, but the payload is
142
+ # getting truncated anyway so "rounding" it might not matter
143
+ nextbit = (f >> (sh- 1 )) & 1
144
+ if nextbit != 0
145
+ # Round halfway to even or check lower bits
146
+ if h& 1 == 1 || (f & ((1 << (sh- 1 ))- 1 )) != 0
147
+ h += 1
148
+ end
149
+ end
150
+ reinterpret (Float16, h)
151
+ end
152
+
153
+ function convert (:: Type{Float32} , val:: Float16 )
154
+ local ival:: UInt32 = reinterpret (UInt16, val),
155
+ sign:: UInt32 = (ival & 0x8000 ) >> 15 ,
156
+ exp:: UInt32 = (ival & 0x7c00 ) >> 10 ,
157
+ sig:: UInt32 = (ival & 0x3ff ) >> 0 ,
158
+ ret:: UInt32
159
+
160
+ if exp == 0
161
+ if sig == 0
162
+ sign = sign << 31
163
+ ret = sign | exp | sig
164
+ else
165
+ n_bit = 1
166
+ bit = 0x0200
167
+ while (bit & sig) == 0
168
+ n_bit = n_bit + 1
169
+ bit = bit >> 1
170
+ end
171
+ sign = sign << 31
172
+ exp = (- 14 - n_bit + 127 ) << 23
173
+ sig = ((sig & (~ bit)) << n_bit) << (23 - 10 )
174
+ ret = sign | exp | sig
175
+ end
176
+ elseif exp == 0x1f
177
+ if sig == 0 # Inf
178
+ if sign == 0
179
+ ret = 0x7f800000
180
+ else
181
+ ret = 0xff800000
182
+ end
183
+ else # NaN
184
+ ret = 0x7fc00000 | (sign<< 31 )
185
+ end
186
+ else
187
+ sign = sign << 31
188
+ exp = (exp - 15 + 127 ) << 23
189
+ sig = sig << (23 - 10 )
190
+ ret = sign | exp | sig
191
+ end
192
+ return reinterpret (Float32, ret)
193
+ end
194
+
195
+ # Float32 -> Float16 algorithm from:
196
+ # "Fast Half Float Conversion" by Jeroen van der Zijp
197
+ # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
198
+
199
+ const basetable = Array {UInt16} (512 )
200
+ const shifttable = Array {UInt8} (512 )
201
+
202
+ for i = 0 : 255
203
+ e = i - 127
204
+ if e < - 24 # Very small numbers map to zero
205
+ basetable[i| 0x000 + 1 ] = 0x0000
206
+ basetable[i| 0x100 + 1 ] = 0x8000
207
+ shifttable[i| 0x000 + 1 ] = 24
208
+ shifttable[i| 0x100 + 1 ] = 24
209
+ elseif e < - 14 # Small numbers map to denorms
210
+ basetable[i| 0x000 + 1 ] = (0x0400 >> (- e- 14 ))
211
+ basetable[i| 0x100 + 1 ] = (0x0400 >> (- e- 14 )) | 0x8000
212
+ shifttable[i| 0x000 + 1 ] = - e- 1
213
+ shifttable[i| 0x100 + 1 ] = - e- 1
214
+ elseif e <= 15 # Normal numbers just lose precision
215
+ basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
216
+ basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
217
+ shifttable[i| 0x000 + 1 ] = 13
218
+ shifttable[i| 0x100 + 1 ] = 13
219
+ elseif e < 128 # Large numbers map to Infinity
220
+ basetable[i| 0x000 + 1 ] = 0x7C00
221
+ basetable[i| 0x100 + 1 ] = 0xFC00
222
+ shifttable[i| 0x000 + 1 ] = 24
223
+ shifttable[i| 0x100 + 1 ] = 24
224
+ else # Infinity and NaN's stay Infinity and NaN's
225
+ basetable[i| 0x000 + 1 ] = 0x7C00
226
+ basetable[i| 0x100 + 1 ] = 0xFC00
227
+ shifttable[i| 0x000 + 1 ] = 13
228
+ shifttable[i| 0x100 + 1 ] = 13
229
+ end
230
+ end
132
231
# convert(::Type{Float16}, x::Float32) = box(Float16,fptrunc(Float16,x))
133
- convert (:: Type{Float16} , x:: Float64 ) = convert (Float16, convert (Float32,x))
134
232
convert (:: Type{Float32} , x:: Float64 ) = box (Float32,fptrunc (Float32,unbox (Float64,x)))
233
+ convert (:: Type{Float16} , x:: Float64 ) = convert (Float16, convert (Float32,x))
135
234
136
235
# convert(::Type{Float32}, x::Float16) = box(Float32,fpext(Float32,x))
137
- convert (:: Type{Float64} , x:: Float16 ) = convert (Float64, convert (Float32,x))
138
236
convert (:: Type{Float64} , x:: Float32 ) = box (Float64,fpext (Float64,unbox (Float32,x)))
237
+ convert (:: Type{Float64} , x:: Float16 ) = convert (Float64, convert (Float32,x))
139
238
140
239
convert (:: Type{AbstractFloat} , x:: Bool ) = convert (Float64, x)
141
240
convert (:: Type{AbstractFloat} , x:: Int8 ) = convert (Float64, x)
@@ -204,23 +303,31 @@ trunc(::Type{Unsigned}, x::Float32) = trunc(UInt,x)
204
303
trunc (:: Type{Unsigned} , x:: Float64 ) = trunc (UInt,x)
205
304
trunc (:: Type{Integer} , x:: Float32 ) = trunc (Int,x)
206
305
trunc (:: Type{Integer} , x:: Float64 ) = trunc (Int,x)
306
+ trunc {T<:Integer} (:: Type{T} , x:: Float16 ) = trunc (T, Float32 (x))
207
307
208
308
# fallbacks
209
309
floor {T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,floor (x))
310
+ floor {T<:Integer} (:: Type{T} , x:: Float16 ) = floor (T, Float32 (x))
210
311
ceil { T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,ceil (x))
312
+ ceil { T<:Integer} (:: Type{T} , x:: Float16 ) = ceil (T, Float32 (x))
211
313
round {T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,round (x))
314
+ round {T<:Integer} (:: Type{T} , x:: Float16 ) = round (T, Float32 (x))
212
315
213
316
trunc (x:: Float64 ) = box (Float64,trunc_llvm (unbox (Float64,x)))
214
317
trunc (x:: Float32 ) = box (Float32,trunc_llvm (unbox (Float32,x)))
318
+ trunc (x:: Float16 ) = Float16 (trunc (Float32 (x)))
215
319
216
320
floor (x:: Float64 ) = box (Float64,floor_llvm (unbox (Float64,x)))
217
321
floor (x:: Float32 ) = box (Float32,floor_llvm (unbox (Float32,x)))
322
+ floor (x:: Float16 ) = Float16 (floor (Float32 (x)))
218
323
219
324
ceil (x:: Float64 ) = box (Float64,ceil_llvm (unbox (Float64,x)))
220
325
ceil (x:: Float32 ) = box (Float32,ceil_llvm (unbox (Float32,x)))
326
+ ceil (x:: Float16 ) = Float16 ( ceil (Float32 (x)))
221
327
222
328
round (x:: Float64 ) = box (Float64,rint_llvm (unbox (Float64,x)))
223
329
round (x:: Float32 ) = box (Float32,rint_llvm (unbox (Float32,x)))
330
+ round (x:: Float16 ) = Float16 (round (Float32 (x)))
224
331
225
332
# # floating point promotions ##
226
333
promote_rule (:: Type{Float32} , :: Type{Float16} ) = Float32
@@ -233,9 +340,13 @@ widen(::Type{Float32}) = Float64
233
340
_default_type (T:: Union{Type{Real},Type{AbstractFloat}} ) = Float64
234
341
235
342
# # floating point arithmetic ##
236
- - (x:: Float32 ) = box (Float32,neg_float (unbox (Float32,x)))
237
343
- (x:: Float64 ) = box (Float64,neg_float (unbox (Float64,x)))
344
+ - (x:: Float32 ) = box (Float32,neg_float (unbox (Float32,x)))
345
+ - (x:: Float16 ) = reinterpret (Float16, reinterpret (UInt16,x) $ 0x8000 )
238
346
347
+ for op in (:+ ,:- ,:* ,:/ ,:\ ,:^ )
348
+ @eval ($ op)(a:: Float16 , b:: Float16 ) = Float16 (($ op)(Float32 (a), Float32 (b)))
349
+ end
239
350
+ (x:: Float32 , y:: Float32 ) = box (Float32,add_float (unbox (Float32,x),unbox (Float32,y)))
240
351
+ (x:: Float64 , y:: Float64 ) = box (Float64,add_float (unbox (Float64,x),unbox (Float64,y)))
241
352
- (x:: Float32 , y:: Float32 ) = box (Float32,sub_float (unbox (Float32,x),unbox (Float32,y)))
@@ -247,10 +358,20 @@ _default_type(T::Union{Type{Real},Type{AbstractFloat}}) = Float64
247
358
248
359
muladd (x:: Float32 , y:: Float32 , z:: Float32 ) = box (Float32,muladd_float (unbox (Float32,x),unbox (Float32,y),unbox (Float32,z)))
249
360
muladd (x:: Float64 , y:: Float64 , z:: Float64 ) = box (Float64,muladd_float (unbox (Float64,x),unbox (Float64,y),unbox (Float64,z)))
361
+ function muladd (a:: Float16 , b:: Float16 , c:: Float16 )
362
+ Float16 (muladd (Float32 (a), Float32 (b), Float32 (c)))
363
+ end
250
364
251
365
# TODO : faster floating point div?
252
366
# TODO : faster floating point fld?
253
367
# TODO : faster floating point mod?
368
+
369
+ for func in (:div ,:fld ,:cld ,:rem ,:mod )
370
+ @eval begin
371
+ $ func (a:: Float16 ,b:: Float16 ) = Float16 ($ func (Float32 (a),Float32 (b)))
372
+ end
373
+ end
374
+
254
375
rem (x:: Float32 , y:: Float32 ) = box (Float32,rem_float (unbox (Float32,x),unbox (Float32,y)))
255
376
rem (x:: Float64 , y:: Float64 ) = box (Float64,rem_float (unbox (Float64,x),unbox (Float64,y)))
256
377
@@ -268,6 +389,17 @@ function mod{T<:AbstractFloat}(x::T, y::T)
268
389
end
269
390
270
391
# # floating point comparisons ##
392
+ function == (x:: Float16 , y:: Float16 )
393
+ ix = reinterpret (UInt16,x)
394
+ iy = reinterpret (UInt16,y)
395
+ if (ix| iy)& 0x7fff > 0x7c00 # isnan(x) || isnan(y)
396
+ return false
397
+ end
398
+ if (ix| iy)& 0x7fff == 0x0000
399
+ return true
400
+ end
401
+ return ix == iy
402
+ end
271
403
== (x:: Float32 , y:: Float32 ) = eq_float (unbox (Float32,x),unbox (Float32,y))
272
404
== (x:: Float64 , y:: Float64 ) = eq_float (unbox (Float64,x),unbox (Float64,y))
273
405
!= (x:: Float32 , y:: Float32 ) = ne_float (unbox (Float32,x),unbox (Float32,y))
@@ -281,6 +413,9 @@ isequal(x::Float32, y::Float32) = fpiseq(unbox(Float32,x),unbox(Float32,y))
281
413
isequal (x:: Float64 , y:: Float64 ) = fpiseq (unbox (Float64,x),unbox (Float64,y))
282
414
isless ( x:: Float32 , y:: Float32 ) = fpislt (unbox (Float32,x),unbox (Float32,y))
283
415
isless ( x:: Float64 , y:: Float64 ) = fpislt (unbox (Float64,x),unbox (Float64,y))
416
+ for op in (:< ,:<= ,:isless )
417
+ @eval ($ op)(a:: Float16 , b:: Float16 ) = ($ op)(Float32 (a), Float32 (b))
418
+ end
284
419
285
420
function cmp (x:: AbstractFloat , y:: AbstractFloat )
286
421
(isnan (x) || isnan (y)) && throw (DomainError ())
@@ -349,18 +484,22 @@ end
349
484
<= (x:: Float32 , y:: Union{Int32,UInt32} ) = Float64 (x)<= Float64 (y)
350
485
<= (x:: Union{Int32,UInt32} , y:: Float32 ) = Float64 (x)<= Float64 (y)
351
486
352
- abs (x:: Float64 ) = box (Float64,abs_float (unbox (Float64,x)))
487
+
488
+ abs (x:: Float16 ) = reinterpret (Float16, reinterpret (UInt16,x) & 0x7fff )
353
489
abs (x:: Float32 ) = box (Float32,abs_float (unbox (Float32,x)))
490
+ abs (x:: Float64 ) = box (Float64,abs_float (unbox (Float64,x)))
354
491
355
492
"""
356
493
isnan(f) -> Bool
357
494
358
495
Test whether a floating point number is not a number (NaN).
359
496
"""
360
497
isnan (x:: AbstractFloat ) = x != x
498
+ isnan (x:: Float16 ) = reinterpret (UInt16,x)& 0x7fff > 0x7c00
361
499
isnan (x:: Real ) = false
362
500
363
501
isfinite (x:: AbstractFloat ) = x - x == 0
502
+ isfinite (x:: Float16 ) = reinterpret (UInt16,x)& 0x7c00 != 0x7c00
364
503
isfinite (x:: Real ) = decompose (x)[3 ] != 0
365
504
isfinite (x:: Integer ) = true
366
505
@@ -526,6 +665,12 @@ exponent_one(::Type{Float32}) = 0x3f80_0000
526
665
exponent_half (:: Type{Float32} ) = 0x3f00_0000
527
666
significand_mask (:: Type{Float32} ) = 0x007f_ffff
528
667
668
+ sign_mask (:: Type{Float16} ) = 0x8000
669
+ exponent_mask (:: Type{Float16} ) = 0x7c00
670
+ exponent_one (:: Type{Float16} ) = 0x3c00
671
+ exponent_half (:: Type{Float16} ) = 0x3800
672
+ significand_mask (:: Type{Float16} ) = 0x03ff
673
+
529
674
@pure significand_bits {T<:AbstractFloat} (:: Type{T} ) = trailing_ones (significand_mask (T))
530
675
@pure exponent_bits {T<:AbstractFloat} (:: Type{T} ) = sizeof (T)* 8 - significand_bits (T) - 1
531
676
@pure exponent_bias {T<:AbstractFloat} (:: Type{T} ) = Int (exponent_one (T) >> significand_bits (T))
0 commit comments