@@ -106,36 +106,62 @@ floatmax(::Type{T}) where {T <: FixedPoint} = typemax(T)
106
106
107
107
108
108
"""
109
- floattype(::Type{T})
109
+ floattype(::Type{T})::Type{<:AbstractFloat}
110
110
111
- Return the minimum float type that represents `T` without overflow to `Inf` .
111
+ Return a minimal type suitable for performing computations with instances of type `T` without integer overflow .
112
112
113
- # Example
113
+ The fallback definition of `floattype(T)` applies only to `T<:AbstractFloat`.
114
+ However, it is permissible to extend `floattype` to return types that are not subtypes of
115
+ `AbstractFloat`; the key characteristic is that the return type should support computation without integer overflow.
116
+
117
+ In general the returned type should have the minimum bitwidth needed to encode the full precision of the input type.
118
+ however, a priority should be placed on computational efficiency; consequently, types like `Float16` should be avoided
119
+ except in scenarios where they are guaranteed to have hardware support.
120
+
121
+ # Examples
114
122
115
123
A classic usage is to avoid overflow behavior by promoting `FixedPoint` to `AbstractFloat`
116
124
117
- ```julia
125
+ ```jldoctest
118
126
julia> x = N0f8(1.0)
119
127
1.0N0f8
120
128
121
129
julia> x + x # overflow
122
130
0.996N0f8
123
131
124
- julia> float_x = floattype(eltype(x)) (x)
125
- 1.0f0
132
+ julia> T = floattype(x)
133
+ Float32
126
134
127
- julia> float_x + float_x
135
+ julia> T(x) + T(x)
128
136
2.0f0
129
137
```
138
+
139
+ The following represents a valid extension of `floattype` to non-AbstractFloats:
140
+
141
+ ```julia
142
+ julia> using FixedPointNumbers, ColorTypes
143
+
144
+ julia> floattype(RGB{N0f8})
145
+ RGB{Float32}
146
+ ```
147
+
148
+ `RGB` itself is not a subtype of `AbstractFloat`, but unlike `RGB{N0f8}` operations with `RGB{Float32}` are not subject to integer overflow.
130
149
"""
131
- floattype (:: Type{T} ) where {T <: Real } = T # fallback
150
+ floattype (:: Type{T} ) where {T <: AbstractFloat } = T # fallback (we want a MethodError if no method producing AbstractFloat is defined)
132
151
floattype (:: Type{T} ) where {T <: Union{ShortInts, Bool} } = Float32
133
152
floattype (:: Type{T} ) where {T <: Integer } = Float64
134
153
floattype (:: Type{T} ) where {T <: LongInts } = BigFloat
154
+ floattype (:: Type{T} ) where {I <: Integer , T <: Rational{I} } = typeof (zero (I)/ oneunit (I))
155
+ floattype (:: Type{<:AbstractIrrational} ) = Float64
135
156
floattype (:: Type{X} ) where {T <: ShortInts , X <: FixedPoint{T} } = Float32
136
157
floattype (:: Type{X} ) where {T <: Integer , X <: FixedPoint{T} } = Float64
137
158
floattype (:: Type{X} ) where {T <: LongInts , X <: FixedPoint{T} } = BigFloat
138
159
160
+ # Non-Real types
161
+ floattype (:: Type{Complex{T}} ) where T = Complex{floattype (T)}
162
+ floattype (:: Type{Base.TwicePrecision{Float64}} ) = Float64 # wider would be nice, but hardware support is paramount
163
+ floattype (:: Type{Base.TwicePrecision{T}} ) where T<: Union{Float16,Float32} = widen (T)
164
+
139
165
float (x:: FixedPoint ) = convert (floattype (x), x)
140
166
141
167
function minmax (x:: X , y:: X ) where {X <: FixedPoint }
0 commit comments