Skip to content

Commit 49627e2

Browse files
authored
Fix performance regression with inlining failure on Julia v1.3.0 (Fixes #144) (#145)
Julia 1.3.0 generates more redundant intermediate codes which are eventually optimized. This trivial change reduces the redundant intermediate codes to promote inlining.
1 parent 4b9142b commit 49627e2

File tree

1 file changed

+31
-31
lines changed

1 file changed

+31
-31
lines changed

src/normed.jl

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,10 @@ function Base.Float32(x::Normed{UInt32,f}) where f
159159
f == 1 && return Float32(x.i)
160160
i32 = unsafe_trunc(Int32, x.i)
161161
if f == 32
162-
rh, rl = Float32(i32>>>16), Float32((i32&0xFFFF)<<8 | (i32>>>24))
162+
rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0x8 | i32>>>0x18)
163163
return muladd(rh, @f32(0x1p-16), rl * @f32(0x1p-40))
164164
elseif f >= 25
165-
rh, rl = Float32(i32>>>16),Float32(((i32&0xFFFF)<<14) + (i32>>>(f-14)))
165+
rh, rl = Float32(i32>>>0x10), Float32((i32&0xFFFF)<<0xE + i32>>>UInt8(f-14))
166166
return muladd(rh, Float32(@exp2(16-f)), rl * Float32(@exp2(-14-f)))
167167
end
168168
# FIXME: avoid the branch in native x86_64 (non-SIMD) codes
@@ -179,49 +179,49 @@ end
179179
function Base.Float64(x::Normed{UInt32,f}) where f
180180
f64 = Float64(x.i)
181181
f == 1 && return f64
182-
f == 2 && return (f64 * 0x040001) * 0x15555000015555p-72
183-
f == 3 && return (f64 * 0x108421) * 0x11b6db76924929p-75
184-
f == 4 && return (f64 * 0x010101) * 0x11000011000011p-72
185-
f == 5 && return (f64 * 0x108421) * 0x04000002000001p-75
186-
f == 6 && return (f64 * 0x09dfb1) * 0x1a56b8e38e6d91p-78
187-
f == 7 && return (f64 * 0x000899) * 0x0f01480001e029p-70
188-
f == 8 && return (f64 * 0x0a5a5b) * 0x18d300000018d3p-80
189-
f == 9 && return (f64 * 0x001001) * 0x080381c8e3f201p-72
190-
f == 10 && return (f64 * 0x100001) * 0x04010000000401p-80
191-
f == 11 && return (f64 * 0x000009) * 0x0e3aaae3955639p-66
192-
f == 12 && return (f64 * 0x0a8055) * 0x186246e46e4cfdp-84
193-
f == 13 && return (f64 * 0x002001) * 0x10000004000001p-78
194-
f == 14 && return (f64 * 0x03400d) * 0x13b13b14ec4ec5p-84
195-
f == 15 && return (f64 * 0x000259) * 0x06d0c5a4f3a5e9p-75
196-
f == 16 && return (f64 * 0x011111) * 0x00f000ff00fff1p-80
197-
f == 18 && return (f64 * 0x0b06d1) * 0x17377445dd1231p-90
198-
f == 19 && return (f64 * 0x080001) * 0x00004000000001p-76
199-
f == 20 && return (f64 * 0x000101) * 0x0ff010ef10ff01p-80
200-
f == 21 && return (f64 * 0x004001) * 0x01fff8101fc001p-84
201-
f == 22 && return (f64 * 0x002945) * 0x18d0000000018dp-88
202-
f == 23 && return (f64 * 0x044819) * 0x07794a23729429p-92
203-
f == 27 && return (f64 * 0x000a21) * 0x0006518c7df9e1p-81
204-
f == 28 && return (f64 * 0x00000d) * 0x13b13b14ec4ec5p-84
205-
f == 30 && return (f64 * 0x001041) * 0x00fc003f03ffc1p-90
206-
f == 32 && return (f64 * 0x010101) * 0x00ff0000ffff01p-96
182+
f == 2 && return (f64 * 0x040001p0) * 0x15555000015555p-72
183+
f == 3 && return (f64 * 0x108421p0) * 0x11b6db76924929p-75
184+
f == 4 && return (f64 * 0x010101p0) * 0x11000011000011p-72
185+
f == 5 && return (f64 * 0x108421p0) * 0x04000002000001p-75
186+
f == 6 && return (f64 * 0x09dfb1p0) * 0x1a56b8e38e6d91p-78
187+
f == 7 && return (f64 * 0x000899p0) * 0x0f01480001e029p-70
188+
f == 8 && return (f64 * 0x0a5a5bp0) * 0x18d300000018d3p-80
189+
f == 9 && return (f64 * 0x001001p0) * 0x080381c8e3f201p-72
190+
f == 10 && return (f64 * 0x100001p0) * 0x04010000000401p-80
191+
f == 11 && return (f64 * 0x000009p0) * 0x0e3aaae3955639p-66
192+
f == 12 && return (f64 * 0x0a8055p0) * 0x186246e46e4cfdp-84
193+
f == 13 && return (f64 * 0x002001p0) * 0x10000004000001p-78
194+
f == 14 && return (f64 * 0x03400dp0) * 0x13b13b14ec4ec5p-84
195+
f == 15 && return (f64 * 0x000259p0) * 0x06d0c5a4f3a5e9p-75
196+
f == 16 && return (f64 * 0x011111p0) * 0x00f000ff00fff1p-80
197+
f == 18 && return (f64 * 0x0b06d1p0) * 0x17377445dd1231p-90
198+
f == 19 && return (f64 * 0x080001p0) * 0x00004000000001p-76
199+
f == 20 && return (f64 * 0x000101p0) * 0x0ff010ef10ff01p-80
200+
f == 21 && return (f64 * 0x004001p0) * 0x01fff8101fc001p-84
201+
f == 22 && return (f64 * 0x002945p0) * 0x18d0000000018dp-88
202+
f == 23 && return (f64 * 0x044819p0) * 0x07794a23729429p-92
203+
f == 27 && return (f64 * 0x000a21p0) * 0x0006518c7df9e1p-81
204+
f == 28 && return (f64 * 0x00000dp0) * 0x13b13b14ec4ec5p-84
205+
f == 30 && return (f64 * 0x001041p0) * 0x00fc003f03ffc1p-90
206+
f == 32 && return (f64 * 0x010101p0) * 0x00ff0000ffff01p-96
207207
f64 / rawone(x)
208208
end
209209
function Base.Float64(x::Normed{UInt64,f}) where f
210210
f == 1 && return Float64(x.i)
211211
if f >= 53
212-
rh = Float64(unsafe_trunc(Int64, x.i >> 16)) * @exp2(16-f) # upper 48 bits
212+
rh = Float64(unsafe_trunc(Int64, x.i>>0x10)) * @exp2(16-f) # upper 48 bits
213213
rl = Float64(unsafe_trunc(Int32, x.i&0xFFFF)) * @exp2(-f) # lower 16 bits
214214
return rh + muladd(rh, @exp2(-f), rl)
215215
end
216216
x.i / rawone(x)
217217
end
218218
function Base.Float64(x::Normed{UInt128,f}) where f
219219
f == 1 && return Float64(x.i)
220-
ih, il = unsafe_trunc(Int64, x.i>>64), unsafe_trunc(Int64, x.i)
221-
rh = Float64(ih>>>16) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits
220+
ih, il = unsafe_trunc(Int64, x.i>>0x40), unsafe_trunc(Int64, x.i)
221+
rh = Float64(ih>>>0x10) * @exp2(f <= 53 ? 80 : 80 - f) # upper 48 bits
222222
km = @exp2(f <= 53 ? 48 : 48 - f) # for middle 32 bits
223223
rm = Float64(unsafe_trunc(Int32, ih&0xFFFF)) * (0x1p16 * km) +
224-
Float64(unsafe_trunc(Int32, il>>>48)) * km
224+
Float64(unsafe_trunc(Int32, il>>>0x30)) * km
225225
rl = Float64(il&0xFFFFFFFFFFFF) * @exp2(f <= 53 ? 0 : -f) # lower 48 bits
226226
if f <= 53
227227
return (rh + (rm + rl)) / unsafe_trunc(Int64, rawone(x))

0 commit comments

Comments
 (0)