Skip to content

Commit bc07c15

Browse files
committed
Fix JuliaLang#10959 UTF-32 conversion errors
Added new `convert` methods that use the `checkstring` function to validate input Added tests for many sorts of valid/invalid data Depends on PR JuliaLang#11551 and JuliaLang#11575 Updated to use unsafe_checkstring, fix comments Remove conversions from Vector{UInt32} Move some code from utf32.jl to utf16.jl and utf8.jl, hopefully more logical
1 parent 4fc9bb6 commit bc07c15

File tree

4 files changed

+288
-48
lines changed

4 files changed

+288
-48
lines changed

base/utf16.jl

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -195,44 +195,52 @@ function convert(::Type{UTF8String}, str::UTF16String)
195195
end
196196

197197
"
198-
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
198+
Converts a vector of `Char` to a `UTF16String`
199+
200+
### Returns:
201+
* `::UTF16String`
202+
203+
### Throws:
204+
* `UnicodeError`
205+
"
206+
function convert(::Type{UTF16String}, chrs::Vector{Char})
207+
len = sizeof(chrs)
208+
# handle zero length string quickly
209+
len == 0 && return empty_utf16
210+
dat = reinterpret(UInt32, chrs)
211+
# get number of words to allocate
212+
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
213+
len += num4byte + 1
214+
# optimized path, no surrogates
215+
num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat)
216+
return encode_to_utf16(dat, len)
217+
end
218+
219+
"
220+
Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
199221
200222
### Input Arguments:
201-
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
202-
* `len` length of output in bytes
223+
* `dat::Vector{UInt32}` UTF-32 encoded data
224+
* `len` length of output in 16-bit words
203225
204226
### Returns:
205-
* `UTF8String`
227+
* `::UTF16String`
206228
"
207-
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
208-
buf = Vector{UInt8}(len)
229+
function encode_to_utf16(dat, len)
230+
buf = Vector{UInt16}(len)
231+
@inbounds buf[len] = 0 # NULL termination
209232
out = 0
210233
pos = 0
211234
@inbounds while out < len
212-
ch::UInt32 = dat[pos += 1]
213-
# Handle ASCII characters
214-
if ch <= 0x7f
215-
buf[out += 1] = ch
216-
# Handle 0x80-0x7ff
217-
elseif ch < 0x800
218-
buf[out += 1] = 0xc0 | (ch >>> 6)
219-
buf[out += 1] = 0x80 | (ch & 0x3f)
220-
# Handle 0x10000-0x10ffff (if input is UInt32)
221-
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
222-
output_utf8_4byte!(buf, out, ch)
223-
out += 4
224-
# Handle surrogate pairs
225-
elseif is_surrogate_codeunit(ch)
226-
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
227-
out += 4
228-
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
229-
else
230-
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
231-
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
232-
buf[out += 1] = 0x80 | (ch & 0x3f)
235+
ch = UInt32(dat[pos += 1])
236+
if ch > 0xffff
237+
# Output surrogate pair for 0x10000-0x10ffff
238+
buf[out += 1] = 0xd7c0 + (ch >>> 10)
239+
ch = 0xdc00 + (ch & 0x3ff)
233240
end
241+
buf[out += 1] = ch
234242
end
235-
UTF8String(buf)
243+
UTF16String(buf)
236244
end
237245

238246
function convert(::Type{UTF16String}, str::ASCIIString)

base/utf32.jl

Lines changed: 159 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,169 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
55
endof(s::UTF32String) = length(s.data) - 1
66
length(s::UTF32String) = length(s.data) - 1
77

8+
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
9+
10+
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
11+
12+
const empty_utf32 = UTF32String(UInt32[0])
13+
814
utf32(x) = convert(UTF32String, x)
915
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
1016
convert(::Type{UTF32String}, s::UTF32String) = s
1117

12-
function convert(::Type{UTF32String}, s::AbstractString)
13-
a = Array(Char, length(s) + 1)
14-
i = 0
15-
for c in s
16-
a[i += 1] = c
18+
"
19+
Converts an `AbstractString` to a `UTF32String`
20+
21+
### Returns:
22+
* `UTF32String`
23+
24+
### Throws:
25+
* `UnicodeError`
26+
"
27+
function convert(::Type{UTF32String}, str::AbstractString)
28+
len, flags = unsafe_checkstring(str)
29+
buf = Vector{Char}(len+1)
30+
out = 0
31+
@inbounds for ch in str ; buf[out += 1] = ch ; end
32+
@inbounds buf[out + 1] = 0 # NULL termination
33+
UTF32String(buf)
34+
end
35+
36+
"
37+
Converts a `UTF32String` to a `UTF8String`
38+
39+
### Returns:
40+
* `UTF8String`
41+
42+
### Throws:
43+
* `UnicodeError`
44+
"
45+
function convert(::Type{UTF8String}, str::UTF32String)
46+
dat = reinterpret(UInt32, str.data)
47+
len = sizeof(dat) >>> 2
48+
# handle zero length string quickly
49+
len <= 1 && return empty_utf8
50+
# get number of bytes to allocate
51+
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
52+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
53+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
54+
end
55+
56+
"
57+
Converts a `UTF8String` to a `UTF32String`
58+
59+
### Returns:
60+
* `::UTF32String`
61+
62+
### Throws:
63+
* `UnicodeError`
64+
"
65+
function convert(::Type{UTF32String}, str::UTF8String)
66+
dat = str.data
67+
# handle zero length string quickly
68+
sizeof(dat) == 0 && return empty_utf32
69+
# Validate UTF-8 encoding, and get number of words to create
70+
len, flags = unsafe_checkstring(dat)
71+
# Optimize case where no characters > 0x7f
72+
flags == 0 && @inbounds return fast_utf_copy(UTF32String, Char, len, dat, true)
73+
# has multi-byte UTF-8 sequences
74+
buf = Vector{Char}(len+1)
75+
@inbounds buf[len+1] = 0 # NULL termination
76+
local ch::UInt32, surr::UInt32
77+
out = 0
78+
pos = 0
79+
@inbounds while out < len
80+
ch = dat[pos += 1]
81+
# Handle ASCII characters
82+
if ch <= 0x7f
83+
buf[out += 1] = ch
84+
# Handle range 0x80-0x7ff
85+
elseif ch < 0xe0
86+
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
87+
# Handle range 0x800-0xffff
88+
elseif ch < 0xf0
89+
pos += 2
90+
ch = get_utf8_3byte(dat, pos, ch)
91+
# Handle surrogate pairs (should have been encoded in 4 bytes)
92+
if is_surrogate_lead(ch)
93+
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
94+
pos += 3
95+
surr = ((UInt32(dat[pos-2] & 0xf) << 12)
96+
| (UInt32(dat[pos-1] & 0x3f) << 6)
97+
| (dat[pos] & 0x3f))
98+
ch = get_supplementary(ch, surr)
99+
end
100+
buf[out += 1] = ch
101+
# Handle range 0x10000-0x10ffff
102+
else
103+
pos += 3
104+
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
105+
end
17106
end
18-
a[end] = Char(0) # NULL terminate
19-
UTF32String(a)
107+
UTF32String(buf)
20108
end
21109

22-
function convert(::Type{UTF32String}, data::AbstractVector{Char})
23-
len = length(data)
24-
d = Array(Char, len + 1)
25-
d[end] = Char(0) # NULL terminate
26-
UTF32String(copy!(d,1, data,1, len))
110+
"
111+
Converts a `UTF16String` to `UTF32String`
112+
113+
### Returns:
114+
* `::UTF32String`
115+
116+
### Throws:
117+
* `UnicodeError`
118+
"
119+
function convert(::Type{UTF32String}, str::UTF16String)
120+
dat = str.data
121+
len = sizeof(dat)
122+
# handle zero length string quickly (account for trailing \0)
123+
len <= 2 && return empty_utf32
124+
# get number of words to create
125+
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1)
126+
# No surrogate pairs, do optimized copy
127+
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
128+
local ch::UInt32
129+
buf = Vector{Char}(len)
130+
out = 0
131+
pos = 0
132+
@inbounds while out < len
133+
ch = dat[pos += 1]
134+
# check for surrogate pair
135+
if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
136+
buf[out += 1] = ch
137+
end
138+
UTF32String(buf)
139+
end
140+
141+
"
142+
Converts a `UTF32String` to `UTF16String`
143+
144+
### Returns:
145+
* `::UTF16String`
146+
147+
### Throws:
148+
* `UnicodeError`
149+
"
150+
function convert(::Type{UTF16String}, str::UTF32String)
151+
dat = reinterpret(UInt32, str.data)
152+
len = sizeof(dat)
153+
# handle zero length string quickly
154+
len <= 4 && return empty_utf16
155+
# get number of words to allocate
156+
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
157+
# optimized path, no surrogates
158+
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
159+
return encode_to_utf16(dat, len + num4byte)
160+
end
161+
162+
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
163+
164+
function convert(::Type{UTF32String}, str::ASCIIString)
165+
dat = str.data
166+
@inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true)
167+
end
168+
169+
function convert(::Type{UTF32String}, dat::AbstractVector{Char})
170+
@inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true)
27171
end
28172

29173
convert{T<:Union{Int32,UInt32}}(::Type{UTF32String}, data::AbstractVector{T}) =
@@ -46,12 +190,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46190

47191
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
48192

49-
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
50193
unsafe_convert{T<:Union{Int32,UInt32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
51194
convert(Ptr{T}, pointer(s))
52195

53196
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
54-
isempty(bytes) && return UTF32String(Char[0])
197+
isempty(bytes) && return empty_utf32
55198
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
56199
data = reinterpret(Char, bytes)
57200
# check for byte-order mark (BOM):
@@ -79,6 +222,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79222
end
80223
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
81224

225+
utf32(x) = convert(UTF32String, x)
226+
82227
utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
83228
utf32(p::Union{Ptr{UInt32}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{Char}, p), len)
84229
function utf32(p::Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}})

base/utf8.jl

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,67 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr
238238
end
239239
convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s))
240240

241+
"
242+
Converts a vector of `Char` to a `UTF8String`
243+
244+
### Returns:
245+
* `UTF8String`
246+
247+
### Throws:
248+
* `UnicodeError`
249+
"
250+
function convert(::Type{UTF8String}, chrs::Vector{Char})
251+
len = sizeof(chrs)
252+
# handle zero length string quickly
253+
len == 0 && return empty_utf8
254+
dat = reinterpret(UInt32, chrs)
255+
# get number of bytes to allocate
256+
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>2)
257+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
258+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
259+
end
260+
261+
"
262+
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
263+
264+
### Input Arguments:
265+
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
266+
* `len` length of output in bytes
267+
268+
### Returns:
269+
* `UTF8String`
270+
"
271+
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
272+
buf = Vector{UInt8}(len)
273+
out = 0
274+
pos = 0
275+
@inbounds while out < len
276+
ch::UInt32 = dat[pos += 1]
277+
# Handle ASCII characters
278+
if ch <= 0x7f
279+
buf[out += 1] = ch
280+
# Handle 0x80-0x7ff
281+
elseif ch < 0x800
282+
buf[out += 1] = 0xc0 | (ch >>> 6)
283+
buf[out += 1] = 0x80 | (ch & 0x3f)
284+
# Handle 0x10000-0x10ffff (if input is UInt32)
285+
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
286+
output_utf8_4byte!(buf, out, ch)
287+
out += 4
288+
# Handle surrogate pairs
289+
elseif is_surrogate_codeunit(ch)
290+
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
291+
out += 4
292+
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
293+
else
294+
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
295+
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
296+
buf[out += 1] = 0x80 | (ch & 0x3f)
297+
end
298+
end
299+
UTF8String(buf)
300+
end
301+
241302
utf8(p::Ptr{UInt8}) = UTF8String(bytestring(p))
242303
utf8(p::Ptr{UInt8}, len::Integer) = utf8(pointer_to_array(p, len))
243304

0 commit comments

Comments
 (0)