@@ -5,25 +5,169 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
5
5
endof (s:: UTF32String ) = length (s. data) - 1
6
6
length (s:: UTF32String ) = length (s. data) - 1
7
7
8
+ reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
9
+
10
+ sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
11
+
12
+ const empty_utf32 = UTF32String (UInt32[0 ])
13
+
8
14
utf32 (x) = convert (UTF32String, x)
9
15
convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
10
16
convert (:: Type{UTF32String} , s:: UTF32String ) = s
11
17
12
- function convert (:: Type{UTF32String} , s:: AbstractString )
13
- a = Array (Char, length (s) + 1 )
14
- i = 0
15
- for c in s
16
- a[i += 1 ] = c
18
+ "
19
+ Converts an `AbstractString` to a `UTF32String`
20
+
21
+ ### Returns:
22
+ * `UTF32String`
23
+
24
+ ### Throws:
25
+ * `UnicodeError`
26
+ "
27
+ function convert (:: Type{UTF32String} , str:: AbstractString )
28
+ len, flags = unsafe_checkstring (str)
29
+ buf = Vector {Char} (len+ 1 )
30
+ out = 0
31
+ @inbounds for ch in str ; buf[out += 1 ] = ch ; end
32
+ @inbounds buf[out + 1 ] = 0 # NULL termination
33
+ UTF32String (buf)
34
+ end
35
+
36
+ "
37
+ Converts a `UTF32String` to a `UTF8String`
38
+
39
+ ### Returns:
40
+ * `UTF8String`
41
+
42
+ ### Throws:
43
+ * `UnicodeError`
44
+ "
45
+ function convert (:: Type{UTF8String} , str:: UTF32String )
46
+ dat = reinterpret (UInt32, str. data)
47
+ len = sizeof (dat) >>> 2
48
+ # handle zero length string quickly
49
+ len <= 1 && return empty_utf8
50
+ # get number of bytes to allocate
51
+ len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len- 1 )
52
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
53
+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
54
+ end
55
+
56
+ "
57
+ Converts a `UTF8String` to a `UTF32String`
58
+
59
+ ### Returns:
60
+ * `::UTF32String`
61
+
62
+ ### Throws:
63
+ * `UnicodeError`
64
+ "
65
+ function convert (:: Type{UTF32String} , str:: UTF8String )
66
+ dat = str. data
67
+ # handle zero length string quickly
68
+ sizeof (dat) == 0 && return empty_utf32
69
+ # Validate UTF-8 encoding, and get number of words to create
70
+ len, flags = unsafe_checkstring (dat)
71
+ # Optimize case where no characters > 0x7f
72
+ flags == 0 && @inbounds return fast_utf_copy (UTF32String, Char, len, dat, true )
73
+ # has multi-byte UTF-8 sequences
74
+ buf = Vector {Char} (len+ 1 )
75
+ @inbounds buf[len+ 1 ] = 0 # NULL termination
76
+ local ch:: UInt32 , surr:: UInt32
77
+ out = 0
78
+ pos = 0
79
+ @inbounds while out < len
80
+ ch = dat[pos += 1 ]
81
+ # Handle ASCII characters
82
+ if ch <= 0x7f
83
+ buf[out += 1 ] = ch
84
+ # Handle range 0x80-0x7ff
85
+ elseif ch < 0xe0
86
+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
87
+ # Handle range 0x800-0xffff
88
+ elseif ch < 0xf0
89
+ pos += 2
90
+ ch = get_utf8_3byte (dat, pos, ch)
91
+ # Handle surrogate pairs (should have been encoded in 4 bytes)
92
+ if is_surrogate_lead (ch)
93
+ # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
94
+ pos += 3
95
+ surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
96
+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
97
+ | (dat[pos] & 0x3f ))
98
+ ch = get_supplementary (ch, surr)
99
+ end
100
+ buf[out += 1 ] = ch
101
+ # Handle range 0x10000-0x10ffff
102
+ else
103
+ pos += 3
104
+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
105
+ end
17
106
end
18
- a[end ] = Char (0 ) # NULL terminate
19
- UTF32String (a)
107
+ UTF32String (buf)
20
108
end
21
109
22
- function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
23
- len = length (data)
24
- d = Array (Char, len + 1 )
25
- d[end ] = Char (0 ) # NULL terminate
26
- UTF32String (copy! (d,1 , data,1 , len))
110
+ "
111
+ Converts a `UTF16String` to `UTF32String`
112
+
113
+ ### Returns:
114
+ * `::UTF32String`
115
+
116
+ ### Throws:
117
+ * `UnicodeError`
118
+ "
119
+ function convert (:: Type{UTF32String} , str:: UTF16String )
120
+ dat = str. data
121
+ len = sizeof (dat)
122
+ # handle zero length string quickly (account for trailing \0)
123
+ len <= 2 && return empty_utf32
124
+ # get number of words to create
125
+ len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 1 )
126
+ # No surrogate pairs, do optimized copy
127
+ (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
128
+ local ch:: UInt32
129
+ buf = Vector {Char} (len)
130
+ out = 0
131
+ pos = 0
132
+ @inbounds while out < len
133
+ ch = dat[pos += 1 ]
134
+ # check for surrogate pair
135
+ if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
136
+ buf[out += 1 ] = ch
137
+ end
138
+ UTF32String (buf)
139
+ end
140
+
141
+ "
142
+ Converts a `UTF32String` to `UTF16String`
143
+
144
+ ### Returns:
145
+ * `::UTF16String`
146
+
147
+ ### Throws:
148
+ * `UnicodeError`
149
+ "
150
+ function convert (:: Type{UTF16String} , str:: UTF32String )
151
+ dat = reinterpret (UInt32, str. data)
152
+ len = sizeof (dat)
153
+ # handle zero length string quickly
154
+ len <= 4 && return empty_utf16
155
+ # get number of words to allocate
156
+ len, flags, num4byte = unsafe_checkstring (dat, 1 , len>>> 2 )
157
+ # optimized path, no surrogates
158
+ num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
159
+ return encode_to_utf16 (dat, len + num4byte)
160
+ end
161
+
162
+ convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
163
+
164
+ function convert (:: Type{UTF32String} , str:: ASCIIString )
165
+ dat = str. data
166
+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
167
+ end
168
+
169
+ function convert (:: Type{UTF32String} , dat:: AbstractVector{Char} )
170
+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
27
171
end
28
172
29
173
convert {T<:Union{Int32,UInt32}} (:: Type{UTF32String} , data:: AbstractVector{T} ) =
@@ -46,12 +190,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46
190
47
191
reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
48
192
49
- sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
50
193
unsafe_convert {T<:Union{Int32,UInt32,Char}} (:: Type{Ptr{T}} , s:: UTF32String ) =
51
194
convert (Ptr{T}, pointer (s))
52
195
53
196
function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
54
- isempty (bytes) && return UTF32String (Char[ 0 ])
197
+ isempty (bytes) && return empty_utf32
55
198
length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
56
199
data = reinterpret (Char, bytes)
57
200
# check for byte-order mark (BOM):
@@ -79,6 +222,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79
222
end
80
223
isvalid (str:: Vector{Char} ) = isvalid (UTF32String, str)
81
224
225
+ utf32 (x) = convert (UTF32String, x)
226
+
82
227
utf32 (p:: Ptr{Char} , len:: Integer ) = utf32 (pointer_to_array (p, len))
83
228
utf32 (p:: Union{Ptr{UInt32}, Ptr{Int32}} , len:: Integer ) = utf32 (convert (Ptr{Char}, p), len)
84
229
function utf32 (p:: Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}} )
0 commit comments