Skip to content

Commit 5bf09bb

Browse files
stevengjKristofferC
authored andcommitted
fix isequal_normalized for combining-char reordering (#52447)
Fixes #52408. (Note that this function was added in Julia 1.8, in #42493.) In the future it would be good to further optimize this function by adding a fast path for the common case of strings that are mostly ASCII characters. Perhaps simply skip ahead to the first byte that doesn't match before we begin doing decomposition etcetera. (cherry picked from commit 3b250c7)
1 parent a7dbaad commit 5bf09bb

File tree

2 files changed

+137
-11
lines changed

2 files changed

+137
-11
lines changed

stdlib/Unicode/src/Unicode.jl

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,19 @@ end
208208

209209
using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
210210

211-
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
212-
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
211+
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer)
212+
ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
213213
ret < 0 && utf8proc_error(ret)
214214
return ret
215215
end
216216

217+
# would be good to have higher-level accessor functions in utf8proc. alternatively,
218+
# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
219+
# because of the bitfields.
220+
combining_class(uc::Integer) =
221+
0x000301 uc 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
222+
combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))
223+
217224
"""
218225
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
219226
@@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
225232
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
226233
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
227234
235+
!!! compat "Julia 1.8"
236+
The `isequal_normalized` function was added in Julia 1.8.
237+
228238
# Examples
229239
230240
For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
251261
true
252262
```
253263
"""
254-
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
255-
function decompose_next_char!(c, state, d, options, s)
256-
n = _decompose_char!(c, d, options)
257-
if n > length(d) # may be possible in future Unicode versions?
258-
n = _decompose_char!(c, resize!(d, n), options)
264+
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) =
265+
_isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark)
266+
267+
# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
268+
function _isequal_normalized!(s1::AbstractString, s2::AbstractString,
269+
d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity;
270+
casefold::Bool=false, stripmark::Bool=false) where {F}
271+
function decompose_next_chars!(state, d, options, s)
272+
local n
273+
offset = 0
274+
@inbounds while true
275+
# read a char and decompose it to d
276+
c = chartransform(UInt32(state[1]))
277+
state = iterate(s, state[2])
278+
if c < 0x80 # fast path for common ASCII case
279+
n = 1 + offset
280+
n > length(d) && resize!(d, 2n)
281+
d[n] = casefold ? (0x41 c 0x5A ? c+0x20 : c) : c
282+
break # ASCII characters are all zero combining class
283+
else
284+
while true
285+
n = _decompose_char!(c, d, offset, options) + offset
286+
if n > length(d)
287+
resize!(d, 2n)
288+
continue
289+
end
290+
break
291+
end
292+
end
293+
294+
# decomposed chars must be sorted in ascending order of combining class,
295+
# which means we need to keep fetching chars until we get to non-combining
296+
(iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining
297+
offset = n
259298
end
260-
return 1, n, iterate(s, state)
299+
300+
# sort by combining class
301+
if n < 32 # almost always true
302+
for j1 = 2:n # insertion sort
303+
cc = combining_class(d[j1])
304+
iszero(cc) && continue # don't re-order non-combiners
305+
for j2 = j1:-1:2
306+
combining_class(d[j2-1]) cc && break
307+
d[j2-1], d[j2] = d[j2], d[j2-1]
308+
end
309+
end
310+
else # avoid n^2 complexity in crazy large-n case
311+
j = 1
312+
@views while j < n
313+
j₀ = j + something(findnext(iszero combining_class, d[j+1:n], 1), n+1-j)
314+
sort!(d[j:j₀-1], by=combining_class)
315+
j = j₀
316+
end
317+
end
318+
319+
# split return statement to help type inference:
320+
return state === nothing ? (1, n, nothing) : (1, n, state)
261321
end
262322
options = UTF8PROC_DECOMPOSE
263323
casefold && (options |= UTF8PROC_CASEFOLD)
264324
stripmark && (options |= UTF8PROC_STRIPMARK)
265325
i1,i2 = iterate(s1),iterate(s2)
266-
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
267326
n1 = n2 = 0 # lengths of codepoint buffers
268327
j1 = j2 = 1 # indices in d1, d2
269328
while true
270329
if j1 > n1
271330
i1 === nothing && return i2 === nothing && j2 > n2
272-
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
331+
j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1)
273332
end
274333
if j2 > n2
275334
i2 === nothing && return false
276-
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
335+
j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2)
277336
end
278337
d1[j1] == d2[j2] || return false
279338
j1 += 1; j2 += 1

stdlib/Unicode/test/runtests.jl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
using Test
44
using Unicode
55
using Unicode: normalize, isassigned, julia_chartransform
6+
import Random
7+
8+
Random.seed!(12345)
69

710
@testset "string normalization" begin
811
# normalize (Unicode normalization etc.):
@@ -455,6 +458,9 @@ end
455458
@test !Base.Unicode.isvalid(Char, overlong_char)
456459
end
457460

461+
# the obvious, but suboptimal, algorithm:
462+
isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...)
463+
458464
@testset "Unicode equivalence" begin
459465
@test isequal_normalized("no\u00EBl", "noe\u0308l")
460466
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
@@ -466,4 +472,65 @@ end
466472
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
467473
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
468474
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
475+
476+
# issue #52408
477+
@testset "Sorting combining characters" begin
478+
for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples
479+
@test isequal_normalized(str, normalize(str))
480+
end
481+
482+
# first codepoint in every possible Unicode combining class
483+
let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345],
484+
vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)]
485+
function randcc(n, n_cc) # random string with lots of combining chars
486+
buf = IOBuffer()
487+
for _ = 1:n
488+
print.(buf, rand(Vowels, rand(1:5)))
489+
print.(buf, Char.(rand(cc_chars, rand(0:n_cc))))
490+
end
491+
return String(take!(buf))
492+
end
493+
for _ = 1:100
494+
s = randcc(10,10)
495+
ns = normalize(s)
496+
cs = normalize(s, casefold=true)
497+
@test isequal_normalized(s, s)
498+
if !isequal_normalized(s, ns)
499+
@show s
500+
end
501+
@test isequal_normalized(s, ns)
502+
@test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns)
503+
@test isequal_normalized(cs, ns, casefold=true) ==
504+
isequal_normalized_naive(cs, ns, casefold=true)
505+
end
506+
for _ = 1:3
507+
s = randcc(5,1000) # exercise sort!-based fallback
508+
@test isequal_normalized(s, normalize(s))
509+
end
510+
function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners
511+
buf1 = IOBuffer()
512+
buf2 = IOBuffer()
513+
p = n_cc / length(cc_chars)
514+
for _ = 1:n
515+
a = join(rand(Vowels, rand(1:5)))
516+
print(buf1, a)
517+
print(buf2, a)
518+
519+
# chars from distinct combining classes
520+
# are canonically equivalent when re-ordered
521+
c = Random.randsubseq(cc_chars, p)
522+
print.(buf1, Char.(Random.shuffle!(c)))
523+
print.(buf2, Char.(Random.shuffle!(c)))
524+
end
525+
return String(take!(buf1)), String(take!(buf2))
526+
end
527+
for _ = 1:100
528+
s1, s2 = randcc2(10,10)
529+
@test isequal_normalized(s1, s2)
530+
end
531+
end
532+
533+
# combining characters in the same class are inequivalent if re-ordered:
534+
@test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334")
535+
end
469536
end

0 commit comments

Comments
 (0)