fix isequal_normalized for combining-char reordering (#52447)

stevengj · KristofferC · commit 5bf09bb4797d · 2023-12-23T14:43:46.000+01:00
Fixes #52408. (Note that this function was added in Julia 1.8, in #42493.) In the future it would be good to further optimize this function by adding a fast path for the common case of strings that are mostly ASCII characters. Perhaps simply skip ahead to the first byte that doesn't match before we begin doing decomposition etcetera. (cherry picked from commit 3b250c7)
diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -208,12 +208,19 @@ end
 
 using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
 
-function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
-    ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
+function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer)
+    ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
     ret < 0 && utf8proc_error(ret)
     return ret
 end
 
+# would be good to have higher-level accessor functions in utf8proc.  alternatively,
+# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
+# because of the bitfields.
+combining_class(uc::Integer) =
+    0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
+combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))
+
 """
     isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
 
@@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
 function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
 to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
 
+!!! compat "Julia 1.8"
+    The `isequal_normalized` function was added in Julia 1.8.
+
 # Examples
 
 For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
 true
 ```
 """
-function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
-    function decompose_next_char!(c, state, d, options, s)
-        n = _decompose_char!(c, d, options)
-        if n > length(d) # may be possible in future Unicode versions?
-            n = _decompose_char!(c, resize!(d, n), options)
+isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) =
+    _isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark)
+
+# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
+function _isequal_normalized!(s1::AbstractString, s2::AbstractString,
+                              d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity;
+                              casefold::Bool=false, stripmark::Bool=false) where {F}
+    function decompose_next_chars!(state, d, options, s)
+        local n
+        offset = 0
+        @inbounds while true
+            # read a char and decompose it to d
+            c = chartransform(UInt32(state[1]))
+            state = iterate(s, state[2])
+            if c < 0x80 # fast path for common ASCII case
+                n = 1 + offset
+                n > length(d) && resize!(d, 2n)
+                d[n] = casefold ? (0x41 ≤ c ≤ 0x5A ? c+0x20 : c) : c
+                break # ASCII characters are all zero combining class
+            else
+                while true
+                    n = _decompose_char!(c, d, offset, options) + offset
+                    if n > length(d)
+                        resize!(d, 2n)
+                        continue
+                    end
+                    break
+                end
+            end
+
+            # decomposed chars must be sorted in ascending order of combining class,
+            # which means we need to keep fetching chars until we get to non-combining
+            (iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining
+            offset = n
         end
-        return 1, n, iterate(s, state)
+
+        # sort by combining class
+        if n < 32 # almost always true
+            for j1 = 2:n # insertion sort
+                cc = combining_class(d[j1])
+                iszero(cc) && continue # don't re-order non-combiners
+                for j2 = j1:-1:2
+                    combining_class(d[j2-1]) ≤ cc && break
+                    d[j2-1], d[j2] = d[j2], d[j2-1]
+                end
+            end
+        else # avoid n^2 complexity in crazy large-n case
+            j = 1
+            @views while j < n
+                j₀ = j + something(findnext(iszero ∘ combining_class, d[j+1:n], 1), n+1-j)
+                sort!(d[j:j₀-1], by=combining_class)
+                j = j₀
+            end
+        end
+
+        # split return statement to help type inference:
+        return state === nothing ? (1, n, nothing) : (1, n, state)
     end
     options = UTF8PROC_DECOMPOSE
     casefold && (options |= UTF8PROC_CASEFOLD)
     stripmark && (options |= UTF8PROC_STRIPMARK)
     i1,i2 = iterate(s1),iterate(s2)
-    d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
     n1 = n2 = 0 # lengths of codepoint buffers
     j1 = j2 = 1 # indices in d1, d2
     while true
         if j1 > n1
             i1 === nothing && return i2 === nothing && j2 > n2
-            j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
+            j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1)
         end
         if j2 > n2
             i2 === nothing && return false
-            j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
+            j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2)
         end
         d1[j1] == d2[j2] || return false
         j1 += 1; j2 += 1
diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
@@ -3,6 +3,9 @@
 using Test
 using Unicode
 using Unicode: normalize, isassigned, julia_chartransform
+import Random
+
+Random.seed!(12345)
 
 @testset "string normalization" begin
     # normalize (Unicode normalization etc.):
@@ -455,6 +458,9 @@ end
     @test !Base.Unicode.isvalid(Char, overlong_char)
 end
 
+# the obvious, but suboptimal, algorithm:
+isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...)
+
 @testset "Unicode equivalence" begin
     @test isequal_normalized("no\u00EBl", "noe\u0308l")
     @test !isequal_normalized("no\u00EBl", "noe\u0308l ")
@@ -466,4 +472,65 @@ end
     @test isequal_normalized("no\u00EBl", "noel", stripmark=true)
     @test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
     @test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
+
+    # issue #52408
+    @testset "Sorting combining characters" begin
+        for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples
+            @test isequal_normalized(str, normalize(str))
+        end
+
+        # first codepoint in every possible Unicode combining class
+        let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345],
+            vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)]
+            function randcc(n, n_cc) # random string with lots of combining chars
+                buf = IOBuffer()
+                for _ = 1:n
+                    print.(buf, rand(Vowels, rand(1:5)))
+                    print.(buf, Char.(rand(cc_chars, rand(0:n_cc))))
+                end
+                return String(take!(buf))
+            end
+            for _ = 1:100
+                s = randcc(10,10)
+                ns = normalize(s)
+                cs = normalize(s, casefold=true)
+                @test isequal_normalized(s, s)
+                if !isequal_normalized(s, ns)
+                    @show s
+                end
+                @test isequal_normalized(s, ns)
+                @test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns)
+                @test isequal_normalized(cs, ns, casefold=true) ==
+                      isequal_normalized_naive(cs, ns, casefold=true)
+            end
+            for _ = 1:3
+                s = randcc(5,1000) # exercise sort!-based fallback
+                @test isequal_normalized(s, normalize(s))
+            end
+            function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners
+                buf1 = IOBuffer()
+                buf2 = IOBuffer()
+                p = n_cc / length(cc_chars)
+                for _ = 1:n
+                    a = join(rand(Vowels, rand(1:5)))
+                    print(buf1, a)
+                    print(buf2, a)
+
+                    # chars from distinct combining classes
+                    # are canonically equivalent when re-ordered
+                    c = Random.randsubseq(cc_chars, p)
+                    print.(buf1, Char.(Random.shuffle!(c)))
+                    print.(buf2, Char.(Random.shuffle!(c)))
+                end
+                return String(take!(buf1)), String(take!(buf2))
+            end
+            for _ = 1:100
+                s1, s2 = randcc2(10,10)
+                @test isequal_normalized(s1, s2)
+            end
+        end
+
+        # combining characters in the same class are inequivalent if re-ordered:
+        @test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334")
+    end
 end