Skip to content

Enable string conversion in EUC-JP. #1296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions Sources/FoundationEssentials/String/String+IO.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ dynamic public func _cfMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>,
// Provide swift-corelibs-foundation with an entry point to convert some bytes into a String
return nil
}

dynamic package func _icuMakeStringFromBytes(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
Expand Down Expand Up @@ -202,8 +207,14 @@ extension String {
return nil
}
#else
if let string = (bytes.withContiguousStorageIfAvailable({ _cfMakeStringFromBytes($0, encoding: encoding.rawValue) }) ??
Array(bytes).withUnsafeBufferPointer({ _cfMakeStringFromBytes($0, encoding: encoding.rawValue) })) {
func makeString(from bytes: UnsafeBufferPointer<UInt8>) -> String? {
return (
_cfMakeStringFromBytes(bytes, encoding: encoding.rawValue) ??
_icuMakeStringFromBytes(bytes, encoding: encoding)
)
}
if let string = (bytes.withContiguousStorageIfAvailable({ makeString(from: $0) }) ??
Array(bytes).withUnsafeBufferPointer({ makeString(from: $0) })) {
self = string
} else {
return nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ dynamic public func _cfStringEncodingConvert(string: String, using encoding: UIn
// Dynamically replaced by swift-corelibs-foundation to implement encodings that we do not have Swift replacements for, yet
return nil
}

dynamic package func _icuStringEncodingConvert(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
// Concrete implementation is provided by FoundationInternationalization.
return nil
}
#endif

@available(FoundationPreview 0.4, *)
Expand Down Expand Up @@ -255,8 +260,12 @@ extension String {
// Other encodings, defer to the CoreFoundation implementation
return _ns.data(using: encoding.rawValue, allowLossyConversion: allowLossyConversion)
#else
// Attempt an up-call into swift-corelibs-foundation, which can defer to the CoreFoundation implementation
return _cfStringEncodingConvert(string: self, using: encoding.rawValue, allowLossyConversion: allowLossyConversion)
return (
// Attempt an up-call into swift-corelibs-foundation, which can defer to the CoreFoundation implementation
_cfStringEncodingConvert(string: self, using: encoding.rawValue, allowLossyConversion: allowLossyConversion) ??
// Or attempt an up-call into ICU via FoundationInternationalization
_icuStringEncodingConvert(string: self, using: encoding, allowLossyConversion: allowLossyConversion)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do we fall back to the ICU one here? On Darwin this encoding is handled via CF, which itself calls into ICU.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess that'll happen if import Foundation has not been done (and CF is present), but import FoundationInternationalization has. I wonder how we can further unify these paths; on Darwin too...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a similar question - I believe _cfStringEncodingConvert will return nil if any of the following are the case:

  1. CF isn't loaded
  2. CF is loaded but it doesn't support the provided encoding
  3. CF is loaded and supports the encoding but the data is malformed

We want the up-call in 2, and I think Tony's question was about 1, but do we want the up call in case 3?

Copy link
Member Author

@YOCKOW YOCKOW May 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now agree that unexpected fallback should be avoided.
Although ICU can support other encodings, I should've made this change focus on EUC-JP.

)
#endif
}
}
Expand Down
202 changes: 202 additions & 0 deletions Sources/FoundationInternationalization/ICU/ICU+StringConverter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if canImport(FoundationEssentials)
import FoundationEssentials
#endif
internal import _FoundationICU

private extension String.Encoding {
var _icuConverterName: String? {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we make sure this returns nil for any encoding which we never want to use ICU for? UTF8, ASCII, UTF16/32, etc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Certainly.
I'll let them return nil here or in ICU.StringConverter's init.

// TODO: Replace this with forthcoming(?) public property such as https://github.com/swiftlang/swift-foundation/pull/1243
switch self {
case .utf8: "UTF-8"
case .ascii: "US-ASCII"
case .japaneseEUC: "EUC-JP"
case .isoLatin1: "ISO-8859-1"
case .shiftJIS: "Shift_JIS"
case .isoLatin2: "ISO-8859-2"
case .unicode: "UTF-16"
case .windowsCP1251: "windows-1251"
case .windowsCP1252: "windows-1252"
case .windowsCP1253: "windows-1253"
case .windowsCP1254: "windows-1254"
case .windowsCP1250: "windows-1250"
case .iso2022JP: "ISO-2022-JP"
case .macOSRoman: "macintosh"
case .utf16BigEndian: "UTF-16BE"
case .utf16LittleEndian: "UTF-16LE"
case .utf32: "UTF-32"
case .utf32BigEndian: "UTF-32BE"
case .utf32LittleEndian: "UTF-32LE"
default: nil
}
}
}

extension ICU {
final class StringConverter: @unchecked Sendable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does this need the @unchecked? I would have assumed that since it's a final class with all immutable, Sendable properties that the compiler can validate this conformance

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you.
That's a vestige of my first implementation where _converter was a bare pointer...

private let _converter: LockedState<OpaquePointer> // UConverter*

let encoding: String.Encoding

init?(encoding: String.Encoding) {
guard let convName = encoding._icuConverterName else {
return nil
}
var status: UErrorCode = U_ZERO_ERROR
guard let converter = ucnv_open(convName, &status), status.isSuccess else {
return nil
}
self._converter = LockedState(initialState: converter)
self.encoding = encoding
}

deinit {
_converter.withLock { ucnv_close($0) }
}
}
}

extension ICU.StringConverter {
func decode(data: Data) -> String? {
return _converter.withLock { converter in
defer {
ucnv_resetToUnicode(converter)
}

let srcLength = CInt(data.count)
let initCapacity = srcLength * CInt(ucnv_getMinCharSize(converter)) + 1
return _withResizingUCharBuffer(initialSize: initCapacity) { (dest, capacity, status) in
return data.withUnsafeBytes { src in
ucnv_toUChars(
converter,
dest,
capacity,
src.baseAddress,
srcLength,
&status
)
}
}
}
}

func encode(string: String, allowLossyConversion lossy: Bool) -> Data? {
return _converter.withLock { (converter) -> Data? in
defer {
ucnv_resetFromUnicode(converter)
}

let utf16Rep = string.utf16
let uchars = UnsafeMutableBufferPointer<UChar>.allocate(capacity: utf16Rep.count)
_ = uchars.initialize(fromContentsOf: utf16Rep)
defer {
uchars.deallocate()
}

let srcLength = uchars.count
let capacity = srcLength * Int(ucnv_getMaxCharSize(converter)) + 1
let dest = UnsafeMutableRawPointer.allocate(
byteCount: capacity,
alignment: MemoryLayout<CChar>.alignment
)

var status: UErrorCode = U_ZERO_ERROR
if lossy {
var lossyChar: UChar = encoding == .ascii ? 0xFF : 0x3F
ucnv_setSubstString(
converter,
&lossyChar,
1,
&status
)
guard status.isSuccess else { return nil }

ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_SUBSTITUTE,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
} else {
ucnv_setFromUCallBack(
converter,
UCNV_FROM_U_CALLBACK_STOP,
nil, // newContext
nil, // oldAction
nil, // oldContext
&status
)
guard status.isSuccess else { return nil }
}

let actualLength = ucnv_fromUChars(
converter,
dest,
CInt(capacity),
uchars.baseAddress,
CInt(srcLength),
&status
)
guard status.isSuccess else { return nil }
return Data(
bytesNoCopy: dest,
count: Int(actualLength),
deallocator: .custom({ pointer, _ in pointer.deallocate() })
)
}
}
}

extension ICU.StringConverter {
nonisolated(unsafe) static private var _converters: LockedState<[String.Encoding: ICU.StringConverter]> = .init(initialState: [:])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should just be a static private let (the immutable-ness affects the LockedState itself but not the contents IIRC, and this isn't unsafe because it's guarded by a lock)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed it should be.
This is also a vestige of my first impl, I guess.


static func converter(for encoding: String.Encoding) -> ICU.StringConverter? {
return _converters.withLock {
if let converter = $0[encoding] {
return converter
}
if let converter = ICU.StringConverter(encoding: encoding) {
$0[encoding] = converter
return converter
}
return nil
}
}
}


@_dynamicReplacement(for: _icuMakeStringFromBytes(_:encoding:))
func _icuMakeStringFromBytes_impl(_ bytes: UnsafeBufferPointer<UInt8>, encoding: String.Encoding) -> String? {
guard let converter = ICU.StringConverter.converter(for: encoding),
let pointer = bytes.baseAddress else {
return nil
}
let data = Data(
bytesNoCopy: UnsafeMutableRawPointer(mutating: pointer),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding a comment here about this being "safe" because this data neither escapes this function nor is mutated? Since the pointer provided is not guaranteed to live longer than the function or be mutable, this could be a trap for a later addition that doesn't see those two constraints

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it's better to have some comments. I'm going to add ones.

count: bytes.count,
deallocator: .none
)
return converter.decode(data: data)
}

@_dynamicReplacement(for: _icuStringEncodingConvert(string:using:allowLossyConversion:))
func _icuStringEncodingConvert_impl(string: String, using encoding: String.Encoding, allowLossyConversion: Bool) -> Data? {
guard let converter = ICU.StringConverter.converter(for: encoding) else {
return nil
}
return converter.encode(string: string, allowLossyConversion: allowLossyConversion)
}
126 changes: 126 additions & 0 deletions Tests/FoundationInternationalizationTests/StringTests+Data.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#if FOUNDATION_FRAMEWORK
@testable import Foundation
#else
@testable import FoundationEssentials
@testable import FoundationInternationalization
#endif // FOUNDATION_FRAMEWORK

#if canImport(TestSupport)
import TestSupport
#endif

final class StringConverterTests: XCTestCase {
private func _test_roundTripConversion(
string: String,
data: Data,
encoding: String._Encoding,
file: StaticString = #filePath,
line: UInt = #line
) {
XCTAssertEqual(
string.data(using: encoding), data, "Failed to convert string to data.",
file: file, line: line
)
XCTAssertEqual(
string, String(data: data, encoding: encoding), "Failed to convert data to string.",
file: file, line: line
)
}

func test_japaneseEUC() {
// Confirm that https://github.com/swiftlang/swift-foundation/issues/1016 is fixed.

// ASCII
_test_roundTripConversion(
string: "ABC",
data: Data([0x41, 0x42, 0x43]),
encoding: .japaneseEUC
)

// Plane 1 Row 1
_test_roundTripConversion(
string: "、。◇",
data: Data([
0xA1, 0xA2,
0xA1, 0xA3,
0xA1, 0xFE,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 4 (Hiragana)
_test_roundTripConversion(
string: "ひらがな",
data: Data([
0xA4, 0xD2,
0xA4, 0xE9,
0xA4, 0xAC,
0xA4, 0xCA,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 5 (Katakana)
_test_roundTripConversion(
string: "ヴヵヶ",
data: Data([
0xA5, 0xF4,
0xA5, 0xF5,
0xA5, 0xF6,
]),
encoding: .japaneseEUC
)

// Plane 1 Row 6 (Greek Alphabets)
_test_roundTripConversion(
string: "Σπ",
data: Data([
0xA6, 0xB2,
0xA6, 0xD0,
]),
encoding: .japaneseEUC
)

// Basic Kanji
_test_roundTripConversion(
string: "日本",
data: Data([
0xC6, 0xFC,
0xCB, 0xDC,
]),
encoding: .japaneseEUC
)

// Amendment by JIS83/JIS90
_test_roundTripConversion(
string: "扉⇔穴",
data: Data([
0xC8, 0xE2,
0xA2, 0xCE,
0xB7, 0xEA,
]),
encoding: .japaneseEUC
)

// Unsupported characters
let sushi = "Sushi🍣"
XCTAssertNil(sushi.data(using: String._Encoding.japaneseEUC))
XCTAssertEqual(
sushi.data(using: String._Encoding.japaneseEUC, allowLossyConversion: true),
"Sushi?".data(using: .utf8)
)
}
}