From 84ea94b6ad35dcb060b71584ff346ef46ad3e135 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 15:49:34 +0900 Subject: [PATCH 1/2] Import implementation for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl --- .../String/String+Encoding+Names.swift | 551 ++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 Sources/FoundationEssentials/String/String+Encoding+Names.swift diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift new file mode 100644 index 000000000..07ca26c21 --- /dev/null +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -0,0 +1,551 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// MARK: - Private extensions for parsing encoding names + +private extension Unicode.Scalar { + var _isASCIINumeric: Bool { + return ("0"..."9").contains(self) + } + + var _asciiNumericValue: Int { + assert(_isASCIINumeric) + return Int(self.value - 0x30) + } + + /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". + /// + /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace + var _isASCIIWhitespace: Bool { + switch self.value { + case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true + default: false + } + } +} + +private extension String { + var _trimmed: Substring.UnicodeScalarView { + let scalars = self.unicodeScalars + let isNonWhitespace: (Unicode.Scalar) -> Bool = { !$0._isASCIIWhitespace } + guard let firstIndexOfNonWhitespace = scalars.firstIndex(where: isNonWhitespace), + let lastIndexOfNonWhitespace = scalars.lastIndex(where: isNonWhitespace) else { + return Substring.UnicodeScalarView() + } + return scalars[firstIndexOfNonWhitespace...lastIndexOfNonWhitespace] + } +} + +/// A type that holds a `Unicode.Scalar` where its value is compared case-insensitively with others' +/// _if the value is within ASCII range_. +private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, + ExpressibleByUnicodeScalarLiteral { + typealias UnicodeScalarLiteralType = Unicode.Scalar.UnicodeScalarLiteralType + + let scalar: Unicode.Scalar + + @inlinable + init(_ scalar: Unicode.Scalar) { + assert(scalar.isASCII) + self.scalar = scalar + } + + init(unicodeScalarLiteral value: Unicode.Scalar.UnicodeScalarLiteralType) { + self.init(Unicode.Scalar(unicodeScalarLiteral: value)) + } + + @inlinable + static func ==( + lhs: ASCIICaseInsensitiveUnicodeScalar, + rhs: ASCIICaseInsensitiveUnicodeScalar + ) -> Bool { + if lhs.scalar == rhs.scalar { + return true + } else if ("A"..."Z").contains(lhs.scalar) { + return lhs.scalar.value + 0x20 == rhs.scalar.value + } else if ("a"..."z").contains(lhs.scalar) { + return lhs.scalar.value - 0x20 == rhs.scalar.value + } + return false + } +} + +/// A type to tokenize string for `String.Encoding` names. +private protocol StringEncodingNameTokenizer: ~Copyable { + associatedtype Token: Equatable + init(name: String) + mutating func nextToken() throws -> Token? +} + +extension StringEncodingNameTokenizer where Self: ~Copyable { + mutating func hasEqualTokens(with other: consuming Self) throws -> Bool { + while let myToken = try self.nextToken() { + guard let otherToken = try other.nextToken(), + myToken == otherToken else { + return false + } + } + return try other.nextToken() == nil + } +} + +/// ICU-independent parser that follows [Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching). +private struct UTS22Tokenizer: StringEncodingNameTokenizer, ~Copyable { + enum Token: Equatable { + case numeric(Int) + case alphabet(ASCIICaseInsensitiveUnicodeScalar) + } + + enum Error: Swift.Error { + case tooLargeNumericValue + } + + let scalars: String.UnicodeScalarView + + private var _currentIndex: String.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name.unicodeScalars + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + + let scalar = scalars[_currentIndex] + switch scalar { + case "0"..."9": + // Parse a numeric value ignoring leading zeros. + // + // NOTE: To prevent the value from overflow, a threhold is set here. + // The max number of digits to be expected is 8 as of now: i.g. `csISO42JISC62261978`. + // It wouldn't matter to throw an error in practice when the value is too large. + + let threshold: Int = 999_999_999 + var value = scalar._asciiNumericValue + scalars.formIndex(after: &_currentIndex) + while _currentIndex < scalars.endIndex { + let currentScalar = scalars[_currentIndex] + guard currentScalar._isASCIINumeric else { + break + } + value = value * 10 + currentScalar._asciiNumericValue + if value > threshold { + throw Error.tooLargeNumericValue + } + scalars.formIndex(after: &_currentIndex) + } + return .numeric(value) + case "A"..."Z", "a"..."z": + scalars.formIndex(after: &_currentIndex) + return .alphabet(ASCIICaseInsensitiveUnicodeScalar(scalar)) + default: + scalars.formIndex(after: &_currentIndex) + if _currentIndex < scalars.endIndex { + return try nextToken() + } + return nil + } + } +} + + +/// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. +private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { + typealias Token = ASCIICaseInsensitiveUnicodeScalar + + enum Error: Swift.Error { + case nonASCII + } + + let scalars: Substring.UnicodeScalarView + + var _currentIndex: Substring.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name._trimmed + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + let scalar = scalars[_currentIndex] + guard scalar.isASCII else { throw Error.nonASCII } + defer { + scalars.formIndex(after: &_currentIndex) + } + return ASCIICaseInsensitiveUnicodeScalar(scalar) + } +} + + +private extension String { + func isEqual( + to other: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + do { + var myTokenizer = T(name: self) + let otherTokenizer = T(name: other) + return try myTokenizer.hasEqualTokens(with: otherTokenizer) + } catch { + // Any errors imply that `self` or `other` contains invalid characters. + return false + } + } +} + + +// MARK: - IANA Charset Names + +/// Info about IANA Charset. +private struct IANACharset { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + init(preferredMIMEName: String?, name: String, aliases: Array) { + self.preferredMIMEName = preferredMIMEName + self.name = name + self.aliases = aliases + } + + func matches( + _ string: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + if let preferredMIMEName = self.preferredMIMEName, + preferredMIMEName.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + if name.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + for alias in aliases { + if alias.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + } + return false + } +} + +// Extracted only necessary charsets from https://www.iana.org/assignments/character-sets/character-sets.xhtml +extension IANACharset { + /// IANA Characater Set `US-ASCII` + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Characater Set `ISO-8859-1` + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Characater Set `ISO-8859-2` + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Characater Set `Shift_JIS` + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Characater Set `EUC-JP` + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Characater Set `ISO-2022-JP` + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Characater Set `UTF-8` + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Characater Set `UTF-16BE` + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Characater Set `UTF-16LE` + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Characater Set `UTF-16` + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Characater Set `UTF-32` + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Characater Set `UTF-32BE` + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Characater Set `UTF-32LE` + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Characater Set `macintosh` + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Characater Set `windows-1250` + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Characater Set `windows-1251` + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Characater Set `windows-1252` + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Characater Set `windows-1253` + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Characater Set `windows-1254` + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} + +// MARK: - `String.Encoding` Names + +extension String.Encoding { + private var _ianaCharset: IANACharset? { + switch self { + case .utf8: .utf8 + case .ascii: .usASCII + case .japaneseEUC: .eucJP + case .isoLatin1: .iso8859_1 + case .shiftJIS: .shiftJIS + case .isoLatin2: .iso8859_2 + case .unicode: .utf16 + case .windowsCP1251: .windows1251 + case .windowsCP1252: .windows1252 + case .windowsCP1253: .windows1253 + case .windowsCP1254: .windows1254 + case .windowsCP1250: .windows1250 + case .iso2022JP: .iso2022JP + case .macOSRoman: .macintosh + case .utf16BigEndian: .utf16BE + case .utf16LittleEndian: .utf16LE + case .utf32: .utf32 + case .utf32BigEndian: .utf32BE + case .utf32LittleEndian: .utf32LE + default: nil + } + } + + /// The name of this encoding that is compatible with the one of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public var ianaName: String? { + return _ianaCharset?.representativeName + } + + /// Creates an instance from the name of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public init?(ianaName charsetName: String) { + func __determineEncoding() -> String.Encoding? { + func __matches(_ charsets: IANACharset...) -> Bool { + assert(!charsets.isEmpty) + return charsets.contains { + $0.matches( + charsetName, + tokenizedBy: ASCIICaseInsensitiveTokenizer.self + ) + } + } + + return if __matches(.utf8) { + .utf8 + } else if __matches(.usASCII) { + .ascii + } else if __matches(.eucJP) { + .japaneseEUC + } else if __matches(.iso8859_1) { + .isoLatin1 + } else if __matches(.shiftJIS) { + .shiftJIS + } else if __matches(.iso8859_2) { + .isoLatin2 + } else if __matches(.utf16) { + .utf16 + } else if __matches(.windows1251) { + .windowsCP1251 + } else if __matches(.windows1252) { + .windowsCP1252 + } else if __matches(.windows1253) { + .windowsCP1253 + } else if __matches(.windows1254) { + .windowsCP1254 + } else if __matches(.windows1250) { + .windowsCP1250 + } else if __matches(.iso2022JP) { + .iso2022JP + } else if __matches(.macintosh) { + .macOSRoman + } else if __matches(.utf16BE) { + .utf16BigEndian + } else if __matches(.utf16LE) { + .utf16LittleEndian + } else if __matches(.utf32) { + .utf32 + } else if __matches(.utf32BE) { + .utf32BigEndian + } else if __matches(.utf32LE) { + .utf32LittleEndian + } else { + nil + } + } + + guard let encoding = __determineEncoding() else { + return nil + } + self = encoding + } +} + From dd0b16c682cef79543da96c8216e44f91d86d669 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 16:27:55 +0900 Subject: [PATCH 2/2] Import tests for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl/blob/0.4.0/Tests/StringEncodingNameImplTests/StringEncodingNameParserTests.swift --- .../StringTests.swift | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index 7b7cb041b..e6f294149 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1360,6 +1360,69 @@ final class StringTests : XCTestCase { "abcd🎺efgh" ]) } + + func test_Encoding_names() { + // Encoding to Name + XCTAssertEqual(String._Encoding.ascii.ianaName, "US-ASCII") + XCTAssertEqual(String._Encoding.nextstep.ianaName, nil) + XCTAssertEqual(String._Encoding.japaneseEUC.ianaName, "EUC-JP") + XCTAssertEqual(String._Encoding.utf8.ianaName, "UTF-8") + XCTAssertEqual(String._Encoding.isoLatin1.ianaName, "ISO-8859-1") + XCTAssertEqual(String._Encoding.symbol.ianaName, nil) + XCTAssertEqual(String._Encoding.nonLossyASCII.ianaName, nil) + XCTAssertEqual(String._Encoding.shiftJIS.ianaName, "Shift_JIS") + XCTAssertEqual(String._Encoding.isoLatin2.ianaName, "ISO-8859-2") + XCTAssertEqual(String._Encoding.unicode.ianaName, "UTF-16") + XCTAssertEqual(String._Encoding.windowsCP1251.ianaName, "windows-1251") + XCTAssertEqual(String._Encoding.windowsCP1252.ianaName, "windows-1252") + XCTAssertEqual(String._Encoding.windowsCP1253.ianaName, "windows-1253") + XCTAssertEqual(String._Encoding.windowsCP1254.ianaName, "windows-1254") + XCTAssertEqual(String._Encoding.windowsCP1250.ianaName, "windows-1250") + XCTAssertEqual(String._Encoding.iso2022JP.ianaName, "ISO-2022-JP") + XCTAssertEqual(String._Encoding.macOSRoman.ianaName, "macintosh") + XCTAssertEqual(String._Encoding.utf16BigEndian.ianaName, "UTF-16BE") + XCTAssertEqual(String._Encoding.utf16LittleEndian.ianaName, "UTF-16LE") + XCTAssertEqual(String._Encoding.utf32.ianaName, "UTF-32") + XCTAssertEqual(String._Encoding.utf32BigEndian.ianaName, "UTF-32BE") + XCTAssertEqual(String._Encoding.utf32LittleEndian.ianaName, "UTF-32LE") + XCTAssertEqual(String._Encoding(rawValue: .max).ianaName, nil) + + // Name to Encoding + XCTAssertEqual(String._Encoding(ianaName: "us-ascii"), .ascii) + XCTAssertEqual(String._Encoding(ianaName: "iso-ir-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "x-nextstep"), nil) + XCTAssertEqual(String._Encoding(ianaName: "euc-jp"), .japaneseEUC) + XCTAssertEqual(String._Encoding(ianaName: "CP51932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-8"), .utf8) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-1"), .isoLatin1) + XCTAssertEqual(String._Encoding(ianaName: "x-mac-symbol"), nil) + XCTAssertEqual(String._Encoding(ianaName: "Adobe-symbol-encoding"), nil) + XCTAssertEqual(String._Encoding(ianaName: "cp932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "shift_jis"), .shiftJIS) + XCTAssertEqual(String._Encoding(ianaName: "windows-31j"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-2"), .isoLatin2) + XCTAssertEqual(String._Encoding(ianaName: "utf-16"), .utf16) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "unicode-1-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1251"), .windowsCP1251) + XCTAssertEqual(String._Encoding(ianaName: "windows-1252"), .windowsCP1252) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1253"), .windowsCP1253) + XCTAssertEqual(String._Encoding(ianaName: "windows-1254"), .windowsCP1254) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-9-windows-Latin-5"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1250"), .windowsCP1250) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-2-windows-Latin-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso-2022-jp"), .iso2022JP) + XCTAssertEqual(String._Encoding(ianaName: "macintosh"), .macOSRoman) + XCTAssertEqual(String._Encoding(ianaName: "utf-16be"), .utf16BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-16le"), .utf16LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32"), .utf32) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-4"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-32be"), .utf32BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32le"), .utf32LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "foo-bar-baz"), nil) + } } // MARK: - Helper functions