Skip to content

Commit 7d4817b

Browse files
authored
Fix ISO Latin 1 Encoding/Decoding issues (#1219) (#1221)
1 parent 35b58c4 commit 7d4817b

File tree

3 files changed

+12
-44
lines changed

3 files changed

+12
-44
lines changed

Sources/FoundationEssentials/String/String+IO.swift

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,6 @@ internal import _FoundationCShims
1818

1919
fileprivate let stringEncodingAttributeName = "com.apple.TextEncoding"
2020

21-
private struct ExtendingToUTF16Sequence<Base: Sequence<UInt8>> : Sequence {
22-
typealias Element = UInt16
23-
24-
struct Iterator : IteratorProtocol {
25-
private var base: Base.Iterator
26-
27-
init(_ base: Base.Iterator) {
28-
self.base = base
29-
}
30-
31-
mutating func next() -> Element? {
32-
guard let value = base.next() else { return nil }
33-
return UInt16(value)
34-
}
35-
}
36-
37-
private let base: Base
38-
39-
init(_ base: Base) {
40-
self.base = base
41-
}
42-
43-
func makeIterator() -> Iterator {
44-
Iterator(base.makeIterator())
45-
}
46-
}
47-
4821

4922
@available(macOS 10.10, iOS 8.0, watchOS 2.0, tvOS 9.0, *)
5023
extension String {
@@ -181,12 +154,9 @@ extension String {
181154
}
182155
#if !FOUNDATION_FRAMEWORK
183156
case .isoLatin1:
184-
guard bytes.allSatisfy(\.isValidISOLatin1) else {
185-
return nil
186-
}
187-
// isoLatin1 is an 8-bit encoding that represents a subset of UTF-16
188-
// Map to 16-bit values and decode as UTF-16
189-
self.init(_validating: ExtendingToUTF16Sequence(bytes), as: UTF16.self)
157+
// ISO Latin 1 bytes are always valid since it's an 8-bit encoding that maps scalars 0x0 through 0xFF
158+
// Simply extend each byte to 16 bits and decode as UTF-16
159+
self.init(decoding: bytes.lazy.map { UInt16($0) }, as: UTF16.self)
190160
case .macOSRoman:
191161
func buildString(_ bytes: UnsafeBufferPointer<UInt8>) -> String {
192162
String(unsafeUninitializedCapacity: bytes.count * 3) { buffer in

Sources/FoundationEssentials/String/StringProtocol+Essentials.swift

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ import Darwin
2121

2222
internal import _FoundationCShims
2323

24-
extension BinaryInteger {
25-
var isValidISOLatin1: Bool {
26-
(0x20 <= self && self <= 0x7E) || (0xA0 <= self && self <= 0xFF)
27-
}
28-
}
29-
3024
extension UInt8 {
3125
private typealias UTF8Representation = (UInt8, UInt8, UInt8)
3226
private static func withMacRomanMap<R>(_ body: (UnsafeBufferPointer<UTF8Representation>) -> R) -> R {
@@ -228,12 +222,14 @@ extension String {
228222
return data + swapped
229223
#if !FOUNDATION_FRAMEWORK
230224
case .isoLatin1:
231-
return try? Data(capacity: self.utf16.count) { buffer in
232-
for scalar in self.utf16 {
233-
guard scalar.isValidISOLatin1 else {
225+
// ISO Latin 1 encodes code points 0x0 through 0xFF (a maximum of 2 UTF-8 scalars per ISO Latin 1 Scalar)
226+
// The UTF-8 count is a cheap, reasonable starting capacity as it is precise for the all-ASCII case and it will only over estimate by 1 byte per non-ASCII character
227+
return try? Data(capacity: self.utf8.count) { buffer in
228+
for scalar in self.unicodeScalars {
229+
guard let valid = UInt8(exactly: scalar.value) else {
234230
throw CocoaError(.fileWriteInapplicableStringEncoding)
235231
}
236-
buffer.appendElement(UInt8(scalar & 0xFF))
232+
buffer.appendElement(valid)
237233
}
238234
}
239235
case .macOSRoman:

Tests/FoundationEssentialsTests/StringTests.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1336,7 +1336,9 @@ final class StringTests : XCTestCase {
13361336
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
13371337
"0123456789",
13381338
"!\"#$%&'()*+,-./",
1339-
"¡¶ÅÖæöÿ\u{00A0}~"
1339+
"¡¶ÅÖæöÿ\u{0080}\u{00A0}~",
1340+
"Hello\nworld",
1341+
"Hello\r\nworld"
13401342
], invalid: [
13411343
"🎺",
13421344
"מ",

0 commit comments

Comments
 (0)