Skip to content

Commit 2be9a83

Browse files
committed
Eliminate bunch of copies of error codepath from Utf8LossyChunksIter
Using a macro to stamp out 7 identical copies of the nontrivial slicing logic to exit this loop didn't seem like a necessary use of a macro. The early return case can be handled by `break` without practically any changes to the logic inside the loop. All this code is from early 2014 (7.5 years old, pre-1.0) so it's possible there were compiler limitations that forced the macro way at the time. Confirmed that `x.py bench library/alloc --stage 0 --test-args from_utf8_lossy` is unaffected on my machine.
1 parent dd549dc commit 2be9a83

File tree

1 file changed

+31
-38
lines changed

1 file changed

+31
-38
lines changed

library/core/src/str/lossy.rs

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
6161
}
6262

6363
let mut i = 0;
64+
let mut valid_up_to = 0;
6465
while i < self.source.len() {
65-
let i_ = i;
66-
67-
// SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
68-
// only increases, so `0 <= i < self.source.len()`.
66+
// SAFETY: `i < self.source.len()` per previous line.
67+
// For some reason the following are both significantly slower:
68+
// while let Some(&byte) = self.source.get(i) {
69+
// while let Some(byte) = self.source.get(i).copied() {
6970
let byte = unsafe { *self.source.get_unchecked(i) };
7071
i += 1;
7172

7273
if byte < 128 {
74+
// This could be a `1 => ...` case in the match below, but for
75+
// the common case of all-ASCII inputs, we bypass loading the
76+
// sizeable UTF8_CHAR_WIDTH table into cache.
7377
} else {
7478
let w = utf8_char_width(byte);
7579

76-
macro_rules! error {
77-
() => {{
78-
// SAFETY: We have checked up to `i` that source is valid UTF-8.
79-
unsafe {
80-
let r = Utf8LossyChunk {
81-
valid: from_utf8_unchecked(&self.source[0..i_]),
82-
broken: &self.source[i_..i],
83-
};
84-
self.source = &self.source[i..];
85-
return Some(r);
86-
}
87-
}};
88-
}
89-
9080
match w {
9181
2 => {
9282
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
93-
error!();
83+
break;
9484
}
9585
i += 1;
9686
}
@@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
10090
(0xE1..=0xEC, 0x80..=0xBF) => (),
10191
(0xED, 0x80..=0x9F) => (),
10292
(0xEE..=0xEF, 0x80..=0xBF) => (),
103-
_ => {
104-
error!();
105-
}
93+
_ => break,
10694
}
10795
i += 1;
10896
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
109-
error!();
97+
break;
11098
}
11199
i += 1;
112100
}
@@ -115,34 +103,39 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
115103
(0xF0, 0x90..=0xBF) => (),
116104
(0xF1..=0xF3, 0x80..=0xBF) => (),
117105
(0xF4, 0x80..=0x8F) => (),
118-
_ => {
119-
error!();
120-
}
106+
_ => break,
121107
}
122108
i += 1;
123109
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
124-
error!();
110+
break;
125111
}
126112
i += 1;
127113
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
128-
error!();
114+
break;
129115
}
130116
i += 1;
131117
}
132-
_ => {
133-
error!();
134-
}
118+
_ => break,
135119
}
136120
}
121+
122+
valid_up_to = i;
137123
}
138124

139-
let r = Utf8LossyChunk {
140-
// SAFETY: We have checked that the entire source is valid UTF-8.
141-
valid: unsafe { from_utf8_unchecked(self.source) },
142-
broken: &[],
143-
};
144-
self.source = &[];
145-
Some(r)
125+
// SAFETY: `i <= self.source.len()` because it only ever increments by 1
126+
// and the loop is terminated as soon as that goes beyond bounds.
127+
let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
128+
self.source = remaining;
129+
130+
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
131+
// `valid_up_to = i` and `i` only increases.
132+
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
133+
134+
Some(Utf8LossyChunk {
135+
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
136+
valid: unsafe { from_utf8_unchecked(valid) },
137+
broken,
138+
})
146139
}
147140
}
148141

0 commit comments

Comments
 (0)