Skip to content

Commit e45064f

Browse files
committed
Avoid allocating new buffers if possible
1 parent 1cdea62 commit e45064f

File tree

1 file changed

+75
-83
lines changed

1 file changed

+75
-83
lines changed

src/events/attributes.rs

Lines changed: 75 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -331,93 +331,95 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331331
}
332332
}
333333

334-
// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
335-
// 2) Begin with a normalized value consisting of the empty string.
336-
// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
337-
// * For a character reference, append the referenced character to the normalized value.
338-
// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
339-
// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
340-
// * For another character, append the character to the normalized value.
341-
//
342-
// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
343-
// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
344-
//
345-
// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
346-
// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
347-
// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
348-
// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
349-
fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
334+
///
335+
///
336+
/// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
337+
/// 2) Begin with a normalized value consisting of the empty string.
338+
/// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
339+
/// * For a character reference, append the referenced character to the normalized value.
340+
/// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
341+
/// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
342+
/// * For another character, append the character to the normalized value.
343+
///
344+
/// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
345+
/// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
346+
///
347+
/// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
348+
/// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
349+
/// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
350+
/// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
351+
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
350352
// TODO: character references, entity references, error handling associated with those
351-
// TODO: don't allocated unless needed?
352353

353354
#[derive(PartialEq)]
354355
enum ParseState {
355-
SpaceOrStart,
356+
Space,
356357
CDATA,
357358
}
358359

359-
let mut value: Vec<u8> = Vec::new();
360-
// Starting in the state where we think we've added a space means we implicitly skip leading spaces
361-
let mut current_state = ParseState::SpaceOrStart;
362-
// Used for trimming trailing spaces
363-
let mut last_cdata_idx = 0;
360+
let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
361+
362+
let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
363+
364+
if first_non_space_char.is_none() {
365+
// The entire value was whitespace-like characters
366+
return Cow::Borrowed(b"");
367+
}
368+
369+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
370+
371+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
372+
let begin = first_non_space_char.unwrap();
373+
let end = last_non_space_char.unwrap_or(attr.len());
374+
let trimmed_attr = &attr[begin..=end];
375+
376+
// A new buffer is only created when we encounter a situation that requires it.
377+
let mut normalized: Option<Vec<u8>> = None;
378+
// We start on character data because all whitespace-like characters are already trimmed away.
379+
let mut current_state = ParseState::CDATA;
364380

365-
// In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
366-
for ch in attr.as_ref() {
381+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
382+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
383+
// buffer and continue using this buffer.
384+
for (idx, ch) in trimmed_attr.iter().enumerate() {
367385
match ch {
368386
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
369-
ParseState::SpaceOrStart => continue,
387+
ParseState::Space => match normalized {
388+
Some(_) => continue,
389+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
390+
},
370391
ParseState::CDATA => {
371-
current_state = ParseState::SpaceOrStart;
372-
value.push(b' ');
392+
current_state = ParseState::Space;
393+
match normalized.as_mut() {
394+
Some(buf) => buf.push(b' '),
395+
None => {
396+
let mut buf = Vec::from(&trimmed_attr[..idx]);
397+
buf.push(b' ');
398+
normalized = Some(buf);
399+
}
400+
}
373401
}
374402
},
375403
c @ _ => match current_state {
376-
ParseState::SpaceOrStart => {
404+
ParseState::Space => {
377405
current_state = ParseState::CDATA;
378-
last_cdata_idx = value.len();
379-
value.push(*c);
406+
if let Some(normalized) = normalized.as_mut() {
407+
normalized.push(*c);
408+
}
380409
}
381410
ParseState::CDATA => {
382-
last_cdata_idx = value.len();
383-
value.push(*c);
411+
if let Some(normalized) = normalized.as_mut() {
412+
normalized.push(*c);
413+
}
384414
}
385415
},
386416
}
387417
}
388418

389-
// Trim any trailing spaces
390-
if current_state == ParseState::SpaceOrStart {
391-
value.truncate(last_cdata_idx + 1);
419+
match normalized {
420+
Some(normalized) => Cow::Owned(normalized),
421+
None => Cow::Borrowed(trimmed_attr),
392422
}
393-
394-
Cow::Owned(value)
395-
396-
// let mut value: Vec<u8> = Vec::new();
397-
398-
// // TODO: replace sequences of spaces
399-
// for i in 0..attr.len() {
400-
// let ch = attr[i];
401-
// match ch {
402-
// b'\n' => value.push(b' '),
403-
// b'\r' => value.push(b' '),
404-
// b'\t' => value.push(b' '),
405-
// c @ _ => value.push(c),
406-
// }
407-
// }
408-
409-
// // Position where value starts after whitespace.
410-
// let first_non_space_char = value
411-
// .iter()
412-
// .position(|c| !c.is_ascii_whitespace())
413-
// .unwrap_or(0);
414-
// // Position where the trailing whitespace starts.
415-
// let last_non_space_char = value
416-
// .iter()
417-
// .rposition(|c| !c.is_ascii_whitespace())
418-
// .and_then(|idx| Some(idx + 1))
419-
// .unwrap_or(0);
420-
// Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
421423
}
422424

423425
impl<'a> Iterator for Attributes<'a> {
@@ -444,7 +446,7 @@ impl<'a> Iterator for Attributes<'a> {
444446
($key:expr, $val:expr) => {
445447
Some(Ok(Attribute {
446448
key: &self.bytes[$key],
447-
value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
449+
value: normalize_attribute_value(&self.bytes[$val]),
448450
}))
449451
};
450452
}
@@ -605,37 +607,27 @@ mod tests {
605607

606608
#[test]
607609
fn attribute_value_normalization() {
610+
// empty value
611+
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
608612
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
609613
assert_eq!(
610-
normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
614+
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
611615
b"foo bar baz delta"
612616
);
613617
// leading and trailing spaces must be stripped
614-
assert_eq!(
615-
normalize_attribute_value(Cow::Borrowed(b" foo ")).as_ref(),
616-
b"foo"
617-
);
618+
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
618619
// leading space
619-
assert_eq!(
620-
normalize_attribute_value(Cow::Borrowed(b" bar")).as_ref(),
621-
b"bar"
622-
);
620+
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
623621
// trailing space
624-
assert_eq!(
625-
normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
626-
b"baz"
627-
);
622+
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
628623
// sequences of spaces must be replaced with a single space
629624
assert_eq!(
630-
normalize_attribute_value(Cow::Borrowed(b" foo bar baz ")).as_ref(),
625+
normalize_attribute_value(b" foo bar baz ").as_ref(),
631626
b"foo bar baz"
632627
);
633628
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
634629
assert_eq!(
635-
normalize_attribute_value(Cow::Borrowed(
636-
b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"
637-
))
638-
.as_ref(),
630+
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
639631
b"foo bar baz delta echo foxtrot"
640632
);
641633
}

0 commit comments

Comments
 (0)