Skip to content

Commit 9307786

Browse files
committed
Properly normalize attribute values
closes #371
1 parent 8a74258 commit 9307786

File tree

2 files changed

+110
-2
lines changed

2 files changed

+110
-2
lines changed

src/escapei.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
131131
}
132132

133133
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
134-
/// value, using a dictionnary of custom entities.
134+
/// value, using a dictionary of custom entities.
135135
///
136136
/// # Pre-condition
137137
///

src/events/attributes.rs

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,87 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331331
}
332332
}
333333

334+
/// Normalize the attribute value according to xml specification section 3.3.3
335+
///
336+
/// https://www.w3.org/TR/xml/#AVNormalize
337+
///
338+
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
339+
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
340+
/// * Character and entity references are substituted as defined by the spec
341+
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
342+
// TODO: character references, entity references, error handling associated with those
343+
344+
#[derive(PartialEq)]
345+
enum ParseState {
346+
Space,
347+
CDATA,
348+
}
349+
350+
let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
351+
352+
let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
353+
354+
if first_non_space_char.is_none() {
355+
// The entire value was whitespace-like characters
356+
return Cow::Borrowed(b"");
357+
}
358+
359+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
360+
361+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
362+
let begin = first_non_space_char.unwrap();
363+
let end = last_non_space_char.unwrap_or(attr.len());
364+
let trimmed_attr = &attr[begin..=end];
365+
366+
// A new buffer is only created when we encounter a situation that requires it.
367+
let mut normalized: Option<Vec<u8>> = None;
368+
// We start on character data because all whitespace-like characters are already trimmed away.
369+
let mut current_state = ParseState::CDATA;
370+
371+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
372+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
373+
// buffer and continue using this buffer.
374+
for (idx, ch) in trimmed_attr.iter().enumerate() {
375+
match ch {
376+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
377+
ParseState::Space => match normalized {
378+
Some(_) => continue,
379+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
380+
},
381+
ParseState::CDATA => {
382+
current_state = ParseState::Space;
383+
match normalized.as_mut() {
384+
Some(buf) => buf.push(b' '),
385+
None => {
386+
let mut buf = Vec::from(&trimmed_attr[..idx]);
387+
buf.push(b' ');
388+
normalized = Some(buf);
389+
}
390+
}
391+
}
392+
},
393+
c @ _ => match current_state {
394+
ParseState::Space => {
395+
current_state = ParseState::CDATA;
396+
if let Some(normalized) = normalized.as_mut() {
397+
normalized.push(*c);
398+
}
399+
}
400+
ParseState::CDATA => {
401+
if let Some(normalized) = normalized.as_mut() {
402+
normalized.push(*c);
403+
}
404+
}
405+
},
406+
}
407+
}
408+
409+
match normalized {
410+
Some(normalized) => Cow::Owned(normalized),
411+
None => Cow::Borrowed(trimmed_attr),
412+
}
413+
}
414+
334415
impl<'a> Iterator for Attributes<'a> {
335416
type Item = Result<Attribute<'a>>;
336417
fn next(&mut self) -> Option<Self::Item> {
@@ -355,7 +436,7 @@ impl<'a> Iterator for Attributes<'a> {
355436
($key:expr, $val:expr) => {
356437
Some(Ok(Attribute {
357438
key: &self.bytes[$key],
358-
value: Cow::Borrowed(&self.bytes[$val]),
439+
value: normalize_attribute_value(&self.bytes[$val]),
359440
}))
360441
};
361442
}
@@ -513,4 +594,31 @@ mod tests {
513594
assert_eq!(&*a.value, b"ee");
514595
assert!(attributes.next().is_none());
515596
}
597+
598+
#[test]
599+
fn attribute_value_normalization() {
600+
// empty value
601+
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
602+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
603+
assert_eq!(
604+
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
605+
b"foo bar baz delta"
606+
);
607+
// leading and trailing spaces must be stripped
608+
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
609+
// leading space
610+
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
611+
// trailing space
612+
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
613+
// sequences of spaces must be replaced with a single space
614+
assert_eq!(
615+
normalize_attribute_value(b" foo bar baz ").as_ref(),
616+
b"foo bar baz"
617+
);
618+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
619+
assert_eq!(
620+
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
621+
b"foo bar baz delta echo foxtrot"
622+
);
623+
}
516624
}

0 commit comments

Comments
 (0)