@@ -331,6 +331,87 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331
331
}
332
332
}
333
333
334
+ /// Normalize the attribute value according to xml specification section 3.3.3
335
+ ///
336
+ /// https://www.w3.org/TR/xml/#AVNormalize
337
+ ///
338
+ /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
339
+ /// * Sequences of whitespace-like characters are replaced with a single whitespace character
340
+ /// * Character and entity references are substituted as defined by the spec
341
+ fn normalize_attribute_value ( attr : & [ u8 ] ) -> Cow < [ u8 ] > {
342
+ // TODO: character references, entity references, error handling associated with those
343
+
344
+ #[ derive( PartialEq ) ]
345
+ enum ParseState {
346
+ Space ,
347
+ CDATA ,
348
+ }
349
+
350
+ let is_whitespace_like = |c| matches ! ( c, b'\n' | b'\r' | b'\t' | b' ' ) ;
351
+
352
+ let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace_like ( * c) ) ;
353
+
354
+ if first_non_space_char. is_none ( ) {
355
+ // The entire value was whitespace-like characters
356
+ return Cow :: Borrowed ( b"" ) ;
357
+ }
358
+
359
+ let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace_like ( * c) ) ;
360
+
361
+ // Trim all whitespace-like characters away from the beginning and end of the attribute value.
362
+ let begin = first_non_space_char. unwrap ( ) ;
363
+ let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
364
+ let trimmed_attr = & attr[ begin..=end] ;
365
+
366
+ // A new buffer is only created when we encounter a situation that requires it.
367
+ let mut normalized: Option < Vec < u8 > > = None ;
368
+ // We start on character data because all whitespace-like characters are already trimmed away.
369
+ let mut current_state = ParseState :: CDATA ;
370
+
371
+ // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
372
+ // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
373
+ // buffer and continue using this buffer.
374
+ for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
375
+ match ch {
376
+ b'\n' | b'\r' | b'\t' | b' ' => match current_state {
377
+ ParseState :: Space => match normalized {
378
+ Some ( _) => continue ,
379
+ None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
380
+ } ,
381
+ ParseState :: CDATA => {
382
+ current_state = ParseState :: Space ;
383
+ match normalized. as_mut ( ) {
384
+ Some ( buf) => buf. push ( b' ' ) ,
385
+ None => {
386
+ let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
387
+ buf. push ( b' ' ) ;
388
+ normalized = Some ( buf) ;
389
+ }
390
+ }
391
+ }
392
+ } ,
393
+ c @ _ => match current_state {
394
+ ParseState :: Space => {
395
+ current_state = ParseState :: CDATA ;
396
+ if let Some ( normalized) = normalized. as_mut ( ) {
397
+ normalized. push ( * c) ;
398
+ }
399
+ }
400
+ ParseState :: CDATA => {
401
+ if let Some ( normalized) = normalized. as_mut ( ) {
402
+ normalized. push ( * c) ;
403
+ }
404
+ }
405
+ } ,
406
+ }
407
+ }
408
+
409
+ match normalized {
410
+ Some ( normalized) => Cow :: Owned ( normalized) ,
411
+ None => Cow :: Borrowed ( trimmed_attr) ,
412
+ }
413
+ }
414
+
334
415
impl < ' a > Iterator for Attributes < ' a > {
335
416
type Item = Result < Attribute < ' a > > ;
336
417
fn next ( & mut self ) -> Option < Self :: Item > {
@@ -355,7 +436,7 @@ impl<'a> Iterator for Attributes<'a> {
355
436
( $key: expr, $val: expr) => {
356
437
Some ( Ok ( Attribute {
357
438
key: & self . bytes[ $key] ,
358
- value: Cow :: Borrowed ( & self . bytes[ $val] ) ,
439
+ value: normalize_attribute_value ( & self . bytes[ $val] ) ,
359
440
} ) )
360
441
} ;
361
442
}
@@ -513,4 +594,31 @@ mod tests {
513
594
assert_eq ! ( & * a. value, b"ee" ) ;
514
595
assert ! ( attributes. next( ) . is_none( ) ) ;
515
596
}
597
+
598
+ #[ test]
599
+ fn attribute_value_normalization ( ) {
600
+ // empty value
601
+ assert_eq ! ( normalize_attribute_value( b"" ) . as_ref( ) , b"" ) ;
602
+ // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
603
+ assert_eq ! (
604
+ normalize_attribute_value( b"\r foo\r bar\t baz\n delta\n " ) . as_ref( ) ,
605
+ b"foo bar baz delta"
606
+ ) ;
607
+ // leading and trailing spaces must be stripped
608
+ assert_eq ! ( normalize_attribute_value( b" foo " ) . as_ref( ) , b"foo" ) ;
609
+ // leading space
610
+ assert_eq ! ( normalize_attribute_value( b" bar" ) . as_ref( ) , b"bar" ) ;
611
+ // trailing space
612
+ assert_eq ! ( normalize_attribute_value( b"baz " ) . as_ref( ) , b"baz" ) ;
613
+ // sequences of spaces must be replaced with a single space
614
+ assert_eq ! (
615
+ normalize_attribute_value( b" foo bar baz " ) . as_ref( ) ,
616
+ b"foo bar baz"
617
+ ) ;
618
+ // sequence replacement mixed with characters treated as whitespace (\t \r \n)
619
+ assert_eq ! (
620
+ normalize_attribute_value( b" \t foo\t bar \r baz \n \n delta\n \t \r echo foxtrot\r " ) . as_ref( ) ,
621
+ b"foo bar baz delta echo foxtrot"
622
+ ) ;
623
+ }
516
624
}
0 commit comments