Split off most encoding-related tests to a separate file

dralley · dralley · commit 75319710dd0e · 2022-08-15T00:23:25.000-04:00
Write a few new ones

Fix up descriptions on some decoding functions
diff --git a/README.md b/README.md
@@ -270,7 +270,6 @@ Note that despite not focusing on performance (there are several unnecessary cop
 Benchmarking is hard and the results depend on your input file and your machine.
 
 Here on my particular file, quick-xml is around **50 times faster** than [xml-rs](https://crates.io/crates/xml-rs) crate.
-_(measurements was done while this crate named quick-xml)_
 
 ```
 // quick-xml benches
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -105,10 +105,9 @@ impl Decoder {
     }
 }
 
-/// Decodes the provided bytes using the specified encoding, ignoring the BOM
-/// if it is present in the `bytes`.
+/// Decodes the provided bytes using the specified encoding.
 ///
-/// Returns an error in case of malformed sequences in the `bytes`.
+/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
 #[cfg(feature = "encoding")]
 pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
     encoding
@@ -119,7 +118,7 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
 /// Decodes a slice with an unknown encoding, removing the BOM if it is present
 /// in the bytes.
 ///
-/// Returns an error in case of malformed sequences in the `bytes`.
+/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
 #[cfg(feature = "encoding")]
 pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
     if let Some(encoding) = detect_encoding(bytes) {
@@ -185,5 +184,3 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
         _ => None,
     }
 }
-
-// TODO: add some tests for functions
diff --git a/tests/documents/utf8.xml b/tests/documents/utf8.xml
@@ -0,0 +1,3 @@
+﻿<?xml version="1.0"?>
+<project name="project-name">
+</project>
diff --git a/tests/encodings.rs b/tests/encodings.rs
@@ -0,0 +1,87 @@
+use quick_xml::events::Event;
+use quick_xml::Reader;
+
+#[cfg(feature = "encoding")]
+mod decode {
+    use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8};
+    use quick_xml::encoding::*;
+    use std::borrow::Cow;
+
+    static UTF16BE_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf16be.xml");
+    static UTF16LE_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf16le.xml");
+    static UTF8_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf8.xml");
+
+    static UTF8_TEXT: &str = r#"<?xml version="1.0"?>
+<project name="project-name">
+</project>
+"#;
+
+    #[test]
+    fn test_removes_bom() {
+        // No BOM
+        assert_eq!(
+            decode_with_bom_removal(UTF8_TEXT.as_bytes()).unwrap(),
+            Cow::Borrowed(UTF8_TEXT)
+        );
+        // BOM
+        assert_eq!(
+            decode_with_bom_removal(UTF8_TEXT_WITH_BOM).unwrap(),
+            Cow::Borrowed(UTF8_TEXT)
+        );
+        assert_eq!(
+            decode_with_bom_removal(UTF16BE_TEXT_WITH_BOM).unwrap(),
+            Cow::Borrowed(UTF8_TEXT).into_owned()
+        );
+        assert_eq!(
+            decode_with_bom_removal(UTF16LE_TEXT_WITH_BOM).unwrap(),
+            Cow::Borrowed(UTF8_TEXT).into_owned()
+        );
+    }
+
+    #[test]
+    fn test_detect_encoding() {
+        // No BOM
+        assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8));
+        // BOM
+        assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8));
+        assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE));
+        assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE));
+    }
+}
+
+#[test]
+#[cfg(feature = "encoding")]
+fn test_koi8_r_encoding() {
+    let src = include_bytes!("documents/opennews_all.rss").as_ref();
+    let mut buf = vec![];
+    let mut r = Reader::from_reader(src);
+    r.trim_text(true).expand_empty_elements(false);
+    loop {
+        match r.read_event_into(&mut buf) {
+            Ok(Event::Text(e)) => {
+                e.unescape().unwrap();
+            }
+            Ok(Event::Eof) => break,
+            _ => (),
+        }
+    }
+}
+
+#[test]
+#[cfg(feature = "encoding")]
+fn fuzz_53() {
+    use std::io::Cursor;
+
+    let data: &[u8] = b"\xe9\x00\x00\x00\x00\x00\x00\x00\x00\
+\x00\x00\x00\x00\n(\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\
+\x00<>\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<<\x00\x00\x00";
+    let cursor = Cursor::new(data);
+    let mut reader = Reader::from_reader(cursor);
+    let mut buf = vec![];
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Eof) | Err(..) => break,
+            _ => buf.clear(),
+        }
+    }
+}
diff --git a/tests/test.rs b/tests/test.rs
@@ -1,6 +1,7 @@
 use quick_xml::name::QName;
 use quick_xml::{events::attributes::Attribute, events::Event::*, Error, Reader};
-use std::{borrow::Cow, io::Cursor};
+
+use std::borrow::Cow;
 
 #[cfg(feature = "serialize")]
 use serde::{Deserialize, Serialize};
@@ -89,40 +90,6 @@ fn test_comment_starting_with_gt() {
     }
 }
 
-#[test]
-#[cfg(feature = "encoding")]
-fn test_koi8_r_encoding() {
-    let src = include_bytes!("documents/opennews_all.rss").as_ref();
-    let mut buf = vec![];
-    let mut r = Reader::from_reader(src);
-    r.trim_text(true).expand_empty_elements(false);
-    loop {
-        match r.read_event_into(&mut buf) {
-            Ok(Text(e)) => {
-                e.unescape().unwrap();
-            }
-            Ok(Eof) => break,
-            _ => (),
-        }
-    }
-}
-
-#[test]
-fn fuzz_53() {
-    let data: &[u8] = b"\xe9\x00\x00\x00\x00\x00\x00\x00\x00\
-\x00\x00\x00\x00\n(\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\
-\x00<>\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<<\x00\x00\x00";
-    let cursor = Cursor::new(data);
-    let mut reader = Reader::from_reader(cursor);
-    let mut buf = vec![];
-    loop {
-        match reader.read_event_into(&mut buf) {
-            Ok(Eof) | Err(..) => break,
-            _ => buf.clear(),
-        }
-    }
-}
-
 #[test]
 fn test_issue94() {
     let data = br#"<Run>
@@ -138,37 +105,6 @@ fn test_issue94() {
     }
 }
 
-#[test]
-fn fuzz_101() {
-    let data: &[u8] = b"\x00\x00<\x00\x00\x0a>&#44444444401?#\x0a413518\
-                       #\x0a\x0a\x0a;<:<)(<:\x0a\x0a\x0a\x0a;<:\x0a\x0a\
-                       <:\x0a\x0a\x0a\x0a\x0a<\x00*\x00\x00\x00\x00";
-    let cursor = Cursor::new(data);
-    let mut reader = Reader::from_reader(cursor);
-    let mut buf = vec![];
-    loop {
-        match reader.read_event_into(&mut buf) {
-            Ok(Start(e)) | Ok(Empty(e)) => {
-                for a in e.attributes() {
-                    if a.ok()
-                        .map_or(true, |a| a.decode_and_unescape_value(&reader).is_err())
-                    {
-                        break;
-                    }
-                }
-            }
-            Ok(Text(e)) => {
-                if e.unescape().is_err() {
-                    break;
-                }
-            }
-            Ok(Eof) | Err(..) => break,
-            _ => (),
-        }
-        buf.clear();
-    }
-}
-
 #[test]
 fn test_no_trim() {
     let mut reader = Reader::from_str(" <tag> text </tag> ");

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+<?xml version="1.0"?>`
	`2`	`+<project name="project-name">`
	`3`	`+</project>`