diff --git a/Changelog.md b/Changelog.md index c3053257..f38f18b8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -27,6 +27,8 @@ the XML declared encoding and always use UTF-8 - [#416]: Add `borrow()` methods in all event structs which allows to get a borrowed version of any event +- [#436]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()` + under the `quick-xml::encoding` namespace. ### Bug Fixes @@ -137,6 +139,13 @@ - [#423]: All escaping functions now accepts and returns strings instead of byte slices - [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array, but since now escaping works on strings. Use `BytesText::from_plain_str` instead +- [#425]: Split the internal implementation of `Reader` into multiple files to better separate the + buffered and unbuffered implementations. The unbuffered methods, e.g. `read_event()`, + will no longer be available when reading from a slice. +- [#436]: When using `Reader` with raw bytes, a buffered parsing implementation will always be used. + If using `Reader::from_str()`, the reader will borrow directly from the `&str`. If you have a byte + array known to be valid UTF-8, it is recommended to convert it to `&str` first, which will enable + the unbuffered (borrowing) implementation. ### New Tests @@ -167,6 +176,8 @@ [#418]: https://github.com/tafia/quick-xml/pull/418 [#421]: https://github.com/tafia/quick-xml/pull/421 [#423]: https://github.com/tafia/quick-xml/pull/423 +[#425]: https://github.com/tafia/quick-xml/pull/425 +[#436]: https://github.com/tafia/quick-xml/pull/430 ## 0.23.0 -- 2022-05-08 diff --git a/README.md b/README.md index 6fa273ee..031c0175 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,11 @@ let xml = r#" Test Test 2 "#; -let mut reader = Reader::from_str(xml); +let mut reader = Reader::from_reader(xml.as_bytes()); +// If you want to read from a string or byte slice without buffering, use: +// let mut reader = Reader::from_str(xml); +// In that case, `Vec` is *not* needed for buffering below and you should use +// `read_event` instead of `read_event_into`. reader.trim_text(true); let mut count = 0; diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 3358f3a4..358c086a 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -3,23 +3,23 @@ use quick_xml::events::Event; use quick_xml::Reader; use quick_xml::Result as XmlResult; -static RPM_PRIMARY: &[u8] = include_bytes!("../tests/documents/rpm_primary.xml"); -static RPM_PRIMARY2: &[u8] = include_bytes!("../tests/documents/rpm_primary2.xml"); -static RPM_FILELISTS: &[u8] = include_bytes!("../tests/documents/rpm_filelists.xml"); -static RPM_OTHER: &[u8] = include_bytes!("../tests/documents/rpm_other.xml"); -static LIBREOFFICE_DOCUMENT: &[u8] = include_bytes!("../tests/documents/libreoffice_document.fodt"); -static DOCUMENT: &[u8] = include_bytes!("../tests/documents/document.xml"); -static TEST_WRITER_INDENT: &[u8] = include_bytes!("../tests/documents/test_writer_indent.xml"); -static SAMPLE_1: &[u8] = include_bytes!("../tests/documents/sample_1.xml"); -static LINESCORE: &[u8] = include_bytes!("../tests/documents/linescore.xml"); -static SAMPLE_RSS: &[u8] = include_bytes!("../tests/documents/sample_rss.xml"); -static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml"); -static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); +static RPM_PRIMARY: &str = include_str!("../tests/documents/rpm_primary.xml"); +static RPM_PRIMARY2: &str = include_str!("../tests/documents/rpm_primary2.xml"); +static RPM_FILELISTS: &str = include_str!("../tests/documents/rpm_filelists.xml"); +static RPM_OTHER: &str = include_str!("../tests/documents/rpm_other.xml"); +static LIBREOFFICE_DOCUMENT: &str = include_str!("../tests/documents/libreoffice_document.fodt"); +static DOCUMENT: &str = include_str!("../tests/documents/document.xml"); +static TEST_WRITER_INDENT: &str = include_str!("../tests/documents/test_writer_indent.xml"); +static SAMPLE_1: &str = include_str!("../tests/documents/sample_1.xml"); +static LINESCORE: &str = include_str!("../tests/documents/linescore.xml"); +static SAMPLE_RSS: &str = include_str!("../tests/documents/sample_rss.xml"); +static SAMPLE_NS: &str = include_str!("../tests/documents/sample_ns.xml"); +static PLAYERS: &str = include_str!("../tests/documents/players.xml"); // TODO: read the namespaces too // TODO: use fully normalized attribute values -fn parse_document(doc: &[u8]) -> XmlResult<()> { - let mut r = Reader::from_reader(doc); +fn parse_document(doc: &str) -> XmlResult<()> { + let mut r = Reader::from_str(doc); loop { match r.read_event()? { Event::Start(e) | Event::Empty(e) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 8bbe1a67..75b08fa1 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -5,8 +5,8 @@ use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Reader; -static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml"); -static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); +static SAMPLE: &str = include_str!("../tests/documents/sample_rss.xml"); +static PLAYERS: &str = include_str!("../tests/documents/players.xml"); static LOREM_IPSUM_TEXT: &str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt @@ -29,17 +29,15 @@ fn read_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_str(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -50,19 +48,17 @@ fn read_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_str(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -79,18 +75,16 @@ fn read_namespaced_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_namespaced_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_str(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -101,20 +95,18 @@ fn read_namespaced_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_str(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -129,79 +121,67 @@ fn read_namespaced_event(c: &mut Criterion) { fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); group.bench_function("StartText", |b| { - let src = "Hello world!".repeat(512 / 12).into_bytes(); - let mut buf = Vec::with_capacity(1024); + let src = "Hello world!".repeat(512 / 12); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_str(&src); let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::StartText(e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 504); }) }); group.bench_function("Start", |b| { - let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); + let src = format!(r#""#, "world".repeat(512 / 5)); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_str(&src); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 525); }) }); group.bench_function("Comment", |b| { - let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); + let src = format!(r#""#, "world".repeat(512 / 5)); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_str(&src); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 520); }) }); group.bench_function("CData", |b| { - let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); + let src = format!(r#""#, "world".repeat(512 / 5)); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_str(&src); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::CData(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 518); }) }); @@ -213,12 +193,11 @@ fn attributes(c: &mut Criterion) { let mut group = c.benchmark_group("attributes"); group.bench_function("with_checks = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_str(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes() { let _attr = attr.unwrap(); @@ -228,7 +207,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -236,12 +214,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("with_checks = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_str(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes().with_checks(false) { let _attr = attr.unwrap(); @@ -251,7 +228,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -259,12 +235,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("try_get_attribute", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_str(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { @@ -279,7 +254,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 150); }) diff --git a/examples/read_buffered.rs b/examples/read_buffered.rs new file mode 100644 index 00000000..25b28ee2 --- /dev/null +++ b/examples/read_buffered.rs @@ -0,0 +1,34 @@ +// This example demonstrates how a reader (for example when reading from a file) +// can be buffered. In that case, data read from the file is written to a supplied +// buffer and returned XML events borrow from that buffer. +// That way, allocations can be kept to a minimum. + +fn main() -> Result<(), quick_xml::Error> { + use quick_xml::events::Event; + use quick_xml::Reader; + + let mut reader = Reader::from_file("tests/documents/document.xml")?; + reader.trim_text(true); + + let mut buf = Vec::new(); + + let mut count = 0; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + let name = e.name(); + let name = reader.decoder().decode(name.as_ref())?; + println!("read start event {:?}", name.as_ref()); + count += 1; + } + Ok(Event::Eof) => break, // exits the loop when reaching end of file + Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + _ => (), // There are several other `Event`s we do not consider here + } + } + + println!("read {} start events in total", count); + + Ok(()) +} diff --git a/examples/read_texts.rs b/examples/read_texts.rs index 40d71e63..70be0b5c 100644 --- a/examples/read_texts.rs +++ b/examples/read_texts.rs @@ -10,14 +10,13 @@ fn main() { reader.trim_text(true); let mut txt = Vec::new(); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => { txt.push( reader - .read_text_into(QName(b"tag2"), &mut Vec::new()) + .read_text(QName(b"tag2")) .expect("Cannot decode text value"), ); println!("{:?}", txt); @@ -26,6 +25,5 @@ fn main() { Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), _ => (), // There are several other `Event`s we do not consider here } - buf.clear(); } } diff --git a/src/de/escape.rs b/src/de/escape.rs index badc3299..e9eff985 100644 --- a/src/de/escape.rs +++ b/src/de/escape.rs @@ -1,9 +1,9 @@ //! Serde `Deserializer` module use crate::de::deserialize_bool; +use crate::encoding::Decoder; use crate::errors::serialize::DeError; use crate::escape::unescape; -use crate::reader::Decoder; use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor}; use serde::{self, forward_to_deserialize_any, serde_if_integer128}; use std::borrow::Cow; diff --git a/src/de/mod.rs b/src/de/mod.rs index e564e041..fd3caa46 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -215,10 +215,10 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ + encoding::Decoder, errors::Error, events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, name::QName, - reader::Decoder, Reader, }; use serde::de::{self, Deserialize, DeserializeOwned, Visitor}; @@ -306,8 +306,8 @@ where } /// Deserialize from a reader. This method will do internal copies of data -/// readed from `reader`. If you want have a `&[u8]` or `&str` input and want -/// to borrow as much as possible, use [`from_slice`] or [`from_str`] +/// readed from `reader`. If you want have a `&str` input and want +/// to borrow as much as possible, use [`from_str`] pub fn from_reader(reader: R) -> Result where R: BufRead, @@ -685,17 +685,7 @@ where impl<'de> Deserializer<'de, SliceReader<'de>> { /// Create new deserializer that will borrow data from the specified string pub fn from_str(s: &'de str) -> Self { - Self::from_borrowing_reader(Reader::from_str(s)) - } - - /// Create new deserializer that will borrow data from the specified byte array - pub fn from_slice(bytes: &'de [u8]) -> Self { - Self::from_borrowing_reader(Reader::from_bytes(bytes)) - } - - /// Create new deserializer that will borrow data from the specified borrowing reader - #[inline] - fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self { + let mut reader = Reader::from_str(s); reader .expand_empty_elements(true) .check_end_names(true) @@ -726,6 +716,13 @@ where } } +impl<'de> Deserializer<'de, IoReader<&'de [u8]>> { + /// Create new deserializer that will borrow data from the specified byte array + pub fn from_slice(bytes: &'de [u8]) -> Self { + Self::from_reader(bytes) + } +} + impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R> where R: XmlRead<'de>, @@ -930,7 +927,7 @@ pub trait XmlRead<'i> { /// You cannot create it, it is created automatically when you call /// [`Deserializer::from_reader`] pub struct IoReader { - reader: Reader, + reader: Reader>, buf: Vec, } @@ -970,12 +967,12 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { } } -/// XML input source that reads from a slice of bytes and can borrow from it. +/// XML input source that reads from a `&str` and can borrow from it. /// /// You cannot create it, it is created automatically when you call -/// [`Deserializer::from_str`] or [`Deserializer::from_slice`] +/// [`Deserializer::from_str`] pub struct SliceReader<'de> { - reader: Reader<&'de [u8]>, + reader: Reader>, } impl<'de> XmlRead<'de> for SliceReader<'de> { @@ -1025,8 +1022,8 @@ mod tests { /// Checks that `peek()` and `read()` behaves correctly after `skip()` #[test] fn read_and_peek() { - let mut de = Deserializer::from_slice( - br#" + let mut de = Deserializer::from_str( + r#" text @@ -1166,8 +1163,8 @@ mod tests { /// Checks that `read_to_end()` behaves correctly after `skip()` #[test] fn read_to_end() { - let mut de = Deserializer::from_slice( - br#" + let mut de = Deserializer::from_str( + r#" text @@ -1270,8 +1267,8 @@ mod tests { item: Vec<()>, } - let mut de = Deserializer::from_slice( - br#" + let mut de = Deserializer::from_str( + r#" @@ -1296,8 +1293,8 @@ mod tests { fn read_to_end() { use crate::de::DeEvent::*; - let mut de = Deserializer::from_slice( - br#" + let mut de = Deserializer::from_str( + r#" textcontent @@ -1343,15 +1340,14 @@ mod tests { Some text - "## - .as_bytes(); + "##; let mut reader1 = IoReader { - reader: Reader::from_reader(s), + reader: Reader::from_reader(s.as_bytes()), buf: Vec::new(), }; let mut reader2 = SliceReader { - reader: Reader::from_bytes(s), + reader: Reader::from_str(s), }; loop { @@ -1373,11 +1369,10 @@ mod tests { - "## - .as_bytes(); + "##; let mut reader = SliceReader { - reader: Reader::from_bytes(s), + reader: Reader::from_str(s), }; reader diff --git a/src/de/seq.rs b/src/de/seq.rs index fe4559bd..d7595632 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,6 +1,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; +use crate::encoding::Decoder; use crate::events::BytesStart; -use crate::reader::Decoder; use serde::de::{DeserializeSeed, SeqAccess}; /// Check if tag `start` is included in the `fields` list. `decoder` is used to diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index dc0b157a..580c6312 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -4,9 +4,9 @@ //! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition use crate::de::{deserialize_bool, str2bool}; +use crate::encoding::Decoder; use crate::errors::serialize::DeError; use crate::escape::unescape; -use crate::reader::Decoder; use memchr::memchr; use serde::de::{DeserializeSeed, Deserializer, EnumAccess, SeqAccess, VariantAccess, Visitor}; use serde::{self, serde_if_integer128}; diff --git a/src/encoding.rs b/src/encoding.rs new file mode 100644 index 00000000..0b98a209 --- /dev/null +++ b/src/encoding.rs @@ -0,0 +1,200 @@ +//! A module for wrappers that encode / decode data. + +use std::borrow::Cow; + +#[cfg(feature = "encoding")] +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; + +use crate::{Error, Result}; + +/// Decoder of byte slices to the strings. This is lightweight object that can be copied. +/// +/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` +/// XML declaration or assumes UTF-8, if XML has no declaration, encoding +/// key is not defined or contains unknown encoding. +/// +/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` +/// is supported. [*UTF-16 is not supported at the present*][utf16]. +/// +/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: +/// any XML declarations are ignored. +/// +/// [utf16]: https://github.com/tafia/quick-xml/issues/158 +#[derive(Clone, Copy, Debug)] +pub struct Decoder { + #[cfg(feature = "encoding")] + pub(crate) encoding: &'static Encoding, +} + +#[cfg(not(feature = "encoding"))] +impl Decoder { + /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + #[inline] + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + Ok(Cow::Borrowed(std::str::from_utf8(bytes)?)) + } + + /// Decodes a slice regardless of XML declaration with BOM removal if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + &bytes[3..] + } else { + bytes + }; + self.decode(bytes) + } +} + +#[cfg(feature = "encoding")] +impl Decoder { + /// Returns the `Reader`s encoding. + /// + /// This encoding will be used by [`decode`]. + /// + /// [`decode`]: Self::decode + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + + /// Decodes specified bytes using encoding, declared in the XML, if it was + /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present + /// in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + decode(bytes, self.encoding) + } + + /// Decodes a slice with BOM removal if it is present in the `bytes` using + /// the reader encoding. + /// + /// If this method called after reading XML declaration with the `"encoding"` + /// key, then this encoding is used, otherwise UTF-8 is used. + /// + /// If XML declaration is absent in the XML, UTF-8 is used. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + self.decode(remove_bom(bytes, self.encoding)) + } +} + +/// Decodes the provided bytes using the specified encoding, ignoring the BOM +/// if it is present in the `bytes`. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result> { + encoding + .decode_without_bom_handling_and_without_replacement(bytes) + .ok_or(Error::NonDecodable(None)) +} + +/// Decodes a slice with an unknown encoding, removing the BOM if it is present +/// in the bytes. +/// +/// Returns an error in case of malformed sequences in the `bytes`. +#[cfg(feature = "encoding")] +pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result> { + if let Some(encoding) = detect_encoding(bytes) { + let bytes = remove_bom(bytes, encoding); + decode(bytes, encoding) + } else { + decode(bytes, UTF_8) + } +} + +#[cfg(feature = "encoding")] +fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) { + if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + bytes.split_at(3) + } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) { + bytes.split_at(2) + } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) { + bytes.split_at(2) + } else { + (&[], bytes) + } +} + +#[cfg(feature = "encoding")] +fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { + let (_, bytes) = split_at_bom(bytes, encoding); + bytes +} + +/// This implementation is required for tests of other parts of the library +#[cfg(test)] +#[cfg(feature = "serialize")] +impl Decoder { + pub(crate) fn utf8() -> Self { + Decoder { + #[cfg(feature = "encoding")] + encoding: UTF_8, + } + } + + #[cfg(feature = "encoding")] + pub(crate) fn utf16() -> Self { + Decoder { encoding: UTF_16LE } + } +} + +/// Automatic encoding detection of XML files based using the [recommended algorithm] +/// (https://www.w3.org/TR/xml11/#sec-guessing) +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order) +/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order) +/// |`00 00 FF FE`|UCS-4, unusual octet order (2143) +/// |`FE FF 00 00`|UCS-4, unusual octet order (3412) +/// |`FE FF ## ##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// |-------------|------------------------------------------ +/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234) +/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321) +/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143) +/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412) +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use) +/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind +/// +/// Because [`encoding_rs`] crate supported only subset of those encodings, only +/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// If encoding is detected, `Some` is returned, otherwise `None` is returned. +#[cfg(feature = "encoding")] +pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { + match bytes { + // with BOM + _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), + _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), + _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), + + // without BOM + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible + + _ => None, + } +} + +// TODO: add tests from these functions diff --git a/src/events/mod.rs b/src/events/mod.rs index b2672edf..54bb8654 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -41,10 +41,11 @@ use std::fmt::{self, Debug, Formatter}; use std::ops::Deref; use std::str::from_utf8; +use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; -use crate::reader::{Decoder, Reader}; +use crate::reader::Reader; use crate::utils::write_cow_string; use attributes::{Attribute, Attributes}; @@ -983,8 +984,9 @@ pub enum Event<'a> { /// let xml = b"\xEF\xBB\xBF"; /// let mut reader = Reader::from_bytes(xml); /// let mut events_processed = 0; + /// let mut event_buffer = Vec::new(); /// loop { - /// match reader.read_event() { + /// match reader.read_event_into(&mut event_buffer) { /// Ok(Event::StartText(e)) => { /// assert_eq!(events_processed, 0); /// // Content contains BOM diff --git a/src/lib.rs b/src/lib.rs index f42ae359..84579845 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,7 @@ #[cfg(feature = "serialize")] pub mod de; +pub mod encoding; mod errors; mod escapei; pub mod escape { @@ -62,8 +63,9 @@ pub mod utils; mod writer; // reexports +pub use crate::encoding::Decoder; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::{Decoder, Reader}; +pub use crate::reader::{BufferedReader, Reader, SliceReader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs new file mode 100644 index 00000000..e9ced5f3 --- /dev/null +++ b/src/reader/buffered_reader.rs @@ -0,0 +1,725 @@ +//! This is an implementation of [`Reader`] for reading from a [`Read`] or [`BufRead`] as +//! underlying byte stream. + +use std::fs::File; +use std::io::{self, BufRead, BufReader, Read}; +use std::ops::{Deref, DerefMut}; +use std::path::Path; + +use crate::events::{BytesText, Event}; +use crate::name::{QName, ResolveResult}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use crate::encoding::detect_encoding; +#[cfg(feature = "encoding")] +use crate::reader::EncodingRef; +use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// Private functions for a [`Reader`] based on an [`BufferedReader`]. +impl Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), + TagState::Opened => self.read_until_close(buf), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'buf>( + &mut self, + buf: &'buf mut Vec, + first: bool, + ) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(buf); + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Public reading methods for a [`Reader`] based on an [`BufferedReader`]. +impl Reader> { + /// Reads the next `Event`. + /// + /// This is the main entry point for reading XML `Event`s. + /// + /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` + /// internally). + /// + /// Having the possibility to control the internal buffers gives you some additional benefits + /// such as: + /// + /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, + /// you can call `buf.clear()` once you are done with processing the event (typically at the + /// end of your loop). + /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). + /// + /// # Examples + /// + /// ``` + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// // This explicitly uses `from_reader(xml.as_bytes())` to use a buffered reader instead of + /// // relying on the zero-copy optimizations for reading from byte slices. + /// let mut reader = Reader::from_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_event_into(&mut buf) { + /// Ok(Event::Start(ref e)) => count += 1, + /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok(Event::Eof) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + #[inline] + pub fn read_event_into<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.read_event_impl(buf) + } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event_into(&mut buf, &mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event_into<'b, 'ns>( + &mut self, + buf: &'b mut Vec, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event_into(buf); + self.resolve_namespaced_event_inner(event, namespace_buffer) + } + + /// Reads until end element is found using provided buffer as intermediate + /// storage for events content. This function is supposed to be called after + /// you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_to_end()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// The provided `buf` buffer will be filled only by one event content at time. + /// Before reading of each event the buffer will be cleared. If you know an + /// appropriate size of each event, you can preallocate the buffer to reduce + /// number of reallocations. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_reader(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#.as_bytes()); + /// reader.trim_text(true); + /// let mut buf = Vec::new(); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`read_to_end()`]: Self::read_to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { + let mut depth = 0; + loop { + buf.clear(); + match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { + let s = match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end_into(end, buf)?; + Ok(s) + } +} + +/// Builder for reading from a file. +impl Reader>> { + /// Creates an XML reader from a file path. + pub fn from_file>(path: P) -> Result { + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + Ok(Self::from_reader_internal(BufferedReader(reader))) + } +} + +/// Builder for reading from any [`&[u8]`]. +impl<'buf> Reader> { + /// Creates an XML reader from any type implementing [`BufRead`]. + pub fn from_bytes(s: &'buf [u8]) -> Self { + Self::from_reader_internal(BufferedReader(s)) + } +} + +/// Builder for reading from any [`BufRead`]. +impl Reader> { + /// Creates an XML reader from any type implementing [`BufRead`]. + pub fn from_reader(reader: R) -> Self { + Self::from_reader_internal(BufferedReader(reader)) + } +} + +/// Builder for reading from any [`Read`]. +impl Reader>> { + /// Creates an XML reader from any type implementing [`Read`]. + pub fn from_unbuffered_reader(reader: R) -> Self { + Self::from_reader_internal(BufferedReader(BufReader::new(reader))) + } +} +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// A struct for handling reading functions based on reading from a [`BufRead`]. +#[derive(Debug, Clone)] +pub struct BufferedReader(R); + +impl Deref for BufferedReader { + type Target = R; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for BufferedReader { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl InnerReader for BufferedReader { + type Reader = R; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + +/// Private reading functions. +impl BufferedReader { + #[inline] + fn read_bytes_until<'buf>( + &mut self, + byte: u8, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + // search byte must be within the ascii range + debug_assert!(byte.is_ascii()); + + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.0.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.0.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + fn read_bang_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.0.consume(1); + + let bang_type = BangType::new(self.peek_one()?)?; + + loop { + match self.0.fill_buf() { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Ok(n) if n.is_empty() => return Err(bang_type.to_err()), + Ok(available) => { + if let Some((consumed, used)) = bang_type.parse(available, read) { + buf.extend_from_slice(consumed); + + self.0.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.0.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + } + } + + if read == 0 { + Ok(None) + } else { + Ok(Some((bang_type, &buf[start..]))) + } + } + + #[inline] + fn read_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + let mut state = ReadElementState::Elem; + let mut read = 0; + + let start = buf.len(); + loop { + match self.0.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(available) => { + if let Some((consumed, used)) = state.change(available) { + buf.extend_from_slice(consumed); + + self.0.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.0.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + } + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.0.fill_buf() { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.0.consume(count); + *position += count; + continue; + } else { + Ok(()) + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + // search byte must be within the ascii range + debug_assert!(byte.is_ascii()); + + match self.peek_one()? { + Some(b) if b == byte => { + *position += 1; + self.0.consume(1); + Ok(true) + } + _ => Ok(false), + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + fn peek_one(&mut self) -> Result> { + loop { + break match self.0.fill_buf() { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_str(s: &str) -> BufferedReader<&[u8]> { + BufferedReader(s.as_bytes()) + } + + fn reader_from_str(s: &str) -> Reader> { + Reader::from_reader_internal(BufferedReader(s.as_bytes())) + } + + #[allow(dead_code)] + fn reader_from_bytes(bytes: &[u8]) -> Reader> { + Reader::from_reader_internal(BufferedReader(bytes)) + } + + #[cfg(feature = "encoding")] + mod encoding { + use super::reader_from_bytes; + use crate::events::Event; + use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251}; + + mod bytes { + use super::reader_from_bytes; + use super::*; + use pretty_assertions::assert_eq; + + /// Checks that encoding is detected by BOM and changed after XML declaration + #[test] + fn bom_detected() { + let mut reader = reader_from_bytes(b"\xFF\xFE"); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_impl(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + reader.read_event_impl(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), WINDOWS_1251); + + assert_eq!(reader.read_event_impl(&mut buf).unwrap(), Event::Eof); + } + + /// Checks that encoding is changed by XML declaration, but only once + #[test] + fn xml_declaration() { + let mut reader = + reader_from_bytes(b""); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_impl(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + reader.read_event_impl(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + assert_eq!(reader.read_event_impl(&mut buf).unwrap(), Event::Eof); + } + } + } + + check!(let mut buf = Vec::new();); +} diff --git a/src/reader.rs b/src/reader/mod.rs similarity index 51% rename from src/reader.rs rename to src/reader/mod.rs index f3a868a7..0c5515a0 100644 --- a/src/reader.rs +++ b/src/reader/mod.rs @@ -1,18 +1,24 @@ //! A module to handle `Reader` -use std::borrow::Cow; -use std::io::{self, BufRead, BufReader}; -use std::{fs::File, path::Path, str::from_utf8}; +use std::ops::{Deref, DerefMut}; +use std::str::from_utf8; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use encoding_rs::{Encoding, UTF_8}; +use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +mod buffered_reader; +mod slice_reader; + +pub use self::buffered_reader::BufferedReader; +pub use self::slice_reader::SliceReader; + /// Possible reader states. The state transition diagram (`true` and `false` shows /// value of [`Reader::expand_empty_elements()`] option): /// @@ -103,6 +109,15 @@ impl EncodingRef { } } +/// A trait for the underlying abstracion handling the actual reading part for the [`Reader`]. +pub trait InnerReader: Deref + DerefMut { + /// The real type of the inner reader. + type Reader; + + /// Consumes this abstration returning the underlying reader. + fn into_inner(self) -> Self::Reader; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A low level encoding-agnostic XML event reader. @@ -119,7 +134,7 @@ impl EncodingRef { /// Test /// Test 2 /// "#; -/// let mut reader = Reader::from_str(xml); +/// let mut reader = Reader::from_reader(xml.as_bytes()); /// reader.trim_text(true); /// /// let mut count = 0; @@ -211,7 +226,7 @@ pub struct Reader { /// Builder methods impl Reader { /// Creates a `Reader` that reads from a given reader. - pub fn from_reader(reader: R) -> Self { + fn from_reader_internal(reader: R) -> Self { Self { reader, opened_buffer: Vec::new(), @@ -334,7 +349,7 @@ impl Reader { } /// Getters -impl Reader { +impl> Reader { /// Consumes `Reader` returning the underlying reader /// /// Can be used to compute line and column of a parsing error position @@ -344,7 +359,7 @@ impl Reader { /// ``` /// # use pretty_assertions::assert_eq; /// use std::{str, io::Cursor}; - /// use quick_xml::Reader; + /// use quick_xml::{BufferedReader, Reader}; /// use quick_xml::events::Event; /// /// let xml = r#" @@ -354,7 +369,7 @@ impl Reader { /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); /// let mut buf = Vec::new(); /// - /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { + /// fn into_line_and_column(reader: Reader>>) -> (usize, usize) { /// let end_pos = reader.buffer_position(); /// let mut cursor = reader.into_inner(); /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) @@ -389,7 +404,7 @@ impl Reader { /// } /// ``` pub fn into_inner(self) -> R { - self.reader + self.reader.into_inner() } /// Gets a reference to the underlying reader. @@ -401,7 +416,10 @@ impl Reader { pub fn get_mut(&mut self) -> &mut R { &mut self.reader } +} +/// Getters that are not specific to any inner reader implementation +impl Reader { /// Gets the current byte position in the input data. /// /// Useful when debugging errors. @@ -472,424 +490,8 @@ impl Reader { } } -/// Read methods -impl Reader { - /// Reads the next `Event`. - /// - /// This is the main entry point for reading XML `Event`s. - /// - /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` - /// internally). - /// - /// Having the possibility to control the internal buffers gives you some additional benefits - /// such as: - /// - /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, - /// you can call `buf.clear()` once you are done with processing the event (typically at the - /// end of your loop). - /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). - /// - /// # Examples - /// - /// ``` - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_event_into(&mut buf) { - /// Ok(Event::Start(ref e)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok(Event::Eof) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - #[inline] - pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) - } - - /// Reads the next event and resolves its namespace (if applicable). - /// - /// # Examples - /// - /// ``` - /// use std::str::from_utf8; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// use quick_xml::name::ResolveResult::*; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut ns_buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { - /// Ok((Bound(ns), Event::Start(e))) => { - /// count += 1; - /// match (ns.as_ref(), e.local_name().as_ref()) { - /// (b"www.xxxx", b"tag1") => (), - /// (b"www.yyyy", b"tag2") => (), - /// (ns, n) => panic!("Namespace and local name mismatch"), - /// } - /// println!("Resolved namespace: {:?}", ns); - /// } - /// Ok((Unbound, Event::Start(_))) => { - /// panic!("Element not in any namespace") - /// }, - /// Ok((Unknown(p), Event::Start(_))) => { - /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) - /// } - /// Ok((_, Event::Text(e))) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) - /// }, - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok((_, Event::Eof)) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - pub fn read_namespaced_event<'b, 'ns>( - &mut self, - buf: &'b mut Vec, - namespace_buffer: &'ns mut Vec, - ) -> Result<(ResolveResult<'ns>, Event<'b>)> { - if self.pending_pop { - self.ns_resolver.pop(namespace_buffer); - } - self.pending_pop = false; - match self.read_event_into(buf) { - Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), - Ok(Event::Start(e)) => { - self.ns_resolver.push(&e, namespace_buffer); - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Start(e), - )) - } - Ok(Event::Empty(e)) => { - // For empty elements we need to 'artificially' keep the namespace scope on the - // stack until the next `next()` call occurs. - // Otherwise the caller has no chance to use `resolve` in the context of the - // namespace declarations that are 'in scope' for the empty element alone. - // Ex: - self.ns_resolver.push(&e, namespace_buffer); - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Empty(e), - )) - } - Ok(Event::End(e)) => { - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::End(e), - )) - } - Ok(e) => Ok((ResolveResult::Unbound, e)), - Err(e) => Err(e), - } - } - - /// Reads until end element is found using provided buffer as intermediate - /// storage for events content. This function is supposed to be called after - /// you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// If your reader created from a string slice or byte array slice, it is - /// better to use [`read_to_end()`] method, because it will not copy bytes - /// into intermediate buffer. - /// - /// The provided `buf` buffer will be filled only by one event content at time. - /// Before reading of each event the buffer will be cleared. If you know an - /// appropriate size of each event, you can preallocate the buffer to reduce - /// number of reallocations. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// let mut buf = Vec::new(); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`read_to_end()`]: Self::read_to_end - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { - let mut depth = 0; - loop { - buf.clear(); - match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } - - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let mut xml = Reader::from_reader(b" - /// <b> - /// - /// " as &[u8]); - /// xml.trim_text(true); - /// - /// let expected = ["", ""]; - /// for &content in expected.iter() { - /// match xml.read_event_into(&mut Vec::new()) { - /// Ok(Event::Start(ref e)) => { - /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); - /// }, - /// e => panic!("Expecting Start event, found {:?}", e), - /// } - /// } - /// ``` - /// - /// [`Text`]: Event::Text - /// [`End`]: Event::End - pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { - let s = match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), - Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), - Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), - _ => return Err(Error::TextNotFound), - }; - self.read_to_end_into(end, buf)?; - Ok(s) - } -} - -/// Private methods +/// Common parsing code for all reader implementations. impl Reader { - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_impl<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - let event = match self.tag_state { - TagState::Init => self.read_until_open(buf, true), - TagState::Closed => self.read_until_open(buf, false), - TagState::Opened => self.read_until_close(buf), - TagState::Empty => self.close_expanded_empty(), - TagState::Exit => return Ok(Event::Eof), - }; - match event { - Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, - _ => {} - } - event - } - - /// Read until '<' is found and moves reader to an `Opened` state. - /// - /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Opened; - - if self.trim_text_start { - self.reader.skip_whitespace(&mut self.buf_position)?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_impl(buf); - } - - match self - .reader - .read_bytes_until(b'<', buf, &mut self.buf_position) - { - Ok(Some(bytes)) => { - #[cfg(feature = "encoding")] - if first && self.encoding.can_be_refined() { - if let Some(encoding) = detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - } - } - - let content = if self.trim_text_end { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - &bytes[..len] - } else { - bytes - }; - - Ok(if first { - Event::StartText(BytesText::from_escaped(content).into()) - } else { - Event::Text(BytesText::from_escaped(content)) - }) - } - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Closed; - - match self.reader.peek_one() { - // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_start(bytes), - Err(e) => Err(e), - }, - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event fn read_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { @@ -1026,519 +628,49 @@ impl Reader { Ok(Event::Start(BytesStart::borrowed(buf, name_end))) } } -} - -impl Reader> { - /// Creates an XML reader from a file path. - pub fn from_file>(path: P) -> Result { - let file = File::open(path).map_err(Error::Io)?; - let reader = BufReader::new(file); - Ok(Self::from_reader(reader)) - } -} - -impl<'a> Reader<&'a [u8]> { - /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Self { - // Rust strings are guaranteed to be UTF-8, so lock the encoding - #[cfg(feature = "encoding")] - { - let mut reader = Self::from_reader(s.as_bytes()); - reader.encoding = EncodingRef::Explicit(UTF_8); - reader - } - - #[cfg(not(feature = "encoding"))] - Self::from_reader(s.as_bytes()) - } - - /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Self { - Self::from_reader(s) - } - - /// Read an event that borrows from the input rather than a buffer. - #[inline] - pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) - } - - /// Reads until end element is found. This function is supposed to be called - /// after you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end(end.name()).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event().unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end(&mut self, end: QName) -> Result<()> { - let mut depth = 0; - loop { - match self.read_event() { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } -} - -/// Represents an input for a reader that can return borrowed data. -/// -/// There are two implementors of this trait: generic one that read data from -/// `Self`, copies some part of it into a provided buffer of type `B` and then -/// returns data that borrow from that buffer. -/// -/// The other implementor is for `&[u8]` and instead of copying data returns -/// borrowed data from `Self` instead. This implementation allows zero-copy -/// deserialization. -/// -/// # Parameters -/// - `'r`: lifetime of a buffer from which events will borrow -/// - `B`: a type of a buffer that can be used to store data read from `Self` and -/// from which events can borrow -trait XmlSource<'r, B> { - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte`, which does not include into result. - /// If input (`Self`) is exhausted, returns `None`. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// Some(b"abc".as_ref()) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until comment, CDATA or processing instruction is finished. - /// - /// This method expect that `<` already was read. - /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. - /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bang_element( - &mut self, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns `Some(buffer)` that contains a data between `<` and `>` or - /// `None` if end-of-input was reached and nothing was read. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; - - fn peek_one(&mut self) -> Result>; -} - -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { - #[inline] - fn read_bytes_until( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut read = 0; - let mut done = false; - let start = buf.len(); - while !done { - let used = { - let available = match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - done = true; - i + 1 - } - None => { - buf.extend_from_slice(available); - available.len() - } - } - }; - self.consume(used); - read += used; - } - *position += read; - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - fn read_bang_element( + fn resolve_namespaced_event_inner<'b, 'ns>( &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - let start = buf.len(); - let mut read = 1; - buf.push(b'!'); - self.consume(1); - - let bang_type = BangType::new(self.peek_one()?)?; - - loop { - match self.fill_buf() { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Ok(n) if n.is_empty() => return Err(bang_type.to_err()), - Ok(available) => { - if let Some((consumed, used)) = bang_type.parse(available, read) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } + event: Result>, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + match event { + Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), + Ok(Event::Start(e)) => { + self.ns_resolver.push(&e, namespace_buffer); + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Start(e), + )) } - } - - if read == 0 { - Ok(None) - } else { - Ok(Some((bang_type, &buf[start..]))) - } - } - - #[inline] - fn read_element( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - } - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - - /// Consume and discard all the whitespace until the next non-whitespace - /// character or EOF. - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - loop { - break match self.fill_buf() { - Ok(n) => { - let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); - if count > 0 { - self.consume(count); - *position += count; - continue; - } else { - Ok(()) - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } - - /// Consume and discard one character if it matches the given byte. Return - /// true if it matched. - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - match self.peek_one()? { - Some(b) if b == byte => { - *position += 1; - self.consume(1); - Ok(true) + Ok(Event::Empty(e)) => { + // For empty elements we need to 'artificially' keep the namespace scope on the + // stack until the next `next()` call occurs. + // Otherwise the caller has no chance to use `resolve` in the context of the + // namespace declarations that are 'in scope' for the empty element alone. + // Ex: + self.ns_resolver.push(&e, namespace_buffer); + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Empty(e), + )) } - _ => Ok(false), - } - } - - /// Return one character without consuming it, so that future `read_*` calls - /// will still include it. On EOF, return None. - fn peek_one(&mut self) -> Result> { - loop { - break match self.fill_buf() { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } -} - -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut usize, - ) -> Result> { - if self.is_empty() { - return Ok(None); - } - - Ok(Some(if let Some(i) = memchr::memchr(byte, self) { - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - bytes - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - bytes - })) - } - - fn read_bang_element( - &mut self, - _buf: (), - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - debug_assert_eq!(self[0], b'!'); - - let bang_type = BangType::new(self[1..].first().copied())?; - - if let Some((bytes, i)) = bang_type.parse(self, 0) { - *position += i; - *self = &self[i..]; - return Ok(Some((bang_type, bytes))); - } - - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Err(bang_type.to_err()) - } - - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { - if self.is_empty() { - return Ok(None); - } - - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - *position += i; - *self = &self[i..]; - return Ok(Some(bytes)); - } - - // Note: Do not update position, so the error points to a sane place - // rather than at the EOF. - Err(Error::UnexpectedEof("Element".to_string())) - - // FIXME: Figure out why the other one works without UnexpectedEof - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - let whitespaces = self - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); - *position += whitespaces; - *self = &self[whitespaces..]; - Ok(()) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - if self.first() == Some(&byte) { - *self = &self[1..]; - *position += 1; - Ok(true) - } else { - Ok(false) + Ok(Event::End(e)) => { + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::End(e), + )) + } + Ok(e) => Ok((ResolveResult::Unbound, e)), + Err(e) => Err(e), } } - - fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) - } } /// Possible elements started with ` bool { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Decoder of byte slices to the strings. This is lightweight object that can be copied. -/// -/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` -/// XML declaration or assumes UTF-8, if XML has no declaration, encoding -/// key is not defined or contains unknown encoding. -/// -/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` -/// is supported. [*UTF-16 is not supported at the present*][utf16]. -/// -/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: -/// any XML declarations are ignored. -/// -/// [utf16]: https://github.com/tafia/quick-xml/issues/158 -#[derive(Clone, Copy, Debug)] -pub struct Decoder { - #[cfg(feature = "encoding")] - encoding: &'static Encoding, -} - -#[cfg(not(feature = "encoding"))] -impl Decoder { - /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if - /// it is present in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { - Ok(Cow::Borrowed(from_utf8(bytes)?)) - } - - /// Decodes a slice regardless of XML declaration with BOM removal if - /// it is present in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { - &bytes[3..] - } else { - bytes - }; - self.decode(bytes) - } -} - -#[cfg(feature = "encoding")] -impl Decoder { - /// Returns the `Reader`s encoding. - /// - /// This encoding will be used by [`decode`]. - /// - /// [`decode`]: Self::decode - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Decodes specified bytes using encoding, declared in the XML, if it was - /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present - /// in the `bytes`. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { - match self - .encoding - .decode_without_bom_handling_and_without_replacement(bytes) - { - None => Err(Error::NonDecodable(None)), - Some(s) => Ok(s), - } - } - - /// Decodes a slice with BOM removal if it is present in the `bytes` using - /// the reader encoding. - /// - /// If this method called after reading XML declaration with the `"encoding"` - /// key, then this encoding is used, otherwise UTF-8 is used. - /// - /// If XML declaration is absent in the XML, UTF-8 is used. - /// - /// Returns an error in case of malformed sequences in the `bytes`. - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { - self.decode(self.remove_bom(bytes)) - } - /// Copied from [`Encoding::decode_with_bom_removal`] - #[inline] - fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { - if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { - return &bytes[3..]; - } - if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { - return &bytes[2..]; - } - if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { - return &bytes[2..]; - } - - bytes - } -} - -/// This implementation is required for tests of other parts of the library -#[cfg(test)] -#[cfg(feature = "serialize")] -impl Decoder { - pub(crate) fn utf8() -> Self { - Decoder { - #[cfg(feature = "encoding")] - encoding: UTF_8, - } - } - - #[cfg(feature = "encoding")] - pub(crate) fn utf16() -> Self { - Decoder { encoding: UTF_16LE } - } -} - -/// Automatic encoding detection of XML files based using the [recommended algorithm] -/// (https://www.w3.org/TR/xml11/#sec-guessing) -/// -/// The algorithm suggests examine up to the first 4 bytes to determine encoding -/// according to the following table: -/// -/// | Bytes |Detected encoding -/// |-------------|------------------------------------------ -/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order) -/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order) -/// |`00 00 FF FE`|UCS-4, unusual octet order (2143) -/// |`FE FF 00 00`|UCS-4, unusual octet order (3412) -/// |`FE FF ## ##`|UTF-16, big-endian -/// |`FF FE ## ##`|UTF-16, little-endian -/// |`EF BB BF` |UTF-8 -/// |-------------|------------------------------------------ -/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234) -/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321) -/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143) -/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412) -/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) -/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) -/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably -/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use) -/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind -/// -/// Because [`encoding_rs`] crate supported only subset of those encodings, only -/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. -/// -/// If encoding is detected, `Some` is returned, otherwise `None` is returned. -#[cfg(feature = "encoding")] -fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { - match bytes { - // with BOM - _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), - _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), - _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), - - // without BOM - _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible - - _ => None, - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - #[cfg(test)] mod test { macro_rules! check { - ($buf:expr) => { + ($(let mut $buf:ident = $init:expr;)?) => { mod read_bytes_until { - use crate::reader::XmlSource; + use super::input_from_str; // Use Bytes for printing bytes as strings for ASCII range use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1837,14 +800,14 @@ mod test { /// Checks that search in the empty buffer returns `None` #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_str("".as_ref()); // ^= 0 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), None @@ -1856,14 +819,14 @@ mod test { /// as a result and set `position` to `len()` #[test] fn non_existent() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef".as_ref(); + let mut input = input_from_str("abcdef".as_ref()); // ^= 6 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1876,14 +839,14 @@ mod test { /// after match (`1`) #[test] fn at_the_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"*abcdef".as_ref(); + let mut input = input_from_str("*abcdef".as_ref()); // ^= 1 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"")) @@ -1896,14 +859,14 @@ mod test { /// symbol after match #[test] fn inside() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abc*def".as_ref(); + let mut input = input_from_str("abc*def".as_ref()); // ^= 4 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abc")) @@ -1916,14 +879,14 @@ mod test { /// symbol after match (`len()`) #[test] fn in_the_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef*".as_ref(); + let mut input = input_from_str("abcdef*".as_ref()); // ^= 7 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1933,10 +896,12 @@ mod test { } mod read_bang_element { + use super::input_from_str; /// Checks that reading CDATA content works correctly mod cdata { + use super::input_from_str; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1945,12 +910,12 @@ mod test { #[test] #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![]]>other content".as_ref(); + let mut input = input_from_str("![]]>other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1965,12 +930,12 @@ mod test { /// is not found, parsing ends with an error #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[other content".as_ref(); + let mut input = input_from_str("![CDATA[other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1984,14 +949,14 @@ mod test { /// Checks that CDATA element without content inside parsed successfully #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[]]>other content".as_ref(); + let mut input = input_from_str("![CDATA[]]>other content".as_ref()); // ^= 11 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA["))) @@ -2004,14 +969,14 @@ mod test { /// a CDATA end sequence do not interrupt CDATA parsing #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); + let mut input = input_from_str("![CDATA[cdata]] ]>content]]>other content]]>".as_ref()); // ^= 28 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) @@ -2037,20 +1002,21 @@ mod test { /// /// [specification]: https://www.w3.org/TR/xml11/#dt-comment mod comment { + use super::input_from_str; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; #[test] #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!- -->other content".as_ref(); + let mut input = input_from_str("!- -->other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2063,12 +1029,12 @@ mod test { #[test] fn not_properly_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!->other content".as_ref(); + let mut input = input_from_str("!->other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2081,12 +1047,12 @@ mod test { #[test] fn not_closed1() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--other content".as_ref(); + let mut input = input_from_str("!--other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2099,12 +1065,12 @@ mod test { #[test] fn not_closed2() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!-->other content".as_ref(); + let mut input = input_from_str("!-->other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2117,12 +1083,12 @@ mod test { #[test] fn not_closed3() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->other content".as_ref(); + let mut input = input_from_str("!--->other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2135,14 +1101,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!---->other content".as_ref(); + let mut input = input_from_str("!---->other content".as_ref()); // ^= 6 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!----"))) @@ -2152,14 +1118,14 @@ mod test { #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->comment<--->other content".as_ref(); + let mut input = input_from_str("!--->comment<--->other content".as_ref()); // ^= 17 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!--->comment<---"))) @@ -2170,20 +1136,22 @@ mod test { /// Checks that reading DOCTYPE definition works correctly mod doctype { + use super::input_from_str; mod uppercase { + use super::input_from_str; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!D other content".as_ref(); + let mut input = input_from_str("!D other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2196,12 +1164,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPEother content".as_ref(); + let mut input = input_from_str("!DOCTYPEother content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2214,14 +1182,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE>other content".as_ref(); + let mut input = input_from_str("!DOCTYPE>other content".as_ref()); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!DOCTYPE"))) @@ -2231,12 +1199,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE other content".as_ref(); + let mut input = input_from_str("!DOCTYPE other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2249,19 +1217,20 @@ mod test { } mod lowercase { + use super::input_from_str; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!d other content".as_ref(); + let mut input = input_from_str("!d other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2274,12 +1243,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctypeother content".as_ref(); + let mut input = input_from_str("!doctypeother content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2292,14 +1261,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype>other content".as_ref(); + let mut input = input_from_str("!doctype>other content".as_ref()); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!doctype"))) @@ -2309,12 +1278,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype other content".as_ref(); + let mut input = input_from_str("!doctype other content".as_ref()); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2329,36 +1298,36 @@ mod test { } mod read_element { - use crate::reader::XmlSource; + use super::input_from_str; use crate::utils::Bytes; use pretty_assertions::assert_eq; /// Checks that nothing was read from empty buffer #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_str("".as_ref()); // ^= 0 - assert_eq!(input.read_element(buf, &mut position).unwrap().map(Bytes), None); + assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); assert_eq!(position, 0); } mod open { - use crate::reader::XmlSource; + use super::input_from_str; use crate::utils::Bytes; use pretty_assertions::assert_eq; #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b">".as_ref(); + let mut input = input_from_str(">".as_ref()); // ^= 1 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"")) ); assert_eq!(position, 1); @@ -2366,13 +1335,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag>".as_ref(); + let mut input = input_from_str("tag>".as_ref()); // ^= 4 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag")) ); assert_eq!(position, 4); @@ -2380,13 +1349,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":>".as_ref(); + let mut input = input_from_str(":>".as_ref()); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":")) ); assert_eq!(position, 2); @@ -2394,13 +1363,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag>".as_ref(); + let mut input = input_from_str(":tag>".as_ref()); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag")) ); assert_eq!(position, 5); @@ -2408,13 +1377,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); + let mut input = input_from_str(r#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref()); // ^= 38 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)) ); assert_eq!(position, 38); @@ -2422,19 +1391,19 @@ mod test { } mod self_closed { - use crate::reader::XmlSource; + use super::input_from_str; use crate::utils::Bytes; use pretty_assertions::assert_eq; #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"/>".as_ref(); + let mut input = input_from_str("/>".as_ref()); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"/")) ); assert_eq!(position, 2); @@ -2442,13 +1411,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag/>".as_ref(); + let mut input = input_from_str("tag/>".as_ref()); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag/")) ); assert_eq!(position, 5); @@ -2456,13 +1425,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":/>".as_ref(); + let mut input = input_from_str(":/>".as_ref()); // ^= 3 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":/")) ); assert_eq!(position, 3); @@ -2470,13 +1439,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag/>".as_ref(); + let mut input = input_from_str(":tag/>".as_ref()); // ^= 6 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag/")) ); assert_eq!(position, 6); @@ -2484,13 +1453,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); + let mut input = input_from_str(r#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref()); // ^= 41 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)) ); assert_eq!(position, 41); @@ -2499,14 +1468,16 @@ mod test { } mod issue_344 { + use super::reader_from_str; use crate::errors::Error; #[test] fn cdata() { let doc = "![]]>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -2519,9 +1490,10 @@ mod test { #[test] fn comment() { let doc = "!- -->"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2534,9 +1506,10 @@ mod test { #[test] fn doctype_uppercase() { let doc = "!D>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2549,9 +1522,10 @@ mod test { #[test] fn doctype_lowercase() { let doc = "!d>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2564,79 +1538,86 @@ mod test { /// Ensures, that no empty `Text` events are generated mod read_event_impl { + use super::reader_from_str; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; - use crate::reader::Reader; use pretty_assertions::assert_eq; #[test] fn start_text() { - let mut reader = Reader::from_str("bom"); + let mut reader = reader_from_str("bom"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::StartText(BytesText::from_escaped_str("bom").into()) ); } #[test] fn declaration() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Decl(BytesDecl::from_start(BytesStart::borrowed(b"xml ", 3))) ); } #[test] fn doctype() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::DocType(BytesText::from_escaped_str("x")) ); } #[test] fn processing_instruction() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::PI(BytesText::from_escaped_str("xml-stylesheet")) ); } #[test] fn start() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Start(BytesStart::borrowed_name(b"tag")) ); } #[test] fn end() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); // Because we expect invalid XML, do not check that // the end name paired with the start name reader.check_end_names(false); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::End(BytesEnd::borrowed(b"tag")) ); } #[test] fn empty() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); } @@ -2644,115 +1625,76 @@ mod test { /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = reader_from_str("text"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Text(BytesText::from_escaped_str("text")) ); } #[test] fn cdata() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::CData(BytesCData::from_str("")) ); } #[test] fn comment() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Comment(BytesText::from_escaped_str("")) ); } #[test] fn eof() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof ); } } - - #[cfg(feature = "encoding")] - mod encoding { - use crate::events::Event; - use crate::reader::Reader; - use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251}; - use pretty_assertions::assert_eq; - - mod bytes { - use super::*; - use pretty_assertions::assert_eq; - - /// Checks that encoding is detected by BOM and changed after XML declaration - #[test] - fn bom_detected() { - let mut reader = Reader::from_bytes(b"\xFF\xFE"); - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - - reader.read_event_impl($buf).unwrap(); - assert_eq!(reader.decoder().encoding(), WINDOWS_1251); - - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); - } - - /// Checks that encoding is changed by XML declaration, but only once - #[test] - fn xml_declaration() { - let mut reader = Reader::from_bytes(b""); - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - - reader.read_event_impl($buf).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); - } - } - - /// Checks that XML declaration cannot change the encoding from UTF-8 if - /// a `Reader` was created using `from_str` method - #[test] - fn str_always_has_utf8() { - let mut reader = Reader::from_str(""); - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_8); - - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); - } - } }; } - /// Tests for reader that generates events that borrow from the provided buffer - mod buffered { - check!(&mut Vec::new()); - } + pub(super) use check; - /// Tests for reader that generates events that borrow from the input - mod borrowed { - check!(()); + #[cfg(feature = "encoding")] + mod encoding { + use crate::events::Event; + use crate::reader::UTF_8; + use pretty_assertions::assert_eq; + /// Checks that XML declaration cannot change the encoding from UTF-8 if + /// a `Reader` was created using `from_str` method. + /// This is outside the `check` macro as this is only relevant for the + /// `Reader::from_str` method. + #[test] + fn str_always_has_utf8() { + let mut reader = crate::Reader::from_str(""); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event().unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_8); + + assert_eq!(reader.read_event().unwrap(), Event::Eof); + } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs new file mode 100644 index 00000000..d39ca3d1 --- /dev/null +++ b/src/reader/slice_reader.rs @@ -0,0 +1,473 @@ +//! This is an implementation of [`Reader`] for reading from a `&[u8]` as +//! underlying byte stream. This implementation supports not using an +//! intermediate buffer as the byte slice itself can be used to borrow from. + +use std::ops::{Deref, DerefMut}; + +#[cfg(feature = "encoding")] +use encoding_rs::UTF_8; + +use crate::events::{BytesText, Event}; +use crate::name::{QName, ResolveResult}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use crate::reader::EncodingRef; +use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// Private functions for a [`Reader`] based on a [`SliceReader`]. +impl<'buf> Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl(&mut self) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(true), + TagState::Closed => self.read_until_open(false), + TagState::Opened => self.read_until_close(), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open(&mut self, first: bool) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(); + } + + match self.reader.read_bytes_until(b'<', &mut self.buf_position) { + Ok(Some(bytes)) => { + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close(&mut self) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(&mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(&mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Builder for reading from a slice of bytes. +impl<'buf> Reader> { + /// Creates an XML reader from a string slice. + pub fn from_str(s: &'buf str) -> Self { + #[cfg_attr(not(feature = "encoding"), allow(unused_mut))] + let mut reader = Self::from_reader_internal(SliceReader(s.as_bytes())); + + // Rust strings are guaranteed to be UTF-8, so lock the encoding + #[cfg(feature = "encoding")] + { + reader.encoding = EncodingRef::Explicit(UTF_8); + } + + reader + } +} + +/// Public reading methods for a [`Reader`] based on an [`SliceReader`]. +impl<'buf> Reader> { + /// Read an event that borrows from the input rather than a buffer. + #[inline] + pub fn read_event(&mut self) -> Result> { + self.read_event_impl() + } + + /// Reads until end element is found. This function is supposed to be called + /// after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#); + /// reader.trim_text(true); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end(end.name()).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end(&mut self, end: QName) -> Result<()> { + let mut depth = 0; + loop { + match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text(&mut self, end: QName) -> Result { + let s = match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end(end)?; + Ok(s) + } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_str(xml); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event(&mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event<'ns>( + &mut self, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'buf>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event(); + self.resolve_namespaced_event_inner(event, namespace_buffer) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// A struct for handling reading functions based on reading from a byte slice. +#[derive(Debug, Clone, Copy)] +pub struct SliceReader<'buf>(&'buf [u8]); + +impl<'buf> Deref for SliceReader<'buf> { + type Target = &'buf [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'buf> DerefMut for SliceReader<'buf> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<'buf> InnerReader for SliceReader<'buf> { + type Reader = &'buf [u8]; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + +/// Private reading functions for a [`SliceReader`]. +impl<'buf> SliceReader<'buf> { + fn read_bytes_until(&mut self, byte: u8, position: &mut usize) -> Result> { + // search byte must be within the ascii range + debug_assert!(byte.is_ascii()); + + if self.0.is_empty() { + return Ok(None); + } + + Ok(Some(if let Some(i) = memchr::memchr(byte, self.0) { + *position += i + 1; + let bytes = &self.0[..i]; + self.0 = &self.0[i + 1..]; + bytes + } else { + *position += self.0.len(); + let bytes = &self.0[..]; + self.0 = &[]; + bytes + })) + } + + fn read_bang_element( + &mut self, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + debug_assert_eq!(self.0[0], b'!'); + + let bang_type = BangType::new(self.0[1..].first().copied())?; + + if let Some((bytes, i)) = bang_type.parse(self.0, 0) { + *position += i; + self.0 = &self.0[i..]; + return Ok(Some((bang_type, bytes))); + } + + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Err(bang_type.to_err()) + } + + fn read_element(&mut self, position: &mut usize) -> Result> { + if self.0.is_empty() { + return Ok(None); + } + + let mut state = ReadElementState::Elem; + + if let Some((bytes, i)) = state.change(self.0) { + *position += i; + self.0 = &self.0[i..]; + return Ok(Some(bytes)); + } + + // Note: Do not update position, so the error points to a sane place + // rather than at the EOF. + Err(Error::UnexpectedEof("Element".to_string())) + + // FIXME: Figure out why the other one works without UnexpectedEof + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + let whitespaces = self + .0 + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or(self.0.len()); + *position += whitespaces; + self.0 = &self.0[whitespaces..]; + Ok(()) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + // search byte must be within the ascii range + debug_assert!(byte.is_ascii()); + + if self.0.first() == Some(&byte) { + self.0 = &self.0[1..]; + *position += 1; + Ok(true) + } else { + Ok(false) + } + } + + fn peek_one(&mut self) -> Result> { + Ok(self.0.first().copied()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_str<'buf>(s: &'buf str) -> SliceReader<'buf> { + SliceReader(s.as_bytes()) + } + + fn reader_from_str<'buf>(s: &'buf str) -> Reader> { + Reader::from_str(s) + } + + check!(); +} diff --git a/tests/namespaces.rs b/tests/namespaces.rs index 4729f2c7..5f190195 100644 --- a/tests/namespaces.rs +++ b/tests/namespaces.rs @@ -11,11 +11,10 @@ fn namespace() { let mut r = Reader::from_str("in namespace!"); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -24,7 +23,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -32,13 +31,13 @@ fn namespace() { ), } // "in namespace!" - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { //TODO: Check in specification, it is true that namespace should be empty? Ok((ns, Text(_))) => assert_eq!(ns, Unbound), e => panic!("expecting text content with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -47,7 +46,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -58,11 +57,10 @@ fn default_namespace() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -71,7 +69,7 @@ fn default_namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -79,7 +77,7 @@ fn default_namespace() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -89,7 +87,7 @@ fn default_namespace() { // very important: a should not be in any namespace. The default namespace only applies to // the sub-document it is defined on. - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -100,11 +98,10 @@ fn default_namespace_reset() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer start element with to resolve to 'www1', got {:?}", @@ -113,7 +110,7 @@ fn default_namespace_reset() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting inner start element with no namespace, got {:?}", @@ -121,13 +118,13 @@ fn default_namespace_reset() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting inner end element with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer end element with to resolve to 'www1', got {:?}", @@ -141,14 +138,13 @@ fn default_namespace_reset() { /// The code path for namespace handling is slightly different for `Empty` vs. `Start+End`. #[test] fn attributes_empty_ns() { - let src = b""; + let src = ""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_str(src); r.trim_text(true).expand_empty_elements(false); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, Empty(e))) => e, e => panic!("Expecting Empty event, got {:?}", e), }; @@ -182,14 +178,13 @@ fn attributes_empty_ns() { /// The code path for namespace handling is slightly different for `Empty` vs. `Start+End`. #[test] fn attributes_empty_ns_expanded() { - let src = b""; + let src = ""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_str(src); r.trim_text(true).expand_empty_elements(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, Start(e))) => e, e => panic!("Expecting Empty event, got {:?}", e), }; @@ -218,7 +213,7 @@ fn attributes_empty_ns_expanded() { assert_eq!(attrs.next(), None); } - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, End(e))) => assert_eq!(e.name(), QName(b"a")), e => panic!("Expecting End event, got {:?}", e), } @@ -226,16 +221,15 @@ fn attributes_empty_ns_expanded() { #[test] fn default_ns_shadowing_empty() { - let src = b""; + let src = ""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_str(src); r.trim_text(true).expand_empty_elements(false); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -246,7 +240,7 @@ fn default_ns_shadowing_empty() { // { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Empty(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -274,7 +268,7 @@ fn default_ns_shadowing_empty() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -285,16 +279,15 @@ fn default_ns_shadowing_empty() { #[test] fn default_ns_shadowing_expanded() { - let src = b""; + let src = ""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_str(src); r.trim_text(true).expand_empty_elements(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -302,11 +295,10 @@ fn default_ns_shadowing_expanded() { e => panic!("Expected Start event (), got {:?}", e), } } - buf.clear(); // { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -333,7 +325,7 @@ fn default_ns_shadowing_expanded() { } // virtual - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -341,7 +333,7 @@ fn default_ns_shadowing_expanded() { e => panic!("Expected End event (), got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -363,11 +355,10 @@ fn reserved_name() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Empty(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "Expected empty element bound to namespace 'www1', got {:?}", diff --git a/tests/test.rs b/tests/test.rs index 5ac9dae8..fa11209d 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -9,8 +9,8 @@ use pretty_assertions::assert_eq; #[test] fn test_sample() { - let src: &[u8] = include_bytes!("documents/sample_rss.xml"); - let mut r = Reader::from_bytes(src); + let src: &str = include_str!("documents/sample_rss.xml"); + let mut r = Reader::from_str(src); let mut count = 0; loop { match r.read_event().unwrap() { @@ -25,8 +25,8 @@ fn test_sample() { #[test] fn test_attributes_empty() { - let src = b""; - let mut r = Reader::from_bytes(src); + let src = ""; + let mut r = Reader::from_str(src); r.trim_text(true).expand_empty_elements(false); match r.read_event() { Ok(Empty(e)) => { @@ -56,7 +56,8 @@ fn test_attribute_equal() { let src = b""; let mut r = Reader::from_reader(src as &[u8]); r.trim_text(true).expand_empty_elements(false); - match r.read_event() { + let mut buf = Vec::new(); + match r.read_event_into(&mut buf) { Ok(Empty(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -77,8 +78,9 @@ fn test_comment_starting_with_gt() { let src = b"-->"; let mut r = Reader::from_reader(src as &[u8]); r.trim_text(true).expand_empty_elements(false); + let mut buf = Vec::new(); loop { - match r.read_event() { + match r.read_event_into(&mut buf) { Ok(Comment(e)) => { assert_eq!(e.as_ref(), b">"); break; @@ -94,9 +96,10 @@ fn test_comment_starting_with_gt() { fn test_koi8_r_encoding() { let src = include_bytes!("documents/opennews_all.rss"); let mut r = Reader::from_bytes(src); + let mut buf = Vec::new(); r.trim_text(true).expand_empty_elements(false); loop { - match r.read_event() { + match r.read_event_into(&mut buf) { Ok(Text(e)) => { e.decode_and_unescape(&r).unwrap(); } @@ -129,8 +132,9 @@ fn test_issue94() { "#; let mut reader = Reader::from_reader(&data[..]); reader.trim_text(true); + let mut buf = Vec::new(); loop { - match reader.read_event() { + match reader.read_event_into(&mut buf) { Ok(Eof) | Err(..) => break, _ => (), } diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index bb32a602..2cd3f75b 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -787,11 +787,11 @@ mod decode_with_bom_removal { fn removes_utf16be_bom() { let mut reader = Reader::from_bytes(include_bytes!("./documents/utf16be.xml")); reader.trim_text(true); - + let mut event_buffer = Vec::new(); let mut txt = Vec::new(); loop { - match reader.read_event() { + match reader.read_event_into(&mut event_buffer) { Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), Ok(Eof) => break, _ => (), @@ -805,11 +805,11 @@ mod decode_with_bom_removal { fn removes_utf16le_bom() { let mut reader = Reader::from_bytes(include_bytes!("./documents/utf16le.xml")); reader.trim_text(true); - + let mut event_buffer = Vec::new(); let mut txt = Vec::new(); loop { - match reader.read_event() { + match reader.read_event_into(&mut event_buffer) { Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), Ok(Eof) => break, _ => (), diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 28401b77..a7d26db8 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -362,19 +362,18 @@ fn test(input: &str, output: &str, trim: bool) { #[track_caller] fn test_bytes(input: &[u8], output: &[u8], trim: bool) { - let mut reader = Reader::from_reader(input); + let mut reader = Reader::from_bytes(input); reader .trim_text(trim) .check_comments(true) .expand_empty_elements(false); let mut spec_lines = SpecIter(output).enumerate(); - let mut buf = Vec::new(); + let mut event_buffer = Vec::new(); let mut ns_buffer = Vec::new(); loop { - buf.clear(); - let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer); + let event = reader.read_namespaced_event_into(&mut event_buffer, &mut ns_buffer); let line = xmlrs_display(event, reader.decoder()); if let Some((n, spec)) = spec_lines.next() { if spec.trim() == "EndDocument" {