From ded1b772311e684ae44733275392936058acbb44 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 19 Jul 2022 20:45:36 +0500 Subject: [PATCH 1/8] Move buffered and borrowing parts of reader to separate files This commit only moves code without significant changes (the only changes is: - corrected imports - add imports to the doc comments which have become inaccessible ) --- src/reader/buffered_reader.rs | 518 ++++++++++++++++++++++ src/{reader.rs => reader/mod.rs} | 718 +------------------------------ src/reader/slice_reader.rs | 232 ++++++++++ 3 files changed, 754 insertions(+), 714 deletions(-) create mode 100644 src/reader/buffered_reader.rs rename src/{reader.rs => reader/mod.rs} (76%) create mode 100644 src/reader/slice_reader.rs diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs new file mode 100644 index 00000000..bde99d90 --- /dev/null +++ b/src/reader/buffered_reader.rs @@ -0,0 +1,518 @@ +//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as +//! underlying byte stream. + +use std::fs::File; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +use crate::errors::{Error, Result}; +use crate::events::Event; +use crate::name::{QName, ResolveResult}; +use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; + +use memchr; + +/// This is an implementation of [`Reader`] for reading from a [`BufRead`] as +/// underlying byte stream. +impl Reader { + /// Reads the next `Event`. + /// + /// This is the main entry point for reading XML `Event`s. + /// + /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` + /// internally). + /// + /// Having the possibility to control the internal buffers gives you some additional benefits + /// such as: + /// + /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, + /// you can call `buf.clear()` once you are done with processing the event (typically at the + /// end of your loop). + /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). + /// + /// # Examples + /// + /// ``` + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_str(xml); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_event_into(&mut buf) { + /// Ok(Event::Start(ref e)) => count += 1, + /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok(Event::Eof) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + #[inline] + pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { + self.read_event_impl(buf) + } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_str(xml); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event<'b, 'ns>( + &mut self, + buf: &'b mut Vec, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + match self.read_event_into(buf) { + Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), + Ok(Event::Start(e)) => { + self.ns_resolver.push(&e, namespace_buffer); + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Start(e), + )) + } + Ok(Event::Empty(e)) => { + // For empty elements we need to 'artificially' keep the namespace scope on the + // stack until the next `next()` call occurs. + // Otherwise the caller has no chance to use `resolve` in the context of the + // namespace declarations that are 'in scope' for the empty element alone. + // Ex: + self.ns_resolver.push(&e, namespace_buffer); + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Empty(e), + )) + } + Ok(Event::End(e)) => { + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::End(e), + )) + } + Ok(e) => Ok((ResolveResult::Unbound, e)), + Err(e) => Err(e), + } + } + + /// Reads until end element is found using provided buffer as intermediate + /// storage for events content. This function is supposed to be called after + /// you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_to_end()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// The provided `buf` buffer will be filled only by one event content at time. + /// Before reading of each event the buffer will be cleared. If you know an + /// appropriate size of each event, you can preallocate the buffer to reduce + /// number of reallocations. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#); + /// reader.trim_text(true); + /// let mut buf = Vec::new(); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`read_to_end()`]: Self::read_to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { + let mut depth = 0; + loop { + buf.clear(); + match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { + let s = match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end_into(end, buf)?; + Ok(s) + } +} + +impl Reader> { + /// Creates an XML reader from a file path. + pub fn from_file>(path: P) -> Result { + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + Ok(Self::from_reader(reader)) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Implementation of `XmlSource` for any `BufRead` reader using a user-given +/// `Vec` as buffer that will be borrowed by events. +impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { + #[inline] + fn read_bytes_until( + &mut self, + byte: u8, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + fn read_bang_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.consume(1); + + let bang_type = BangType::new(self.peek_one()?)?; + + loop { + match self.fill_buf() { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Ok(n) if n.is_empty() => return Err(bang_type.to_err()), + Ok(available) => { + if let Some((consumed, used)) = bang_type.parse(available, read) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + } + } + + if read == 0 { + Ok(None) + } else { + Ok(Some((bang_type, &buf[start..]))) + } + } + + #[inline] + fn read_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + let mut state = ReadElementState::Elem; + let mut read = 0; + + let start = buf.len(); + loop { + match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(available) => { + if let Some((consumed, used)) = state.change(available) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + } + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.fill_buf() { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.consume(count); + *position += count; + continue; + } else { + Ok(()) + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + match self.peek_one()? { + Some(b) if b == byte => { + *position += 1; + self.consume(1); + Ok(true) + } + _ => Ok(false), + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + fn peek_one(&mut self) -> Result> { + loop { + break match self.fill_buf() { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } +} diff --git a/src/reader.rs b/src/reader/mod.rs similarity index 76% rename from src/reader.rs rename to src/reader/mod.rs index f3a868a7..781d8419 100644 --- a/src/reader.rs +++ b/src/reader/mod.rs @@ -1,8 +1,7 @@ //! A module to handle `Reader` use std::borrow::Cow; -use std::io::{self, BufRead, BufReader}; -use std::{fs::File, path::Path, str::from_utf8}; +use std::str::from_utf8; #[cfg(feature = "encoding")] use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; @@ -13,6 +12,9 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +mod buffered_reader; +mod slice_reader; + /// Possible reader states. The state transition diagram (`true` and `false` shows /// value of [`Reader::expand_empty_elements()`] option): /// @@ -472,305 +474,6 @@ impl Reader { } } -/// Read methods -impl Reader { - /// Reads the next `Event`. - /// - /// This is the main entry point for reading XML `Event`s. - /// - /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` - /// internally). - /// - /// Having the possibility to control the internal buffers gives you some additional benefits - /// such as: - /// - /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, - /// you can call `buf.clear()` once you are done with processing the event (typically at the - /// end of your loop). - /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). - /// - /// # Examples - /// - /// ``` - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_event_into(&mut buf) { - /// Ok(Event::Start(ref e)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok(Event::Eof) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - #[inline] - pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) - } - - /// Reads the next event and resolves its namespace (if applicable). - /// - /// # Examples - /// - /// ``` - /// use std::str::from_utf8; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// use quick_xml::name::ResolveResult::*; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut ns_buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { - /// Ok((Bound(ns), Event::Start(e))) => { - /// count += 1; - /// match (ns.as_ref(), e.local_name().as_ref()) { - /// (b"www.xxxx", b"tag1") => (), - /// (b"www.yyyy", b"tag2") => (), - /// (ns, n) => panic!("Namespace and local name mismatch"), - /// } - /// println!("Resolved namespace: {:?}", ns); - /// } - /// Ok((Unbound, Event::Start(_))) => { - /// panic!("Element not in any namespace") - /// }, - /// Ok((Unknown(p), Event::Start(_))) => { - /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) - /// } - /// Ok((_, Event::Text(e))) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) - /// }, - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok((_, Event::Eof)) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - pub fn read_namespaced_event<'b, 'ns>( - &mut self, - buf: &'b mut Vec, - namespace_buffer: &'ns mut Vec, - ) -> Result<(ResolveResult<'ns>, Event<'b>)> { - if self.pending_pop { - self.ns_resolver.pop(namespace_buffer); - } - self.pending_pop = false; - match self.read_event_into(buf) { - Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), - Ok(Event::Start(e)) => { - self.ns_resolver.push(&e, namespace_buffer); - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Start(e), - )) - } - Ok(Event::Empty(e)) => { - // For empty elements we need to 'artificially' keep the namespace scope on the - // stack until the next `next()` call occurs. - // Otherwise the caller has no chance to use `resolve` in the context of the - // namespace declarations that are 'in scope' for the empty element alone. - // Ex: - self.ns_resolver.push(&e, namespace_buffer); - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Empty(e), - )) - } - Ok(Event::End(e)) => { - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::End(e), - )) - } - Ok(e) => Ok((ResolveResult::Unbound, e)), - Err(e) => Err(e), - } - } - - /// Reads until end element is found using provided buffer as intermediate - /// storage for events content. This function is supposed to be called after - /// you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// If your reader created from a string slice or byte array slice, it is - /// better to use [`read_to_end()`] method, because it will not copy bytes - /// into intermediate buffer. - /// - /// The provided `buf` buffer will be filled only by one event content at time. - /// Before reading of each event the buffer will be cleared. If you know an - /// appropriate size of each event, you can preallocate the buffer to reduce - /// number of reallocations. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// let mut buf = Vec::new(); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`read_to_end()`]: Self::read_to_end - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { - let mut depth = 0; - loop { - buf.clear(); - match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } - - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let mut xml = Reader::from_reader(b" - /// <b> - /// - /// " as &[u8]); - /// xml.trim_text(true); - /// - /// let expected = ["", ""]; - /// for &content in expected.iter() { - /// match xml.read_event_into(&mut Vec::new()) { - /// Ok(Event::Start(ref e)) => { - /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); - /// }, - /// e => panic!("Expecting Start event, found {:?}", e), - /// } - /// } - /// ``` - /// - /// [`Text`]: Event::Text - /// [`End`]: Event::End - pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { - let s = match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), - Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), - Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), - _ => return Err(Error::TextNotFound), - }; - self.read_to_end_into(end, buf)?; - Ok(s) - } -} - /// Private methods impl Reader { /// Read text into the given buffer, and return an event that borrows from @@ -1028,133 +731,6 @@ impl Reader { } } -impl Reader> { - /// Creates an XML reader from a file path. - pub fn from_file>(path: P) -> Result { - let file = File::open(path).map_err(Error::Io)?; - let reader = BufReader::new(file); - Ok(Self::from_reader(reader)) - } -} - -impl<'a> Reader<&'a [u8]> { - /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Self { - // Rust strings are guaranteed to be UTF-8, so lock the encoding - #[cfg(feature = "encoding")] - { - let mut reader = Self::from_reader(s.as_bytes()); - reader.encoding = EncodingRef::Explicit(UTF_8); - reader - } - - #[cfg(not(feature = "encoding"))] - Self::from_reader(s.as_bytes()) - } - - /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Self { - Self::from_reader(s) - } - - /// Read an event that borrows from the input rather than a buffer. - #[inline] - pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) - } - - /// Reads until end element is found. This function is supposed to be called - /// after you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end(end.name()).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event().unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end(&mut self, end: QName) -> Result<()> { - let mut depth = 0; - loop { - match self.read_event() { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } -} - /// Represents an input for a reader that can return borrowed data. /// /// There are two implementors of this trait: generic one that read data from @@ -1255,292 +831,6 @@ trait XmlSource<'r, B> { fn peek_one(&mut self) -> Result>; } -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { - #[inline] - fn read_bytes_until( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut read = 0; - let mut done = false; - let start = buf.len(); - while !done { - let used = { - let available = match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - done = true; - i + 1 - } - None => { - buf.extend_from_slice(available); - available.len() - } - } - }; - self.consume(used); - read += used; - } - *position += read; - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - - fn read_bang_element( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - let start = buf.len(); - let mut read = 1; - buf.push(b'!'); - self.consume(1); - - let bang_type = BangType::new(self.peek_one()?)?; - - loop { - match self.fill_buf() { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Ok(n) if n.is_empty() => return Err(bang_type.to_err()), - Ok(available) => { - if let Some((consumed, used)) = bang_type.parse(available, read) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - } - } - - if read == 0 { - Ok(None) - } else { - Ok(Some((bang_type, &buf[start..]))) - } - } - - #[inline] - fn read_element( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - } - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - - /// Consume and discard all the whitespace until the next non-whitespace - /// character or EOF. - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - loop { - break match self.fill_buf() { - Ok(n) => { - let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); - if count > 0 { - self.consume(count); - *position += count; - continue; - } else { - Ok(()) - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } - - /// Consume and discard one character if it matches the given byte. Return - /// true if it matched. - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - match self.peek_one()? { - Some(b) if b == byte => { - *position += 1; - self.consume(1); - Ok(true) - } - _ => Ok(false), - } - } - - /// Return one character without consuming it, so that future `read_*` calls - /// will still include it. On EOF, return None. - fn peek_one(&mut self) -> Result> { - loop { - break match self.fill_buf() { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } -} - -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut usize, - ) -> Result> { - if self.is_empty() { - return Ok(None); - } - - Ok(Some(if let Some(i) = memchr::memchr(byte, self) { - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - bytes - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - bytes - })) - } - - fn read_bang_element( - &mut self, - _buf: (), - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - debug_assert_eq!(self[0], b'!'); - - let bang_type = BangType::new(self[1..].first().copied())?; - - if let Some((bytes, i)) = bang_type.parse(self, 0) { - *position += i; - *self = &self[i..]; - return Ok(Some((bang_type, bytes))); - } - - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Err(bang_type.to_err()) - } - - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { - if self.is_empty() { - return Ok(None); - } - - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - *position += i; - *self = &self[i..]; - return Ok(Some(bytes)); - } - - // Note: Do not update position, so the error points to a sane place - // rather than at the EOF. - Err(Error::UnexpectedEof("Element".to_string())) - - // FIXME: Figure out why the other one works without UnexpectedEof - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - let whitespaces = self - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); - *position += whitespaces; - *self = &self[whitespaces..]; - Ok(()) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - if self.first() == Some(&byte) { - *self = &self[1..]; - *position += 1; - Ok(true) - } else { - Ok(false) - } - } - - fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) - } -} - /// Possible elements started with ` Reader<&'a [u8]> { + /// Creates an XML reader from a string slice. + pub fn from_str(s: &'a str) -> Self { + // Rust strings are guaranteed to be UTF-8, so lock the encoding + #[cfg(feature = "encoding")] + { + let mut reader = Self::from_reader(s.as_bytes()); + reader.encoding = EncodingRef::Explicit(UTF_8); + reader + } + + #[cfg(not(feature = "encoding"))] + Self::from_reader(s.as_bytes()) + } + + /// Creates an XML reader from a slice of bytes. + pub fn from_bytes(s: &'a [u8]) -> Self { + Self::from_reader(s) + } + + /// Read an event that borrows from the input rather than a buffer. + #[inline] + pub fn read_event(&mut self) -> Result> { + self.read_event_impl(()) + } + + /// Reads until end element is found. This function is supposed to be called + /// after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#); + /// reader.trim_text(true); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end(end.name()).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end(&mut self, end: QName) -> Result<()> { + let mut depth = 0; + loop { + match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer +/// that will be borrowed by events. This implementation provides a zero-copy deserialization +impl<'a> XmlSource<'a, ()> for &'a [u8] { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: (), + position: &mut usize, + ) -> Result> { + if self.is_empty() { + return Ok(None); + } + + Ok(Some(if let Some(i) = memchr::memchr(byte, self) { + *position += i + 1; + let bytes = &self[..i]; + *self = &self[i + 1..]; + bytes + } else { + *position += self.len(); + let bytes = &self[..]; + *self = &[]; + bytes + })) + } + + fn read_bang_element( + &mut self, + _buf: (), + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + debug_assert_eq!(self[0], b'!'); + + let bang_type = BangType::new(self[1..].first().copied())?; + + if let Some((bytes, i)) = bang_type.parse(self, 0) { + *position += i; + *self = &self[i..]; + return Ok(Some((bang_type, bytes))); + } + + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Err(bang_type.to_err()) + } + + fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { + if self.is_empty() { + return Ok(None); + } + + let mut state = ReadElementState::Elem; + + if let Some((bytes, i)) = state.change(self) { + *position += i; + *self = &self[i..]; + return Ok(Some(bytes)); + } + + // Note: Do not update position, so the error points to a sane place + // rather than at the EOF. + Err(Error::UnexpectedEof("Element".to_string())) + + // FIXME: Figure out why the other one works without UnexpectedEof + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + let whitespaces = self + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or(self.len()); + *position += whitespaces; + *self = &self[whitespaces..]; + Ok(()) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + if self.first() == Some(&byte) { + *self = &self[1..]; + *position += 1; + Ok(true) + } else { + Ok(false) + } + } + + fn peek_one(&mut self) -> Result> { + Ok(self.first().copied()) + } +} From 7aba3ddfab42cef0a5a37f5ef3d107e59809db60 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 19 Jul 2022 22:03:58 +0500 Subject: [PATCH 2/8] Implement reading namespaced events for borrowing reader Main code moved from `read_namespaced_event_into` to `resolve_namespaced_event_inner` --- benches/microbenches.rs | 4 +-- src/reader/buffered_reader.rs | 42 +++------------------- src/reader/mod.rs | 43 +++++++++++++++++++++++ src/reader/slice_reader.rs | 61 +++++++++++++++++++++++++++++++- tests/namespaces.rs | 65 +++++++++++++++-------------------- tests/xmlrs_reader_tests.rs | 6 ++-- 6 files changed, 139 insertions(+), 82 deletions(-) diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 8bbe1a67..3bdfea4f 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -85,7 +85,7 @@ fn read_namespaced_event(c: &mut Criterion) { let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), @@ -109,7 +109,7 @@ fn read_namespaced_event(c: &mut Criterion) { let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index bde99d90..bec2a1e4 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -84,7 +84,7 @@ impl Reader { /// let mut ns_buf = Vec::new(); /// let mut txt = Vec::new(); /// loop { - /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { + /// match reader.read_namespaced_event_into(&mut buf, &mut ns_buf) { /// Ok((Bound(ns), Event::Start(e))) => { /// count += 1; /// match (ns.as_ref(), e.local_name().as_ref()) { @@ -112,7 +112,7 @@ impl Reader { /// println!("Found {} start events", count); /// println!("Text events: {:?}", txt); /// ``` - pub fn read_namespaced_event<'b, 'ns>( + pub fn read_namespaced_event_into<'b, 'ns>( &mut self, buf: &'b mut Vec, namespace_buffer: &'ns mut Vec, @@ -121,42 +121,8 @@ impl Reader { self.ns_resolver.pop(namespace_buffer); } self.pending_pop = false; - match self.read_event_into(buf) { - Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), - Ok(Event::Start(e)) => { - self.ns_resolver.push(&e, namespace_buffer); - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Start(e), - )) - } - Ok(Event::Empty(e)) => { - // For empty elements we need to 'artificially' keep the namespace scope on the - // stack until the next `next()` call occurs. - // Otherwise the caller has no chance to use `resolve` in the context of the - // namespace declarations that are 'in scope' for the empty element alone. - // Ex: - self.ns_resolver.push(&e, namespace_buffer); - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Empty(e), - )) - } - Ok(Event::End(e)) => { - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::End(e), - )) - } - Ok(e) => Ok((ResolveResult::Unbound, e)), - Err(e) => Err(e), - } + let event = self.read_event_into(buf); + self.resolve_namespaced_event_inner(event, namespace_buffer) } /// Reads until end element is found using provided buffer as intermediate diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 781d8419..3bdae3c6 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -729,6 +729,49 @@ impl Reader { Ok(Event::Start(BytesStart::borrowed(buf, name_end))) } } + + fn resolve_namespaced_event_inner<'b, 'ns>( + &mut self, + event: Result>, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + match event { + Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), + Ok(Event::Start(e)) => { + self.ns_resolver.push(&e, namespace_buffer); + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Start(e), + )) + } + Ok(Event::Empty(e)) => { + // For empty elements we need to 'artificially' keep the namespace scope on the + // stack until the next `next()` call occurs. + // Otherwise the caller has no chance to use `resolve` in the context of the + // namespace declarations that are 'in scope' for the empty element alone. + // Ex: + self.ns_resolver.push(&e, namespace_buffer); + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Empty(e), + )) + } + Ok(Event::End(e)) => { + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::End(e), + )) + } + Ok(e) => Ok((ResolveResult::Unbound, e)), + Err(e) => Err(e), + } + } } /// Represents an input for a reader that can return borrowed data. diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index abbb04f6..25b52d12 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -9,7 +9,7 @@ use encoding_rs::UTF_8; use crate::errors::{Error, Result}; use crate::events::Event; -use crate::name::QName; +use crate::name::{QName, ResolveResult}; use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; use memchr; @@ -134,6 +134,65 @@ impl<'a> Reader<&'a [u8]> { } } } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_str(xml); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event(&mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event<'ns>( + &mut self, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'a>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event(); + self.resolve_namespaced_event_inner(event, namespace_buffer) + } } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/namespaces.rs b/tests/namespaces.rs index 4729f2c7..3878bd4c 100644 --- a/tests/namespaces.rs +++ b/tests/namespaces.rs @@ -11,11 +11,10 @@ fn namespace() { let mut r = Reader::from_str("in namespace!"); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -24,7 +23,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -32,13 +31,13 @@ fn namespace() { ), } // "in namespace!" - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { //TODO: Check in specification, it is true that namespace should be empty? Ok((ns, Text(_))) => assert_eq!(ns, Unbound), e => panic!("expecting text content with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -47,7 +46,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -58,11 +57,10 @@ fn default_namespace() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -71,7 +69,7 @@ fn default_namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -79,7 +77,7 @@ fn default_namespace() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -89,7 +87,7 @@ fn default_namespace() { // very important: a should not be in any namespace. The default namespace only applies to // the sub-document it is defined on. - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -100,11 +98,10 @@ fn default_namespace_reset() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer start element with to resolve to 'www1', got {:?}", @@ -113,7 +110,7 @@ fn default_namespace_reset() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting inner start element with no namespace, got {:?}", @@ -121,13 +118,13 @@ fn default_namespace_reset() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting inner end element with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer end element with to resolve to 'www1', got {:?}", @@ -143,12 +140,11 @@ fn default_namespace_reset() { fn attributes_empty_ns() { let src = b""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_bytes(src); r.trim_text(true).expand_empty_elements(false); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, Empty(e))) => e, e => panic!("Expecting Empty event, got {:?}", e), }; @@ -184,12 +180,11 @@ fn attributes_empty_ns() { fn attributes_empty_ns_expanded() { let src = b""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_bytes(src); r.trim_text(true).expand_empty_elements(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, Start(e))) => e, e => panic!("Expecting Empty event, got {:?}", e), }; @@ -218,7 +213,7 @@ fn attributes_empty_ns_expanded() { assert_eq!(attrs.next(), None); } - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((Unbound, End(e))) => assert_eq!(e.name(), QName(b"a")), e => panic!("Expecting End event, got {:?}", e), } @@ -228,14 +223,13 @@ fn attributes_empty_ns_expanded() { fn default_ns_shadowing_empty() { let src = b""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_bytes(src); r.trim_text(true).expand_empty_elements(false); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -246,7 +240,7 @@ fn default_ns_shadowing_empty() { // { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Empty(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -274,7 +268,7 @@ fn default_ns_shadowing_empty() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -287,14 +281,13 @@ fn default_ns_shadowing_empty() { fn default_ns_shadowing_expanded() { let src = b""; - let mut r = Reader::from_reader(src as &[u8]); + let mut r = Reader::from_bytes(src); r.trim_text(true).expand_empty_elements(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -302,11 +295,10 @@ fn default_ns_shadowing_expanded() { e => panic!("Expected Start event (), got {:?}", e), } } - buf.clear(); // { - let e = match r.read_namespaced_event(&mut buf, &mut ns_buf) { + let e = match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -333,7 +325,7 @@ fn default_ns_shadowing_expanded() { } // virtual - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:i"))); assert_eq!(e.name(), QName(b"e")); @@ -341,7 +333,7 @@ fn default_ns_shadowing_expanded() { e => panic!("Expected End event (), got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(e))) => { assert_eq!(ns, Bound(Namespace(b"urn:example:o"))); assert_eq!(e.name(), QName(b"e")); @@ -363,11 +355,10 @@ fn reserved_name() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Empty(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "Expected empty element bound to namespace 'www1', got {:?}", diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 28401b77..5cd4974f 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -362,19 +362,17 @@ fn test(input: &str, output: &str, trim: bool) { #[track_caller] fn test_bytes(input: &[u8], output: &[u8], trim: bool) { - let mut reader = Reader::from_reader(input); + let mut reader = Reader::from_bytes(input); reader .trim_text(trim) .check_comments(true) .expand_empty_elements(false); let mut spec_lines = SpecIter(output).enumerate(); - let mut buf = Vec::new(); let mut ns_buffer = Vec::new(); loop { - buf.clear(); - let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer); + let event = reader.read_namespaced_event(&mut ns_buffer); let line = xmlrs_display(event, reader.decoder()); if let Some((n, spec)) = spec_lines.next() { if spec.trim() == "EndDocument" { From 10c736e74a0c6a69e1fa4947cfd079dc22178b4a Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 09:30:34 +0200 Subject: [PATCH 3/8] Change the check! macro to more flexibly define buffers --- src/reader/mod.rs | 251 ++++++++++++++++++++++++++++------------------ 1 file changed, 153 insertions(+), 98 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 3bdae3c6..919e8c15 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -874,6 +874,43 @@ trait XmlSource<'r, B> { fn peek_one(&mut self) -> Result>; } +/// This is just a helper implementation for using `&mut ()` as buffer while reading from an +/// `&[u8]` to unify how the `check!` macro below works. +impl<'a, 'b> XmlSource<'a, &'b mut ()> for &'a [u8] { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + self.read_bytes_until(byte, (), position) + } + + fn read_bang_element( + &mut self, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + self.read_bang_element((), position) + } + + fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { + self.read_element((), position) + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + >::skip_whitespace(self, position) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + >::skip_one(self, byte, position) + } + + fn peek_one(&mut self) -> Result> { + >::peek_one(self) + } +} + /// Possible elements started with ` Option<&'static Encoding> { #[cfg(test)] mod test { macro_rules! check { - ($buf:expr) => { + ($(let mut $buf:ident = $init:expr;)?) => { mod read_bytes_until { use crate::reader::XmlSource; // Use Bytes for printing bytes as strings for ASCII range @@ -1170,14 +1207,14 @@ mod test { /// Checks that search in the empty buffer returns `None` #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"".as_ref(); // ^= 0 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), None @@ -1189,14 +1226,14 @@ mod test { /// as a result and set `position` to `len()` #[test] fn non_existent() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abcdef".as_ref(); // ^= 6 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1209,14 +1246,14 @@ mod test { /// after match (`1`) #[test] fn at_the_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"*abcdef".as_ref(); // ^= 1 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"")) @@ -1229,14 +1266,14 @@ mod test { /// symbol after match #[test] fn inside() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abc*def".as_ref(); // ^= 4 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abc")) @@ -1249,14 +1286,14 @@ mod test { /// symbol after match (`len()`) #[test] fn in_the_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abcdef*".as_ref(); // ^= 7 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1278,12 +1315,12 @@ mod test { #[test] #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![]]>other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1298,12 +1335,12 @@ mod test { /// is not found, parsing ends with an error #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1317,14 +1354,14 @@ mod test { /// Checks that CDATA element without content inside parsed successfully #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[]]>other content".as_ref(); // ^= 11 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA["))) @@ -1337,14 +1374,14 @@ mod test { /// a CDATA end sequence do not interrupt CDATA parsing #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); // ^= 28 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) @@ -1378,12 +1415,12 @@ mod test { #[test] #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!- -->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1396,12 +1433,12 @@ mod test { #[test] fn not_properly_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1414,12 +1451,12 @@ mod test { #[test] fn not_closed1() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1432,12 +1469,12 @@ mod test { #[test] fn not_closed2() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!-->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1450,12 +1487,12 @@ mod test { #[test] fn not_closed3() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1468,14 +1505,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!---->other content".as_ref(); // ^= 6 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!----"))) @@ -1485,14 +1522,14 @@ mod test { #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--->comment<--->other content".as_ref(); // ^= 17 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!--->comment<---"))) @@ -1511,12 +1548,12 @@ mod test { #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!D other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1529,12 +1566,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPEother content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1547,14 +1584,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPE>other content".as_ref(); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!DOCTYPE"))) @@ -1564,12 +1601,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPE other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1589,12 +1626,12 @@ mod test { #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!d other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1607,12 +1644,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctypeother content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1625,14 +1662,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctype>other content".as_ref(); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!doctype"))) @@ -1642,12 +1679,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctype other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1669,12 +1706,12 @@ mod test { /// Checks that nothing was read from empty buffer #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"".as_ref(); // ^= 0 - assert_eq!(input.read_element(buf, &mut position).unwrap().map(Bytes), None); + assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); assert_eq!(position, 0); } @@ -1685,13 +1722,13 @@ mod test { #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b">".as_ref(); // ^= 1 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"")) ); assert_eq!(position, 1); @@ -1699,13 +1736,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"tag>".as_ref(); // ^= 4 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag")) ); assert_eq!(position, 4); @@ -1713,13 +1750,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":>".as_ref(); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":")) ); assert_eq!(position, 2); @@ -1727,13 +1764,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":tag>".as_ref(); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag")) ); assert_eq!(position, 5); @@ -1741,13 +1778,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); // ^= 38 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)) ); assert_eq!(position, 38); @@ -1761,13 +1798,13 @@ mod test { #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"/>".as_ref(); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"/")) ); assert_eq!(position, 2); @@ -1775,13 +1812,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"tag/>".as_ref(); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag/")) ); assert_eq!(position, 5); @@ -1789,13 +1826,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":/>".as_ref(); // ^= 3 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":/")) ); assert_eq!(position, 3); @@ -1803,13 +1840,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":tag/>".as_ref(); // ^= 6 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag/")) ); assert_eq!(position, 6); @@ -1817,13 +1854,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); // ^= 41 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)) ); assert_eq!(position, 41); @@ -1838,8 +1875,9 @@ mod test { fn cdata() { let doc = "![]]>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1853,8 +1891,9 @@ mod test { fn comment() { let doc = "!- -->"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1868,8 +1907,9 @@ mod test { fn doctype_uppercase() { let doc = "!D>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1883,8 +1923,9 @@ mod test { fn doctype_lowercase() { let doc = "!d>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1904,9 +1945,10 @@ mod test { #[test] fn start_text() { let mut reader = Reader::from_str("bom"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::StartText(BytesText::from_escaped_str("bom").into()) ); } @@ -1914,9 +1956,10 @@ mod test { #[test] fn declaration() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Decl(BytesDecl::from_start(BytesStart::borrowed(b"xml ", 3))) ); } @@ -1924,9 +1967,10 @@ mod test { #[test] fn doctype() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::DocType(BytesText::from_escaped_str("x")) ); } @@ -1934,9 +1978,10 @@ mod test { #[test] fn processing_instruction() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::PI(BytesText::from_escaped_str("xml-stylesheet")) ); } @@ -1944,9 +1989,10 @@ mod test { #[test] fn start() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Start(BytesStart::borrowed_name(b"tag")) ); } @@ -1957,9 +2003,10 @@ mod test { // Because we expect invalid XML, do not check that // the end name paired with the start name reader.check_end_names(false); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::End(BytesEnd::borrowed(b"tag")) ); } @@ -1967,9 +2014,10 @@ mod test { #[test] fn empty() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); } @@ -1978,24 +2026,26 @@ mod test { #[test] fn text() { let mut reader = Reader::from_str("text"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Text(BytesText::from_escaped_str("text")) ); } #[test] fn cdata() { - let mut reader = Reader::from_str(""); + let mut reader =Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::CData(BytesCData::from_str("")) ); } @@ -2003,9 +2053,10 @@ mod test { #[test] fn comment() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Comment(BytesText::from_escaped_str("")) ); } @@ -2013,9 +2064,10 @@ mod test { #[test] fn eof() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof ); } @@ -2036,30 +2088,32 @@ mod test { #[test] fn bom_detected() { let mut reader = Reader::from_bytes(b"\xFF\xFE"); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), WINDOWS_1251); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } /// Checks that encoding is changed by XML declaration, but only once #[test] fn xml_declaration() { let mut reader = Reader::from_bytes(b""); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } @@ -2068,12 +2122,13 @@ mod test { #[test] fn str_always_has_utf8() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_8); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } }; @@ -2081,11 +2136,11 @@ mod test { /// Tests for reader that generates events that borrow from the provided buffer mod buffered { - check!(&mut Vec::new()); + check!(let mut buf = Vec::new();); } /// Tests for reader that generates events that borrow from the input mod borrowed { - check!(()); + check!(let mut buf = ();); } } From a018ada5b689d0d6ffe65fe18c28e4a31858e1b2 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 19 Jul 2022 21:15:01 +0500 Subject: [PATCH 4/8] Introduce SliceReader and BufferedReader --- src/reader/buffered_reader.rs | 55 ++++++++++++++++++---------------- src/reader/slice_reader.rs | 56 +++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 52 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index bec2a1e4..e3b248b9 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -8,7 +8,7 @@ use std::path::Path; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::{QName, ResolveResult}; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; +use crate::reader::{is_whitespace, BangType, ReadElementState, Reader}; use memchr; @@ -290,22 +290,25 @@ impl Reader> { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { +/// A struct for handling reading functions based on reading from a [`BufRead`]. +#[derive(Debug, Clone)] +pub struct BufferedReader(R); + +/// Private reading functions. +impl BufferedReader { #[inline] - fn read_bytes_until( + fn read_bytes_until<'buf>( &mut self, byte: u8, - buf: &'b mut Vec, + buf: &'buf mut Vec, position: &mut usize, - ) -> Result> { + ) -> Result> { let mut read = 0; let mut done = false; let start = buf.len(); while !done { let used = { - let available = match self.fill_buf() { + let available = match self.0.fill_buf() { Ok(n) if n.is_empty() => break, Ok(n) => n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, @@ -327,7 +330,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { } } }; - self.consume(used); + self.0.consume(used); read += used; } *position += read; @@ -339,22 +342,22 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { } } - fn read_bang_element( + fn read_bang_element<'buf>( &mut self, - buf: &'b mut Vec, + buf: &'buf mut Vec, position: &mut usize, - ) -> Result> { + ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to // start with it. let start = buf.len(); let mut read = 1; buf.push(b'!'); - self.consume(1); + self.0.consume(1); let bang_type = BangType::new(self.peek_one()?)?; loop { - match self.fill_buf() { + match self.0.fill_buf() { // Note: Do not update position, so the error points to // somewhere sane rather than at the EOF Ok(n) if n.is_empty() => return Err(bang_type.to_err()), @@ -362,7 +365,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { if let Some((consumed, used)) = bang_type.parse(available, read) { buf.extend_from_slice(consumed); - self.consume(used); + self.0.consume(used); read += used; *position += read; @@ -371,7 +374,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { buf.extend_from_slice(available); let used = available.len(); - self.consume(used); + self.0.consume(used); read += used; } } @@ -391,23 +394,23 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { } #[inline] - fn read_element( + fn read_element<'buf>( &mut self, - buf: &'b mut Vec, + buf: &'buf mut Vec, position: &mut usize, - ) -> Result> { + ) -> Result> { let mut state = ReadElementState::Elem; let mut read = 0; let start = buf.len(); loop { - match self.fill_buf() { + match self.0.fill_buf() { Ok(n) if n.is_empty() => break, Ok(available) => { if let Some((consumed, used)) = state.change(available) { buf.extend_from_slice(consumed); - self.consume(used); + self.0.consume(used); read += used; *position += read; @@ -416,7 +419,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { buf.extend_from_slice(available); let used = available.len(); - self.consume(used); + self.0.consume(used); read += used; } } @@ -439,11 +442,11 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { /// character or EOF. fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { loop { - break match self.fill_buf() { + break match self.0.fill_buf() { Ok(n) => { let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); if count > 0 { - self.consume(count); + self.0.consume(count); *position += count; continue; } else { @@ -462,7 +465,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { match self.peek_one()? { Some(b) if b == byte => { *position += 1; - self.consume(1); + self.0.consume(1); Ok(true) } _ => Ok(false), @@ -473,7 +476,7 @@ impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { /// will still include it. On EOF, return None. fn peek_one(&mut self) -> Result> { loop { - break match self.fill_buf() { + break match self.0.fill_buf() { Ok(n) if n.is_empty() => Ok(None), Ok(n) => Ok(Some(n[0])), Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 25b52d12..d131d810 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -10,7 +10,7 @@ use encoding_rs::UTF_8; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::{QName, ResolveResult}; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, XmlSource}; +use crate::reader::{is_whitespace, BangType, ReadElementState, Reader}; use memchr; @@ -197,28 +197,31 @@ impl<'a> Reader<&'a [u8]> { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { +/// A struct for handling reading functions based on reading from a byte slice. +#[derive(Debug, Clone, Copy)] +pub struct SliceReader<'buf>(&'buf [u8]); + +/// Private reading functions for a [`SliceReader`]. +impl<'buf> SliceReader<'buf> { fn read_bytes_until( &mut self, byte: u8, _buf: (), position: &mut usize, - ) -> Result> { - if self.is_empty() { + ) -> Result> { + if self.0.is_empty() { return Ok(None); } - Ok(Some(if let Some(i) = memchr::memchr(byte, self) { + Ok(Some(if let Some(i) = memchr::memchr(byte, self.0) { *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; + let bytes = &self.0[..i]; + self.0 = &self.0[i + 1..]; bytes } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; + *position += self.0.len(); + let bytes = &self.0[..]; + self.0 = &[]; bytes })) } @@ -227,16 +230,16 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { &mut self, _buf: (), position: &mut usize, - ) -> Result> { + ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to // start with it. - debug_assert_eq!(self[0], b'!'); + debug_assert_eq!(self.0[0], b'!'); - let bang_type = BangType::new(self[1..].first().copied())?; + let bang_type = BangType::new(self.0[1..].first().copied())?; - if let Some((bytes, i)) = bang_type.parse(self, 0) { + if let Some((bytes, i)) = bang_type.parse(self.0, 0) { *position += i; - *self = &self[i..]; + self.0 = &self.0[i..]; return Ok(Some((bang_type, bytes))); } @@ -245,16 +248,16 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { - if self.is_empty() { + fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { + if self.0.is_empty() { return Ok(None); } let mut state = ReadElementState::Elem; - if let Some((bytes, i)) = state.change(self) { + if let Some((bytes, i)) = state.change(self.0) { *position += i; - *self = &self[i..]; + self.0 = &self.0[i..]; return Ok(Some(bytes)); } @@ -267,17 +270,18 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { let whitespaces = self + .0 .iter() .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); + .unwrap_or(self.0.len()); *position += whitespaces; - *self = &self[whitespaces..]; + self.0 = &self.0[whitespaces..]; Ok(()) } fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - if self.first() == Some(&byte) { - *self = &self[1..]; + if self.0.first() == Some(&byte) { + self.0 = &self.0[1..]; *position += 1; Ok(true) } else { @@ -286,6 +290,6 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) + Ok(self.0.first().copied()) } } From 9eb0d9b0b2667897a08f05ab60a218c790170b61 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 10:37:50 +0200 Subject: [PATCH 5/8] Split reader into BufferedReader and SliceReader This also changes the test cases in the `reader::test::check` macro to allow for reader-specific tests. --- benches/macrobenches.rs | 3 +- benches/microbenches.rs | 4 +- src/de/mod.rs | 6 +- src/lib.rs | 2 +- src/reader/buffered_reader.rs | 210 ++++++++++++++-- src/reader/mod.rs | 457 +++++++++------------------------- src/reader/slice_reader.rs | 276 ++++++++++++++++++-- tests/test.rs | 9 +- 8 files changed, 573 insertions(+), 394 deletions(-) diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 3358f3a4..4cb02ffe 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -20,8 +20,9 @@ static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); // TODO: use fully normalized attribute values fn parse_document(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); + let mut buf = Vec::new(); loop { - match r.read_event()? { + match r.read_event_into(&mut buf)? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { criterion::black_box(attr?.decode_and_unescape_value(&r)?); diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 3bdfea4f..c52eceb6 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -85,7 +85,7 @@ fn read_namespaced_event(c: &mut Criterion) { let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut ns_buf) { + match r.read_namespaced_event_into(&mut buf, &mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), @@ -109,7 +109,7 @@ fn read_namespaced_event(c: &mut Criterion) { let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut ns_buf) { + match r.read_namespaced_event_into(&mut buf, &mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), diff --git a/src/de/mod.rs b/src/de/mod.rs index e564e041..47ea99e6 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -695,7 +695,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> { /// Create new deserializer that will borrow data from the specified borrowing reader #[inline] - fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self { + fn from_borrowing_reader(mut reader: Reader>) -> Self { reader .expand_empty_elements(true) .check_end_names(true) @@ -930,7 +930,7 @@ pub trait XmlRead<'i> { /// You cannot create it, it is created automatically when you call /// [`Deserializer::from_reader`] pub struct IoReader { - reader: Reader, + reader: Reader>, buf: Vec, } @@ -975,7 +975,7 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { /// You cannot create it, it is created automatically when you call /// [`Deserializer::from_str`] or [`Deserializer::from_slice`] pub struct SliceReader<'de> { - reader: Reader<&'de [u8]>, + reader: Reader>, } impl<'de> XmlRead<'de> for SliceReader<'de> { diff --git a/src/lib.rs b/src/lib.rs index f42ae359..70c18b19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,5 +65,5 @@ mod writer; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::{Decoder, Reader}; +pub use crate::reader::{BufferedReader, Decoder, Reader, SliceReader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index e3b248b9..0a0acc23 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -1,20 +1,136 @@ -//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as +//! This is an implementation of [`Reader`] for reading from a [`Read`] or [`BufRead`] as //! underlying byte stream. use std::fs::File; -use std::io::{self, BufRead, BufReader}; +use std::io::{self, BufRead, BufReader, Read}; +use std::ops::{Deref, DerefMut}; use std::path::Path; -use crate::errors::{Error, Result}; -use crate::events::Event; +use crate::events::{BytesText, Event}; use crate::name::{QName, ResolveResult}; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader}; +use crate::{Error, Result}; -use memchr; +#[cfg(feature = "encoding")] +use crate::reader::{detect_encoding, EncodingRef}; +use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; -/// This is an implementation of [`Reader`] for reading from a [`BufRead`] as -/// underlying byte stream. -impl Reader { +/// Private functions for a [`Reader`] based on an [`BufferedReader`]. +impl Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), + TagState::Opened => self.read_until_close(buf), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'buf>( + &mut self, + buf: &'buf mut Vec, + first: bool, + ) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(buf); + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Public reading methods for a [`Reader`] based on an [`BufferedReader`]. +impl Reader> { /// Reads the next `Event`. /// /// This is the main entry point for reading XML `Event`s. @@ -40,7 +156,9 @@ impl Reader { /// Test /// Test 2 /// "#; - /// let mut reader = Reader::from_str(xml); + /// // This explicitly uses `from_reader(xml.as_bytes())` to use a buffered reader instead of + /// // relying on the zero-copy optimizations for reading from byte slices. + /// let mut reader = Reader::from_reader(xml.as_bytes()); /// reader.trim_text(true); /// let mut count = 0; /// let mut buf = Vec::new(); @@ -59,7 +177,7 @@ impl Reader { /// println!("Text events: {:?}", txt); /// ``` #[inline] - pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { + pub fn read_event_into<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { self.read_event_impl(buf) } @@ -77,7 +195,7 @@ impl Reader { /// Test /// Test 2 /// "#; - /// let mut reader = Reader::from_str(xml); + /// let mut reader = Reader::from_reader(xml.as_bytes()); /// reader.trim_text(true); /// let mut count = 0; /// let mut buf = Vec::new(); @@ -173,7 +291,7 @@ impl Reader { /// use quick_xml::events::{BytesStart, Event}; /// use quick_xml::Reader; /// - /// let mut reader = Reader::from_str(r#" + /// let mut reader = Reader::from_reader(r#" /// /// /// @@ -182,7 +300,7 @@ impl Reader { /// /// /// - /// "#); + /// "#.as_bytes()); /// reader.trim_text(true); /// let mut buf = Vec::new(); /// @@ -203,7 +321,6 @@ impl Reader { /// /// [`Start`]: Event::Start /// [`End`]: Event::End - /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end /// [`read_to_end()`]: Self::read_to_end /// [`check_end_names`]: Self::check_end_names /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag @@ -279,21 +396,59 @@ impl Reader { } } -impl Reader> { +/// Builder for reading from a file. +impl Reader>> { /// Creates an XML reader from a file path. pub fn from_file>(path: P) -> Result { let file = File::open(path).map_err(Error::Io)?; let reader = BufReader::new(file); - Ok(Self::from_reader(reader)) + Ok(Self::from_reader_internal(BufferedReader(reader))) } } +/// Builder for reading from any [`BufRead`]. +impl Reader> { + /// Creates an XML reader from any type implementing [`BufRead`]. + pub fn from_reader(reader: R) -> Self { + Self::from_reader_internal(BufferedReader(reader)) + } +} + +/// Builder for reading from any [`Read`]. +impl Reader>> { + /// Creates an XML reader from any type implementing [`Read`]. + pub fn from_unbuffered_reader(reader: R) -> Self { + Self::from_reader_internal(BufferedReader(BufReader::new(reader))) + } +} //////////////////////////////////////////////////////////////////////////////////////////////////// /// A struct for handling reading functions based on reading from a [`BufRead`]. #[derive(Debug, Clone)] pub struct BufferedReader(R); +impl Deref for BufferedReader { + type Target = R; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for BufferedReader { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl InnerReader for BufferedReader { + type Reader = R; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + /// Private reading functions. impl BufferedReader { #[inline] @@ -485,3 +640,24 @@ impl BufferedReader { } } } + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_bytes(bytes: &[u8]) -> BufferedReader<&[u8]> { + BufferedReader(bytes) + } + + fn reader_from_str(s: &str) -> Reader> { + Reader::from_reader_internal(BufferedReader(s.as_bytes())) + } + + #[allow(dead_code)] + fn reader_from_bytes(s: &[u8]) -> Reader> { + Reader::from_reader_internal(BufferedReader(s)) + } + + check!(let mut buf = Vec::new();); +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 919e8c15..f5a661b3 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1,6 +1,7 @@ //! A module to handle `Reader` use std::borrow::Cow; +use std::ops::{Deref, DerefMut}; use std::str::from_utf8; #[cfg(feature = "encoding")] @@ -15,6 +16,9 @@ use memchr; mod buffered_reader; mod slice_reader; +pub use self::buffered_reader::BufferedReader; +pub use self::slice_reader::SliceReader; + /// Possible reader states. The state transition diagram (`true` and `false` shows /// value of [`Reader::expand_empty_elements()`] option): /// @@ -105,6 +109,15 @@ impl EncodingRef { } } +/// A trait for the underlying abstracion handling the actual reading part for the [`Reader`]. +pub trait InnerReader: Deref + DerefMut { + /// The real type of the inner reader. + type Reader; + + /// Consumes this abstration returning the underlying reader. + fn into_inner(self) -> Self::Reader; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A low level encoding-agnostic XML event reader. @@ -213,7 +226,7 @@ pub struct Reader { /// Builder methods impl Reader { /// Creates a `Reader` that reads from a given reader. - pub fn from_reader(reader: R) -> Self { + fn from_reader_internal(reader: R) -> Self { Self { reader, opened_buffer: Vec::new(), @@ -336,7 +349,7 @@ impl Reader { } /// Getters -impl Reader { +impl> Reader { /// Consumes `Reader` returning the underlying reader /// /// Can be used to compute line and column of a parsing error position @@ -346,7 +359,7 @@ impl Reader { /// ``` /// # use pretty_assertions::assert_eq; /// use std::{str, io::Cursor}; - /// use quick_xml::Reader; + /// use quick_xml::{BufferedReader, Reader}; /// use quick_xml::events::Event; /// /// let xml = r#" @@ -356,7 +369,7 @@ impl Reader { /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); /// let mut buf = Vec::new(); /// - /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { + /// fn into_line_and_column(reader: Reader>>) -> (usize, usize) { /// let end_pos = reader.buffer_position(); /// let mut cursor = reader.into_inner(); /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) @@ -391,7 +404,7 @@ impl Reader { /// } /// ``` pub fn into_inner(self) -> R { - self.reader + self.reader.into_inner() } /// Gets a reference to the underlying reader. @@ -403,7 +416,10 @@ impl Reader { pub fn get_mut(&mut self) -> &mut R { &mut self.reader } +} +/// Getters that are not specific to any inner reader implementation +impl Reader { /// Gets the current byte position in the input data. /// /// Useful when debugging errors. @@ -474,125 +490,8 @@ impl Reader { } } -/// Private methods +/// Common parsing code for all reader implementations. impl Reader { - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_impl<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - let event = match self.tag_state { - TagState::Init => self.read_until_open(buf, true), - TagState::Closed => self.read_until_open(buf, false), - TagState::Opened => self.read_until_close(buf), - TagState::Empty => self.close_expanded_empty(), - TagState::Exit => return Ok(Event::Eof), - }; - match event { - Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, - _ => {} - } - event - } - - /// Read until '<' is found and moves reader to an `Opened` state. - /// - /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Opened; - - if self.trim_text_start { - self.reader.skip_whitespace(&mut self.buf_position)?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_impl(buf); - } - - match self - .reader - .read_bytes_until(b'<', buf, &mut self.buf_position) - { - Ok(Some(bytes)) => { - #[cfg(feature = "encoding")] - if first && self.encoding.can_be_refined() { - if let Some(encoding) = detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - } - } - - let content = if self.trim_text_end { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - &bytes[..len] - } else { - bytes - }; - - Ok(if first { - Event::StartText(BytesText::from_escaped(content).into()) - } else { - Event::Text(BytesText::from_escaped(content)) - }) - } - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Closed; - - match self.reader.peek_one() { - // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_start(bytes), - Err(e) => Err(e), - }, - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event fn read_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { @@ -774,143 +673,6 @@ impl Reader { } } -/// Represents an input for a reader that can return borrowed data. -/// -/// There are two implementors of this trait: generic one that read data from -/// `Self`, copies some part of it into a provided buffer of type `B` and then -/// returns data that borrow from that buffer. -/// -/// The other implementor is for `&[u8]` and instead of copying data returns -/// borrowed data from `Self` instead. This implementation allows zero-copy -/// deserialization. -/// -/// # Parameters -/// - `'r`: lifetime of a buffer from which events will borrow -/// - `B`: a type of a buffer that can be used to store data read from `Self` and -/// from which events can borrow -trait XmlSource<'r, B> { - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte`, which does not include into result. - /// If input (`Self`) is exhausted, returns `None`. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// Some(b"abc".as_ref()) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until comment, CDATA or processing instruction is finished. - /// - /// This method expect that `<` already was read. - /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. - /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bang_element( - &mut self, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns `Some(buffer)` that contains a data between `<` and `>` or - /// `None` if end-of-input was reached and nothing was read. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; - - fn peek_one(&mut self) -> Result>; -} - -/// This is just a helper implementation for using `&mut ()` as buffer while reading from an -/// `&[u8]` to unify how the `check!` macro below works. -impl<'a, 'b> XmlSource<'a, &'b mut ()> for &'a [u8] { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: &mut (), - position: &mut usize, - ) -> Result> { - self.read_bytes_until(byte, (), position) - } - - fn read_bang_element( - &mut self, - _buf: &mut (), - position: &mut usize, - ) -> Result> { - self.read_bang_element((), position) - } - - fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { - self.read_element((), position) - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - >::skip_whitespace(self, position) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - >::skip_one(self, byte, position) - } - - fn peek_one(&mut self) -> Result> { - >::peek_one(self) - } -} - /// Possible elements started with ` { mod read_bytes_until { - use crate::reader::XmlSource; + use super::input_from_bytes; // Use Bytes for printing bytes as strings for ASCII range use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1209,7 +971,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_bytes(b"".as_ref()); // ^= 0 assert_eq!( @@ -1228,7 +990,7 @@ mod test { fn non_existent() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef".as_ref(); + let mut input = input_from_bytes(b"abcdef".as_ref()); // ^= 6 assert_eq!( @@ -1248,7 +1010,7 @@ mod test { fn at_the_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"*abcdef".as_ref(); + let mut input = input_from_bytes(b"*abcdef".as_ref()); // ^= 1 assert_eq!( @@ -1268,7 +1030,7 @@ mod test { fn inside() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abc*def".as_ref(); + let mut input = input_from_bytes(b"abc*def".as_ref()); // ^= 4 assert_eq!( @@ -1288,7 +1050,7 @@ mod test { fn in_the_end() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef*".as_ref(); + let mut input = input_from_bytes(b"abcdef*".as_ref()); // ^= 7 assert_eq!( @@ -1303,10 +1065,12 @@ mod test { } mod read_bang_element { + use super::input_from_bytes; /// Checks that reading CDATA content works correctly mod cdata { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1317,7 +1081,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![]]>other content".as_ref(); + let mut input = input_from_bytes(b"![]]>other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1337,7 +1101,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[other content".as_ref(); + let mut input = input_from_bytes(b"![CDATA[other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1356,7 +1120,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[]]>other content".as_ref(); + let mut input = input_from_bytes(b"![CDATA[]]>other content".as_ref()); // ^= 11 assert_eq!( @@ -1376,7 +1140,7 @@ mod test { fn with_content() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); + let mut input = input_from_bytes(b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref()); // ^= 28 assert_eq!( @@ -1407,8 +1171,9 @@ mod test { /// /// [specification]: https://www.w3.org/TR/xml11/#dt-comment mod comment { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1417,7 +1182,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!- -->other content".as_ref(); + let mut input = input_from_bytes(b"!- -->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1435,7 +1200,7 @@ mod test { fn not_properly_end() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!->other content".as_ref(); + let mut input = input_from_bytes(b"!->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1453,7 +1218,7 @@ mod test { fn not_closed1() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--other content".as_ref(); + let mut input = input_from_bytes(b"!--other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1471,7 +1236,7 @@ mod test { fn not_closed2() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!-->other content".as_ref(); + let mut input = input_from_bytes(b"!-->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1489,7 +1254,7 @@ mod test { fn not_closed3() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->other content".as_ref(); + let mut input = input_from_bytes(b"!--->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1507,7 +1272,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!---->other content".as_ref(); + let mut input = input_from_bytes(b"!---->other content".as_ref()); // ^= 6 assert_eq!( @@ -1524,7 +1289,7 @@ mod test { fn with_content() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->comment<--->other content".as_ref(); + let mut input = input_from_bytes(b"!--->comment<--->other content".as_ref()); // ^= 17 assert_eq!( @@ -1540,9 +1305,11 @@ mod test { /// Checks that reading DOCTYPE definition works correctly mod doctype { + use super::input_from_bytes; mod uppercase { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1550,7 +1317,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!D other content".as_ref(); + let mut input = input_from_bytes(b"!D other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1568,7 +1335,7 @@ mod test { fn without_space() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPEother content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPEother content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1586,7 +1353,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE>other content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPE>other content".as_ref()); // ^= 9 assert_eq!( @@ -1603,7 +1370,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE other content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPE other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1619,8 +1386,9 @@ mod test { } mod lowercase { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1628,7 +1396,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!d other content".as_ref(); + let mut input = input_from_bytes(b"!d other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1646,7 +1414,7 @@ mod test { fn without_space() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctypeother content".as_ref(); + let mut input = input_from_bytes(b"!doctypeother content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1664,7 +1432,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype>other content".as_ref(); + let mut input = input_from_bytes(b"!doctype>other content".as_ref()); // ^= 9 assert_eq!( @@ -1681,7 +1449,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype other content".as_ref(); + let mut input = input_from_bytes(b"!doctype other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1699,7 +1467,7 @@ mod test { } mod read_element { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1708,7 +1476,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_bytes(b"".as_ref()); // ^= 0 assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); @@ -1716,7 +1484,7 @@ mod test { } mod open { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1724,7 +1492,7 @@ mod test { fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b">".as_ref(); + let mut input = input_from_bytes(b">".as_ref()); // ^= 1 assert_eq!( @@ -1738,7 +1506,7 @@ mod test { fn normal() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag>".as_ref(); + let mut input = input_from_bytes(b"tag>".as_ref()); // ^= 4 assert_eq!( @@ -1752,7 +1520,7 @@ mod test { fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":>".as_ref(); + let mut input = input_from_bytes(b":>".as_ref()); // ^= 2 assert_eq!( @@ -1766,7 +1534,7 @@ mod test { fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag>".as_ref(); + let mut input = input_from_bytes(b":tag>".as_ref()); // ^= 5 assert_eq!( @@ -1780,7 +1548,7 @@ mod test { fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); + let mut input = input_from_bytes(br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref()); // ^= 38 assert_eq!( @@ -1792,7 +1560,7 @@ mod test { } mod self_closed { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1800,7 +1568,7 @@ mod test { fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"/>".as_ref(); + let mut input = input_from_bytes(b"/>".as_ref()); // ^= 2 assert_eq!( @@ -1814,7 +1582,7 @@ mod test { fn normal() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag/>".as_ref(); + let mut input = input_from_bytes(b"tag/>".as_ref()); // ^= 5 assert_eq!( @@ -1828,7 +1596,7 @@ mod test { fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":/>".as_ref(); + let mut input = input_from_bytes(b":/>".as_ref()); // ^= 3 assert_eq!( @@ -1842,7 +1610,7 @@ mod test { fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag/>".as_ref(); + let mut input = input_from_bytes(b":tag/>".as_ref()); // ^= 6 assert_eq!( @@ -1856,7 +1624,7 @@ mod test { fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); + let mut input = input_from_bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref()); // ^= 41 assert_eq!( @@ -1869,12 +1637,13 @@ mod test { } mod issue_344 { + use super::reader_from_str; use crate::errors::Error; #[test] fn cdata() { let doc = "![]]>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -1890,7 +1659,7 @@ mod test { #[test] fn comment() { let doc = "!- -->"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -1906,7 +1675,7 @@ mod test { #[test] fn doctype_uppercase() { let doc = "!D>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -1922,7 +1691,7 @@ mod test { #[test] fn doctype_lowercase() { let doc = "!d>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -1938,13 +1707,13 @@ mod test { /// Ensures, that no empty `Text` events are generated mod read_event_impl { + use super::reader_from_str; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; - use crate::reader::Reader; use pretty_assertions::assert_eq; #[test] fn start_text() { - let mut reader = Reader::from_str("bom"); + let mut reader = reader_from_str("bom"); $(let mut $buf = $init;)? assert_eq!( @@ -1955,7 +1724,7 @@ mod test { #[test] fn declaration() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -1966,7 +1735,7 @@ mod test { #[test] fn doctype() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -1977,7 +1746,7 @@ mod test { #[test] fn processing_instruction() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -1988,7 +1757,7 @@ mod test { #[test] fn start() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -1999,7 +1768,7 @@ mod test { #[test] fn end() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); // Because we expect invalid XML, do not check that // the end name paired with the start name reader.check_end_names(false); @@ -2013,7 +1782,7 @@ mod test { #[test] fn empty() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2025,7 +1794,7 @@ mod test { /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = reader_from_str("text"); $(let mut $buf = $init;)? assert_eq!( @@ -2041,7 +1810,7 @@ mod test { #[test] fn cdata() { - let mut reader =Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2052,7 +1821,7 @@ mod test { #[test] fn comment() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2063,7 +1832,7 @@ mod test { #[test] fn eof() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2075,19 +1844,19 @@ mod test { #[cfg(feature = "encoding")] mod encoding { + use super::reader_from_bytes; use crate::events::Event; - use crate::reader::Reader; use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251}; - use pretty_assertions::assert_eq; mod bytes { + use super::reader_from_bytes; use super::*; use pretty_assertions::assert_eq; /// Checks that encoding is detected by BOM and changed after XML declaration #[test] fn bom_detected() { - let mut reader = Reader::from_bytes(b"\xFF\xFE"); + let mut reader = reader_from_bytes(b"\xFF\xFE"); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); @@ -2103,7 +1872,7 @@ mod test { /// Checks that encoding is changed by XML declaration, but only once #[test] fn xml_declaration() { - let mut reader = Reader::from_bytes(b""); + let mut reader = reader_from_bytes(b""); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); @@ -2116,31 +1885,31 @@ mod test { assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } - - /// Checks that XML declaration cannot change the encoding from UTF-8 if - /// a `Reader` was created using `from_str` method - #[test] - fn str_always_has_utf8() { - let mut reader = Reader::from_str(""); - $(let mut $buf = $init;)? - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($(&mut $buf)?).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_8); - - assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); - } } }; } - /// Tests for reader that generates events that borrow from the provided buffer - mod buffered { - check!(let mut buf = Vec::new();); - } + pub(super) use check; - /// Tests for reader that generates events that borrow from the input - mod borrowed { - check!(let mut buf = ();); + #[cfg(feature = "encoding")] + mod encoding { + use crate::events::Event; + use crate::reader::UTF_8; + use pretty_assertions::assert_eq; + /// Checks that XML declaration cannot change the encoding from UTF-8 if + /// a `Reader` was created using `from_str` method. + /// This is outside the `check` macro as this is only relevant for the + /// `Reader::from_str` method. + #[test] + fn str_always_has_utf8() { + let mut reader = crate::Reader::from_str(""); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_into(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_8); + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index d131d810..da42ac4f 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -2,45 +2,172 @@ //! underlying byte stream. This implementation supports not using an //! intermediate buffer as the byte slice itself can be used to borrow from. -#[cfg(feature = "encoding")] -use crate::reader::EncodingRef; +use std::ops::{Deref, DerefMut}; + #[cfg(feature = "encoding")] use encoding_rs::UTF_8; -use crate::errors::{Error, Result}; -use crate::events::Event; +use crate::events::{BytesText, Event}; use crate::name::{QName, ResolveResult}; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use crate::reader::{detect_encoding, EncodingRef}; +use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// Private functions for a [`Reader`] based on a [`SliceReader`]. +impl<'buf> Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl(&mut self, _buf: &mut ()) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(&mut (), true), + TagState::Closed => self.read_until_open(&mut (), false), + TagState::Opened => self.read_until_close(&mut ()), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open(&mut self, _buf: &mut (), first: bool) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(&mut ()); + } + + match self + .reader + .read_bytes_until(b'<', &mut (), &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close(&mut self, _buf: &mut ()) -> Result> { + self.tag_state = TagState::Closed; -use memchr; + match self.reader.peek_one() { + // ` match self + .reader + .read_bang_element(&mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` { + match self + .reader + .read_bytes_until(b'>', &mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + } + } + // ` { + match self + .reader + .read_bytes_until(b'>', &mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + } + } + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(&mut (), &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} -/// This is an implementation of [`Reader`] for reading from a `&[u8]` as -/// underlying byte stream. This implementation supports not using an -/// intermediate buffer as the byte slice itself can be used to borrow from. -impl<'a> Reader<&'a [u8]> { +/// Builder for reading from a slice of bytes. +impl<'buf> Reader> { /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Self { + pub fn from_str(s: &'buf str) -> Self { + #[cfg_attr(not(feature = "encoding"), allow(unused_mut))] + let mut reader = Self::from_reader_internal(SliceReader(s.as_bytes())); + // Rust strings are guaranteed to be UTF-8, so lock the encoding #[cfg(feature = "encoding")] { - let mut reader = Self::from_reader(s.as_bytes()); reader.encoding = EncodingRef::Explicit(UTF_8); - reader } - #[cfg(not(feature = "encoding"))] - Self::from_reader(s.as_bytes()) + reader } /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Self { - Self::from_reader(s) + pub fn from_bytes(s: &'buf [u8]) -> Self { + Self::from_reader_internal(SliceReader(s)) } +} +/// Public reading methods for a [`Reader`] based on an [`SliceReader`]. +impl<'buf> Reader> { /// Read an event that borrows from the input rather than a buffer. #[inline] - pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) + pub fn read_event(&mut self) -> Result> { + self.read_event_impl(&mut ()) + } + + /// Temporary helper to keep both `read_event` and `read_event_into` available for reading + /// from `&[u8]`. + #[inline] + pub fn read_event_into(&mut self, _buf: &mut Vec) -> Result> { + self.read_event() } /// Reads until end element is found. This function is supposed to be called @@ -135,6 +262,66 @@ impl<'a> Reader<&'a [u8]> { } } + /// Temporary helper to keep both `read_to_end` and `read_to_end_into` available for reading + /// from `&[u8]`. + pub fn read_to_end_into(&mut self, end: QName, _buf: &mut Vec) -> Result<()> { + self.read_to_end(end) + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text(&mut self, end: QName) -> Result { + let s = match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end(end)?; + Ok(s) + } + + /// Temporary helper to keep both `read_text` and `read_text_into` available for reading + /// from `&[u8]`. + pub fn read_text_into(&mut self, end: QName, _buf: &mut Vec) -> Result { + self.read_text(end) + } + /// Reads the next event and resolves its namespace (if applicable). /// /// # Examples @@ -185,7 +372,7 @@ impl<'a> Reader<&'a [u8]> { pub fn read_namespaced_event<'ns>( &mut self, namespace_buffer: &'ns mut Vec, - ) -> Result<(ResolveResult<'ns>, Event<'a>)> { + ) -> Result<(ResolveResult<'ns>, Event<'buf>)> { if self.pending_pop { self.ns_resolver.pop(namespace_buffer); } @@ -201,12 +388,34 @@ impl<'a> Reader<&'a [u8]> { #[derive(Debug, Clone, Copy)] pub struct SliceReader<'buf>(&'buf [u8]); +impl<'buf> Deref for SliceReader<'buf> { + type Target = &'buf [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'buf> DerefMut for SliceReader<'buf> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<'buf> InnerReader for SliceReader<'buf> { + type Reader = &'buf [u8]; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + /// Private reading functions for a [`SliceReader`]. impl<'buf> SliceReader<'buf> { fn read_bytes_until( &mut self, byte: u8, - _buf: (), + _buf: &mut (), position: &mut usize, ) -> Result> { if self.0.is_empty() { @@ -228,7 +437,7 @@ impl<'buf> SliceReader<'buf> { fn read_bang_element( &mut self, - _buf: (), + _buf: &mut (), position: &mut usize, ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to @@ -248,7 +457,7 @@ impl<'buf> SliceReader<'buf> { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { + fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { if self.0.is_empty() { return Ok(None); } @@ -293,3 +502,24 @@ impl<'buf> SliceReader<'buf> { Ok(self.0.first().copied()) } } + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_bytes<'buf>(bytes: &'buf [u8]) -> SliceReader<'buf> { + SliceReader(bytes) + } + + fn reader_from_str<'buf>(s: &'buf str) -> Reader> { + Reader::from_str(s) + } + + #[allow(dead_code)] + fn reader_from_bytes<'buf>(s: &'buf [u8]) -> Reader> { + Reader::from_bytes(s) + } + + check!(let mut buf = ();); +} diff --git a/tests/test.rs b/tests/test.rs index 5ac9dae8..fb51d8b1 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -56,7 +56,8 @@ fn test_attribute_equal() { let src = b""; let mut r = Reader::from_reader(src as &[u8]); r.trim_text(true).expand_empty_elements(false); - match r.read_event() { + let mut buf = Vec::new(); + match r.read_event_into(&mut buf) { Ok(Empty(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -77,8 +78,9 @@ fn test_comment_starting_with_gt() { let src = b"-->"; let mut r = Reader::from_reader(src as &[u8]); r.trim_text(true).expand_empty_elements(false); + let mut buf = Vec::new(); loop { - match r.read_event() { + match r.read_event_into(&mut buf) { Ok(Comment(e)) => { assert_eq!(e.as_ref(), b">"); break; @@ -129,8 +131,9 @@ fn test_issue94() { "#; let mut reader = Reader::from_reader(&data[..]); reader.trim_text(true); + let mut buf = Vec::new(); loop { - match reader.read_event() { + match reader.read_event_into(&mut buf) { Ok(Eof) | Err(..) => break, _ => (), } From b6a2af107752c5af97134083d396da4c9b9708c4 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 11:26:25 +0200 Subject: [PATCH 6/8] Remove buffered access for SliceReader as events always borrow from the input slice --- README.md | 6 ++- benches/macrobenches.rs | 5 +-- benches/microbenches.rs | 70 ++++++++++-------------------- examples/read_texts.rs | 6 +-- src/reader/mod.rs | 7 ++- src/reader/slice_reader.rs | 89 ++++++++++---------------------------- 6 files changed, 58 insertions(+), 125 deletions(-) diff --git a/README.md b/README.md index 6fa273ee..031c0175 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,11 @@ let xml = r#" Test Test 2 "#; -let mut reader = Reader::from_str(xml); +let mut reader = Reader::from_reader(xml.as_bytes()); +// If you want to read from a string or byte slice without buffering, use: +// let mut reader = Reader::from_str(xml); +// In that case, `Vec` is *not* needed for buffering below and you should use +// `read_event` instead of `read_event_into`. reader.trim_text(true); let mut count = 0; diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 4cb02ffe..a4e2719e 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -19,10 +19,9 @@ static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); // TODO: read the namespaces too // TODO: use fully normalized attribute values fn parse_document(doc: &[u8]) -> XmlResult<()> { - let mut r = Reader::from_reader(doc); - let mut buf = Vec::new(); + let mut r = Reader::from_bytes(doc); loop { - match r.read_event_into(&mut buf)? { + match r.read_event()? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { criterion::black_box(attr?.decode_and_unescape_value(&r)?); diff --git a/benches/microbenches.rs b/benches/microbenches.rs index c52eceb6..ee52b27b 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -29,17 +29,15 @@ fn read_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -50,19 +48,17 @@ fn read_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -79,18 +75,16 @@ fn read_namespaced_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_namespaced_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event_into(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -101,20 +95,18 @@ fn read_namespaced_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event_into(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -130,78 +122,66 @@ fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); group.bench_function("StartText", |b| { let src = "Hello world!".repeat(512 / 12).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::StartText(e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 504); }) }); group.bench_function("Start", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 525); }) }); group.bench_function("Comment", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 520); }) }); group.bench_function("CData", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::CData(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 518); }) }); @@ -213,12 +193,11 @@ fn attributes(c: &mut Criterion) { let mut group = c.benchmark_group("attributes"); group.bench_function("with_checks = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes() { let _attr = attr.unwrap(); @@ -228,7 +207,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -236,12 +214,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("with_checks = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes().with_checks(false) { let _attr = attr.unwrap(); @@ -251,7 +228,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -259,12 +235,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("try_get_attribute", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { @@ -279,7 +254,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 150); }) diff --git a/examples/read_texts.rs b/examples/read_texts.rs index 40d71e63..70be0b5c 100644 --- a/examples/read_texts.rs +++ b/examples/read_texts.rs @@ -10,14 +10,13 @@ fn main() { reader.trim_text(true); let mut txt = Vec::new(); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => { txt.push( reader - .read_text_into(QName(b"tag2"), &mut Vec::new()) + .read_text(QName(b"tag2")) .expect("Cannot decode text value"), ); println!("{:?}", txt); @@ -26,6 +25,5 @@ fn main() { Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), _ => (), // There are several other `Event`s we do not consider here } - buf.clear(); } } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index f5a661b3..87c42c27 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -134,7 +134,7 @@ pub trait InnerReader: Deref + DerefMut { /// Test /// Test 2 /// "#; -/// let mut reader = Reader::from_str(xml); +/// let mut reader = Reader::from_reader(xml.as_bytes()); /// reader.trim_text(true); /// /// let mut count = 0; @@ -1903,13 +1903,12 @@ mod test { #[test] fn str_always_has_utf8() { let mut reader = crate::Reader::from_str(""); - let mut buf = Vec::new(); assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_into(&mut buf).unwrap(); + reader.read_event().unwrap(); assert_eq!(reader.decoder().encoding(), UTF_8); - assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event().unwrap(), Event::Eof); } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index da42ac4f..8c603f45 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -20,11 +20,11 @@ impl<'buf> Reader> { /// Read text into the given buffer, and return an event that borrows from /// either that buffer or from the input itself, based on the type of the /// reader. - fn read_event_impl(&mut self, _buf: &mut ()) -> Result> { + fn read_event_impl(&mut self) -> Result> { let event = match self.tag_state { - TagState::Init => self.read_until_open(&mut (), true), - TagState::Closed => self.read_until_open(&mut (), false), - TagState::Opened => self.read_until_close(&mut ()), + TagState::Init => self.read_until_open(true), + TagState::Closed => self.read_until_open(false), + TagState::Opened => self.read_until_close(), TagState::Empty => self.close_expanded_empty(), TagState::Exit => return Ok(Event::Eof), }; @@ -38,7 +38,7 @@ impl<'buf> Reader> { /// Read until '<' is found and moves reader to an `Opened` state. /// /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - fn read_until_open(&mut self, _buf: &mut (), first: bool) -> Result> { + fn read_until_open(&mut self, first: bool) -> Result> { self.tag_state = TagState::Opened; if self.trim_text_start { @@ -47,13 +47,10 @@ impl<'buf> Reader> { // If we already at the `<` symbol, do not try to return an empty Text event if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_impl(&mut ()); + return self.read_event_impl(); } - match self - .reader - .read_bytes_until(b'<', &mut (), &mut self.buf_position) - { + match self.reader.read_bytes_until(b'<', &mut self.buf_position) { Ok(Some(bytes)) => { #[cfg(feature = "encoding")] if first && self.encoding.can_be_refined() { @@ -86,43 +83,30 @@ impl<'buf> Reader> { /// Private function to read until `>` is found. This function expects that /// it was called just after encounter a `<` symbol. - fn read_until_close(&mut self, _buf: &mut ()) -> Result> { + fn read_until_close(&mut self) -> Result> { self.tag_state = TagState::Closed; match self.reader.peek_one() { // ` match self - .reader - .read_bang_element(&mut (), &mut self.buf_position) - { + Ok(Some(b'!')) => match self.reader.read_bang_element(&mut self.buf_position) { Ok(None) => Ok(Event::Eof), Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), Err(e) => Err(e), }, // ` { - match self - .reader - .read_bytes_until(b'>', &mut (), &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - } - } + Ok(Some(b'/')) => match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, // ` { - match self - .reader - .read_bytes_until(b'>', &mut (), &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - } - } + Ok(Some(b'?')) => match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(&mut (), &mut self.buf_position) { + Ok(Some(_)) => match self.reader.read_element(&mut self.buf_position) { Ok(None) => Ok(Event::Eof), Ok(Some(bytes)) => self.read_start(bytes), Err(e) => Err(e), @@ -160,14 +144,7 @@ impl<'buf> Reader> { /// Read an event that borrows from the input rather than a buffer. #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(&mut ()) - } - - /// Temporary helper to keep both `read_event` and `read_event_into` available for reading - /// from `&[u8]`. - #[inline] - pub fn read_event_into(&mut self, _buf: &mut Vec) -> Result> { - self.read_event() + self.read_event_impl() } /// Reads until end element is found. This function is supposed to be called @@ -262,12 +239,6 @@ impl<'buf> Reader> { } } - /// Temporary helper to keep both `read_to_end` and `read_to_end_into` available for reading - /// from `&[u8]`. - pub fn read_to_end_into(&mut self, end: QName, _buf: &mut Vec) -> Result<()> { - self.read_to_end(end) - } - /// Reads optional text between start and end tags. /// /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a @@ -316,12 +287,6 @@ impl<'buf> Reader> { Ok(s) } - /// Temporary helper to keep both `read_text` and `read_text_into` available for reading - /// from `&[u8]`. - pub fn read_text_into(&mut self, end: QName, _buf: &mut Vec) -> Result { - self.read_text(end) - } - /// Reads the next event and resolves its namespace (if applicable). /// /// # Examples @@ -412,12 +377,7 @@ impl<'buf> InnerReader for SliceReader<'buf> { /// Private reading functions for a [`SliceReader`]. impl<'buf> SliceReader<'buf> { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: &mut (), - position: &mut usize, - ) -> Result> { + fn read_bytes_until(&mut self, byte: u8, position: &mut usize) -> Result> { if self.0.is_empty() { return Ok(None); } @@ -437,7 +397,6 @@ impl<'buf> SliceReader<'buf> { fn read_bang_element( &mut self, - _buf: &mut (), position: &mut usize, ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to @@ -457,7 +416,7 @@ impl<'buf> SliceReader<'buf> { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { + fn read_element(&mut self, position: &mut usize) -> Result> { if self.0.is_empty() { return Ok(None); } @@ -521,5 +480,5 @@ mod test { Reader::from_bytes(s) } - check!(let mut buf = ();); + check!(); } From c972101c513e4d10ffd17a67e19d74b545994036 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 11:26:55 +0200 Subject: [PATCH 7/8] Add example for buffered access when reading from a file --- examples/read_buffered.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/read_buffered.rs diff --git a/examples/read_buffered.rs b/examples/read_buffered.rs new file mode 100644 index 00000000..25b28ee2 --- /dev/null +++ b/examples/read_buffered.rs @@ -0,0 +1,34 @@ +// This example demonstrates how a reader (for example when reading from a file) +// can be buffered. In that case, data read from the file is written to a supplied +// buffer and returned XML events borrow from that buffer. +// That way, allocations can be kept to a minimum. + +fn main() -> Result<(), quick_xml::Error> { + use quick_xml::events::Event; + use quick_xml::Reader; + + let mut reader = Reader::from_file("tests/documents/document.xml")?; + reader.trim_text(true); + + let mut buf = Vec::new(); + + let mut count = 0; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + let name = e.name(); + let name = reader.decoder().decode(name.as_ref())?; + println!("read start event {:?}", name.as_ref()); + count += 1; + } + Ok(Event::Eof) => break, // exits the loop when reaching end of file + Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + _ => (), // There are several other `Event`s we do not consider here + } + } + + println!("read {} start events in total", count); + + Ok(()) +} From c3a07b6fb633f938f9828521495fbadde0164875 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Fri, 22 Jul 2022 21:13:27 +0200 Subject: [PATCH 8/8] Add changelog entry --- Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog.md b/Changelog.md index c3053257..e1cb2973 100644 --- a/Changelog.md +++ b/Changelog.md @@ -137,6 +137,9 @@ - [#423]: All escaping functions now accepts and returns strings instead of byte slices - [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array, but since now escaping works on strings. Use `BytesText::from_plain_str` instead +- [#425]: Split the internal implementation of `Reader` into multiple files to better separate the + buffered and unbuffered implementations. The buffered methods, e.g. `read_event_into(&mut buf)`, + will no longer be available when reading from a slice. ### New Tests @@ -167,6 +170,7 @@ [#418]: https://github.com/tafia/quick-xml/pull/418 [#421]: https://github.com/tafia/quick-xml/pull/421 [#423]: https://github.com/tafia/quick-xml/pull/423 +[#425]: https://github.com/tafia/quick-xml/pull/425 ## 0.23.0 -- 2022-05-08