diff --git a/Changelog.md b/Changelog.md
index c3053257..f38f18b8 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -27,6 +27,8 @@
the XML declared encoding and always use UTF-8
- [#416]: Add `borrow()` methods in all event structs which allows to get
a borrowed version of any event
+- [#436]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
+ under the `quick-xml::encoding` namespace.
### Bug Fixes
@@ -137,6 +139,13 @@
- [#423]: All escaping functions now accepts and returns strings instead of byte slices
- [#423]: Removed `BytesText::from_plain` because it internally did escaping of a byte array,
but since now escaping works on strings. Use `BytesText::from_plain_str` instead
+- [#425]: Split the internal implementation of `Reader` into multiple files to better separate the
+ buffered and unbuffered implementations. The unbuffered methods, e.g. `read_event()`,
+ will no longer be available when reading from a slice.
+- [#436]: When using `Reader` with raw bytes, a buffered parsing implementation will always be used.
+ If using `Reader::from_str()`, the reader will borrow directly from the `&str`. If you have a byte
+ array known to be valid UTF-8, it is recommended to convert it to `&str` first, which will enable
+ the unbuffered (borrowing) implementation.
### New Tests
@@ -167,6 +176,8 @@
[#418]: https://github.com/tafia/quick-xml/pull/418
[#421]: https://github.com/tafia/quick-xml/pull/421
[#423]: https://github.com/tafia/quick-xml/pull/423
+[#425]: https://github.com/tafia/quick-xml/pull/425
+[#436]: https://github.com/tafia/quick-xml/pull/430
## 0.23.0 -- 2022-05-08
diff --git a/README.md b/README.md
index 6fa273ee..031c0175 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,11 @@ let xml = r#"TestTest 2"#;
-let mut reader = Reader::from_str(xml);
+let mut reader = Reader::from_reader(xml.as_bytes());
+// If you want to read from a string or byte slice without buffering, use:
+// let mut reader = Reader::from_str(xml);
+// In that case, `Vec` is *not* needed for buffering below and you should use
+// `read_event` instead of `read_event_into`.
reader.trim_text(true);
let mut count = 0;
diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
index 3358f3a4..358c086a 100644
--- a/benches/macrobenches.rs
+++ b/benches/macrobenches.rs
@@ -3,23 +3,23 @@ use quick_xml::events::Event;
use quick_xml::Reader;
use quick_xml::Result as XmlResult;
-static RPM_PRIMARY: &[u8] = include_bytes!("../tests/documents/rpm_primary.xml");
-static RPM_PRIMARY2: &[u8] = include_bytes!("../tests/documents/rpm_primary2.xml");
-static RPM_FILELISTS: &[u8] = include_bytes!("../tests/documents/rpm_filelists.xml");
-static RPM_OTHER: &[u8] = include_bytes!("../tests/documents/rpm_other.xml");
-static LIBREOFFICE_DOCUMENT: &[u8] = include_bytes!("../tests/documents/libreoffice_document.fodt");
-static DOCUMENT: &[u8] = include_bytes!("../tests/documents/document.xml");
-static TEST_WRITER_INDENT: &[u8] = include_bytes!("../tests/documents/test_writer_indent.xml");
-static SAMPLE_1: &[u8] = include_bytes!("../tests/documents/sample_1.xml");
-static LINESCORE: &[u8] = include_bytes!("../tests/documents/linescore.xml");
-static SAMPLE_RSS: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
-static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
-static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
+static RPM_PRIMARY: &str = include_str!("../tests/documents/rpm_primary.xml");
+static RPM_PRIMARY2: &str = include_str!("../tests/documents/rpm_primary2.xml");
+static RPM_FILELISTS: &str = include_str!("../tests/documents/rpm_filelists.xml");
+static RPM_OTHER: &str = include_str!("../tests/documents/rpm_other.xml");
+static LIBREOFFICE_DOCUMENT: &str = include_str!("../tests/documents/libreoffice_document.fodt");
+static DOCUMENT: &str = include_str!("../tests/documents/document.xml");
+static TEST_WRITER_INDENT: &str = include_str!("../tests/documents/test_writer_indent.xml");
+static SAMPLE_1: &str = include_str!("../tests/documents/sample_1.xml");
+static LINESCORE: &str = include_str!("../tests/documents/linescore.xml");
+static SAMPLE_RSS: &str = include_str!("../tests/documents/sample_rss.xml");
+static SAMPLE_NS: &str = include_str!("../tests/documents/sample_ns.xml");
+static PLAYERS: &str = include_str!("../tests/documents/players.xml");
// TODO: read the namespaces too
// TODO: use fully normalized attribute values
-fn parse_document(doc: &[u8]) -> XmlResult<()> {
- let mut r = Reader::from_reader(doc);
+fn parse_document(doc: &str) -> XmlResult<()> {
+ let mut r = Reader::from_str(doc);
loop {
match r.read_event()? {
Event::Start(e) | Event::Empty(e) => {
diff --git a/benches/microbenches.rs b/benches/microbenches.rs
index 8bbe1a67..75b08fa1 100644
--- a/benches/microbenches.rs
+++ b/benches/microbenches.rs
@@ -5,8 +5,8 @@ use quick_xml::events::Event;
use quick_xml::name::QName;
use quick_xml::Reader;
-static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
-static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
+static SAMPLE: &str = include_str!("../tests/documents/sample_rss.xml");
+static PLAYERS: &str = include_str!("../tests/documents/players.xml");
static LOREM_IPSUM_TEXT: &str =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
@@ -29,17 +29,15 @@ fn read_event(c: &mut Criterion) {
let mut group = c.benchmark_group("read_event");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(SAMPLE);
+ let mut r = Reader::from_str(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
loop {
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Eof) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(
count, 1550,
@@ -50,19 +48,17 @@ fn read_event(c: &mut Criterion) {
group.bench_function("trim_text = true", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(SAMPLE);
+ let mut r = Reader::from_str(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
loop {
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Eof) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(
count, 1550,
@@ -79,18 +75,16 @@ fn read_namespaced_event(c: &mut Criterion) {
let mut group = c.benchmark_group("read_namespaced_event");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(SAMPLE);
+ let mut r = Reader::from_str(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
let mut ns_buf = Vec::new();
loop {
- match r.read_namespaced_event(&mut buf, &mut ns_buf) {
+ match r.read_namespaced_event(&mut ns_buf) {
Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1,
Ok((_, Event::Eof)) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(
count, 1550,
@@ -101,20 +95,18 @@ fn read_namespaced_event(c: &mut Criterion) {
group.bench_function("trim_text = true", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(SAMPLE);
+ let mut r = Reader::from_str(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
let mut ns_buf = Vec::new();
loop {
- match r.read_namespaced_event(&mut buf, &mut ns_buf) {
+ match r.read_namespaced_event(&mut ns_buf) {
Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1,
Ok((_, Event::Eof)) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(
count, 1550,
@@ -129,79 +121,67 @@ fn read_namespaced_event(c: &mut Criterion) {
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
group.bench_function("StartText", |b| {
- let src = "Hello world!".repeat(512 / 12).into_bytes();
- let mut buf = Vec::with_capacity(1024);
+ let src = "Hello world!".repeat(512 / 12);
b.iter(|| {
- let mut r = Reader::from_reader(src.as_ref());
+ let mut r = Reader::from_str(&src);
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false).check_comments(false);
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::StartText(e)) => nbtxt += e.len(),
something_else => panic!("Did not expect {:?}", something_else),
};
- buf.clear();
-
assert_eq!(nbtxt, 504);
})
});
group.bench_function("Start", |b| {
- let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes();
- let mut buf = Vec::with_capacity(1024);
+ let src = format!(r#""#, "world".repeat(512 / 5));
b.iter(|| {
- let mut r = Reader::from_reader(src.as_ref());
+ let mut r = Reader::from_str(&src);
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Start(ref e)) => nbtxt += e.len(),
something_else => panic!("Did not expect {:?}", something_else),
};
- buf.clear();
-
assert_eq!(nbtxt, 525);
})
});
group.bench_function("Comment", |b| {
- let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes();
- let mut buf = Vec::with_capacity(1024);
+ let src = format!(r#""#, "world".repeat(512 / 5));
b.iter(|| {
- let mut r = Reader::from_reader(src.as_ref());
+ let mut r = Reader::from_str(&src);
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};
- buf.clear();
-
assert_eq!(nbtxt, 520);
})
});
group.bench_function("CData", |b| {
- let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes();
- let mut buf = Vec::with_capacity(1024);
+ let src = format!(r#""#, "world".repeat(512 / 5));
b.iter(|| {
- let mut r = Reader::from_reader(src.as_ref());
+ let mut r = Reader::from_str(&src);
let mut nbtxt = criterion::black_box(0);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::CData(ref e)) => nbtxt += e.len(),
something_else => panic!("Did not expect {:?}", something_else),
};
- buf.clear();
-
assert_eq!(nbtxt, 518);
})
});
@@ -213,12 +193,11 @@ fn attributes(c: &mut Criterion) {
let mut group = c.benchmark_group("attributes");
group.bench_function("with_checks = true", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(PLAYERS);
+ let mut r = Reader::from_str(PLAYERS);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
loop {
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Empty(e)) => {
for attr in e.attributes() {
let _attr = attr.unwrap();
@@ -228,7 +207,6 @@ fn attributes(c: &mut Criterion) {
Ok(Event::Eof) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(count, 1041);
})
@@ -236,12 +214,11 @@ fn attributes(c: &mut Criterion) {
group.bench_function("with_checks = false", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(PLAYERS);
+ let mut r = Reader::from_str(PLAYERS);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
loop {
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Empty(e)) => {
for attr in e.attributes().with_checks(false) {
let _attr = attr.unwrap();
@@ -251,7 +228,6 @@ fn attributes(c: &mut Criterion) {
Ok(Event::Eof) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(count, 1041);
})
@@ -259,12 +235,11 @@ fn attributes(c: &mut Criterion) {
group.bench_function("try_get_attribute", |b| {
b.iter(|| {
- let mut r = Reader::from_reader(PLAYERS);
+ let mut r = Reader::from_str(PLAYERS);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
- let mut buf = Vec::new();
loop {
- match r.read_event_into(&mut buf) {
+ match r.read_event() {
Ok(Event::Empty(e)) if e.name() == QName(b"player") => {
for name in ["num", "status", "avg"] {
if let Some(_attr) = e.try_get_attribute(name).unwrap() {
@@ -279,7 +254,6 @@ fn attributes(c: &mut Criterion) {
Ok(Event::Eof) => break,
_ => (),
}
- buf.clear();
}
assert_eq!(count, 150);
})
diff --git a/examples/read_buffered.rs b/examples/read_buffered.rs
new file mode 100644
index 00000000..25b28ee2
--- /dev/null
+++ b/examples/read_buffered.rs
@@ -0,0 +1,34 @@
+// This example demonstrates how a reader (for example when reading from a file)
+// can be buffered. In that case, data read from the file is written to a supplied
+// buffer and returned XML events borrow from that buffer.
+// That way, allocations can be kept to a minimum.
+
+fn main() -> Result<(), quick_xml::Error> {
+ use quick_xml::events::Event;
+ use quick_xml::Reader;
+
+ let mut reader = Reader::from_file("tests/documents/document.xml")?;
+ reader.trim_text(true);
+
+ let mut buf = Vec::new();
+
+ let mut count = 0;
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(ref e)) => {
+ let name = e.name();
+ let name = reader.decoder().decode(name.as_ref())?;
+ println!("read start event {:?}", name.as_ref());
+ count += 1;
+ }
+ Ok(Event::Eof) => break, // exits the loop when reaching end of file
+ Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
+ _ => (), // There are several other `Event`s we do not consider here
+ }
+ }
+
+ println!("read {} start events in total", count);
+
+ Ok(())
+}
diff --git a/examples/read_texts.rs b/examples/read_texts.rs
index 40d71e63..70be0b5c 100644
--- a/examples/read_texts.rs
+++ b/examples/read_texts.rs
@@ -10,14 +10,13 @@ fn main() {
reader.trim_text(true);
let mut txt = Vec::new();
- let mut buf = Vec::new();
loop {
- match reader.read_event_into(&mut buf) {
+ match reader.read_event() {
Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => {
txt.push(
reader
- .read_text_into(QName(b"tag2"), &mut Vec::new())
+ .read_text(QName(b"tag2"))
.expect("Cannot decode text value"),
);
println!("{:?}", txt);
@@ -26,6 +25,5 @@ fn main() {
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
_ => (), // There are several other `Event`s we do not consider here
}
- buf.clear();
}
}
diff --git a/src/de/escape.rs b/src/de/escape.rs
index badc3299..e9eff985 100644
--- a/src/de/escape.rs
+++ b/src/de/escape.rs
@@ -1,9 +1,9 @@
//! Serde `Deserializer` module
use crate::de::deserialize_bool;
+use crate::encoding::Decoder;
use crate::errors::serialize::DeError;
use crate::escape::unescape;
-use crate::reader::Decoder;
use serde::de::{DeserializeSeed, EnumAccess, VariantAccess, Visitor};
use serde::{self, forward_to_deserialize_any, serde_if_integer128};
use std::borrow::Cow;
diff --git a/src/de/mod.rs b/src/de/mod.rs
index e564e041..fd3caa46 100644
--- a/src/de/mod.rs
+++ b/src/de/mod.rs
@@ -215,10 +215,10 @@ mod var;
pub use crate::errors::serialize::DeError;
use crate::{
+ encoding::Decoder,
errors::Error,
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
name::QName,
- reader::Decoder,
Reader,
};
use serde::de::{self, Deserialize, DeserializeOwned, Visitor};
@@ -306,8 +306,8 @@ where
}
/// Deserialize from a reader. This method will do internal copies of data
-/// readed from `reader`. If you want have a `&[u8]` or `&str` input and want
-/// to borrow as much as possible, use [`from_slice`] or [`from_str`]
+/// readed from `reader`. If you want have a `&str` input and want
+/// to borrow as much as possible, use [`from_str`]
pub fn from_reader(reader: R) -> Result
where
R: BufRead,
@@ -685,17 +685,7 @@ where
impl<'de> Deserializer<'de, SliceReader<'de>> {
/// Create new deserializer that will borrow data from the specified string
pub fn from_str(s: &'de str) -> Self {
- Self::from_borrowing_reader(Reader::from_str(s))
- }
-
- /// Create new deserializer that will borrow data from the specified byte array
- pub fn from_slice(bytes: &'de [u8]) -> Self {
- Self::from_borrowing_reader(Reader::from_bytes(bytes))
- }
-
- /// Create new deserializer that will borrow data from the specified borrowing reader
- #[inline]
- fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self {
+ let mut reader = Reader::from_str(s);
reader
.expand_empty_elements(true)
.check_end_names(true)
@@ -726,6 +716,13 @@ where
}
}
+impl<'de> Deserializer<'de, IoReader<&'de [u8]>> {
+ /// Create new deserializer that will borrow data from the specified byte array
+ pub fn from_slice(bytes: &'de [u8]) -> Self {
+ Self::from_reader(bytes)
+ }
+}
+
impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R>
where
R: XmlRead<'de>,
@@ -930,7 +927,7 @@ pub trait XmlRead<'i> {
/// You cannot create it, it is created automatically when you call
/// [`Deserializer::from_reader`]
pub struct IoReader {
- reader: Reader,
+ reader: Reader>,
buf: Vec,
}
@@ -970,12 +967,12 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader {
}
}
-/// XML input source that reads from a slice of bytes and can borrow from it.
+/// XML input source that reads from a `&str` and can borrow from it.
///
/// You cannot create it, it is created automatically when you call
-/// [`Deserializer::from_str`] or [`Deserializer::from_slice`]
+/// [`Deserializer::from_str`]
pub struct SliceReader<'de> {
- reader: Reader<&'de [u8]>,
+ reader: Reader>,
}
impl<'de> XmlRead<'de> for SliceReader<'de> {
@@ -1025,8 +1022,8 @@ mod tests {
/// Checks that `peek()` and `read()` behaves correctly after `skip()`
#[test]
fn read_and_peek() {
- let mut de = Deserializer::from_slice(
- br#"
+ let mut de = Deserializer::from_str(
+ r#"
text
@@ -1166,8 +1163,8 @@ mod tests {
/// Checks that `read_to_end()` behaves correctly after `skip()`
#[test]
fn read_to_end() {
- let mut de = Deserializer::from_slice(
- br#"
+ let mut de = Deserializer::from_str(
+ r#"
text
@@ -1270,8 +1267,8 @@ mod tests {
item: Vec<()>,
}
- let mut de = Deserializer::from_slice(
- br#"
+ let mut de = Deserializer::from_str(
+ r#"
@@ -1296,8 +1293,8 @@ mod tests {
fn read_to_end() {
use crate::de::DeEvent::*;
- let mut de = Deserializer::from_slice(
- br#"
+ let mut de = Deserializer::from_str(
+ r#"
textcontent
@@ -1343,15 +1340,14 @@ mod tests {
Some text
- "##
- .as_bytes();
+ "##;
let mut reader1 = IoReader {
- reader: Reader::from_reader(s),
+ reader: Reader::from_reader(s.as_bytes()),
buf: Vec::new(),
};
let mut reader2 = SliceReader {
- reader: Reader::from_bytes(s),
+ reader: Reader::from_str(s),
};
loop {
@@ -1373,11 +1369,10 @@ mod tests {
- "##
- .as_bytes();
+ "##;
let mut reader = SliceReader {
- reader: Reader::from_bytes(s),
+ reader: Reader::from_str(s),
};
reader
diff --git a/src/de/seq.rs b/src/de/seq.rs
index fe4559bd..d7595632 100644
--- a/src/de/seq.rs
+++ b/src/de/seq.rs
@@ -1,6 +1,6 @@
use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
+use crate::encoding::Decoder;
use crate::events::BytesStart;
-use crate::reader::Decoder;
use serde::de::{DeserializeSeed, SeqAccess};
/// Check if tag `start` is included in the `fields` list. `decoder` is used to
diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs
index dc0b157a..580c6312 100644
--- a/src/de/simple_type.rs
+++ b/src/de/simple_type.rs
@@ -4,9 +4,9 @@
//! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition
use crate::de::{deserialize_bool, str2bool};
+use crate::encoding::Decoder;
use crate::errors::serialize::DeError;
use crate::escape::unescape;
-use crate::reader::Decoder;
use memchr::memchr;
use serde::de::{DeserializeSeed, Deserializer, EnumAccess, SeqAccess, VariantAccess, Visitor};
use serde::{self, serde_if_integer128};
diff --git a/src/encoding.rs b/src/encoding.rs
new file mode 100644
index 00000000..0b98a209
--- /dev/null
+++ b/src/encoding.rs
@@ -0,0 +1,200 @@
+//! A module for wrappers that encode / decode data.
+
+use std::borrow::Cow;
+
+#[cfg(feature = "encoding")]
+use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
+
+use crate::{Error, Result};
+
+/// Decoder of byte slices to the strings. This is lightweight object that can be copied.
+///
+/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
+/// XML declaration or assumes UTF-8, if XML has no declaration, encoding
+/// key is not defined or contains unknown encoding.
+///
+/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
+/// is supported. [*UTF-16 is not supported at the present*][utf16].
+///
+/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder:
+/// any XML declarations are ignored.
+///
+/// [utf16]: https://github.com/tafia/quick-xml/issues/158
+#[derive(Clone, Copy, Debug)]
+pub struct Decoder {
+ #[cfg(feature = "encoding")]
+ pub(crate) encoding: &'static Encoding,
+}
+
+#[cfg(not(feature = "encoding"))]
+impl Decoder {
+ /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if
+ /// it is present in the `bytes`.
+ ///
+ /// Returns an error in case of malformed sequences in the `bytes`.
+ ///
+ /// If you instead want to use XML declared encoding, use the `encoding` feature
+ #[inline]
+ pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> {
+ Ok(Cow::Borrowed(std::str::from_utf8(bytes)?))
+ }
+
+ /// Decodes a slice regardless of XML declaration with BOM removal if
+ /// it is present in the `bytes`.
+ ///
+ /// Returns an error in case of malformed sequences in the `bytes`.
+ ///
+ /// If you instead want to use XML declared encoding, use the `encoding` feature
+ pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> {
+ let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
+ &bytes[3..]
+ } else {
+ bytes
+ };
+ self.decode(bytes)
+ }
+}
+
+#[cfg(feature = "encoding")]
+impl Decoder {
+ /// Returns the `Reader`s encoding.
+ ///
+ /// This encoding will be used by [`decode`].
+ ///
+ /// [`decode`]: Self::decode
+ pub fn encoding(&self) -> &'static Encoding {
+ self.encoding
+ }
+
+ /// Decodes specified bytes using encoding, declared in the XML, if it was
+ /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
+ /// in the `bytes`.
+ ///
+ /// Returns an error in case of malformed sequences in the `bytes`.
+ pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> {
+ decode(bytes, self.encoding)
+ }
+
+ /// Decodes a slice with BOM removal if it is present in the `bytes` using
+ /// the reader encoding.
+ ///
+ /// If this method called after reading XML declaration with the `"encoding"`
+ /// key, then this encoding is used, otherwise UTF-8 is used.
+ ///
+ /// If XML declaration is absent in the XML, UTF-8 is used.
+ ///
+ /// Returns an error in case of malformed sequences in the `bytes`.
+ pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> {
+ self.decode(remove_bom(bytes, self.encoding))
+ }
+}
+
+/// Decodes the provided bytes using the specified encoding, ignoring the BOM
+/// if it is present in the `bytes`.
+///
+/// Returns an error in case of malformed sequences in the `bytes`.
+#[cfg(feature = "encoding")]
+pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result> {
+ encoding
+ .decode_without_bom_handling_and_without_replacement(bytes)
+ .ok_or(Error::NonDecodable(None))
+}
+
+/// Decodes a slice with an unknown encoding, removing the BOM if it is present
+/// in the bytes.
+///
+/// Returns an error in case of malformed sequences in the `bytes`.
+#[cfg(feature = "encoding")]
+pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result> {
+ if let Some(encoding) = detect_encoding(bytes) {
+ let bytes = remove_bom(bytes, encoding);
+ decode(bytes, encoding)
+ } else {
+ decode(bytes, UTF_8)
+ }
+}
+
+#[cfg(feature = "encoding")]
+fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
+ if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
+ bytes.split_at(3)
+ } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
+ bytes.split_at(2)
+ } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
+ bytes.split_at(2)
+ } else {
+ (&[], bytes)
+ }
+}
+
+#[cfg(feature = "encoding")]
+fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
+ let (_, bytes) = split_at_bom(bytes, encoding);
+ bytes
+}
+
+/// This implementation is required for tests of other parts of the library
+#[cfg(test)]
+#[cfg(feature = "serialize")]
+impl Decoder {
+ pub(crate) fn utf8() -> Self {
+ Decoder {
+ #[cfg(feature = "encoding")]
+ encoding: UTF_8,
+ }
+ }
+
+ #[cfg(feature = "encoding")]
+ pub(crate) fn utf16() -> Self {
+ Decoder { encoding: UTF_16LE }
+ }
+}
+
+/// Automatic encoding detection of XML files based using the [recommended algorithm]
+/// (https://www.w3.org/TR/xml11/#sec-guessing)
+///
+/// The algorithm suggests examine up to the first 4 bytes to determine encoding
+/// according to the following table:
+///
+/// | Bytes |Detected encoding
+/// |-------------|------------------------------------------
+/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order)
+/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order)
+/// |`00 00 FF FE`|UCS-4, unusual octet order (2143)
+/// |`FE FF 00 00`|UCS-4, unusual octet order (3412)
+/// |`FE FF ## ##`|UTF-16, big-endian
+/// |`FF FE ## ##`|UTF-16, little-endian
+/// |`EF BB BF` |UTF-8
+/// |-------------|------------------------------------------
+/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234)
+/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321)
+/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143)
+/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412)
+/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
+/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
+/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
+/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use)
+/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind
+///
+/// Because [`encoding_rs`] crate supported only subset of those encodings, only
+/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
+///
+/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
+#[cfg(feature = "encoding")]
+pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
+ match bytes {
+ // with BOM
+ _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
+ _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
+ _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
+
+ // without BOM
+ _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
+ _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
+ _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
+
+ _ => None,
+ }
+}
+
+// TODO: add tests from these functions
diff --git a/src/events/mod.rs b/src/events/mod.rs
index b2672edf..54bb8654 100644
--- a/src/events/mod.rs
+++ b/src/events/mod.rs
@@ -41,10 +41,11 @@ use std::fmt::{self, Debug, Formatter};
use std::ops::Deref;
use std::str::from_utf8;
+use crate::encoding::Decoder;
use crate::errors::{Error, Result};
use crate::escape::{escape, partial_escape, unescape_with};
use crate::name::{LocalName, QName};
-use crate::reader::{Decoder, Reader};
+use crate::reader::Reader;
use crate::utils::write_cow_string;
use attributes::{Attribute, Attributes};
@@ -983,8 +984,9 @@ pub enum Event<'a> {
/// let xml = b"\xEF\xBB\xBF";
/// let mut reader = Reader::from_bytes(xml);
/// let mut events_processed = 0;
+ /// let mut event_buffer = Vec::new();
/// loop {
- /// match reader.read_event() {
+ /// match reader.read_event_into(&mut event_buffer) {
/// Ok(Event::StartText(e)) => {
/// assert_eq!(events_processed, 0);
/// // Content contains BOM
diff --git a/src/lib.rs b/src/lib.rs
index f42ae359..84579845 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,7 @@
#[cfg(feature = "serialize")]
pub mod de;
+pub mod encoding;
mod errors;
mod escapei;
pub mod escape {
@@ -62,8 +63,9 @@ pub mod utils;
mod writer;
// reexports
+pub use crate::encoding::Decoder;
#[cfg(feature = "serialize")]
pub use crate::errors::serialize::DeError;
pub use crate::errors::{Error, Result};
-pub use crate::reader::{Decoder, Reader};
+pub use crate::reader::{BufferedReader, Reader, SliceReader};
pub use crate::writer::{ElementWriter, Writer};
diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
new file mode 100644
index 00000000..e9ced5f3
--- /dev/null
+++ b/src/reader/buffered_reader.rs
@@ -0,0 +1,725 @@
+//! This is an implementation of [`Reader`] for reading from a [`Read`] or [`BufRead`] as
+//! underlying byte stream.
+
+use std::fs::File;
+use std::io::{self, BufRead, BufReader, Read};
+use std::ops::{Deref, DerefMut};
+use std::path::Path;
+
+use crate::events::{BytesText, Event};
+use crate::name::{QName, ResolveResult};
+use crate::{Error, Result};
+
+#[cfg(feature = "encoding")]
+use crate::encoding::detect_encoding;
+#[cfg(feature = "encoding")]
+use crate::reader::EncodingRef;
+use crate::reader::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState};
+
+/// Private functions for a [`Reader`] based on an [`BufferedReader`].
+impl Reader> {
+ /// Read text into the given buffer, and return an event that borrows from
+ /// either that buffer or from the input itself, based on the type of the
+ /// reader.
+ fn read_event_impl<'buf>(&mut self, buf: &'buf mut Vec) -> Result> {
+ let event = match self.tag_state {
+ TagState::Init => self.read_until_open(buf, true),
+ TagState::Closed => self.read_until_open(buf, false),
+ TagState::Opened => self.read_until_close(buf),
+ TagState::Empty => self.close_expanded_empty(),
+ TagState::Exit => return Ok(Event::Eof),
+ };
+ match event {
+ Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit,
+ _ => {}
+ }
+ event
+ }
+
+ /// Read until '<' is found and moves reader to an `Opened` state.
+ ///
+ /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
+ fn read_until_open<'buf>(
+ &mut self,
+ buf: &'buf mut Vec,
+ first: bool,
+ ) -> Result> {
+ self.tag_state = TagState::Opened;
+
+ if self.trim_text_start {
+ self.reader.skip_whitespace(&mut self.buf_position)?;
+ }
+
+ // If we already at the `<` symbol, do not try to return an empty Text event
+ if self.reader.skip_one(b'<', &mut self.buf_position)? {
+ return self.read_event_impl(buf);
+ }
+
+ match self
+ .reader
+ .read_bytes_until(b'<', buf, &mut self.buf_position)
+ {
+ Ok(Some(bytes)) => {
+ #[cfg(feature = "encoding")]
+ if first && self.encoding.can_be_refined() {
+ if let Some(encoding) = detect_encoding(bytes) {
+ self.encoding = EncodingRef::BomDetected(encoding);
+ }
+ }
+
+ let content = if self.trim_text_end {
+ // Skip the ending '<
+ let len = bytes
+ .iter()
+ .rposition(|&b| !is_whitespace(b))
+ .map_or_else(|| bytes.len(), |p| p + 1);
+ &bytes[..len]
+ } else {
+ bytes
+ };
+
+ Ok(if first {
+ Event::StartText(BytesText::from_escaped(content).into())
+ } else {
+ Event::Text(BytesText::from_escaped(content))
+ })
+ }
+ Ok(None) => Ok(Event::Eof),
+ Err(e) => Err(e),
+ }
+ }
+
+ /// Private function to read until `>` is found. This function expects that
+ /// it was called just after encounter a `<` symbol.
+ fn read_until_close<'buf>(&mut self, buf: &'buf mut Vec) -> Result> {
+ self.tag_state = TagState::Closed;
+
+ match self.reader.peek_one() {
+ // ` match self.reader.read_bang_element(buf, &mut self.buf_position) {
+ Ok(None) => Ok(Event::Eof),
+ Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes),
+ Err(e) => Err(e),
+ },
+ // `` - closing tag
+ Ok(Some(b'/')) => match self
+ .reader
+ .read_bytes_until(b'>', buf, &mut self.buf_position)
+ {
+ Ok(None) => Ok(Event::Eof),
+ Ok(Some(bytes)) => self.read_end(bytes),
+ Err(e) => Err(e),
+ },
+ // `` - processing instruction
+ Ok(Some(b'?')) => match self
+ .reader
+ .read_bytes_until(b'>', buf, &mut self.buf_position)
+ {
+ Ok(None) => Ok(Event::Eof),
+ Ok(Some(bytes)) => self.read_question_mark(bytes),
+ Err(e) => Err(e),
+ },
+ // `<...` - opening or self-closed tag
+ Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) {
+ Ok(None) => Ok(Event::Eof),
+ Ok(Some(bytes)) => self.read_start(bytes),
+ Err(e) => Err(e),
+ },
+ Ok(None) => Ok(Event::Eof),
+ Err(e) => Err(e),
+ }
+ }
+}
+
+/// Public reading methods for a [`Reader`] based on an [`BufferedReader`].
+impl Reader> {
+ /// Reads the next `Event`.
+ ///
+ /// This is the main entry point for reading XML `Event`s.
+ ///
+ /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
+ /// internally).
+ ///
+ /// Having the possibility to control the internal buffers gives you some additional benefits
+ /// such as:
+ ///
+ /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
+ /// you can call `buf.clear()` once you are done with processing the event (typically at the
+ /// end of your loop).
+ /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use quick_xml::Reader;
+ /// use quick_xml::events::Event;
+ ///
+ /// let xml = r#"
+ /// Test
+ /// Test 2
+ /// "#;
+ /// // This explicitly uses `from_reader(xml.as_bytes())` to use a buffered reader instead of
+ /// // relying on the zero-copy optimizations for reading from byte slices.
+ /// let mut reader = Reader::from_reader(xml.as_bytes());
+ /// reader.trim_text(true);
+ /// let mut count = 0;
+ /// let mut buf = Vec::new();
+ /// let mut txt = Vec::new();
+ /// loop {
+ /// match reader.read_event_into(&mut buf) {
+ /// Ok(Event::Start(ref e)) => count += 1,
+ /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()),
+ /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
+ /// Ok(Event::Eof) => break,
+ /// _ => (),
+ /// }
+ /// buf.clear();
+ /// }
+ /// println!("Found {} start events", count);
+ /// println!("Text events: {:?}", txt);
+ /// ```
+ #[inline]
+ pub fn read_event_into<'buf>(&mut self, buf: &'buf mut Vec) -> Result> {
+ self.read_event_impl(buf)
+ }
+
+ /// Reads the next event and resolves its namespace (if applicable).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::str::from_utf8;
+ /// use quick_xml::Reader;
+ /// use quick_xml::events::Event;
+ /// use quick_xml::name::ResolveResult::*;
+ ///
+ /// let xml = r#"
+ /// Test
+ /// Test 2
+ /// "#;
+ /// let mut reader = Reader::from_reader(xml.as_bytes());
+ /// reader.trim_text(true);
+ /// let mut count = 0;
+ /// let mut buf = Vec::new();
+ /// let mut ns_buf = Vec::new();
+ /// let mut txt = Vec::new();
+ /// loop {
+ /// match reader.read_namespaced_event_into(&mut buf, &mut ns_buf) {
+ /// Ok((Bound(ns), Event::Start(e))) => {
+ /// count += 1;
+ /// match (ns.as_ref(), e.local_name().as_ref()) {
+ /// (b"www.xxxx", b"tag1") => (),
+ /// (b"www.yyyy", b"tag2") => (),
+ /// (ns, n) => panic!("Namespace and local name mismatch"),
+ /// }
+ /// println!("Resolved namespace: {:?}", ns);
+ /// }
+ /// Ok((Unbound, Event::Start(_))) => {
+ /// panic!("Element not in any namespace")
+ /// },
+ /// Ok((Unknown(p), Event::Start(_))) => {
+ /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p))
+ /// }
+ /// Ok((_, Event::Text(e))) => {
+ /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned())
+ /// },
+ /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
+ /// Ok((_, Event::Eof)) => break,
+ /// _ => (),
+ /// }
+ /// buf.clear();
+ /// }
+ /// println!("Found {} start events", count);
+ /// println!("Text events: {:?}", txt);
+ /// ```
+ pub fn read_namespaced_event_into<'b, 'ns>(
+ &mut self,
+ buf: &'b mut Vec,
+ namespace_buffer: &'ns mut Vec,
+ ) -> Result<(ResolveResult<'ns>, Event<'b>)> {
+ if self.pending_pop {
+ self.ns_resolver.pop(namespace_buffer);
+ }
+ self.pending_pop = false;
+ let event = self.read_event_into(buf);
+ self.resolve_namespaced_event_inner(event, namespace_buffer)
+ }
+
+ /// Reads until end element is found using provided buffer as intermediate
+ /// storage for events content. This function is supposed to be called after
+ /// you already read a [`Start`] event.
+ ///
+ /// Manages nested cases where parent and child elements have the same name.
+ ///
+ /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
+ /// will be returned. In particularly, that error will be returned if you call
+ /// this method without consuming the corresponding [`Start`] event first.
+ ///
+ /// If your reader created from a string slice or byte array slice, it is
+ /// better to use [`read_to_end()`] method, because it will not copy bytes
+ /// into intermediate buffer.
+ ///
+ /// The provided `buf` buffer will be filled only by one event content at time.
+ /// Before reading of each event the buffer will be cleared. If you know an
+ /// appropriate size of each event, you can preallocate the buffer to reduce
+ /// number of reallocations.
+ ///
+ /// The `end` parameter should contain name of the end element _in the reader
+ /// encoding_. It is good practice to always get that parameter using
+ /// [`BytesStart::to_end()`] method.
+ ///
+ /// The correctness of the skipped events does not checked, if you disabled
+ /// the [`check_end_names`] option.
+ ///
+ /// # Namespaces
+ ///
+ /// While the [`Reader`] does not support namespace resolution, namespaces
+ /// does not change the algorithm for comparing names. Although the names
+ /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
+ /// same namespace, are semantically equivalent, `` cannot close
+ /// ``, because according to [the specification]
+ ///
+ /// > The end of every element that begins with a **start-tag** MUST be marked
+ /// > by an **end-tag** containing a name that echoes the element's type as
+ /// > given in the **start-tag**
+ ///
+ /// # Examples
+ ///
+ /// This example shows, how you can skip XML content after you read the
+ /// start event.
+ ///
+ /// ```
+ /// # use pretty_assertions::assert_eq;
+ /// use quick_xml::events::{BytesStart, Event};
+ /// use quick_xml::Reader;
+ ///
+ /// let mut reader = Reader::from_reader(r#"
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ /// "#.as_bytes());
+ /// reader.trim_text(true);
+ /// let mut buf = Vec::new();
+ ///
+ /// let start = BytesStart::borrowed_name(b"outer");
+ /// let end = start.to_end().into_owned();
+ ///
+ /// // First, we read a start event...
+ /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
+ ///
+ /// //...then, we could skip all events to the corresponding end event.
+ /// // This call will correctly handle nested elements.
+ /// // Note, however, that this method does not handle namespaces.
+ /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
+ ///
+ /// // At the end we should get an Eof event, because we ate the whole XML
+ /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
+ /// ```
+ ///
+ /// [`Start`]: Event::Start
+ /// [`End`]: Event::End
+ /// [`read_to_end()`]: Self::read_to_end
+ /// [`check_end_names`]: Self::check_end_names
+ /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
+ pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> {
+ let mut depth = 0;
+ loop {
+ buf.clear();
+ match self.read_event_into(buf) {
+ Err(e) => return Err(e),
+
+ Ok(Event::Start(e)) if e.name() == end => depth += 1,
+ Ok(Event::End(e)) if e.name() == end => {
+ if depth == 0 {
+ return Ok(());
+ }
+ depth -= 1;
+ }
+ Ok(Event::Eof) => {
+ let name = self.decoder().decode(end.as_ref());
+ return Err(Error::UnexpectedEof(format!("{:?}>", name)));
+ }
+ _ => (),
+ }
+ }
+ }
+
+ /// Reads optional text between start and end tags.
+ ///
+ /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a
+ /// `String`. If the next event is an [`End`] event, returns the empty string. In all other
+ /// cases, returns an error.
+ ///
+ /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8
+ /// if none is specified).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use pretty_assertions::assert_eq;
+ /// use quick_xml::Reader;
+ /// use quick_xml::events::Event;
+ ///
+ /// let mut xml = Reader::from_reader(b"
+ /// <b>
+ ///
+ /// " as &[u8]);
+ /// xml.trim_text(true);
+ ///
+ /// let expected = ["", ""];
+ /// for &content in expected.iter() {
+ /// match xml.read_event_into(&mut Vec::new()) {
+ /// Ok(Event::Start(ref e)) => {
+ /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content);
+ /// },
+ /// e => panic!("Expecting Start event, found {:?}", e),
+ /// }
+ /// }
+ /// ```
+ ///
+ /// [`Text`]: Event::Text
+ /// [`End`]: Event::End
+ pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result {
+ let s = match self.read_event_into(buf) {
+ Err(e) => return Err(e),
+
+ Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(),
+ Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
+ Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
+ _ => return Err(Error::TextNotFound),
+ };
+ self.read_to_end_into(end, buf)?;
+ Ok(s)
+ }
+}
+
+/// Builder for reading from a file.
+impl Reader>> {
+ /// Creates an XML reader from a file path.
+ pub fn from_file>(path: P) -> Result {
+ let file = File::open(path).map_err(Error::Io)?;
+ let reader = BufReader::new(file);
+ Ok(Self::from_reader_internal(BufferedReader(reader)))
+ }
+}
+
+/// Builder for reading from any [`&[u8]`].
+impl<'buf> Reader> {
+ /// Creates an XML reader from any type implementing [`BufRead`].
+ pub fn from_bytes(s: &'buf [u8]) -> Self {
+ Self::from_reader_internal(BufferedReader(s))
+ }
+}
+
+/// Builder for reading from any [`BufRead`].
+impl Reader> {
+ /// Creates an XML reader from any type implementing [`BufRead`].
+ pub fn from_reader(reader: R) -> Self {
+ Self::from_reader_internal(BufferedReader(reader))
+ }
+}
+
+/// Builder for reading from any [`Read`].
+impl Reader>> {
+ /// Creates an XML reader from any type implementing [`Read`].
+ pub fn from_unbuffered_reader(reader: R) -> Self {
+ Self::from_reader_internal(BufferedReader(BufReader::new(reader)))
+ }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// A struct for handling reading functions based on reading from a [`BufRead`].
+#[derive(Debug, Clone)]
+pub struct BufferedReader(R);
+
+impl Deref for BufferedReader {
+ type Target = R;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl DerefMut for BufferedReader {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
+ }
+}
+
+impl InnerReader for BufferedReader {
+ type Reader = R;
+
+ fn into_inner(self) -> Self::Reader {
+ self.0
+ }
+}
+
+/// Private reading functions.
+impl BufferedReader {
+ #[inline]
+ fn read_bytes_until<'buf>(
+ &mut self,
+ byte: u8,
+ buf: &'buf mut Vec,
+ position: &mut usize,
+ ) -> Result
- match r.read_namespaced_event(&mut buf, &mut ns_buf) {
+ match r.read_namespaced_event(&mut ns_buf) {
Ok((ns, End(e))) => {
assert_eq!(ns, Bound(Namespace(b"urn:example:i")));
assert_eq!(e.name(), QName(b"e"));
@@ -341,7 +333,7 @@ fn default_ns_shadowing_expanded() {
e => panic!("Expected End event (), got {:?}", e),
}
//
- match r.read_namespaced_event(&mut buf, &mut ns_buf) {
+ match r.read_namespaced_event(&mut ns_buf) {
Ok((ns, End(e))) => {
assert_eq!(ns, Bound(Namespace(b"urn:example:o")));
assert_eq!(e.name(), QName(b"e"));
@@ -363,11 +355,10 @@ fn reserved_name() {
let mut r = Reader::from_str(r#""#);
r.trim_text(true);
- let mut buf = Vec::new();
let mut ns_buf = Vec::new();
//
- match r.read_namespaced_event(&mut buf, &mut ns_buf) {
+ match r.read_namespaced_event(&mut ns_buf) {
Ok((ns, Empty(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))),
e => panic!(
"Expected empty element bound to namespace 'www1', got {:?}",
diff --git a/tests/test.rs b/tests/test.rs
index 5ac9dae8..fa11209d 100644
--- a/tests/test.rs
+++ b/tests/test.rs
@@ -9,8 +9,8 @@ use pretty_assertions::assert_eq;
#[test]
fn test_sample() {
- let src: &[u8] = include_bytes!("documents/sample_rss.xml");
- let mut r = Reader::from_bytes(src);
+ let src: &str = include_str!("documents/sample_rss.xml");
+ let mut r = Reader::from_str(src);
let mut count = 0;
loop {
match r.read_event().unwrap() {
@@ -25,8 +25,8 @@ fn test_sample() {
#[test]
fn test_attributes_empty() {
- let src = b"";
- let mut r = Reader::from_bytes(src);
+ let src = "";
+ let mut r = Reader::from_str(src);
r.trim_text(true).expand_empty_elements(false);
match r.read_event() {
Ok(Empty(e)) => {
@@ -56,7 +56,8 @@ fn test_attribute_equal() {
let src = b"";
let mut r = Reader::from_reader(src as &[u8]);
r.trim_text(true).expand_empty_elements(false);
- match r.read_event() {
+ let mut buf = Vec::new();
+ match r.read_event_into(&mut buf) {
Ok(Empty(e)) => {
let mut attrs = e.attributes();
assert_eq!(
@@ -77,8 +78,9 @@ fn test_comment_starting_with_gt() {
let src = b"-->";
let mut r = Reader::from_reader(src as &[u8]);
r.trim_text(true).expand_empty_elements(false);
+ let mut buf = Vec::new();
loop {
- match r.read_event() {
+ match r.read_event_into(&mut buf) {
Ok(Comment(e)) => {
assert_eq!(e.as_ref(), b">");
break;
@@ -94,9 +96,10 @@ fn test_comment_starting_with_gt() {
fn test_koi8_r_encoding() {
let src = include_bytes!("documents/opennews_all.rss");
let mut r = Reader::from_bytes(src);
+ let mut buf = Vec::new();
r.trim_text(true).expand_empty_elements(false);
loop {
- match r.read_event() {
+ match r.read_event_into(&mut buf) {
Ok(Text(e)) => {
e.decode_and_unescape(&r).unwrap();
}
@@ -129,8 +132,9 @@ fn test_issue94() {
"#;
let mut reader = Reader::from_reader(&data[..]);
reader.trim_text(true);
+ let mut buf = Vec::new();
loop {
- match reader.read_event() {
+ match reader.read_event_into(&mut buf) {
Ok(Eof) | Err(..) => break,
_ => (),
}
diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs
index bb32a602..2cd3f75b 100644
--- a/tests/unit_tests.rs
+++ b/tests/unit_tests.rs
@@ -787,11 +787,11 @@ mod decode_with_bom_removal {
fn removes_utf16be_bom() {
let mut reader = Reader::from_bytes(include_bytes!("./documents/utf16be.xml"));
reader.trim_text(true);
-
+ let mut event_buffer = Vec::new();
let mut txt = Vec::new();
loop {
- match reader.read_event() {
+ match reader.read_event_into(&mut event_buffer) {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(Eof) => break,
_ => (),
@@ -805,11 +805,11 @@ mod decode_with_bom_removal {
fn removes_utf16le_bom() {
let mut reader = Reader::from_bytes(include_bytes!("./documents/utf16le.xml"));
reader.trim_text(true);
-
+ let mut event_buffer = Vec::new();
let mut txt = Vec::new();
loop {
- match reader.read_event() {
+ match reader.read_event_into(&mut event_buffer) {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(Eof) => break,
_ => (),
diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs
index 28401b77..a7d26db8 100644
--- a/tests/xmlrs_reader_tests.rs
+++ b/tests/xmlrs_reader_tests.rs
@@ -362,19 +362,18 @@ fn test(input: &str, output: &str, trim: bool) {
#[track_caller]
fn test_bytes(input: &[u8], output: &[u8], trim: bool) {
- let mut reader = Reader::from_reader(input);
+ let mut reader = Reader::from_bytes(input);
reader
.trim_text(trim)
.check_comments(true)
.expand_empty_elements(false);
let mut spec_lines = SpecIter(output).enumerate();
- let mut buf = Vec::new();
+ let mut event_buffer = Vec::new();
let mut ns_buffer = Vec::new();
loop {
- buf.clear();
- let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
+ let event = reader.read_namespaced_event_into(&mut event_buffer, &mut ns_buffer);
let line = xmlrs_display(event, reader.decoder());
if let Some((n, spec)) = spec_lines.next() {
if spec.trim() == "EndDocument" {