Skip to content

Commit 0960333

Browse files
authored
Merge pull request #781 from Mingun/only-upper-cdata
Start CDATA section only after uppercase `<![CDATA[`
2 parents 22b3e45 + b71cf7c commit 0960333

File tree

6 files changed

+48
-28
lines changed

6 files changed

+48
-28
lines changed

Changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,13 @@
1717

1818
### Bug Fixes
1919

20+
- [#781]: Fix conditions to start CDATA section. Only uppercase `<![CDATA[` can start it.
21+
Previously any case was allowed.
22+
2023
### Misc Changes
2124

25+
[#781]: https://github.com/tafia/quick-xml/pull/781
26+
2227

2328
## 0.35.0 -- 2024-06-29
2429

src/reader/mod.rs

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -974,32 +974,35 @@ impl BangType {
974974
/// - `chunk`: data read on current iteration and not yet consumed from reader
975975
#[inline(always)]
976976
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
977-
for i in memchr::memchr_iter(b'>', chunk) {
978-
match self {
979-
// Need to read at least 6 symbols (`!---->`) for properly finished comment
980-
// <!----> - XML comment
981-
// 012345 - i
982-
Self::Comment if buf.len() + i > 4 => {
983-
if chunk[..i].ends_with(b"--") {
984-
// We cannot strip last `--` from the buffer because we need it in case of
985-
// check_comments enabled option. XML standard requires that comment
986-
// will not end with `--->` sequence because this is a special case of
987-
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
988-
return Some((&chunk[..i], i + 1)); // +1 for `>`
989-
}
990-
// End sequence `-|->` was splitted at |
991-
// buf --/ \-- chunk
992-
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
993-
return Some((&chunk[..i], i + 1)); // +1 for `>`
994-
}
995-
// End sequence `--|>` was splitted at |
996-
// buf --/ \-- chunk
997-
if i == 0 && buf.ends_with(b"--") {
998-
return Some((&[], i + 1)); // +1 for `>`
977+
match self {
978+
Self::Comment => {
979+
for i in memchr::memchr_iter(b'>', chunk) {
980+
// Need to read at least 6 symbols (`!---->`) for properly finished comment
981+
// <!----> - XML comment
982+
// 012345 - i
983+
if buf.len() + i > 4 {
984+
if chunk[..i].ends_with(b"--") {
985+
// We cannot strip last `--` from the buffer because we need it in case of
986+
// check_comments enabled option. XML standard requires that comment
987+
// will not end with `--->` sequence because this is a special case of
988+
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
989+
return Some((&chunk[..i], i + 1)); // +1 for `>`
990+
}
991+
// End sequence `-|->` was splitted at |
992+
// buf --/ \-- chunk
993+
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
994+
return Some((&chunk[..i], i + 1)); // +1 for `>`
995+
}
996+
// End sequence `--|>` was splitted at |
997+
// buf --/ \-- chunk
998+
if i == 0 && buf.ends_with(b"--") {
999+
return Some((&[], i + 1)); // +1 for `>`
1000+
}
9991001
}
10001002
}
1001-
Self::Comment => {}
1002-
Self::CData => {
1003+
}
1004+
Self::CData => {
1005+
for i in memchr::memchr_iter(b'>', chunk) {
10031006
if chunk[..i].ends_with(b"]]") {
10041007
return Some((&chunk[..i], i + 1)); // +1 for `>`
10051008
}
@@ -1014,7 +1017,9 @@ impl BangType {
10141017
return Some((&[], i + 1)); // +1 for `>`
10151018
}
10161019
}
1017-
Self::DocType => {
1020+
}
1021+
Self::DocType => {
1022+
for i in memchr::memchr_iter(b'>', chunk) {
10181023
let content = &chunk[..i];
10191024
let balance = memchr::memchr2_iter(b'<', b'>', content)
10201025
.map(|p| if content[p] == b'<' { 1i32 } else { -1 })

src/reader/state.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,14 +128,22 @@ impl ReaderState {
128128
self.decoder(),
129129
)))
130130
}
131-
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
131+
// XML requires uppercase only:
132+
// https://www.w3.org/TR/xml11/#sec-cdata-sect
133+
// Even HTML5 required uppercase only:
134+
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
135+
BangType::CData if buf.starts_with(b"![CDATA[") => {
132136
debug_assert!(buf.ends_with(b"]]"));
133137
Ok(Event::CData(BytesCData::wrap(
134138
// Cut of `![CDATA[` and `]]` from start and end
135139
&buf[8..len - 2],
136140
self.decoder(),
137141
)))
138142
}
143+
// XML requires uppercase only, but we will check that on validation stage:
144+
// https://www.w3.org/TR/xml11/#sec-prolog-dtd
145+
// HTML5 allows mixed case for doctype declarations:
146+
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
139147
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
140148
match buf[8..].iter().position(|&b| !is_whitespace(b)) {
141149
Some(start) => Ok(Event::DocType(BytesText::wrap(

tests/fuzzing.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ fn fuzz_101() {
5151

5252
#[test]
5353
fn fuzz_empty_doctype() {
54-
let data: &[u8] = b"<!doctype \n >";
54+
let data: &[u8] = b"<!DOCTYPE \n >";
5555
let mut reader = Reader::from_reader(data);
5656
let mut buf = Vec::new();
5757
assert!(matches!(

tests/reader-config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ mod trim_markup_names_in_closing_tags {
471471
}
472472

473473
const XML: &str = " \t\r\n\
474-
<!doctype root \t\r\n> \t\r\n\
474+
<!DOCTYPE root \t\r\n> \t\r\n\
475475
<root \t\r\n> \t\r\n\
476476
<empty \t\r\n/> \t\r\n\
477477
text \t\r\n\

tests/reader-errors.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ mod syntax {
343343
err!(unclosed24("<![CDATA[]h") => SyntaxError::UnclosedCData);
344344
err!(unclosed25("<![CDATA[]>") => SyntaxError::UnclosedCData);
345345

346+
err!(lowercase("<![cdata[]]>") => SyntaxError::UnclosedCData);
347+
346348
ok!(normal1("<![CDATA[]]>") => 12: Event::CData(BytesCData::new("")));
347349
ok!(normal2("<![CDATA[]]>rest") => 12: Event::CData(BytesCData::new("")));
348350
}

0 commit comments

Comments
 (0)