Skip to content

Commit 7eec14c

Browse files
committed
Extended configuration
1 parent bb693a2 commit 7eec14c

File tree

10 files changed

+167
-25
lines changed

10 files changed

+167
-25
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "xml-rs"
3-
version = "0.8.8"
3+
version = "0.8.9"
44
authors = ["Vladimir Matveev <[email protected]>"]
55
license = "MIT"
66
description = "An XML library in pure Rust"

src/macros.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
macro_rules! gen_setter {
66
($target:ty, $field:ident : into $t:ty) => {
77
impl $target {
8-
/// Sets the field to the provided value and returns updated config object.
8+
/// See [`ParserConfig`][crate::ParserConfig] fields docs for details
99
#[inline]
1010
pub fn $field<T: Into<$t>>(mut self, value: T) -> $target {
1111
self.$field = value.into();
@@ -15,14 +15,36 @@ macro_rules! gen_setter {
1515
};
1616
($target:ty, $field:ident : val $t:ty) => {
1717
impl $target {
18-
/// Sets the field to the provided value and returns updated config object.
18+
/// See [`ParserConfig`][crate::ParserConfig] fields docs for details
1919
#[inline]
2020
pub fn $field(mut self, value: $t) -> $target {
2121
self.$field = value;
2222
self
2323
}
2424
}
2525
};
26+
($target:ty, $field:ident : delegate $t:ty) => {
27+
impl $target {
28+
/// See [`ParserConfig`][crate::ParserConfig] fields docs for details
29+
#[inline]
30+
pub fn $field(mut self, value: $t) -> $target {
31+
self.c.$field = value;
32+
self
33+
}
34+
}
35+
};
36+
($target:ty, $field:ident : c2 $t:ty) => {
37+
impl $target {
38+
/// See [`ParserConfig2`][crate::reader::ParserConfig] fields docs for details
39+
#[inline]
40+
#[must_use] pub fn $field(self, value: $t) -> ParserConfig2 {
41+
ParserConfig2 {
42+
c: self,
43+
..Default::default()
44+
}.$field(value)
45+
}
46+
}
47+
};
2648
}
2749

2850
macro_rules! gen_setters {

src/reader/config.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::collections::HashMap;
33
use std::io::Read;
44

55
use crate::reader::EventReader;
6+
use crate::util::Encoding;
67

78
/// Parser configuration structure.
89
///
@@ -181,3 +182,108 @@ gen_setters! { ParserConfig,
181182
replace_unknown_entity_references: val bool,
182183
ignore_root_level_whitespace: val bool
183184
}
185+
186+
/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
187+
#[derive(Clone, PartialEq, Eq, Debug)]
188+
#[non_exhaustive]
189+
#[derive(Default)]
190+
pub struct ParserConfig2 {
191+
pub(crate) c: ParserConfig,
192+
193+
/// Use this encoding as the default. Necessary for UTF-16 files without BOM.
194+
pub override_encoding: Option<Encoding>,
195+
/// Allow `<?xml encoding="…">` to contain unsupported encoding names,
196+
/// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
197+
pub ignore_invalid_encoding_declarations: bool,
198+
}
199+
200+
impl ParserConfig2 {
201+
#[inline]
202+
#[must_use] pub fn new() -> Self {
203+
Self::default()
204+
}
205+
206+
/// Read character encoding from `Content-Type` header.
207+
/// Set this when parsing XML documents fetched over HTTP.
208+
///
209+
/// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
210+
#[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
211+
let charset = mime_type.split_once(';')
212+
.and_then(|(_, args)| args.split_once("charset"))
213+
.and_then(|(_, args)| args.split_once('='));
214+
if let Some((_, charset)) = charset {
215+
let name = charset.trim().trim_matches('"');
216+
match name.parse() {
217+
Ok(enc) => {
218+
self.override_encoding = Some(enc);
219+
},
220+
Err(_) => {},
221+
}
222+
}
223+
self
224+
}
225+
226+
/// Creates an XML reader with this configuration.
227+
///
228+
/// This is a convenience method for configuring and creating a reader at the same time:
229+
///
230+
/// ```rust
231+
/// use xml::reader::ParserConfig;
232+
///
233+
/// let mut source: &[u8] = b"...";
234+
///
235+
/// let reader = ParserConfig::new()
236+
/// .trim_whitespace(true)
237+
/// .ignore_comments(true)
238+
/// .coalesce_characters(false)
239+
/// .create_reader(&mut source);
240+
/// ```
241+
///
242+
/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
243+
/// this configuration object.
244+
#[inline]
245+
pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
246+
EventReader::new_with_config(source, self)
247+
}
248+
}
249+
250+
impl From<ParserConfig> for ParserConfig2 {
251+
#[inline]
252+
fn from(c: ParserConfig) -> Self {
253+
Self {
254+
c,
255+
..Default::default()
256+
}
257+
}
258+
}
259+
260+
gen_setters! { ParserConfig2,
261+
override_encoding: val Option<Encoding>,
262+
ignore_invalid_encoding_declarations: val bool
263+
}
264+
265+
gen_setters! { ParserConfig,
266+
override_encoding: c2 Option<Encoding>,
267+
ignore_invalid_encoding_declarations: c2 bool,
268+
content_type: c2 &str
269+
}
270+
271+
gen_setters! { ParserConfig2,
272+
trim_whitespace: delegate bool,
273+
whitespace_to_characters: delegate bool,
274+
cdata_to_characters: delegate bool,
275+
ignore_comments: delegate bool,
276+
coalesce_characters: delegate bool,
277+
ignore_end_of_stream: delegate bool,
278+
replace_unknown_entity_references: delegate bool,
279+
ignore_root_level_whitespace: delegate bool
280+
}
281+
282+
#[test]
283+
fn mime_parse() {
284+
let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii");
285+
assert_eq!(c.override_encoding, Some(Encoding::Ascii));
286+
287+
let c = ParserConfig2::new().content_type("text/xml;charset = \"UTF-16\"");
288+
assert_eq!(c.override_encoding, Some(Encoding::Utf16));
289+
}

src/reader/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@ use std::result;
1010
use crate::common::{Position, TextPosition};
1111

1212
pub use self::config::ParserConfig;
13-
pub use self::events::XmlEvent;
13+
pub use self::config::ParserConfig2;
1414

15+
pub use self::events::XmlEvent;
1516
use self::parser::PullParser;
1617

1718
mod config;
@@ -35,12 +36,12 @@ impl<R: Read> EventReader<R> {
3536
/// Creates a new reader, consuming the given stream.
3637
#[inline]
3738
pub fn new(source: R) -> EventReader<R> {
38-
EventReader::new_with_config(source, ParserConfig::new())
39+
EventReader::new_with_config(source, ParserConfig2::new())
3940
}
4041

4142
/// Creates a new reader with the provded configuration, consuming the given stream.
4243
#[inline]
43-
pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> {
44+
pub fn new_with_config(source: R, config: impl Into<ParserConfig2>) -> EventReader<R> {
4445
EventReader { source, parser: PullParser::new(config) }
4546
}
4647

src/reader/parser.rs

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::common::{self, is_name_char, is_name_start_char, Position, TextPositi
99
use crate::name::OwnedName;
1010
use crate::namespace::NamespaceStack;
1111

12-
use crate::reader::config::ParserConfig;
12+
use crate::reader::config::ParserConfig2;
1313
use crate::reader::events::XmlEvent;
1414
use crate::reader::lexer::{Lexer, Token};
1515

@@ -65,7 +65,7 @@ pub type Result = super::Result<XmlEvent>;
6565

6666
/// Pull-based XML parser.
6767
pub(crate) struct PullParser {
68-
config: ParserConfig,
68+
config: ParserConfig2,
6969
lexer: Lexer,
7070
st: State,
7171
state_after_reference: State,
@@ -91,10 +91,21 @@ pub(crate) struct PullParser {
9191

9292
impl PullParser {
9393
/// Returns a new parser using the given config.
94-
pub fn new(config: ParserConfig) -> PullParser {
94+
#[inline]
95+
pub fn new(config: impl Into<ParserConfig2>) -> PullParser {
96+
let config = config.into();
97+
Self::new_with_config2(config)
98+
}
99+
100+
#[inline]
101+
fn new_with_config2(config: ParserConfig2) -> PullParser {
102+
let mut lexer = Lexer::new();
103+
if let Some(enc) = config.override_encoding {
104+
lexer.set_encoding(enc);
105+
}
95106
PullParser {
96107
config,
97-
lexer: Lexer::new(),
108+
lexer,
98109
st: State::OutsideTag,
99110
state_after_reference: State::OutsideTag,
100111
buf: String::new(),
@@ -126,7 +137,7 @@ impl PullParser {
126137
}
127138

128139
/// Checks if this parser ignores the end of stream errors.
129-
pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
140+
pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream }
130141
}
131142

132143
impl Position for PullParser {
@@ -316,7 +327,7 @@ impl PullParser {
316327
} else { // self.st != State::OutsideTag
317328
self_error!(self; "Unexpected end of stream") // TODO: add expected hint?
318329
}
319-
} else if self.config.ignore_end_of_stream {
330+
} else if self.config.c.ignore_end_of_stream {
320331
self.final_result = None;
321332
self.lexer.reset_eof_handled();
322333
return self_error!(self; "Unexpected end of stream: still inside the root element");
@@ -635,7 +646,7 @@ mod tests {
635646
expect_event!(r, p, Err(_)); // ---> is forbidden in comments
636647

637648
let (mut r, mut p) = test_data!(r#"<x><!--<text&x;> <!--></x>"#);
638-
p.config.ignore_comments = false;
649+
p.config.c.ignore_comments = false;
639650
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
640651
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
641652
expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!");

src/reader/parser/inside_cdata.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ impl PullParser {
77
pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
88
match t {
99
Token::CDataEnd => {
10-
let event = if self.config.cdata_to_characters {
10+
let event = if self.config.c.cdata_to_characters {
1111
None
1212
} else {
1313
let data = self.take_buf();

src/reader/parser/inside_comment.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use super::{PullParser, Result, State};
66
impl PullParser {
77
pub fn inside_comment(&mut self, t: Token) -> Option<Result> {
88
match t {
9-
Token::CommentEnd if self.config.ignore_comments => {
9+
Token::CommentEnd if self.config.c.ignore_comments => {
1010
self.into_state_continue(State::OutsideTag)
1111
}
1212

@@ -15,7 +15,7 @@ impl PullParser {
1515
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data)))
1616
}
1717

18-
_ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment
18+
_ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
1919

2020
_ => {
2121
t.push_to_string(&mut self.buf);

src/reader/parser/inside_declaration.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ impl PullParser {
2020
if let Some(new_encoding) = encoding.as_deref() {
2121
let new_encoding = match new_encoding.parse() {
2222
Ok(e) => e,
23+
Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1,
2324
Err(_) => return Some(self_error!(self; "Unknown encoding: {}", new_encoding)),
2425
};
2526
let current_encoding = self.lexer.encoding();
2627
if current_encoding != new_encoding {
2728
let set = match (current_encoding, new_encoding) {
2829
(Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new,
2930
(Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding,
31+
_ if self.config.ignore_invalid_encoding_declarations => current_encoding,
3032
_ => return Some(self_error!(self; "Conflicting encoding declared {}, used {}", new_encoding, current_encoding)),
3133
};
3234
self.lexer.set_encoding(set);

src/reader/parser/inside_reference.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ impl PullParser {
3535
};
3636
if let Some(c) = c {
3737
self.buf.push(c);
38-
} else if let Some(v) = self.config.extra_entities.get(&name) {
38+
} else if let Some(v) = self.config.c.extra_entities.get(&name) {
3939
self.buf.push_str(v);
4040
} else if let Some(v) = self.entities.get(&name) {
4141
if self.state_after_reference == State::OutsideTag {
@@ -71,7 +71,7 @@ impl PullParser {
7171
match char::from_u32(val) {
7272
Some('\0') => Err("NUL character entity is not allowed".into()),
7373
Some(c) => Ok(c),
74-
None if self.config.replace_unknown_entity_references => {
74+
None if self.config.c.replace_unknown_entity_references => {
7575
Ok('\u{fffd}')
7676
},
7777
None => Err(format!("Invalid character U+{val:X}")),

src/reader/parser/outside_tag.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ impl PullParser {
1818

1919
Token::Character(c) if is_whitespace_char(c) => {
2020
// skip whitespace outside of the root element
21-
if self.depth() == 0 && self.config.ignore_root_level_whitespace { None }
22-
else if self.config.trim_whitespace && !self.buf_has_data() { None }
21+
if self.depth() == 0 && self.config.c.ignore_root_level_whitespace { None }
22+
else if self.config.c.trim_whitespace && !self.buf_has_data() { None }
2323
else {
2424
if !self.buf_has_data() {
2525
self.push_pos();
@@ -46,12 +46,12 @@ impl PullParser {
4646
None
4747
}
4848

49-
Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => {
49+
Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => {
5050
// We need to switch the lexer into a comment mode inside comments
5151
self.into_state_continue(State::InsideComment)
5252
}
5353

54-
Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => {
54+
Token::CDataStart if self.config.c.coalesce_characters && self.config.c.cdata_to_characters => {
5555
if !self.buf_has_data() {
5656
self.push_pos();
5757
}
@@ -64,11 +64,11 @@ impl PullParser {
6464
// or a whitespace
6565
let mut next_event = if self.buf_has_data() {
6666
let buf = self.take_buf();
67-
if self.inside_whitespace && self.config.trim_whitespace {
67+
if self.inside_whitespace && self.config.c.trim_whitespace {
6868
None
69-
} else if self.inside_whitespace && !self.config.whitespace_to_characters {
69+
} else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
7070
Some(Ok(XmlEvent::Whitespace(buf)))
71-
} else if self.config.trim_whitespace {
71+
} else if self.config.c.trim_whitespace {
7272
Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
7373
} else {
7474
Some(Ok(XmlEvent::Characters(buf)))

0 commit comments

Comments
 (0)