Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 3c84f18

Browse files
committedApr 11, 2025·
Implement a decoding tokenizer
Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
1 parent 2e33830 commit 3c84f18

File tree

18 files changed

+478
-36
lines changed

18 files changed

+478
-36
lines changed
 

‎html5ever/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
1313
rust-version.workspace = true
1414

1515
[features]
16+
default = ["encoding"]
1617
trace_tokenizer = []
18+
encoding = ["dep:encoding_rs", "markup5ever/encoding"]
1719

1820
[dependencies]
1921
log = "0.4"
2022
mac = "0.1"
2123
markup5ever = { version = "0.16", path = "../markup5ever" }
2224
match_token = { workspace = true }
25+
encoding_rs = { version = "0.8", optional = true }
2326

2427
[dev-dependencies]
2528
criterion = "0.5"

‎html5ever/examples/noop-tokenize.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ use std::cell::RefCell;
1515
use std::io;
1616

1717
use html5ever::tendril::*;
18-
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
18+
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
19+
use markup5ever::buffer_queue::BufferQueue;
1920

2021
/// In our case, our sink only contains a tokens vector
2122
struct Sink(RefCell<Vec<Token>>);

‎html5ever/examples/tokenize.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ use std::cell::Cell;
1313
use std::io;
1414

1515
use html5ever::tendril::*;
16-
use html5ever::tokenizer::BufferQueue;
1716
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
1817
use html5ever::tokenizer::{
1918
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
2019
};
20+
use markup5ever::buffer_queue::BufferQueue;
2121

2222
#[derive(Clone)]
2323
struct TokenPrinter {

‎html5ever/src/tokenizer/char_ref/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
// except according to those terms.
99

1010
use super::{TokenSink, Tokenizer};
11-
use crate::buffer_queue::BufferQueue;
1211
use crate::data;
1312
use crate::tendril::StrTendril;
1413

1514
use log::debug;
1615
use mac::format_if;
16+
use markup5ever::buffer_queue::BufferQueue;
1717
use std::borrow::Cow::Borrowed;
1818
use std::char::from_u32;
1919

‎html5ever/src/tokenizer/interface.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
7777
Script(Handle),
7878
Plaintext,
7979
RawData(states::RawKind),
80+
#[cfg(feature = "encoding")]
81+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
8082
}
8183

8284
/// Types which can receive tokens from the tokenizer.

‎html5ever/src/tokenizer/mod.rs

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
2222
use self::char_ref::{CharRef, CharRefTokenizer};
2323

2424
use crate::util::str::lower_ascii_letter;
25-
2625
use log::{debug, trace};
2726
use mac::format_if;
28-
use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult};
27+
use markup5ever::{
28+
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
29+
TokenizerResult,
30+
};
2931
use std::borrow::Cow::{self, Borrowed};
3032
use std::cell::{Cell, RefCell, RefMut};
3133
use std::collections::BTreeMap;
32-
use std::mem;
34+
use std::{iter, mem};
3335

34-
pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
36+
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
3537
use crate::tendril::StrTendril;
3638
use crate::{Attribute, LocalName, QualName, SmallCharSet};
3739

@@ -43,6 +45,8 @@ pub enum ProcessResult<Handle> {
4345
Continue,
4446
Suspend,
4547
Script(Handle),
48+
#[cfg(feature = "encoding")]
49+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
4650
}
4751

4852
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
357361
ProcessResult::Continue => (),
358362
ProcessResult::Suspend => break,
359363
ProcessResult::Script(node) => return TokenizerResult::Script(node),
364+
#[cfg(feature = "encoding")]
365+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
366+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
367+
},
360368
}
361369
}
362370
} else {
@@ -365,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
365373
ProcessResult::Continue => (),
366374
ProcessResult::Suspend => break,
367375
ProcessResult::Script(node) => return TokenizerResult::Script(node),
376+
#[cfg(feature = "encoding")]
377+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
378+
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
379+
},
368380
}
369381
}
370382
}
@@ -445,6 +457,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
445457
self.state.set(states::RawData(kind));
446458
ProcessResult::Continue
447459
},
460+
#[cfg(feature = "encoding")]
461+
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
462+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
463+
},
448464
}
449465
}
450466

@@ -1448,6 +1464,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
14481464
ProcessResult::Continue => (),
14491465
ProcessResult::Suspend => break,
14501466
ProcessResult::Script(_) => unreachable!(),
1467+
#[cfg(feature = "encoding")]
1468+
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
14511469
}
14521470
}
14531471

@@ -1575,13 +1593,24 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
15751593
}
15761594
}
15771595

1596+
impl<Sink> InputSink for Tokenizer<Sink>
1597+
where
1598+
Sink: TokenSink,
1599+
{
1600+
type Handle = Sink::Handle;
1601+
1602+
fn feed<'a>(&'a self, input: &'a BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
1603+
iter::from_fn(|| self.feed(input).into())
1604+
}
1605+
}
1606+
15781607
#[cfg(test)]
15791608
#[allow(non_snake_case)]
15801609
mod test {
15811610
use super::option_push; // private items
1582-
use crate::tendril::{SliceExt, StrTendril};
1583-
15841611
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1612+
use crate::tendril::{SliceExt, StrTendril};
1613+
use crate::LocalName;
15851614

15861615
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
15871616
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1590,8 +1619,6 @@ mod test {
15901619
use markup5ever::buffer_queue::BufferQueue;
15911620
use std::cell::RefCell;
15921621

1593-
use crate::LocalName;
1594-
15951622
// LinesMatch implements the TokenSink trait. It is used for testing to see
15961623
// if current_line is being updated when process_token is called. The lines
15971624
// vector is a collection of the line numbers that each token is on.

‎html5ever/src/tree_builder/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ where
394394
assert!(more_tokens.is_empty());
395395
return tokenizer::TokenSinkResult::RawData(k);
396396
},
397+
#[cfg(feature = "encoding")]
398+
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
399+
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
400+
},
397401
}
398402
}
399403
}

‎html5ever/src/tree_builder/rules.rs

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@
1010
// The tree builder rules, as a single, enormous nested match expression.
1111

1212
use crate::interface::Quirks;
13-
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
13+
use crate::tokenizer::states::{Rawtext, Rcdata};
1414
use crate::tokenizer::TagKind::{EndTag, StartTag};
1515
use crate::tree_builder::tag_sets::*;
1616
use crate::tree_builder::types::*;
17-
use crate::tree_builder::{
18-
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
19-
TreeSink,
20-
};
21-
use crate::QualName;
22-
use markup5ever::{expanded_name, local_name, namespace_url, ns};
17+
use crate::tree_builder::RawKind::ScriptData;
18+
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
19+
20+
use markup5ever::interface::create_element;
21+
use markup5ever::interface::NodeOrText::AppendNode;
22+
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
2323
use std::borrow::Cow::Borrowed;
2424

2525
use crate::tendril::SliceExt;
2626
use match_token::match_token;
2727

28+
#[cfg(feature = "encoding")]
29+
use encoding_rs::Encoding;
30+
2831
fn any_not_whitespace(x: &StrTendril) -> bool {
2932
// FIXME: this might be much faster as a byte scan
3033
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where
113116

114117
<html> => self.step(InsertionMode::InBody, token),
115118

116-
tag @ <base> <basefont> <bgsound> <link> <meta> => {
117-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
119+
tag @ <meta> => {
120+
// FIXME: handle <meta http-equiv="Content-Type">
121+
#[cfg(feature = "encoding")]
122+
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
123+
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
124+
self.insert_and_pop_element_for(tag);
125+
return ProcessResult::MaybeChangeEncodingAndStartOver(encoding);
126+
}
127+
}
128+
129+
self.insert_and_pop_element_for(tag);
130+
ProcessResult::DoneAckSelfClosing
131+
},
132+
133+
tag @ <base> <basefont> <bgsound> <link> => {
118134
self.insert_and_pop_element_for(tag);
119135
ProcessResult::DoneAckSelfClosing
120136
}

‎html5ever/src/tree_builder/types.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ pub(crate) enum ProcessResult<Handle> {
7070
Script(Handle),
7171
ToPlaintext,
7272
ToRawData(RawKind),
73+
#[cfg(feature = "encoding")]
74+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
7375
}
7476

7577
pub(crate) enum FormatEntry<Handle> {

‎markup5ever/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@ rust-version.workspace = true
1414
[lib]
1515
path = "lib.rs"
1616

17+
[features]
18+
encoding = ["dep:encoding_rs"]
19+
1720
[dependencies]
1821
string_cache = "0.8"
1922
phf = "0.11"
2023
tendril = "0.4"
2124
log = "0.4"
25+
encoding_rs = { version = "0.8", optional = true }
2226

2327
[build-dependencies]
2428
string_cache_codegen = "0.5.4"

‎markup5ever/encoding.rs

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright 2014-2025 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
11+
use tendril::{fmt::Bytes, Tendril};
12+
13+
use crate::buffer_queue::BufferQueue;
14+
15+
/// <https://html.spec.whatwg.org/#concept-encoding-confidence>
16+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17+
pub enum Confidence {
18+
Tentative,
19+
Certain,
20+
Irrelevant,
21+
}
22+
23+
pub struct Decoder {
24+
inner: encoding_rs::Decoder,
25+
confidence: Confidence,
26+
}
27+
28+
impl Decoder {
29+
pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self {
30+
Self {
31+
inner: encoding.new_decoder(),
32+
confidence,
33+
}
34+
}
35+
36+
pub fn confidence(&self) -> Confidence {
37+
self.confidence
38+
}
39+
40+
/// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding
41+
/// should be changed to `encoding`
42+
pub fn change_the_encoding_to(
43+
&mut self,
44+
mut new_encoding: &'static Encoding,
45+
) -> Option<&'static Encoding> {
46+
let current_encoding = self.inner.encoding();
47+
// Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE,
48+
// then set the confidence to certain and return. The new encoding is ignored; if it was anything
49+
// but the same encoding, then it would be clearly incorrect.
50+
if current_encoding == UTF_16BE || current_encoding == UTF_16BE {
51+
self.confidence = Confidence::Certain;
52+
return None;
53+
}
54+
55+
// Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8.
56+
if new_encoding == UTF_16BE || new_encoding == UTF_16BE {
57+
new_encoding = UTF_8;
58+
}
59+
60+
// Step 3. If the new encoding is x-user-defined, then change it to windows-1252.
61+
if new_encoding == X_USER_DEFINED {
62+
new_encoding = WINDOWS_1252;
63+
}
64+
65+
// Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret
66+
// the input stream, then set the confidence to certain and return. This happens when the encoding information found
67+
// in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass
68+
// through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section
69+
// failed to find the right encoding.
70+
if current_encoding == new_encoding {
71+
self.confidence = Confidence::Certain;
72+
return None;
73+
}
74+
75+
// Step 5. If all the bytes up to the last byte converted by the current decoder have the same
76+
// Unicode interpretations in both the current encoding and the new encoding, and if the user agent
77+
// supports changing the converter on the fly, then the user agent may change to the new converter
78+
// for the encoding on the fly. Set the document's character encoding and the encoding used to convert
79+
// the input stream to the new encoding, set the confidence to certain, and return.
80+
// NOTE: We don't support changing the converter on the fly
81+
82+
// Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and
83+
// other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just
84+
// set the encoding to the new encoding and the confidence to certain. Whenever possible, this should
85+
// be done without actually contacting the network layer (the bytes should be re-parsed from memory),
86+
// even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting
87+
// the network layer would involve repeating a request that uses a method other than `GET`, then instead
88+
// set the confidence to certain and ignore the new encoding. The resource will be misinterpreted.
89+
// User agents may notify the user of the situation, to aid in application development.
90+
Some(new_encoding)
91+
}
92+
93+
/// Decode the given chunk with the current encoding. The result will be pushed to the end
94+
/// of the input stream.
95+
pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) {
96+
let mut remaining = chunk;
97+
loop {
98+
let mut out: Tendril<Bytes> = Tendril::new();
99+
let max_len = self
100+
.inner
101+
.max_utf8_buffer_length_without_replacement(remaining.len())
102+
.unwrap_or(8192)
103+
.min(8192);
104+
105+
// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize
106+
// part of the buffer. We are only going to access the initialized segment.
107+
unsafe {
108+
out.push_uninitialized(max_len as u32);
109+
}
110+
111+
let (result, bytes_read, bytes_written) = self
112+
.inner
113+
.decode_to_utf8_without_replacement(&remaining, &mut out, last);
114+
115+
if bytes_written > 0 {
116+
let bytes_chunk = out.subtendril(0, bytes_written as u32);
117+
118+
// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8
119+
let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() };
120+
output.push_back(utf8_chunk);
121+
}
122+
123+
if matches!(result, DecoderResult::Malformed(_, _)) {
124+
output.push_back("\u{FFFD}".into());
125+
}
126+
127+
remaining = &remaining[bytes_read..];
128+
if remaining.is_empty() {
129+
return;
130+
}
131+
}
132+
}
133+
}

‎markup5ever/input_stream.rs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
use std::cell::RefCell;
2+
3+
use encoding_rs::Encoding;
4+
use tendril::StrTendril;
5+
6+
use crate::buffer_queue::BufferQueue;
7+
use crate::encoding::{Confidence, Decoder};
8+
9+
/// <https://html.spec.whatwg.org/#input-stream>
10+
pub struct InputStream {
11+
input: BufferQueue,
12+
decoder: RefCell<Decoder>,
13+
}
14+
15+
impl InputStream {
16+
fn new(encoding: &'static Encoding) -> Self {
17+
Self {
18+
input: Default::default(),
19+
decoder: RefCell::new(Decoder::new(encoding, Confidence::Tentative)),
20+
}
21+
}
22+
23+
pub fn append(&self, data: StrTendril) {
24+
self.input.push_back(data);
25+
}
26+
27+
pub fn append_bytes(&self, data: &[u8]) {
28+
self.decoder
29+
.borrow_mut()
30+
.decode(data, false, &self.input);
31+
}
32+
33+
pub fn code_points(&self) -> &BufferQueue {
34+
&self.input
35+
}
36+
37+
/// Attempt to switch to another encoding.
38+
///
39+
/// If the encoding was switched then the new encoding is returned. Note that the new encoding may be
40+
/// different from the one that this function was called with.
41+
pub fn maybe_switch_encoding(&self, encoding: &'static Encoding) -> Option<&'static Encoding> {
42+
if self.decoder.borrow().confidence() == Confidence::Tentative {
43+
if let Some(new_encoding) = self.decoder.borrow_mut().change_the_encoding_to(encoding) {
44+
return Some(new_encoding);
45+
}
46+
}
47+
None
48+
}
49+
50+
/// Move any input that is left in the decoding stage to the end of the input stream
51+
pub fn finish_decoding_input(&self) {
52+
self.decoder
53+
.borrow_mut()
54+
.decode(&[], true, &self.input);
55+
}
56+
57+
/// Remove all input from the stream
58+
pub fn clear(&self) {
59+
self.input.clear();
60+
}
61+
}
62+
63+
pub struct DecodingParser<Sink> {
64+
/// Data received from `document.write`
65+
script_input: BufferQueue,
66+
input_stream: InputStream,
67+
input_sink: Sink,
68+
}
69+
70+
impl<Sink> DecodingParser<Sink>
71+
where
72+
Sink: InputSink,
73+
{
74+
pub fn new(sink: Sink, document_encoding: &'static Encoding) -> Self {
75+
Self {
76+
script_input: Default::default(),
77+
input_stream: InputStream::new(document_encoding),
78+
input_sink: sink,
79+
}
80+
}
81+
82+
pub fn sink(&self) -> &Sink {
83+
&self.input_sink
84+
}
85+
86+
pub fn input_stream(&self) -> &InputStream {
87+
&self.input_stream
88+
}
89+
90+
/// Return an iterator that can be used to drive the parser
91+
pub fn parse(&self) -> impl Iterator<Item = ParserAction<Sink::Handle>> + '_ {
92+
self.input_sink
93+
.feed(self.input_stream.code_points())
94+
.filter_map(|sink_result| match sink_result {
95+
InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
96+
InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
97+
.input_stream
98+
.maybe_switch_encoding(encoding)
99+
.map(ParserAction::StartOverWithEncoding),
100+
})
101+
}
102+
103+
/// Returns an iterator that can be used to drive the parser
104+
pub fn document_write<'a>(&'a self, input: &'a BufferQueue) -> impl Iterator<Item = ParserAction<Sink::Handle>> + use<'a, Sink>{
105+
debug_assert!(self.script_input.is_empty(), "Should not parse input from document.write while the parser is suspended");
106+
107+
self.input_sink
108+
.feed(&input)
109+
.filter_map(move |sink_result| match sink_result {
110+
InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
111+
InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
112+
.input_stream
113+
.maybe_switch_encoding(encoding)
114+
.map(ParserAction::StartOverWithEncoding),
115+
})
116+
}
117+
118+
/// End a `document.write` transaction, appending any input that was not yet parsed to the
119+
/// current insertion point, behind any input that was received reentrantly during this transaction.
120+
pub fn push_script_input(&self, input: &BufferQueue) {
121+
while let Some(chunk) = input.pop_front() {
122+
self.script_input.push_back(chunk);
123+
}
124+
}
125+
126+
/// Notifies the parser that it has been unblocked and parsing can resume
127+
pub fn notify_parser_blocking_script_loaded(&self) {
128+
// Move pending script input to the front of the input stream
129+
self.script_input.swap_with(&self.input_stream.input);
130+
while let Some(chunk) = self.script_input.pop_front() {
131+
self.input_stream.input.push_back(chunk);
132+
}
133+
}
134+
}
135+
136+
pub enum ParserAction<Handle> {
137+
HandleScript(Handle),
138+
StartOverWithEncoding(&'static Encoding),
139+
}
140+
141+
pub enum InputSinkResult<Handle> {
142+
HandleScript(Handle),
143+
MaybeStartOverWithEncoding(&'static Encoding),
144+
}
145+
146+
pub trait InputSink {
147+
type Handle;
148+
149+
fn feed<'a>(&'a self, input: &'a BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a;
150+
}
151+
152+
impl<T> ParserAction<T> {
153+
pub fn map_script<U, F>(self, f: F) -> ParserAction<U>
154+
where
155+
F: FnOnce(T) -> U,
156+
{
157+
match self {
158+
Self::HandleScript(script) => ParserAction::HandleScript(f(script)),
159+
Self::StartOverWithEncoding(encoding) => ParserAction::StartOverWithEncoding(encoding),
160+
}
161+
}
162+
}

‎markup5ever/interface/mod.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ use std::cell::Ref;
1212
use std::fmt;
1313
use tendril::StrTendril;
1414

15+
use crate::InputSinkResult;
16+
1517
pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText};
1618
pub use self::tree_builder::{ElemName, Tracer, TreeSink};
1719
pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
@@ -65,6 +67,19 @@ impl fmt::Debug for ExpandedName<'_> {
6567
pub enum TokenizerResult<Handle> {
6668
Done,
6769
Script(Handle),
70+
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
71+
}
72+
73+
impl<Handle> From<TokenizerResult<Handle>> for Option<InputSinkResult<Handle>> {
74+
fn from(value: TokenizerResult<Handle>) -> Self {
75+
match value {
76+
TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)),
77+
TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => {
78+
Some(InputSinkResult::MaybeStartOverWithEncoding(encoding))
79+
},
80+
TokenizerResult::Done => None,
81+
}
82+
}
6883
}
6984

7085
/// Helper to quickly create an expanded name.

‎markup5ever/lib.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,10 @@ mod util {
4848
pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult};
4949
pub use util::smallcharset::SmallCharSet;
5050
pub use util::*;
51+
52+
#[cfg(feature = "encoding")]
53+
pub mod encoding;
54+
55+
mod input_stream;
56+
57+
pub use input_stream::{DecodingParser, InputSink, InputSinkResult, InputStream, ParserAction};

‎markup5ever/util/buffer_queue.rs

Lines changed: 65 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@
1818
//!
1919
//! [`BufferQueue`]: struct.BufferQueue.html
2020
21-
use std::{cell::RefCell, collections::VecDeque, mem};
21+
use std::{cell::RefCell, collections::VecDeque, fmt, mem};
2222

23-
use tendril::StrTendril;
23+
use tendril::{
24+
fmt::{Bytes, SliceFormat, UTF8},
25+
Atomicity, NonAtomic, StrTendril, Tendril,
26+
};
2427

2528
pub use self::SetResult::{FromSet, NotFromSet};
2629
use crate::util::smallcharset::SmallCharSet;
@@ -38,18 +41,30 @@ pub enum SetResult {
3841
NotFromSet(StrTendril),
3942
}
4043

41-
/// A queue of owned string buffers, which supports incrementally consuming characters.
44+
/// A queue of tendrils, which supports incrementally consuming characters.
4245
///
4346
/// Internally it uses [`VecDeque`] and has the same complexity properties.
4447
///
4548
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
4649
#[derive(Debug)]
47-
pub struct BufferQueue {
50+
pub struct BufferQueue<F = UTF8, A = NonAtomic>
51+
where
52+
F: SliceFormat + Default,
53+
<F as SliceFormat>::Slice: fmt::Debug,
54+
A: Atomicity,
55+
{
4856
/// Buffers to process.
49-
buffers: RefCell<VecDeque<StrTendril>>,
57+
buffers: RefCell<VecDeque<Tendril<F, A>>>,
5058
}
5159

52-
impl Default for BufferQueue {
60+
pub type ByteBufferQueue = BufferQueue<Bytes>;
61+
62+
impl<F, A> Default for BufferQueue<F, A>
63+
where
64+
F: SliceFormat + Default,
65+
<F as SliceFormat>::Slice: fmt::Debug,
66+
A: Atomicity,
67+
{
5368
/// Create an empty BufferQueue.
5469
#[inline]
5570
fn default() -> Self {
@@ -59,7 +74,17 @@ impl Default for BufferQueue {
5974
}
6075
}
6176

62-
impl BufferQueue {
77+
impl<F, A> BufferQueue<F, A>
78+
where
79+
F: SliceFormat + Default,
80+
<F as SliceFormat>::Slice: fmt::Debug,
81+
A: Atomicity,
82+
{
83+
/// Swap the contents of the two buffers
84+
pub fn swap(&self, other: &Self){
85+
mem::swap(&mut self.buffers.borrow_mut(), &mut other.buffers.borrow_mut());
86+
}
87+
6388
/// Returns whether the queue is empty.
6489
#[inline]
6590
pub fn is_empty(&self) -> bool {
@@ -68,14 +93,14 @@ impl BufferQueue {
6893

6994
/// Get the buffer at the beginning of the queue.
7095
#[inline]
71-
pub fn pop_front(&self) -> Option<StrTendril> {
96+
pub fn pop_front(&self) -> Option<Tendril<F, A>> {
7297
self.buffers.borrow_mut().pop_front()
7398
}
7499

75100
/// Add a buffer to the beginning of the queue.
76101
///
77102
/// If the buffer is empty, it will be skipped.
78-
pub fn push_front(&self, buf: StrTendril) {
103+
pub fn push_front(&self, buf: Tendril<F, A>) {
79104
if buf.len32() == 0 {
80105
return;
81106
}
@@ -85,13 +110,27 @@ impl BufferQueue {
85110
/// Add a buffer to the end of the queue.
86111
///
87112
/// If the buffer is empty, it will be skipped.
88-
pub fn push_back(&self, buf: StrTendril) {
113+
pub fn push_back(&self, buf: Tendril<F, A>) {
89114
if buf.len32() == 0 {
90115
return;
91116
}
92117
self.buffers.borrow_mut().push_back(buf);
93118
}
94119

120+
pub fn insert(&self, index: usize, buffer: Tendril<F, A>) {
121+
if buffer.len32() == 0 {
122+
return;
123+
}
124+
125+
self.buffers.borrow_mut().insert(index, buffer);
126+
}
127+
128+
pub fn clear(&self) {
129+
self.buffers.borrow_mut().clear();
130+
}
131+
}
132+
133+
impl BufferQueue {
95134
/// Look at the next available character without removing it, if the queue is not empty.
96135
pub fn peek(&self) -> Option<char> {
97136
debug_assert!(
@@ -236,18 +275,32 @@ impl BufferQueue {
236275
result
237276
}
238277

239-
pub fn replace_with(&self, other: BufferQueue) {
278+
pub fn replace_with(&self, other: Self) {
240279
let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
241280
}
242281

243-
pub fn swap_with(&self, other: &BufferQueue) {
282+
pub fn swap_with(&self, other: &Self) {
244283
mem::swap(
245284
&mut *self.buffers.borrow_mut(),
246285
&mut *other.buffers.borrow_mut(),
247286
);
248287
}
249288
}
250289

290+
impl<F, A> IntoIterator for BufferQueue<F, A>
291+
where
292+
F: SliceFormat + Default,
293+
<F as SliceFormat>::Slice: fmt::Debug,
294+
A: Atomicity,
295+
{
296+
type Item = Tendril<F, A>;
297+
type IntoIter = <VecDeque<Tendril<F, A>> as IntoIterator>::IntoIter;
298+
299+
fn into_iter(self) -> Self::IntoIter {
300+
self.buffers.into_inner().into_iter()
301+
}
302+
}
303+
251304
#[cfg(test)]
252305
#[allow(non_snake_case)]
253306
mod test {

‎rcdom/tests/html-serializer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ impl Serialize for Tokens {
6868

6969
fn tokenize_and_serialize(input: StrTendril) -> StrTendril {
7070
let input = {
71-
let q = ::html5ever::tokenizer::BufferQueue::default();
71+
let q = markup5ever::buffer_queue::BufferQueue::default();
7272
q.push_front(input);
7373
q
7474
};

‎rcdom/tests/html-tokenizer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ use html5ever::tendril::*;
1414
use html5ever::tokenizer::states::{
1515
CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData,
1616
};
17-
use html5ever::tokenizer::BufferQueue;
1817
use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1918
use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
2019
use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
2120
use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
2221
use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
22+
use markup5ever::buffer_queue::BufferQueue;
2323
use serde_json::{Map, Value};
2424
use std::cell::RefCell;
2525
use std::ffi::OsStr;

‎xml5ever/src/tokenizer/mod.rs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,16 @@ use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
2424
use log::debug;
2525
use mac::{format_if, unwrap_or_return};
2626
use markup5ever::{
27-
local_name, namespace_prefix, namespace_url, ns, small_char_set, TokenizerResult,
27+
buffer_queue::BufferQueue, local_name, namespace_prefix, namespace_url, ns, small_char_set,
28+
InputSink, InputSinkResult, TokenizerResult,
2829
};
2930
use std::borrow::Cow::{self, Borrowed};
3031
use std::cell::{Cell, RefCell, RefMut};
3132
use std::collections::BTreeMap;
33+
use std::iter;
3234
use std::mem::replace;
3335

34-
use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
36+
use self::buffer_queue::{FromSet, NotFromSet, SetResult};
3537
use self::char_ref::{CharRef, CharRefTokenizer};
3638
use self::qname::QualNameTokenizer;
3739
use self::states::XmlState;
@@ -1299,6 +1301,17 @@ impl<Sink: TokenSink> XmlTokenizer<Sink> {
12991301
}
13001302
}
13011303

1304+
impl<Sink> InputSink for XmlTokenizer<Sink>
1305+
where
1306+
Sink: TokenSink,
1307+
{
1308+
type Handle = Sink::Handle;
1309+
1310+
fn feed<'a>(&'a self, input: &'a BufferQueue) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
1311+
iter::from_fn(|| self.feed(input).into())
1312+
}
1313+
}
1314+
13021315
#[cfg(test)]
13031316
mod test {
13041317

0 commit comments

Comments
 (0)
Please sign in to comment.