Skip to content

Handle decoding of input in html5ever #590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions html5ever/Cargo.toml
Original file line number Diff line number Diff line change
@@ -13,13 +13,16 @@ readme = "../README.md"
rust-version.workspace = true

[features]
default = ["encoding"]
trace_tokenizer = []
encoding = ["dep:encoding_rs", "markup5ever/encoding"]

[dependencies]
log = "0.4"
mac = "0.1"
markup5ever = { version = "0.16", path = "../markup5ever" }
match_token = { workspace = true }
encoding_rs = { version = "0.8", optional = true }

[dev-dependencies]
criterion = "0.6"
3 changes: 2 additions & 1 deletion html5ever/examples/noop-tokenize.rs
Original file line number Diff line number Diff line change
@@ -15,7 +15,8 @@ use std::cell::RefCell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
use markup5ever::buffer_queue::BufferQueue;

/// In our case, our sink only contains a tokens vector
struct Sink(RefCell<Vec<Token>>);
2 changes: 1 addition & 1 deletion html5ever/examples/tokenize.rs
Original file line number Diff line number Diff line change
@@ -13,11 +13,11 @@ use std::cell::Cell;
use std::io;

use html5ever::tendril::*;
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
use html5ever::tokenizer::{
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
use markup5ever::buffer_queue::BufferQueue;

#[derive(Clone)]
struct TokenPrinter {
2 changes: 1 addition & 1 deletion html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
@@ -8,12 +8,12 @@
// except according to those terms.

use super::{TokenSink, Tokenizer};
use crate::buffer_queue::BufferQueue;
use crate::data;
use crate::tendril::StrTendril;

use log::debug;
use mac::format_if;
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;

2 changes: 2 additions & 0 deletions html5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
@@ -77,6 +77,8 @@ pub enum TokenSinkResult<Handle> {
Script(Handle),
Plaintext,
RawData(states::RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

/// Types which can receive tokens from the tokenizer.
46 changes: 38 additions & 8 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};

use crate::util::str::lower_ascii_letter;

use log::{debug, trace};
use mac::format_if;
use markup5ever::{ns, small_char_set, TokenizerResult};
use markup5ever::{
buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
TokenizerResult,
};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
use std::mem;
use std::{iter, mem};

pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
use crate::tendril::StrTendril;
use crate::{Attribute, LocalName, QualName, SmallCharSet};

@@ -43,6 +45,8 @@ pub enum ProcessResult<Handle> {
Continue,
Suspend,
Script(Handle),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +361,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
} else {
@@ -365,6 +373,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}
}
@@ -456,6 +468,10 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
#[cfg(feature = "encoding")]
TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
},
}
}

@@ -1725,6 +1741,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(_) => unreachable!(),
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
}
}

@@ -2001,13 +2019,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

impl<Sink> InputSink for Tokenizer<Sink>
where
Sink: TokenSink,
{
type Handle = Sink::Handle;

fn feed<'a>(
&'a self,
input: &'a BufferQueue,
) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
iter::from_fn(|| self.feed(input).into())
}
}

#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
use crate::tendril::{SliceExt, StrTendril};

use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use crate::tendril::{SliceExt, StrTendril};
use crate::LocalName;

use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -2016,8 +2048,6 @@ mod test {
use markup5ever::buffer_queue::BufferQueue;
use std::cell::RefCell;

use crate::LocalName;

// LinesMatch implements the TokenSink trait. It is used for testing to see
// if current_line is being updated when process_token is called. The lines
// vector is a collection of the line numbers that each token is on.
4 changes: 4 additions & 0 deletions html5ever/src/tree_builder/mod.rs
Original file line number Diff line number Diff line change
@@ -396,6 +396,10 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
#[cfg(feature = "encoding")]
ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
},
}
}
}
34 changes: 25 additions & 9 deletions html5ever/src/tree_builder/rules.rs
Original file line number Diff line number Diff line change
@@ -10,21 +10,24 @@
// The tree builder rules, as a single, enormous nested match expression.

use crate::interface::Quirks;
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
use crate::tokenizer::states::{Rawtext, Rcdata};
use crate::tokenizer::TagKind::{EndTag, StartTag};
use crate::tree_builder::tag_sets::*;
use crate::tree_builder::types::*;
use crate::tree_builder::{
create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
TreeSink,
};
use crate::QualName;
use markup5ever::{expanded_name, local_name, ns};
use crate::tree_builder::RawKind::ScriptData;
use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};

use markup5ever::interface::create_element;
use markup5ever::interface::NodeOrText::AppendNode;
use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
use std::borrow::Cow::Borrowed;

use crate::tendril::SliceExt;
use match_token::match_token;

#[cfg(feature = "encoding")]
use encoding_rs::Encoding;

fn any_not_whitespace(x: &StrTendril) -> bool {
// FIXME: this might be much faster as a byte scan
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where

<html> => self.step(InsertionMode::InBody, token),

tag @ <base> <basefont> <bgsound> <link> <meta> => {
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
tag @ <meta> => {
// FIXME: handle <meta http-equiv="Content-Type">
#[cfg(feature = "encoding")]
if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
self.insert_and_pop_element_for(tag);
return ProcessResult::MaybeChangeEncodingAndStartOver(encoding);
}
}

self.insert_and_pop_element_for(tag);
ProcessResult::DoneAckSelfClosing
},

tag @ <base> <basefont> <bgsound> <link> => {
self.insert_and_pop_element_for(tag);
ProcessResult::DoneAckSelfClosing
}
2 changes: 2 additions & 0 deletions html5ever/src/tree_builder/types.rs
Original file line number Diff line number Diff line change
@@ -70,6 +70,8 @@ pub(crate) enum ProcessResult<Handle> {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
#[cfg(feature = "encoding")]
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

pub(crate) enum FormatEntry<Handle> {
10 changes: 9 additions & 1 deletion markup5ever/Cargo.toml
Original file line number Diff line number Diff line change
@@ -13,7 +13,15 @@ rust-version.workspace = true
[lib]
path = "lib.rs"

[features]
encoding = ["dep:encoding_rs"]

[dependencies]
web_atoms = { version = "0.1", path = "../web_atoms" }
tendril = "0.4"
log = "0.4"
log = "0.4"
encoding_rs = { version = "0.8", optional = true }

[build-dependencies]
string_cache_codegen = "0.5.4"
phf_codegen = "0.11"
133 changes: 133 additions & 0 deletions markup5ever/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2014-2025 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
use tendril::{fmt::Bytes, Tendril};

use crate::buffer_queue::BufferQueue;

/// <https://html.spec.whatwg.org/#concept-encoding-confidence>
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Confidence {
Tentative,
Certain,
Irrelevant,
}

pub struct Decoder {
inner: encoding_rs::Decoder,
confidence: Confidence,
}

impl Decoder {
pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self {
Self {
inner: encoding.new_decoder(),
confidence,
}
}

pub fn confidence(&self) -> Confidence {
self.confidence
}

/// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding
/// should be changed to `encoding`
pub fn change_the_encoding_to(
&mut self,
mut new_encoding: &'static Encoding,
) -> Option<&'static Encoding> {
let current_encoding = self.inner.encoding();
// Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE,
// then set the confidence to certain and return. The new encoding is ignored; if it was anything
// but the same encoding, then it would be clearly incorrect.
if current_encoding == UTF_16BE || current_encoding == UTF_16BE {
self.confidence = Confidence::Certain;
return None;
}

// Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8.
if new_encoding == UTF_16BE || new_encoding == UTF_16BE {
new_encoding = UTF_8;
}

// Step 3. If the new encoding is x-user-defined, then change it to windows-1252.
if new_encoding == X_USER_DEFINED {
new_encoding = WINDOWS_1252;
}

// Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret
// the input stream, then set the confidence to certain and return. This happens when the encoding information found
// in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass
// through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section
// failed to find the right encoding.
if current_encoding == new_encoding {
self.confidence = Confidence::Certain;
return None;
}

// Step 5. If all the bytes up to the last byte converted by the current decoder have the same
// Unicode interpretations in both the current encoding and the new encoding, and if the user agent
// supports changing the converter on the fly, then the user agent may change to the new converter
// for the encoding on the fly. Set the document's character encoding and the encoding used to convert
// the input stream to the new encoding, set the confidence to certain, and return.
// NOTE: We don't support changing the converter on the fly

// Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and
// other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just
// set the encoding to the new encoding and the confidence to certain. Whenever possible, this should
// be done without actually contacting the network layer (the bytes should be re-parsed from memory),
// even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting
// the network layer would involve repeating a request that uses a method other than `GET`, then instead
// set the confidence to certain and ignore the new encoding. The resource will be misinterpreted.
// User agents may notify the user of the situation, to aid in application development.
Some(new_encoding)
}

/// Decode the given chunk with the current encoding. The result will be pushed to the end
/// of the input stream.
pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) {
let mut remaining = chunk;
loop {
let mut out: Tendril<Bytes> = Tendril::new();
let max_len = self
.inner
.max_utf8_buffer_length_without_replacement(remaining.len())
.unwrap_or(8192)
.min(8192);

// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize
// part of the buffer. We are only going to access the initialized segment.
unsafe {
out.push_uninitialized(max_len as u32);
}

let (result, bytes_read, bytes_written) = self
.inner
.decode_to_utf8_without_replacement(&remaining, &mut out, last);

if bytes_written > 0 {
let bytes_chunk = out.subtendril(0, bytes_written as u32);

// SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8
let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() };
output.push_back(utf8_chunk);
}

if matches!(result, DecoderResult::Malformed(_, _)) {
output.push_back("\u{FFFD}".into());
}

remaining = &remaining[bytes_read..];
if remaining.is_empty() {
return;
}
}
}
}
167 changes: 167 additions & 0 deletions markup5ever/input_stream.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use std::cell::RefCell;

use encoding_rs::Encoding;
use tendril::StrTendril;

use crate::buffer_queue::BufferQueue;
use crate::encoding::{Confidence, Decoder};

/// <https://html.spec.whatwg.org/#input-stream>
pub struct InputStream {
input: BufferQueue,
decoder: RefCell<Decoder>,
}

impl InputStream {
fn new(encoding: &'static Encoding) -> Self {
Self {
input: Default::default(),
decoder: RefCell::new(Decoder::new(encoding, Confidence::Tentative)),
}
}

pub fn append(&self, data: StrTendril) {
self.input.push_back(data);
}

pub fn append_bytes(&self, data: &[u8]) {
self.decoder.borrow_mut().decode(data, false, &self.input);
}

pub fn code_points(&self) -> &BufferQueue {
&self.input
}

/// Attempt to switch to another encoding.
///
/// If the encoding was switched then the new encoding is returned. Note that the new encoding may be
/// different from the one that this function was called with.
pub fn maybe_switch_encoding(&self, encoding: &'static Encoding) -> Option<&'static Encoding> {
if self.decoder.borrow().confidence() == Confidence::Tentative {
if let Some(new_encoding) = self.decoder.borrow_mut().change_the_encoding_to(encoding) {
return Some(new_encoding);
}
}
None
}

/// Move any input that is left in the decoding stage to the end of the input stream
pub fn finish_decoding_input(&self) {
self.decoder.borrow_mut().decode(&[], true, &self.input);
}

/// Remove all input from the stream
pub fn clear(&self) {
self.input.clear();
}
}

pub struct DecodingParser<Sink> {
/// Data received from `document.write`
script_input: BufferQueue,
input_stream: InputStream,
input_sink: Sink,
}

impl<Sink> DecodingParser<Sink>
where
Sink: InputSink,
{
pub fn new(sink: Sink, document_encoding: &'static Encoding) -> Self {
Self {
script_input: Default::default(),
input_stream: InputStream::new(document_encoding),
input_sink: sink,
}
}

pub fn sink(&self) -> &Sink {
&self.input_sink
}

pub fn input_stream(&self) -> &InputStream {
&self.input_stream
}

/// Return an iterator that can be used to drive the parser
pub fn parse(&self) -> impl Iterator<Item = ParserAction<Sink::Handle>> + '_ {
self.input_sink
.feed(self.input_stream.code_points())
.filter_map(|sink_result| match sink_result {
InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
.input_stream
.maybe_switch_encoding(encoding)
.map(ParserAction::StartOverWithEncoding),
})
}

/// Returns an iterator that can be used to drive the parser
pub fn document_write<'a>(
&'a self,
input: &'a BufferQueue,
) -> impl Iterator<Item = ParserAction<Sink::Handle>> + use<'a, Sink> {
debug_assert!(
self.script_input.is_empty(),
"Should not parse input from document.write while the parser is suspended"
);

self.input_sink
.feed(&input)
.filter_map(move |sink_result| match sink_result {
InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
.input_stream
.maybe_switch_encoding(encoding)
.map(ParserAction::StartOverWithEncoding),
})
}

/// End a `document.write` transaction, appending any input that was not yet parsed to the
/// current insertion point, behind any input that was received reentrantly during this transaction.
pub fn push_script_input(&self, input: &BufferQueue) {
while let Some(chunk) = input.pop_front() {
self.script_input.push_back(chunk);
}
}

/// Notifies the parser that it has been unblocked and parsing can resume
pub fn notify_parser_blocking_script_loaded(&self) {
// Move pending script input to the front of the input stream
self.script_input.swap_with(&self.input_stream.input);
while let Some(chunk) = self.script_input.pop_front() {
self.input_stream.input.push_back(chunk);
}
}
}

pub enum ParserAction<Handle> {
HandleScript(Handle),
StartOverWithEncoding(&'static Encoding),
}

pub enum InputSinkResult<Handle> {
HandleScript(Handle),
MaybeStartOverWithEncoding(&'static Encoding),
}

pub trait InputSink {
type Handle;

fn feed<'a>(
&'a self,
input: &'a BufferQueue,
) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a;
}

impl<T> ParserAction<T> {
pub fn map_script<U, F>(self, f: F) -> ParserAction<U>
where
F: FnOnce(T) -> U,
{
match self {
Self::HandleScript(script) => ParserAction::HandleScript(f(script)),
Self::StartOverWithEncoding(encoding) => ParserAction::StartOverWithEncoding(encoding),
}
}
}
15 changes: 15 additions & 0 deletions markup5ever/interface/mod.rs
Original file line number Diff line number Diff line change
@@ -13,6 +13,8 @@ use std::fmt;
use tendril::StrTendril;
use web_atoms::{LocalName, Namespace, Prefix};

use crate::InputSinkResult;

pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText};
pub use self::tree_builder::{ElemName, Tracer, TreeSink};
pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
@@ -65,6 +67,19 @@ impl fmt::Debug for ExpandedName<'_> {
pub enum TokenizerResult<Handle> {
Done,
Script(Handle),
MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}

impl<Handle> From<TokenizerResult<Handle>> for Option<InputSinkResult<Handle>> {
fn from(value: TokenizerResult<Handle>) -> Self {
match value {
TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)),
TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => {
Some(InputSinkResult::MaybeStartOverWithEncoding(encoding))
},
TokenizerResult::Done => None,
}
}
}

/// Helper to quickly create an expanded name.
7 changes: 7 additions & 0 deletions markup5ever/lib.rs
Original file line number Diff line number Diff line change
@@ -57,3 +57,10 @@ mod util {
pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult};
pub use util::smallcharset::SmallCharSet;
pub use util::*;

#[cfg(feature = "encoding")]
pub mod encoding;

mod input_stream;

pub use input_stream::{DecodingParser, InputSink, InputSinkResult, InputStream, ParserAction};
80 changes: 68 additions & 12 deletions markup5ever/util/buffer_queue.rs
Original file line number Diff line number Diff line change
@@ -21,10 +21,13 @@
use std::{
cell::{RefCell, RefMut},
collections::VecDeque,
mem,
fmt, mem,
};

use tendril::StrTendril;
use tendril::{
fmt::{Bytes, SliceFormat, UTF8},
Atomicity, NonAtomic, StrTendril, Tendril,
};

pub use self::SetResult::{FromSet, NotFromSet};
use crate::util::smallcharset::SmallCharSet;
@@ -42,18 +45,30 @@ pub enum SetResult {
NotFromSet(StrTendril),
}

/// A queue of owned string buffers, which supports incrementally consuming characters.
/// A queue of tendrils, which supports incrementally consuming characters.
///
/// Internally it uses [`VecDeque`] and has the same complexity properties.
///
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
#[derive(Debug)]
pub struct BufferQueue {
pub struct BufferQueue<F = UTF8, A = NonAtomic>
where
F: SliceFormat + Default,
<F as SliceFormat>::Slice: fmt::Debug,
A: Atomicity,
{
/// Buffers to process.
buffers: RefCell<VecDeque<StrTendril>>,
buffers: RefCell<VecDeque<Tendril<F, A>>>,
}

impl Default for BufferQueue {
pub type ByteBufferQueue = BufferQueue<Bytes>;

impl<F, A> Default for BufferQueue<F, A>
where
F: SliceFormat + Default,
<F as SliceFormat>::Slice: fmt::Debug,
A: Atomicity,
{
/// Create an empty BufferQueue.
#[inline]
fn default() -> Self {
@@ -63,7 +78,20 @@ impl Default for BufferQueue {
}
}

impl BufferQueue {
impl<F, A> BufferQueue<F, A>
where
F: SliceFormat + Default,
<F as SliceFormat>::Slice: fmt::Debug,
A: Atomicity,
{
/// Swap the contents of the two buffers
pub fn swap(&self, other: &Self) {
mem::swap(
&mut self.buffers.borrow_mut(),
&mut other.buffers.borrow_mut(),
);
}

/// Returns whether the queue is empty.
#[inline]
pub fn is_empty(&self) -> bool {
@@ -72,14 +100,14 @@ impl BufferQueue {

/// Get the buffer at the beginning of the queue.
#[inline]
pub fn pop_front(&self) -> Option<StrTendril> {
pub fn pop_front(&self) -> Option<Tendril<F, A>> {
self.buffers.borrow_mut().pop_front()
}

/// Add a buffer to the beginning of the queue.
///
/// If the buffer is empty, it will be skipped.
pub fn push_front(&self, buf: StrTendril) {
pub fn push_front(&self, buf: Tendril<F, A>) {
if buf.len32() == 0 {
return;
}
@@ -89,13 +117,27 @@ impl BufferQueue {
/// Add a buffer to the end of the queue.
///
/// If the buffer is empty, it will be skipped.
pub fn push_back(&self, buf: StrTendril) {
pub fn push_back(&self, buf: Tendril<F, A>) {
if buf.len32() == 0 {
return;
}
self.buffers.borrow_mut().push_back(buf);
}

pub fn insert(&self, index: usize, buffer: Tendril<F, A>) {
if buffer.len32() == 0 {
return;
}

self.buffers.borrow_mut().insert(index, buffer);
}

pub fn clear(&self) {
self.buffers.borrow_mut().clear();
}
}

impl BufferQueue {
/// Look at the next available character without removing it, if the queue is not empty.
pub fn peek(&self) -> Option<char> {
debug_assert!(
@@ -240,11 +282,11 @@ impl BufferQueue {
result
}

pub fn replace_with(&self, other: BufferQueue) {
pub fn replace_with(&self, other: Self) {
let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
}

pub fn swap_with(&self, other: &BufferQueue) {
pub fn swap_with(&self, other: &Self) {
mem::swap(
&mut *self.buffers.borrow_mut(),
&mut *other.buffers.borrow_mut(),
@@ -265,6 +307,20 @@ impl BufferQueue {
}
}

impl<F, A> IntoIterator for BufferQueue<F, A>
where
F: SliceFormat + Default,
<F as SliceFormat>::Slice: fmt::Debug,
A: Atomicity,
{
type Item = Tendril<F, A>;
type IntoIter = <VecDeque<Tendril<F, A>> as IntoIterator>::IntoIter;

fn into_iter(self) -> Self::IntoIter {
self.buffers.into_inner().into_iter()
}
}

#[cfg(test)]
#[allow(non_snake_case)]
mod test {
2 changes: 1 addition & 1 deletion rcdom/tests/html-serializer.rs
Original file line number Diff line number Diff line change
@@ -68,7 +68,7 @@ impl Serialize for Tokens {

fn tokenize_and_serialize(input: StrTendril) -> StrTendril {
let input = {
let q = ::html5ever::tokenizer::BufferQueue::default();
let q = markup5ever::buffer_queue::BufferQueue::default();
q.push_front(input);
q
};
4 changes: 2 additions & 2 deletions rcdom/tests/html-tokenizer.rs
Original file line number Diff line number Diff line change
@@ -14,12 +14,12 @@ use html5ever::tendril::*;
use html5ever::tokenizer::states::{
CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData,
};
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use html5ever::{ns, Attribute, LocalName, QualName};
use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
use markup5ever::buffer_queue::BufferQueue;
use serde_json::{Map, Value};
use std::cell::RefCell;
use std::ffi::OsStr;
22 changes: 20 additions & 2 deletions xml5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -23,13 +23,17 @@ use crate::tendril::StrTendril;
use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
use log::debug;
use mac::{format_if, unwrap_or_return};
use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
use markup5ever::{
buffer_queue::BufferQueue, local_name, namespace_prefix, namespace_url, ns, small_char_set,
InputSink, InputSinkResult, TokenizerResult,
};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
use std::iter;
use std::mem::replace;

use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
use self::buffer_queue::{FromSet, NotFromSet, SetResult};
use self::char_ref::{CharRef, CharRefTokenizer};
use self::qname::QualNameTokenizer;
use self::states::XmlState;
@@ -1297,6 +1301,20 @@ impl<Sink: TokenSink> XmlTokenizer<Sink> {
}
}

impl<Sink> InputSink for XmlTokenizer<Sink>
where
Sink: TokenSink,
{
type Handle = Sink::Handle;

fn feed<'a>(
&'a self,
input: &'a BufferQueue,
) -> impl Iterator<Item = InputSinkResult<Self::Handle>> + 'a {
iter::from_fn(|| self.feed(input).into())
}
}

#[cfg(test)]
mod test {