@@ -704,7 +704,52 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
704
704
match self . state . get ( ) {
705
705
//§ data-state
706
706
states:: Data => loop {
707
- match pop_except_from ! ( self , input, small_char_set!( '\r' '\0' '&' '<' '\n' ) ) {
707
+ let set = small_char_set ! ( '\r' '\0' '&' '<' '\n' ) ;
708
+
709
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
710
+ let set_result = if !( self . opts . exact_errors
711
+ || self . reconsume . get ( )
712
+ || self . ignore_lf . get ( ) )
713
+ && is_x86_feature_detected ! ( "sse2" )
714
+ {
715
+ let front_buffer = input. peek_front_chunk_mut ( ) ;
716
+ let Some ( mut front_buffer) = front_buffer else {
717
+ return ProcessResult :: Suspend ;
718
+ } ;
719
+
720
+ // Special case: The fast path is not worth taking if the first character is already in the set,
721
+ // which is fairly common
722
+ let first_char = front_buffer
723
+ . chars ( )
724
+ . next ( )
725
+ . expect ( "Input buffers are never empty" ) ;
726
+
727
+ if matches ! ( first_char, '\r' | '\0' | '&' | '<' | '\n' ) {
728
+ drop ( front_buffer) ;
729
+ self . pop_except_from ( input, set)
730
+ } else {
731
+ // SAFETY:
732
+ // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
733
+ let result = unsafe { self . data_state_sse2_fast_path ( & mut front_buffer) } ;
734
+
735
+ if front_buffer. is_empty ( ) {
736
+ drop ( front_buffer) ;
737
+ input. pop_front ( ) ;
738
+ }
739
+
740
+ result
741
+ }
742
+ } else {
743
+ self . pop_except_from ( input, set)
744
+ } ;
745
+
746
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
747
+ let set_result = self . pop_except_from ( input, set) ;
748
+
749
+ let Some ( set_result) = set_result else {
750
+ return ProcessResult :: Suspend ;
751
+ } ;
752
+ match set_result {
708
753
FromSet ( '\0' ) => {
709
754
self . bad_char_error ( ) ;
710
755
self . emit_char ( '\0' ) ;
@@ -1839,6 +1884,121 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
1839
1884
states:: CdataSectionEnd => go ! ( self : push_temp ']' ; push_temp ']' ; to CdataSection ) ,
1840
1885
}
1841
1886
}
1887
+
1888
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1889
+ #[ target_feature( enable = "sse2" ) ]
1890
+ /// Implements the [data state] with SIMD instructions.
1891
+ ///
1892
+ /// The algorithm implemented is the naive SIMD approach described [here].
1893
+ ///
1894
+ /// ### SAFETY:
1895
+ /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
1896
+ ///
1897
+ /// [data state]: https://html.spec.whatwg.org/#data-state
1898
+ /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1899
+ unsafe fn data_state_sse2_fast_path ( & self , input : & mut StrTendril ) -> Option < SetResult > {
1900
+ #[ cfg( target_arch = "x86" ) ]
1901
+ use std:: arch:: x86:: {
1902
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1903
+ _mm_set1_epi8,
1904
+ } ;
1905
+ #[ cfg( target_arch = "x86_64" ) ]
1906
+ use std:: arch:: x86_64:: {
1907
+ __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
1908
+ _mm_set1_epi8,
1909
+ } ;
1910
+
1911
+ debug_assert ! ( !input. is_empty( ) ) ;
1912
+
1913
+ let quote_mask = _mm_set1_epi8 ( '<' as i8 ) ;
1914
+ let escape_mask = _mm_set1_epi8 ( '&' as i8 ) ;
1915
+ let carriage_return_mask = _mm_set1_epi8 ( '\r' as i8 ) ;
1916
+ let zero_mask = _mm_set1_epi8 ( '\0' as i8 ) ;
1917
+ let newline_mask = _mm_set1_epi8 ( '\n' as i8 ) ;
1918
+
1919
+ let raw_bytes: & [ u8 ] = input. as_bytes ( ) ;
1920
+ let start = raw_bytes. as_ptr ( ) ;
1921
+
1922
+ const STRIDE : usize = 16 ;
1923
+ let mut i = 0 ;
1924
+ let mut n_newlines = 0 ;
1925
+ while i + STRIDE <= raw_bytes. len ( ) {
1926
+ // Load a 16 byte chunk from the input
1927
+ let data = _mm_loadu_si128 ( start. add ( i) as * const __m128i ) ;
1928
+
1929
+ // Compare the chunk against each mask
1930
+ let quotes = _mm_cmpeq_epi8 ( data, quote_mask) ;
1931
+ let escapes = _mm_cmpeq_epi8 ( data, escape_mask) ;
1932
+ let carriage_returns = _mm_cmpeq_epi8 ( data, carriage_return_mask) ;
1933
+ let zeros = _mm_cmpeq_epi8 ( data, zero_mask) ;
1934
+ let newlines = _mm_cmpeq_epi8 ( data, newline_mask) ;
1935
+
1936
+ // Combine all test results and create a bitmask from them.
1937
+ // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
1938
+ let test_result = _mm_or_si128 (
1939
+ _mm_or_si128 ( quotes, zeros) ,
1940
+ _mm_or_si128 ( escapes, carriage_returns) ,
1941
+ ) ;
1942
+ let bitmask = _mm_movemask_epi8 ( test_result) ;
1943
+ let newline_mask = _mm_movemask_epi8 ( newlines) ;
1944
+
1945
+ if ( bitmask != 0 ) {
1946
+ // We have reached one of the characters that cause the state machine to transition
1947
+ let position = if cfg ! ( target_endian = "little" ) {
1948
+ bitmask. trailing_zeros ( ) as usize
1949
+ } else {
1950
+ bitmask. leading_zeros ( ) as usize
1951
+ } ;
1952
+
1953
+ n_newlines += ( newline_mask & ( ( 1 << position) - 1 ) ) . count_ones ( ) as u64 ;
1954
+ i += position;
1955
+ break ;
1956
+ } else {
1957
+ n_newlines += newline_mask. count_ones ( ) as u64 ;
1958
+ }
1959
+
1960
+ i += STRIDE ;
1961
+ }
1962
+
1963
+ // Process any remaining bytes (less than STRIDE)
1964
+ while let Some ( c) = raw_bytes. get ( i) {
1965
+ if matches ! ( * c, b'<' | b'&' | b'\r' | b'\0' ) {
1966
+ break ;
1967
+ }
1968
+ if * c == b'\n' {
1969
+ n_newlines += 1 ;
1970
+ }
1971
+
1972
+ i += 1 ;
1973
+ }
1974
+
1975
+ let set_result = if i == 0 {
1976
+ let first_char = input. pop_front_char ( ) . unwrap ( ) ;
1977
+ debug_assert ! ( matches!( first_char, '<' | '&' | '\r' | '\0' ) ) ;
1978
+
1979
+ // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1980
+ // Still, it would be nice to not have to do that.
1981
+ // The same is true for the unwrap call.
1982
+ let preprocessed_char = self
1983
+ . get_preprocessed_char ( first_char, & BufferQueue :: default ( ) )
1984
+ . unwrap ( ) ;
1985
+ SetResult :: FromSet ( preprocessed_char)
1986
+ } else {
1987
+ debug_assert ! (
1988
+ input. len( ) >= i,
1989
+ "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" ,
1990
+ i,
1991
+ input. len( )
1992
+ ) ;
1993
+ let consumed_chunk = input. unsafe_subtendril ( 0 , i as u32 ) ;
1994
+ input. unsafe_pop_front ( i as u32 ) ;
1995
+ SetResult :: NotFromSet ( consumed_chunk)
1996
+ } ;
1997
+
1998
+ self . current_line . set ( self . current_line . get ( ) + n_newlines) ;
1999
+
2000
+ Some ( set_result)
2001
+ }
1842
2002
}
1843
2003
1844
2004
#[ cfg( test) ]
0 commit comments