@@ -706,11 +706,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
706706            states:: Data  => loop  { 
707707                let  set = small_char_set ! ( '\r'  '\0'  '&'  '<'  '\n' ) ; 
708708
709-                 #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ,  target_arch =  "aarch64" ) ) ]  
709+                 #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ]  
710710                let  set_result = if  !( self . opts . exact_errors 
711711                    || self . reconsume . get ( ) 
712712                    || self . ignore_lf . get ( ) ) 
713-                     && Self :: is_supported_simd_feature_detected ( ) 
713+                     && is_x86_feature_detected ! ( "sse2" ) 
714714                { 
715715                    let  front_buffer = input. peek_front_chunk_mut ( ) ; 
716716                    let  Some ( mut  front_buffer)  = front_buffer else  { 
@@ -729,8 +729,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
729729                        self . pop_except_from ( input,  set) 
730730                    }  else  { 
731731                        // SAFETY: 
732-                         // This CPU is guaranteed to support SIMD  due to the is_supported_simd_feature_detected  check above 
733-                         let  result = unsafe  {  self . data_state_simd_fast_path ( & mut  front_buffer)  } ; 
732+                         // This CPU is guaranteed to support SSE2  due to the is_x86_feature_detected  check above 
733+                         let  result = unsafe  {  self . data_state_sse2_fast_path ( & mut  front_buffer)  } ; 
734734
735735                        if  front_buffer. is_empty ( )  { 
736736                            drop ( front_buffer) ; 
@@ -743,11 +743,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
743743                    self . pop_except_from ( input,  set) 
744744                } ; 
745745
746-                 #[ cfg( not( any(  
747-                     target_arch = "x86" ,  
748-                     target_arch = "x86_64" ,  
749-                     target_arch = "aarch64"  
750-                 ) ) ) ]  
746+                 #[ cfg( not( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ) ]  
751747                let  set_result = self . pop_except_from ( input,  set) ; 
752748
753749                let  Some ( set_result)  = set_result else  { 
@@ -1889,90 +1885,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18891885        } 
18901886    } 
18911887
1892-     /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64. 
1893- fn  is_supported_simd_feature_detected ( )  -> bool  { 
1894-         #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ]  
1895-         { 
1896-             is_x86_feature_detected ! ( "sse2" ) 
1897-         } 
1898- 
1899-         #[ cfg( target_arch = "aarch64" ) ]  
1900-         { 
1901-             std:: arch:: is_aarch64_feature_detected!( "neon" ) 
1902-         } 
1903- 
1904-         #[ cfg( not( any( target_arch = "x86" ,  target_arch = "x86_64" ,  target_arch = "aarch64" ) ) ) ]  
1905-         false 
1906-     } 
1907- 
1908-     #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ,  target_arch = "aarch64" ) ) ]  
1888+     #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ]  
1889+     #[ target_feature( enable = "sse2" ) ]  
19091890    /// Implements the [data state] with SIMD instructions. 
1910- /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes. 
19111891/// 
19121892/// The algorithm implemented is the naive SIMD approach described [here]. 
19131893/// 
19141894/// ### SAFETY: 
1915- /// Calling this function on a CPU that supports neither SSE2 nor NEON  causes undefined behaviour. 
1895+ /// Calling this function on a CPU that does not support SSE2  causes undefined behaviour. 
19161896/// 
19171897/// [data state]: https://html.spec.whatwg.org/#data-state 
19181898/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/ 
1919- unsafe  fn  data_state_simd_fast_path ( & self ,  input :  & mut  StrTendril )  -> Option < SetResult >  { 
1920-         #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ]  
1921-         let  ( mut  i,  mut  n_newlines)  = self . data_state_sse2_fast_path ( input) ; 
1922- 
1923-         #[ cfg( target_arch = "aarch64" ) ]  
1924-         let  ( mut  i,  mut  n_newlines)  = self . data_state_neon_fast_path ( input) ; 
1925- 
1926-         // Process any remaining bytes (less than STRIDE) 
1927-         while  let  Some ( c)  = input. as_bytes ( ) . get ( i)  { 
1928-             if  matches ! ( * c,  b'<'  | b'&'  | b'\r'  | b'\0' )  { 
1929-                 break ; 
1930-             } 
1931-             if  * c == b'\n'  { 
1932-                 n_newlines += 1 ; 
1933-             } 
1934- 
1935-             i += 1 ; 
1936-         } 
1937- 
1938-         let  set_result = if  i == 0  { 
1939-             let  first_char = input. pop_front_char ( ) . unwrap ( ) ; 
1940-             debug_assert ! ( matches!( first_char,  '<'  | '&'  | '\r'  | '\0' ) ) ; 
1941- 
1942-             // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case. 
1943-             // Still, it would be nice to not have to do that. 
1944-             // The same is true for the unwrap call. 
1945-             let  preprocessed_char = self 
1946-                 . get_preprocessed_char ( first_char,  & BufferQueue :: default ( ) ) 
1947-                 . unwrap ( ) ; 
1948-             SetResult :: FromSet ( preprocessed_char) 
1949-         }  else  { 
1950-             debug_assert ! ( 
1951-                 input. len( )  >= i, 
1952-                 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" , 
1953-                 i, 
1954-                 input. len( ) 
1955-             ) ; 
1956-             let  consumed_chunk = input. unsafe_subtendril ( 0 ,  i as  u32 ) ; 
1957-             input. unsafe_pop_front ( i as  u32 ) ; 
1958-             SetResult :: NotFromSet ( consumed_chunk) 
1959-         } ; 
1960- 
1961-         self . current_line . set ( self . current_line . get ( )  + n_newlines) ; 
1962- 
1963-         Some ( set_result) 
1964-     } 
1965- 
1966-     #[ cfg( any( target_arch = "x86" ,  target_arch = "x86_64" ) ) ]  
1967-     #[ target_feature( enable = "sse2" ) ]  
1968-     /// Implements the [data state] with SSE2 instructions for x86/x86_64. 
1969- /// Returns a pair of the number of bytes processed and the number of newlines found. 
1970- /// 
1971- /// ### SAFETY: 
1972- /// Calling this function on a CPU that does not support NEON causes undefined behaviour. 
1973- /// 
1974- /// [data state]: https://html.spec.whatwg.org/#data-state 
1975- unsafe  fn  data_state_sse2_fast_path ( & self ,  input :  & mut  StrTendril )  -> ( usize ,  u64 )  { 
1899+ unsafe  fn  data_state_sse2_fast_path ( & self ,  input :  & mut  StrTendril )  -> Option < SetResult >  { 
19761900        #[ cfg( target_arch = "x86" ) ]  
19771901        use  std:: arch:: x86:: { 
19781902            __m128i,  _mm_cmpeq_epi8,  _mm_loadu_si128,  _mm_movemask_epi8,  _mm_or_si128, 
@@ -2036,78 +1960,44 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
20361960            i += STRIDE ; 
20371961        } 
20381962
2039-         ( i,  n_newlines) 
2040-     } 
2041- 
2042-     #[ cfg( target_arch = "aarch64" ) ]  
2043-     #[ target_feature( enable = "neon" ) ]  
2044-     /// Implements the [data state] with NEON SIMD instructions for AArch64. 
2045- /// Returns a pair of the number of bytes processed and the number of newlines found. 
2046- /// 
2047- /// ### SAFETY: 
2048- /// Calling this function on a CPU that does not support NEON causes undefined behaviour. 
2049- /// 
2050- /// [data state]: https://html.spec.whatwg.org/#data-state 
2051- unsafe  fn  data_state_neon_fast_path ( & self ,  input :  & mut  StrTendril )  -> ( usize ,  u64 )  { 
2052-         use  std:: arch:: aarch64:: { vceqq_u8,  vdupq_n_u8,  vld1q_u8,  vmaxvq_u8,  vorrq_u8} ; 
2053- 
2054-         debug_assert ! ( !input. is_empty( ) ) ; 
2055- 
2056-         let  quote_mask = vdupq_n_u8 ( b'<' ) ; 
2057-         let  escape_mask = vdupq_n_u8 ( b'&' ) ; 
2058-         let  carriage_return_mask = vdupq_n_u8 ( b'\r' ) ; 
2059-         let  zero_mask = vdupq_n_u8 ( b'\0' ) ; 
2060-         let  newline_mask = vdupq_n_u8 ( b'\n' ) ; 
2061- 
2062-         let  raw_bytes:  & [ u8 ]  = input. as_bytes ( ) ; 
2063-         let  start = raw_bytes. as_ptr ( ) ; 
2064- 
2065-         const  STRIDE :  usize  = 16 ; 
2066-         let  mut  i = 0 ; 
2067-         let  mut  n_newlines = 0 ; 
2068-         while  i + STRIDE  <= raw_bytes. len ( )  { 
2069-             // Load a 16 byte chunk from the input 
2070-             let  data = vld1q_u8 ( start. add ( i) ) ; 
2071- 
2072-             // Compare the chunk against each mask 
2073-             let  quotes = vceqq_u8 ( data,  quote_mask) ; 
2074-             let  escapes = vceqq_u8 ( data,  escape_mask) ; 
2075-             let  carriage_returns = vceqq_u8 ( data,  carriage_return_mask) ; 
2076-             let  zeros = vceqq_u8 ( data,  zero_mask) ; 
2077-             let  newlines = vceqq_u8 ( data,  newline_mask) ; 
2078- 
2079-             // Combine all test results and create a bitmask from them. 
2080-             // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise. 
2081-             let  test_result =
2082-                 vorrq_u8 ( vorrq_u8 ( quotes,  zeros) ,  vorrq_u8 ( escapes,  carriage_returns) ) ; 
2083-             let  bitmask = vmaxvq_u8 ( test_result) ; 
2084-             let  newline_mask = vmaxvq_u8 ( newlines) ; 
2085-             if  bitmask != 0  { 
2086-                 // We have reached one of the characters that cause the state machine to transition 
2087-                 let  chunk_bytes = std:: slice:: from_raw_parts ( start. add ( i) ,  STRIDE ) ; 
2088-                 let  position = chunk_bytes
2089-                     . iter ( ) 
2090-                     . position ( |& b| matches ! ( b,  b'<'  | b'&'  | b'\r'  | b'\0' ) ) 
2091-                     . unwrap ( ) ; 
2092- 
2093-                 n_newlines += chunk_bytes[ ..position] 
2094-                     . iter ( ) 
2095-                     . filter ( |& & b| b == b'\n' ) 
2096-                     . count ( )  as  u64 ; 
2097- 
2098-                 i += position; 
1963+         // Process any remaining bytes (less than STRIDE) 
1964+         while  let  Some ( c)  = raw_bytes. get ( i)  { 
1965+             if  matches ! ( * c,  b'<'  | b'&'  | b'\r'  | b'\0' )  { 
20991966                break ; 
2100-             }  else  { 
2101-                 if  newline_mask != 0  { 
2102-                     let  chunk_bytes = std:: slice:: from_raw_parts ( start. add ( i) ,  STRIDE ) ; 
2103-                     n_newlines += chunk_bytes. iter ( ) . filter ( |& & b| b == b'\n' ) . count ( )  as  u64 ; 
2104-                 } 
1967+             } 
1968+             if  * c == b'\n'  { 
1969+                 n_newlines += 1 ; 
21051970            } 
21061971
2107-             i += STRIDE ; 
1972+             i += 1 ; 
21081973        } 
21091974
2110-         ( i,  n_newlines) 
1975+         let  set_result = if  i == 0  { 
1976+             let  first_char = input. pop_front_char ( ) . unwrap ( ) ; 
1977+             debug_assert ! ( matches!( first_char,  '<'  | '&'  | '\r'  | '\0' ) ) ; 
1978+ 
1979+             // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case. 
1980+             // Still, it would be nice to not have to do that. 
1981+             // The same is true for the unwrap call. 
1982+             let  preprocessed_char = self 
1983+                 . get_preprocessed_char ( first_char,  & BufferQueue :: default ( ) ) 
1984+                 . unwrap ( ) ; 
1985+             SetResult :: FromSet ( preprocessed_char) 
1986+         }  else  { 
1987+             debug_assert ! ( 
1988+                 input. len( )  >= i, 
1989+                 "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long" , 
1990+                 i, 
1991+                 input. len( ) 
1992+             ) ; 
1993+             let  consumed_chunk = input. unsafe_subtendril ( 0 ,  i as  u32 ) ; 
1994+             input. unsafe_pop_front ( i as  u32 ) ; 
1995+             SetResult :: NotFromSet ( consumed_chunk) 
1996+         } ; 
1997+ 
1998+         self . current_line . set ( self . current_line . get ( )  + n_newlines) ; 
1999+ 
2000+         Some ( set_result) 
21112001    } 
21122002} 
21132003
0 commit comments