Skip to content

Commit 9f7046e

Browse files
committed
Revert "Add AARCH64-NEON fastpath for data state (servo#618)"
This reverts commit f1cb6e7.
1 parent 2246b5e commit 9f7046e

File tree

1 file changed

+42
-152
lines changed

1 file changed

+42
-152
lines changed

html5ever/src/tokenizer/mod.rs

Lines changed: 42 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -706,11 +706,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
706706
states::Data => loop {
707707
let set = small_char_set!('\r' '\0' '&' '<' '\n');
708708

709-
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
709+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
710710
let set_result = if !(self.opts.exact_errors
711711
|| self.reconsume.get()
712712
|| self.ignore_lf.get())
713-
&& Self::is_supported_simd_feature_detected()
713+
&& is_x86_feature_detected!("sse2")
714714
{
715715
let front_buffer = input.peek_front_chunk_mut();
716716
let Some(mut front_buffer) = front_buffer else {
@@ -729,8 +729,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
729729
self.pop_except_from(input, set)
730730
} else {
731731
// SAFETY:
732-
// This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
733-
let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
732+
// This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
733+
let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
734734

735735
if front_buffer.is_empty() {
736736
drop(front_buffer);
@@ -743,11 +743,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
743743
self.pop_except_from(input, set)
744744
};
745745

746-
#[cfg(not(any(
747-
target_arch = "x86",
748-
target_arch = "x86_64",
749-
target_arch = "aarch64"
750-
)))]
746+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
751747
let set_result = self.pop_except_from(input, set);
752748

753749
let Some(set_result) = set_result else {
@@ -1889,90 +1885,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
18891885
}
18901886
}
18911887

1892-
/// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
1893-
fn is_supported_simd_feature_detected() -> bool {
1894-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1895-
{
1896-
is_x86_feature_detected!("sse2")
1897-
}
1898-
1899-
#[cfg(target_arch = "aarch64")]
1900-
{
1901-
std::arch::is_aarch64_feature_detected!("neon")
1902-
}
1903-
1904-
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
1905-
false
1906-
}
1907-
1908-
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
1888+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1889+
#[target_feature(enable = "sse2")]
19091890
/// Implements the [data state] with SIMD instructions.
1910-
/// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
19111891
///
19121892
/// The algorithm implemented is the naive SIMD approach described [here].
19131893
///
19141894
/// ### SAFETY:
1915-
/// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
1895+
/// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
19161896
///
19171897
/// [data state]: https://html.spec.whatwg.org/#data-state
19181898
/// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
1919-
unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
1920-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1921-
let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
1922-
1923-
#[cfg(target_arch = "aarch64")]
1924-
let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
1925-
1926-
// Process any remaining bytes (less than STRIDE)
1927-
while let Some(c) = input.as_bytes().get(i) {
1928-
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
1929-
break;
1930-
}
1931-
if *c == b'\n' {
1932-
n_newlines += 1;
1933-
}
1934-
1935-
i += 1;
1936-
}
1937-
1938-
let set_result = if i == 0 {
1939-
let first_char = input.pop_front_char().unwrap();
1940-
debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1941-
1942-
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1943-
// Still, it would be nice to not have to do that.
1944-
// The same is true for the unwrap call.
1945-
let preprocessed_char = self
1946-
.get_preprocessed_char(first_char, &BufferQueue::default())
1947-
.unwrap();
1948-
SetResult::FromSet(preprocessed_char)
1949-
} else {
1950-
debug_assert!(
1951-
input.len() >= i,
1952-
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1953-
i,
1954-
input.len()
1955-
);
1956-
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1957-
input.unsafe_pop_front(i as u32);
1958-
SetResult::NotFromSet(consumed_chunk)
1959-
};
1960-
1961-
self.current_line.set(self.current_line.get() + n_newlines);
1962-
1963-
Some(set_result)
1964-
}
1965-
1966-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1967-
#[target_feature(enable = "sse2")]
1968-
/// Implements the [data state] with SSE2 instructions for x86/x86_64.
1969-
/// Returns a pair of the number of bytes processed and the number of newlines found.
1970-
///
1971-
/// ### SAFETY:
1972-
/// Calling this function on a CPU that does not support NEON causes undefined behaviour.
1973-
///
1974-
/// [data state]: https://html.spec.whatwg.org/#data-state
1975-
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
1899+
unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
19761900
#[cfg(target_arch = "x86")]
19771901
use std::arch::x86::{
19781902
__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
@@ -2036,78 +1960,44 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
20361960
i += STRIDE;
20371961
}
20381962

2039-
(i, n_newlines)
2040-
}
2041-
2042-
#[cfg(target_arch = "aarch64")]
2043-
#[target_feature(enable = "neon")]
2044-
/// Implements the [data state] with NEON SIMD instructions for AArch64.
2045-
/// Returns a pair of the number of bytes processed and the number of newlines found.
2046-
///
2047-
/// ### SAFETY:
2048-
/// Calling this function on a CPU that does not support NEON causes undefined behaviour.
2049-
///
2050-
/// [data state]: https://html.spec.whatwg.org/#data-state
2051-
unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
2052-
use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
2053-
2054-
debug_assert!(!input.is_empty());
2055-
2056-
let quote_mask = vdupq_n_u8(b'<');
2057-
let escape_mask = vdupq_n_u8(b'&');
2058-
let carriage_return_mask = vdupq_n_u8(b'\r');
2059-
let zero_mask = vdupq_n_u8(b'\0');
2060-
let newline_mask = vdupq_n_u8(b'\n');
2061-
2062-
let raw_bytes: &[u8] = input.as_bytes();
2063-
let start = raw_bytes.as_ptr();
2064-
2065-
const STRIDE: usize = 16;
2066-
let mut i = 0;
2067-
let mut n_newlines = 0;
2068-
while i + STRIDE <= raw_bytes.len() {
2069-
// Load a 16 byte chunk from the input
2070-
let data = vld1q_u8(start.add(i));
2071-
2072-
// Compare the chunk against each mask
2073-
let quotes = vceqq_u8(data, quote_mask);
2074-
let escapes = vceqq_u8(data, escape_mask);
2075-
let carriage_returns = vceqq_u8(data, carriage_return_mask);
2076-
let zeros = vceqq_u8(data, zero_mask);
2077-
let newlines = vceqq_u8(data, newline_mask);
2078-
2079-
// Combine all test results and create a bitmask from them.
2080-
// Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
2081-
let test_result =
2082-
vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
2083-
let bitmask = vmaxvq_u8(test_result);
2084-
let newline_mask = vmaxvq_u8(newlines);
2085-
if bitmask != 0 {
2086-
// We have reached one of the characters that cause the state machine to transition
2087-
let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2088-
let position = chunk_bytes
2089-
.iter()
2090-
.position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
2091-
.unwrap();
2092-
2093-
n_newlines += chunk_bytes[..position]
2094-
.iter()
2095-
.filter(|&&b| b == b'\n')
2096-
.count() as u64;
2097-
2098-
i += position;
1963+
// Process any remaining bytes (less than STRIDE)
1964+
while let Some(c) = raw_bytes.get(i) {
1965+
if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
20991966
break;
2100-
} else {
2101-
if newline_mask != 0 {
2102-
let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
2103-
n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
2104-
}
1967+
}
1968+
if *c == b'\n' {
1969+
n_newlines += 1;
21051970
}
21061971

2107-
i += STRIDE;
1972+
i += 1;
21081973
}
21091974

2110-
(i, n_newlines)
1975+
let set_result = if i == 0 {
1976+
let first_char = input.pop_front_char().unwrap();
1977+
debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
1978+
1979+
// FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
1980+
// Still, it would be nice to not have to do that.
1981+
// The same is true for the unwrap call.
1982+
let preprocessed_char = self
1983+
.get_preprocessed_char(first_char, &BufferQueue::default())
1984+
.unwrap();
1985+
SetResult::FromSet(preprocessed_char)
1986+
} else {
1987+
debug_assert!(
1988+
input.len() >= i,
1989+
"Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
1990+
i,
1991+
input.len()
1992+
);
1993+
let consumed_chunk = input.unsafe_subtendril(0, i as u32);
1994+
input.unsafe_pop_front(i as u32);
1995+
SetResult::NotFromSet(consumed_chunk)
1996+
};
1997+
1998+
self.current_line.set(self.current_line.get() + n_newlines);
1999+
2000+
Some(set_result)
21112001
}
21122002
}
21132003

0 commit comments

Comments
 (0)