crates/vendor/memchr-2.5.0/src/memmem/prefilter/genericsimd.rs - third_party/rust_crates - Git at Google

 use core::mem::size_of;

 use crate::memmem::{
     prefilter::{PrefilterFnTy, PrefilterState},
     vector::Vector,
     NeedleInfo,
 };

 /// The implementation of the forward vector accelerated candidate finder.
 ///
 /// This is inspired by the "generic SIMD" algorithm described here:
 /// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd
 ///
 /// The main difference is that this is just a prefilter. That is, it reports
 /// candidates once they are seen and doesn't attempt to confirm them. Also,
 /// the bytes this routine uses to check for candidates are selected based on
 /// an a priori background frequency distribution. This means that on most
 /// haystacks, this will on average spend more time in vectorized code than you
 /// would if you just selected the first and last bytes of the needle.
 ///
 /// Note that a non-prefilter variant of this algorithm can be found in the
 /// parent module, but it only works on smaller needles.
 ///
 /// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter
 /// function parameters. `fallback` is a prefilter that is used if the haystack
 /// is too small to be handled with the given vector size.
 ///
 /// This routine is not safe because it is intended for callers to specialize
 /// this with a particular vector (e.g., __m256i) and then call it with the
 /// relevant target feature (e.g., avx2) enabled.
 ///
 /// # Panics
 ///
 /// If `needle.len() <= 1`, then this panics.
 ///
 /// # Safety
 ///
 /// Since this is meant to be used with vector functions, callers need to
 /// specialize this inside of a function with a `target_feature` attribute.
 /// Therefore, callers must ensure that whatever target feature is being used
 /// supports the vector functions that this function is specialized for. (For
 /// the specific vector functions used, see the Vector trait implementations.)
 #[inline(always)]
 pub(crate) unsafe fn find<V: Vector>(
     prestate: &mut PrefilterState,
     ninfo: &NeedleInfo,
     haystack: &[u8],
     needle: &[u8],
     fallback: PrefilterFnTy,
 ) -> Option<usize> {
     assert!(needle.len() >= 2, "needle must be at least 2 bytes");
     let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();
     let min_haystack_len = rare2i + size_of::<V>();
     if haystack.len() < min_haystack_len {
         return fallback(prestate, ninfo, haystack, needle);
     }

     let start_ptr = haystack.as_ptr();
     let end_ptr = start_ptr.add(haystack.len());
     let max_ptr = end_ptr.sub(min_haystack_len);
     let mut ptr = start_ptr;

     let rare1chunk = V::splat(needle[rare1i]);
     let rare2chunk = V::splat(needle[rare2i]);

     // N.B. I did experiment with unrolling the loop to deal with size(V)
     // bytes at a time and 2*size(V) bytes at a time. The double unroll
     // was marginally faster while the quadruple unroll was unambiguously
     // slower. In the end, I decided the complexity from unrolling wasn't
     // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
     // compare.
     while ptr <= max_ptr {
         let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
         if let Some(chunki) = m {
             return Some(matched(prestate, start_ptr, ptr, chunki));
         }
         ptr = ptr.add(size_of::<V>());
     }
     if ptr < end_ptr {
         // This routine immediately quits if a candidate match is found.
         // That means that if we're here, no candidate matches have been
         // found at or before 'ptr'. Thus, we don't need to mask anything
         // out even though we might technically search part of the haystack
         // that we've already searched (because we know it can't match).
         ptr = max_ptr;
         let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
         if let Some(chunki) = m {
             return Some(matched(prestate, start_ptr, ptr, chunki));
         }
     }
     prestate.update(haystack.len());
     None
 }

 // Below are two different techniques for checking whether a candidate
 // match exists in a given chunk or not. find_in_chunk2 checks two bytes
 // where as find_in_chunk3 checks three bytes. The idea behind checking
 // three bytes is that while we do a bit more work per iteration, we
 // decrease the chances of a false positive match being reported and thus
 // make the search faster overall. This actually works out for the
 // memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where
 // using find_in_chunk3 is about 25% faster than find_in_chunk2. However,
 // it turns out that find_in_chunk2 is faster for all other benchmarks, so
 // perhaps the extra check isn't worth it in practice.
 //
 // For now, we go with find_in_chunk2, but we leave find_in_chunk3 around
 // to make it easy to switch to and benchmark when possible.

 /// Search for an occurrence of two rare bytes from the needle in the current
 /// chunk pointed to by ptr.
 ///
 /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
 /// bytes repeated in each 8-bit lane, respectively.
 ///
 /// # Safety
 ///
 /// It must be safe to do an unaligned read of size(V) bytes starting at both
 /// (ptr + rare1i) and (ptr + rare2i).
 #[inline(always)]
 unsafe fn find_in_chunk2<V: Vector>(
     ptr: *const u8,
     rare1i: usize,
     rare2i: usize,
     rare1chunk: V,
     rare2chunk: V,
 ) -> Option<usize> {
     let chunk0 = V::load_unaligned(ptr.add(rare1i));
     let chunk1 = V::load_unaligned(ptr.add(rare2i));

     let eq0 = chunk0.cmpeq(rare1chunk);
     let eq1 = chunk1.cmpeq(rare2chunk);

     let match_offsets = eq0.and(eq1).movemask();
     if match_offsets == 0 {
         return None;
     }
     Some(match_offsets.trailing_zeros() as usize)
 }

 /// Search for an occurrence of two rare bytes and the first byte (even if one
 /// of the rare bytes is equivalent to the first byte) from the needle in the
 /// current chunk pointed to by ptr.
 ///
 /// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,
 /// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.
 ///
 /// # Safety
 ///
 /// It must be safe to do an unaligned read of size(V) bytes starting at ptr,
 /// (ptr + rare1i) and (ptr + rare2i).
 #[allow(dead_code)]
 #[inline(always)]
 unsafe fn find_in_chunk3<V: Vector>(
     ptr: *const u8,
     rare1i: usize,
     rare2i: usize,
     firstchunk: V,
     rare1chunk: V,
     rare2chunk: V,
 ) -> Option<usize> {
     let chunk0 = V::load_unaligned(ptr);
     let chunk1 = V::load_unaligned(ptr.add(rare1i));
     let chunk2 = V::load_unaligned(ptr.add(rare2i));

     let eq0 = chunk0.cmpeq(firstchunk);
     let eq1 = chunk1.cmpeq(rare1chunk);
     let eq2 = chunk2.cmpeq(rare2chunk);

     let match_offsets = eq0.and(eq1).and(eq2).movemask();
     if match_offsets == 0 {
         return None;
     }
     Some(match_offsets.trailing_zeros() as usize)
 }

 /// Accepts a chunk-relative offset and returns a haystack relative offset
 /// after updating the prefilter state.
 ///
 /// Why do we use this unlineable function when a search completes? Well,
 /// I don't know. Really. Obviously this function was not here initially.
 /// When doing profiling, the codegen for the inner loop here looked bad and
 /// I didn't know why. There were a couple extra 'add' instructions and an
 /// extra 'lea' instruction that I couldn't explain. I hypothesized that the
 /// optimizer was having trouble untangling the hot code in the loop from the
 /// code that deals with a candidate match. By putting the latter into an
 /// unlineable function, it kind of forces the issue and it had the intended
 /// effect: codegen improved measurably. It's good for a ~10% improvement
 /// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.
 #[cold]
 #[inline(never)]
 fn matched(
     prestate: &mut PrefilterState,
     start_ptr: *const u8,
     ptr: *const u8,
     chunki: usize,
 ) -> usize {
     let found = diff(ptr, start_ptr) + chunki;
     prestate.update(found);
     found
 }

 /// Subtract `b` from `a` and return the difference. `a` must be greater than
 /// or equal to `b`.
 fn diff(a: *const u8, b: *const u8) -> usize {
     debug_assert!(a >= b);
     (a as usize) - (b as usize)
 }
	use core::mem::size_of;

	use crate::memmem::{
	prefilter::{PrefilterFnTy, PrefilterState},
	vector::Vector,
	NeedleInfo,
	};

	/// The implementation of the forward vector accelerated candidate finder.
	///
	/// This is inspired by the "generic SIMD" algorithm described here:
	/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd
	///
	/// The main difference is that this is just a prefilter. That is, it reports
	/// candidates once they are seen and doesn't attempt to confirm them. Also,
	/// the bytes this routine uses to check for candidates are selected based on
	/// an a priori background frequency distribution. This means that on most
	/// haystacks, this will on average spend more time in vectorized code than you
	/// would if you just selected the first and last bytes of the needle.
	///
	/// Note that a non-prefilter variant of this algorithm can be found in the
	/// parent module, but it only works on smaller needles.
	///
	/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter
	/// function parameters. `fallback` is a prefilter that is used if the haystack
	/// is too small to be handled with the given vector size.
	///
	/// This routine is not safe because it is intended for callers to specialize
	/// this with a particular vector (e.g., __m256i) and then call it with the
	/// relevant target feature (e.g., avx2) enabled.
	///
	/// # Panics
	///
	/// If `needle.len() <= 1`, then this panics.
	///
	/// # Safety
	///
	/// Since this is meant to be used with vector functions, callers need to
	/// specialize this inside of a function with a `target_feature` attribute.
	/// Therefore, callers must ensure that whatever target feature is being used
	/// supports the vector functions that this function is specialized for. (For
	/// the specific vector functions used, see the Vector trait implementations.)
	#[inline(always)]
	pub(crate) unsafe fn find<V: Vector>(
	prestate: &mut PrefilterState,
	ninfo: &NeedleInfo,
	haystack: &[u8],
	needle: &[u8],
	fallback: PrefilterFnTy,
	) -> Option<usize> {
	assert!(needle.len() >= 2, "needle must be at least 2 bytes");
	let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();
	let min_haystack_len = rare2i + size_of::<V>();
	if haystack.len() < min_haystack_len {
	return fallback(prestate, ninfo, haystack, needle);
	}

	let start_ptr = haystack.as_ptr();
	let end_ptr = start_ptr.add(haystack.len());
	let max_ptr = end_ptr.sub(min_haystack_len);
	let mut ptr = start_ptr;

	let rare1chunk = V::splat(needle[rare1i]);
	let rare2chunk = V::splat(needle[rare2i]);

	// N.B. I did experiment with unrolling the loop to deal with size(V)
	// bytes at a time and 2*size(V) bytes at a time. The double unroll
	// was marginally faster while the quadruple unroll was unambiguously
	// slower. In the end, I decided the complexity from unrolling wasn't
	// worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
	// compare.
	while ptr <= max_ptr {
	let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
	if let Some(chunki) = m {
	return Some(matched(prestate, start_ptr, ptr, chunki));
	}
	ptr = ptr.add(size_of::<V>());
	}
	if ptr < end_ptr {
	// This routine immediately quits if a candidate match is found.
	// That means that if we're here, no candidate matches have been
	// found at or before 'ptr'. Thus, we don't need to mask anything
	// out even though we might technically search part of the haystack
	// that we've already searched (because we know it can't match).
	ptr = max_ptr;
	let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
	if let Some(chunki) = m {
	return Some(matched(prestate, start_ptr, ptr, chunki));
	}
	}
	prestate.update(haystack.len());
	None
	}

	// Below are two different techniques for checking whether a candidate
	// match exists in a given chunk or not. find_in_chunk2 checks two bytes
	// where as find_in_chunk3 checks three bytes. The idea behind checking
	// three bytes is that while we do a bit more work per iteration, we
	// decrease the chances of a false positive match being reported and thus
	// make the search faster overall. This actually works out for the
	// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where
	// using find_in_chunk3 is about 25% faster than find_in_chunk2. However,
	// it turns out that find_in_chunk2 is faster for all other benchmarks, so
	// perhaps the extra check isn't worth it in practice.
	//
	// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around
	// to make it easy to switch to and benchmark when possible.

	/// Search for an occurrence of two rare bytes from the needle in the current
	/// chunk pointed to by ptr.
	///
	/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
	/// bytes repeated in each 8-bit lane, respectively.
	///
	/// # Safety
	///
	/// It must be safe to do an unaligned read of size(V) bytes starting at both
	/// (ptr + rare1i) and (ptr + rare2i).
	#[inline(always)]
	unsafe fn find_in_chunk2<V: Vector>(
	ptr: *const u8,
	rare1i: usize,
	rare2i: usize,
	rare1chunk: V,
	rare2chunk: V,
	) -> Option<usize> {
	let chunk0 = V::load_unaligned(ptr.add(rare1i));
	let chunk1 = V::load_unaligned(ptr.add(rare2i));

	let eq0 = chunk0.cmpeq(rare1chunk);
	let eq1 = chunk1.cmpeq(rare2chunk);

	let match_offsets = eq0.and(eq1).movemask();
	if match_offsets == 0 {
	return None;
	}
	Some(match_offsets.trailing_zeros() as usize)
	}

	/// Search for an occurrence of two rare bytes and the first byte (even if one
	/// of the rare bytes is equivalent to the first byte) from the needle in the
	/// current chunk pointed to by ptr.
	///
	/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,
	/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.
	///
	/// # Safety
	///
	/// It must be safe to do an unaligned read of size(V) bytes starting at ptr,
	/// (ptr + rare1i) and (ptr + rare2i).
	#[allow(dead_code)]
	#[inline(always)]
	unsafe fn find_in_chunk3<V: Vector>(
	ptr: *const u8,
	rare1i: usize,
	rare2i: usize,
	firstchunk: V,
	rare1chunk: V,
	rare2chunk: V,
	) -> Option<usize> {
	let chunk0 = V::load_unaligned(ptr);
	let chunk1 = V::load_unaligned(ptr.add(rare1i));
	let chunk2 = V::load_unaligned(ptr.add(rare2i));

	let eq0 = chunk0.cmpeq(firstchunk);
	let eq1 = chunk1.cmpeq(rare1chunk);
	let eq2 = chunk2.cmpeq(rare2chunk);

	let match_offsets = eq0.and(eq1).and(eq2).movemask();
	if match_offsets == 0 {
	return None;
	}
	Some(match_offsets.trailing_zeros() as usize)
	}

	/// Accepts a chunk-relative offset and returns a haystack relative offset
	/// after updating the prefilter state.
	///
	/// Why do we use this unlineable function when a search completes? Well,
	/// I don't know. Really. Obviously this function was not here initially.
	/// When doing profiling, the codegen for the inner loop here looked bad and
	/// I didn't know why. There were a couple extra 'add' instructions and an
	/// extra 'lea' instruction that I couldn't explain. I hypothesized that the
	/// optimizer was having trouble untangling the hot code in the loop from the
	/// code that deals with a candidate match. By putting the latter into an
	/// unlineable function, it kind of forces the issue and it had the intended
	/// effect: codegen improved measurably. It's good for a ~10% improvement
	/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.
	#[cold]
	#[inline(never)]
	fn matched(
	prestate: &mut PrefilterState,
	start_ptr: *const u8,
	ptr: *const u8,
	chunki: usize,
	) -> usize {
	let found = diff(ptr, start_ptr) + chunki;
	prestate.update(found);
	found
	}

	/// Subtract `b` from `a` and return the difference. `a` must be greater than
	/// or equal to `b`.
	fn diff(a: const u8, b: const u8) -> usize {
	debug_assert!(a >= b);
	(a as usize) - (b as usize)
	}