DO NOT SUBMIT
PiperOrigin-RevId: 774593500
diff --git a/c/enc/backward_references.c b/c/enc/backward_references.c
index 2531ceb..5084a48 100644
--- a/c/enc/backward_references.c
+++ b/c/enc/backward_references.c
@@ -133,6 +133,10 @@
#define PREFIX() D
#define ENABLE_COMPOUND_DICTIONARY 1
+#define HASHER() H4
+/* NOLINTNEXTLINE(build/include) */
+#include "backward_references_inc.h"
+#undef HASHER
#define HASHER() H5
/* NOLINTNEXTLINE(build/include) */
#include "backward_references_inc.h"
@@ -194,6 +198,7 @@
literal_context_lut, params, hasher, dist_cache, \
last_insert_len, commands, num_commands, num_literals); \
return;
+ CASE_(4)
CASE_(5)
CASE_(6)
#if defined(BROTLI_MAX_SIMD_QUALITY)
diff --git a/c/enc/hash.h b/c/enc/hash.h
index f210246..696ba44 100644
--- a/c/enc/hash.h
+++ b/c/enc/hash.h
@@ -12,6 +12,9 @@
#include <stdlib.h> /* exit */
#include <string.h> /* memcmp, memset */
+#include <stdio.h> /* printf */
+
+#include <immintrin.h>
#include <brotli/types.h>
@@ -567,17 +570,67 @@
BROTLI_DCHECK(cur_ix_masked + max_length <= ring_buffer_mask);
- for (i = 0; i < 4; ++i) {
+ // load the boundary, distance_offset, and last distances into sse registers
+ const __m128i boundary_mask = _mm_set1_epi32((int)boundary);
+ const __m128i distance_offset_mask = _mm_set1_epi32((int)distance_offset);
+ const __m128i last_distances =
+ _mm_loadu_si128((const __m128i*)(const void*)(distance_cache));
+ // the existing logic is:
+ // if (distance <= boundary || distance > distance_offset) continue
+ // we want to invert that to find the last distances that are in the
+ // range.
+ // to complicate matters, SSE only has a greater than comparison
+ // these instructions compute the equivalent of:
+ // if (!(distance > distance_offset) && distance > boundary)
+ const __m128i greater_than_distance_offset = _mm_cmpgt_epi32(last_distances, distance_offset_mask);
+ const __m128i greater_than_boundary = _mm_cmpgt_epi32(last_distances, boundary_mask);
+ const __m128i not_greater_than_distance_offset_and_greater_than_boundary = _mm_andnot_si128(greater_than_distance_offset, greater_than_boundary);
+ // having found the last distances that are in the range, we need to convert
+ // them into a bitmask.
+ // first, sizzle the bytes so that the bottom 4 bytes correspond to the 4 lanes.
+ // note that we reverse the order so that the bottom byte corresponds to the first lane (first last distance).
+ // second, create a 16 bit bitmask from the 16 bytes (the top 12 bits/bytes are always 0, since there are only 4 last distances to consider)
+ // we can use the bit mask to quickly skip past the out of range distances.
+ const __m128i swizzle_mask = _mm_set_epi8(
+ // top 12 bytes set to 0
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ 0x80,
+ // 12th byte
+ 0x0C,
+ // 8th byte
+ 0x08,
+ // 4th byte
+ 0x04,
+ // 0th byte
+ 0x00);
+ const __m128i swizzled = _mm_shuffle_epi8(not_greater_than_distance_offset_and_greater_than_boundary, swizzle_mask);
+ uint64_t acceptable_distances = (uint64_t)_mm_movemask_epi8(swizzled);
+
+ // printf("acceptable_distances: %lx\n", acceptable_distances);
+
+ for (; acceptable_distances != 0; acceptable_distances &= (acceptable_distances - 1)) {
+ i = (size_t)BROTLI_TZCNT64(acceptable_distances);
const size_t distance = (size_t)distance_cache[i];
size_t offset;
size_t limit;
size_t len;
- if (distance <= boundary || distance > distance_offset) continue;
+ // if (distance <= boundary || distance > distance_offset) continue;
+ // if (!(distance > distance_offset) && distance > boundary) {
offset = distance_offset - distance;
limit = source_size - offset;
limit = limit > max_length ? max_length : limit;
len = FindMatchLengthWithLimit(&source[offset], &data[cur_ix_masked],
- limit);
+ limit);
if (len >= 2) {
score_t score = BackwardReferenceScoreUsingLastDistance(len);
if (best_score < score) {