Introduce Abseil Prefetch API

PiperOrigin-RevId: 504941246
Change-Id: I94c1e85afd254e84948477b511d41eeb8285fdae
diff --git a/absl/base/prefetch.h b/absl/base/prefetch.h
new file mode 100644
index 0000000..4d42846
--- /dev/null
+++ b/absl/base/prefetch.h
@@ -0,0 +1,196 @@
+// Copyright 2023 The Abseil Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: prefetch.h
+// -----------------------------------------------------------------------------
+//
+// This header file defines prefetch functions to prefetch memory contents
+// into the first level cache (L1) for the current CPU. The prefetch logic
+// offered in this header is limited to prefetching first level cachelines
+// only, and is aimed at relatively 'simple' prefetching logic.
+//
+#ifndef ABSL_BASE_PREFETCH_H_
+#define ABSL_BASE_PREFETCH_H_
+
+#include "absl/base/config.h"
+
+#if defined(ABSL_INTERNAL_HAVE_SSE)
+#include <xmmintrin.h>
+#endif
+
+#if defined(_MSC_VER) && defined(ABSL_INTERNAL_HAVE_SSE)
+#include <intrin.h>
+#pragma intrinsic(_mm_prefetch)
+#endif
+
+namespace absl {
+ABSL_NAMESPACE_BEGIN
+
+// Moves data into the L1 cache before it is read, or "prefetches" it.
+//
+// The value of `addr` is the address of the memory to prefetch. If
+// the target and compiler support it, data prefetch instructions are
+// generated. If the prefetch is done some time before the memory is
+// read, it may be in the cache by the time the read occurs.
+//
+// This method prefetches data with the highest degree of temporal locality;
+// data is prefetched where possible into all levels of the cache.
+//
+// Incorrect or gratuitous use of this function can degrade performance.
+// Use this function only when representative benchmarks show an improvement.
+//
+// Example:
+//
+//  // Computes incremental checksum for `data`.
+//  int ComputeChecksum(int sum, absl::string_view data);
+//
+//  // Computes cumulative checksum for all values in `data`
+//  int ComputeChecksum(absl::Span<const std::string> data) {
+//    int sum = 0;
+//    auto it = data.begin();
+//    auto pit = data.begin();
+//    auto end = data.end();
+//    for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) {
+//      absl::PrefetchToLocalCache(pit->data());
+//    }
+//    for (; pit != end; ++pit, ++it) {
+//      sum = ComputeChecksum(sum, *it);
+//      absl::PrefetchToLocalCache(pit->data());
+//    }
+//    for (; it != end; ++it) {
+//      sum = ComputeChecksum(sum, *it);
+//    }
+//    return sum;
+//  }
+//
+void PrefetchToLocalCache(const void* addr);
+
+// Moves data into the L1 cache before it is read, or "prefetches" it.
+//
+// This function is identical to `PrefetchToLocalCache()` except that it has
+// non-temporal locality: the fetched data should not be left in any of the
+// cache tiers. This is useful for cases where the data is used only once /
+// short term, for example, invoking a destructor on an object.
+//
+// Incorrect or gratuitous use of this function can degrade performance.
+// Use this function only when representative benchmarks show an improvement.
+//
+// Example:
+//
+//  template <typename Iterator>
+//  void DestroyPointers(Iterator begin, Iterator end) {
+//    size_t distance = std::min(8U, bars.size());
+//
+//    int dist = 8;
+//    auto prefetch_it = begin;
+//    while (prefetch_it != end && --dist;) {
+//      absl::PrefetchToLocalCacheNta(*prefetch_it++);
+//    }
+//    while (prefetch_it != end) {
+//      delete *begin++;
+//      absl::PrefetchToLocalCacheNta(*prefetch_it++);
+//    }
+//    while (begin != end) {
+//      delete *begin++;
+//    }
+//  }
+//
+void PrefetchToLocalCacheNta(const void* addr);
+
+// Moves data into the L1 cache with the intent to modify it.
+//
+// This function is similar to `PrefetchToLocalCache()` except that it
+// prefetches cachelines with an 'intent to modify' This typically includes
+// invalidating cache entries for this address in all other cache tiers, and an
+// exclusive access intent.
+//
+// Incorrect or gratuitous use of this function can degrade performance. As this
+// function can invalidate cached cachelines on other caches and computer cores,
+// incorrect usage of this function can have an even greater negative impact
+// than incorrect regular prefetches.
+// Use this function only when representative benchmarks show an improvement.
+//
+// Example:
+//
+//  void* Arena::Allocate(size_t size) {
+//    void* ptr = AllocateBlock(size);
+//    absl::PrefetchToLocalCacheForWrite(p);
+//    return ptr;
+//  }
+//
+void PrefetchToLocalCacheforWrite(const void* addr);
+
+#if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
+
+#define ABSL_HAVE_PREFETCH 1
+
+// See __builtin_prefetch:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
+//
+inline void PrefetchToLocalCache(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);
+}
+
+inline void PrefetchToLocalCacheNta(const void* addr) {
+  __builtin_prefetch(addr, 0, 0);
+}
+
+inline void PrefetchToLocalCacheForWrite(const void* addr) {
+  // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1)
+  // unless -march=broadwell or newer; this is not generally the default, so we
+  // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel
+  // processors and has been present on AMD processors since the K6-2.
+#if defined(__x86_64__)
+  asm("prefetchw (%0)" : : "r"(addr));
+#else
+  __builtin_prefetch(addr, 1, 0);
+#endif
+}
+
+#elif defined(ABSL_INTERNAL_HAVE_SSE)
+
+#define ABSL_HAVE_PREFETCH 1
+
+inline void PrefetchToLocalCache(const void* addr) {
+  _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
+}
+
+inline void PrefetchToLocalCacheNta(const void* addr) {
+  _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
+}
+
+inline void PrefetchToLocalCacheForWrite(const void* addr) {
+#if defined(_MM_HINT_ET0)
+  _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
+#elif defined(__x86_64__)
+  // _MM_HINT_ET0 is not universally supported. As we commented further
+  // up, PREFETCHW is recognized as a no-op on older Intel processors
+  // and has been present on AMD processors since the K6-2
+  asm("prefetchw (%0)" : : "r"(addr));
+#endif
+}
+
+#else
+
+inline void PrefetchToLocalCache(const void* addr) {}
+inline void PrefetchToLocalCacheNta(const void* addr) {}
+inline void PrefetchToLocalCacheForWrite(const void* addr) {}
+
+#endif
+
+ABSL_NAMESPACE_END
+}  // namespace absl
+
+#endif  // ABSL_BASE_PREFETCH_H_