Avoid using the non-portable type __m128i_u.

According to https://stackoverflow.com/a/68939636 it is safe to use
__m128i instead.

https://learn.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list?view=msvc-170 also uses this type instead

__m128i_u is just __m128i with a looser alignment requirement, but
simply calling _mm_loadu_si128() instead of _mm_load_si128() is enough to
tell the compiler when a pointer is unaligned.

Fixes #1552

PiperOrigin-RevId: 576931936
Change-Id: I7c3530001149b360c12a1786c7e1832754d0e35c
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index d3eedd5..59995ae 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -58,10 +58,10 @@
 
 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
 using V128 = uint64x2_t;
-using V128u = uint64x2_t;
 #else
+// Note: Do not use __m128i_u, it is not portable.
+// Use V128_LoadU() perform an unaligned load from __m128i*.
 using V128 = __m128i;
-using V128u = __m128i_u;
 #endif
 
 // Starting with the initial value in |crc|, accumulates a CRC32 value for
@@ -78,7 +78,7 @@
 V128 V128_Load(const V128* src);
 
 // Load 128 bits of integer data. |src| does not need to be aligned.
-V128 V128_LoadU(const V128u* src);
+V128 V128_LoadU(const V128* src);
 
 // Store 128 bits of integer data. |src| must be 16-byte aligned.
 void V128_Store(V128* dst, V128 data);
@@ -146,7 +146,7 @@
 
 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
 
-inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
+inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
 
 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
 
@@ -215,7 +215,7 @@
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
-inline V128 V128_LoadU(const V128u* src) {
+inline V128 V128_LoadU(const V128* src) {
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
diff --git a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
index a06485a..968e9ae 100644
--- a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
@@ -98,7 +98,7 @@
     for (size_t i = 0; i < vec_regions; i++) {
       size_t region = i;
 
-      auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
+      auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region);
       auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
 
       // Load the blocks, unaligned
@@ -262,7 +262,7 @@
         size_t region = (j + i) % kRegions;
 
         auto* vsrc =
-            reinterpret_cast<const V128u*>(src_bytes + region_size * region);
+            reinterpret_cast<const V128*>(src_bytes + region_size * region);
         auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
 
         // Load and CRC data.