[K/N] Don't get out of bound during SSE hashCode calculation

Use loadl_epi64 to read vectors of u16x4.
diff --git a/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp b/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp
index 31de4c0..34402a6 100644
--- a/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/arm.cpp
@@ -34,7 +34,7 @@
     ALWAYS_INLINE static VecType initVec() { return vdupq_n_u32(0); }
     ALWAYS_INLINE static Vec128Type initVec128() { return vdupq_n_u32(0); }
     ALWAYS_INLINE static int vec128toInt(Vec128Type x) { return vgetq_lane_u32(x, 0); }
-    ALWAYS_INLINE static VecType u16Load(U16VecType x) { return vmovl_u16(x); }
+    ALWAYS_INLINE static VecType u16Load(U16VecType const* x) { return vmovl_u16(*x); }
     ALWAYS_INLINE static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return vmulq_u32(x, y); }
     ALWAYS_INLINE static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return vaddq_u32(x, y); }
     ALWAYS_INLINE static VecType vecMul(VecType x, VecType y) { return vmulq_u32(x, y); }
diff --git a/kotlin-native/runtime/src/main/cpp/polyhash/attributeSensitiveFunctions.inc b/kotlin-native/runtime/src/main/cpp/polyhash/attributeSensitiveFunctions.inc
index 7524cae..9eb6329 100644
--- a/kotlin-native/runtime/src/main/cpp/polyhash/attributeSensitiveFunctions.inc
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/attributeSensitiveFunctions.inc
@@ -9,7 +9,7 @@
     const int vecLength = sizeof(VecType) / 4;
     if (n < vecLength / 4) return;
 
-    VecType x = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
+    VecType x = Traits::u16Load(reinterpret_cast<U16VecType const*>(str));
     res = Traits::vec128Mul(res, *reinterpret_cast<Vec128Type const*>(b));
     VecType z = Traits::vecMul(x, *reinterpret_cast<VecType const*>(p));
     res = Traits::vec128Add(res, Traits::squash1(z));
@@ -33,8 +33,8 @@
     VecType res1 = Traits::initVec();
 
     do {
-        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
-        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
+        VecType x0 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength));
         res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
         res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
         VecType z0 = Traits::vecMul(x0, *reinterpret_cast<VecType const*>(p));
@@ -66,10 +66,10 @@
     VecType res3 = Traits::initVec();
 
     do {
-        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
-        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
-        VecType x2 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 2));
-        VecType x3 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 3));
+        VecType x0 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength));
+        VecType x2 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 2));
+        VecType x3 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 3));
         res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
         res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
         res2 = Traits::vecMul(res2, *reinterpret_cast<VecType const*>(b));
@@ -109,14 +109,14 @@
     VecType res7 = Traits::initVec();
 
     do {
-        VecType x0 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str));
-        VecType x1 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength));
-        VecType x2 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 2));
-        VecType x3 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 3));
-        VecType x4 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 4));
-        VecType x5 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 5));
-        VecType x6 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 6));
-        VecType x7 = Traits::u16Load(*reinterpret_cast<U16VecType const*>(str + vecLength * 7));
+        VecType x0 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str));
+        VecType x1 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength));
+        VecType x2 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 2));
+        VecType x3 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 3));
+        VecType x4 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 4));
+        VecType x5 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 5));
+        VecType x6 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 6));
+        VecType x7 = Traits::u16Load(reinterpret_cast<U16VecType const*>(str + vecLength * 7));
         res0 = Traits::vecMul(res0, *reinterpret_cast<VecType const*>(b));
         res1 = Traits::vecMul(res1, *reinterpret_cast<VecType const*>(b));
         res2 = Traits::vecMul(res2, *reinterpret_cast<VecType const*>(b));
@@ -149,4 +149,4 @@
     Vec128Type sum1 = Traits::vec128Add(Traits::squash2(res0, res1), Traits::squash2(res2, res3));
     Vec128Type sum2 = Traits::vec128Add(Traits::squash2(res4, res5), Traits::squash2(res6, res7));
     res = Traits::vec128Add(res, Traits::vec128Add(sum1, sum2));
-}
\ No newline at end of file
+}
diff --git a/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp b/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp
index d53ef38..9838580 100644
--- a/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp
+++ b/kotlin-native/runtime/src/main/cpp/polyhash/x86.cpp
@@ -32,7 +32,7 @@
     static VecType initVec() { return _mm_setzero_si128(); }
     static Vec128Type initVec128() { return _mm_setzero_si128(); }
     static int vec128toInt(Vec128Type x) { return _mm_cvtsi128_si32(x); }
-    static VecType u16Load(U16VecType x) { return _mm_cvtepu16_epi32(x); }
+    static VecType u16Load(U16VecType const* x) { return _mm_cvtepu16_epi32(_mm_loadl_epi64(x)); }
     static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return _mm_mullo_epi32(x, y); }
     static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return _mm_add_epi32(x, y); }
     static VecType vecMul(VecType x, VecType y) { return _mm_mullo_epi32(x, y); }
@@ -80,7 +80,7 @@
     static VecType initVec() { return _mm256_setzero_si256(); }
     static Vec128Type initVec128() { return _mm_setzero_si128(); }
     static int vec128toInt(Vec128Type x) { return _mm_cvtsi128_si32(x); }
-    static VecType u16Load(U16VecType x) { return _mm256_cvtepu16_epi32(x); }
+    static VecType u16Load(U16VecType const* x) { return _mm256_cvtepu16_epi32(*x); }
     static Vec128Type vec128Mul(Vec128Type x, Vec128Type y) { return _mm_mullo_epi32(x, y); }
     static Vec128Type vec128Add(Vec128Type x, Vec128Type y) { return _mm_add_epi32(x, y); }
     static VecType vecMul(VecType x, VecType y) { return _mm256_mullo_epi32(x, y); }