Don't overread in poly_Rq_mul

The polynomials have 701, 16-bit values. But poly_Rq_mul was reading 32
bytes at offset 1384 in order to get the last 18 of them. This silently
worked for a long time, but when 7153013019 switched to keeping
variables on the stack it was noticed by Valgrind.

This change fixes the overread. Setting watchpoints at the ends of the
two inputs (and one output) now shows no overreads nor overwrites.

BUG=424

Change-Id: Id86c1407ffce66593541c10feee47213f4b95c5d
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/48645
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/hrss/asm/poly_rq_mul.S b/crypto/hrss/asm/poly_rq_mul.S
index 53ce47c..c37d7d0 100644
--- a/crypto/hrss/asm/poly_rq_mul.S
+++ b/crypto/hrss/asm/poly_rq_mul.S
@@ -26,23 +26,6 @@
 # This file was generated by poly_rq_mul.py
 .text
 .align 32
-mask_low9words:
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0xffff
-.word 0x0
-.word 0x0
-.word 0x0
-.word 0x0
-.word 0x0
-.word 0x0
-.word 0x0
 const3:
 .word 3
 .word 3
@@ -746,8 +729,20 @@
 vmovdqu 1120(%rsi), %ymm4
 vmovdqu 1208(%rsi), %ymm5
 vmovdqu 1296(%rsi), %ymm6
-vmovdqu 1384(%rsi), %ymm7
-vpand mask_low9words(%rip), %ymm7, %ymm7
+
+# Only 18 bytes more can be read, but vmovdqu reads 32.
+# Copy 18 bytes to the red zone and zero pad to 32 bytes.
+xor %r9, %r9
+movq %r9, -16(%rsp)
+movq %r9, -8(%rsp)
+movq 1384(%rsi), %r9
+movq %r9, -32(%rsp)
+movq 1384+8(%rsi), %r9
+movq %r9, -24(%rsp)
+movw 1384+16(%rsi), %r9w
+movw %r9w, -16(%rsp)
+vmovdqu -32(%rsp), %ymm7
+
 vmovdqu 416(%rsi), %ymm8
 vmovdqu 504(%rsi), %ymm9
 vmovdqu 592(%rsi), %ymm10
@@ -1341,8 +1336,20 @@
 vmovdqu 1120(%rdx), %ymm4
 vmovdqu 1208(%rdx), %ymm5
 vmovdqu 1296(%rdx), %ymm6
-vmovdqu 1384(%rdx), %ymm7
-vpand mask_low9words(%rip), %ymm7, %ymm7
+
+# Only 18 bytes more can be read, but vmovdqu reads 32.
+# Copy 18 bytes to the red zone and zero pad to 32 bytes.
+xor %r9, %r9
+movq %r9, -16(%rsp)
+movq %r9, -8(%rsp)
+movq 1384(%rdx), %r9
+movq %r9, -32(%rsp)
+movq 1384+8(%rdx), %r9
+movq %r9, -24(%rsp)
+movw 1384+16(%rdx), %r9w
+movw %r9w, -16(%rsp)
+vmovdqu -32(%rsp), %ymm7
+
 vmovdqu 416(%rdx), %ymm8
 vmovdqu 504(%rdx), %ymm9
 vmovdqu 592(%rdx), %ymm10
@@ -8295,7 +8302,20 @@
 vmovdqa %ymm8, 2880(%r8)
 vmovdqu 680(%rdi), %ymm8
 vmovdqu 1032(%rdi), %ymm10
-vmovdqu 1384(%rdi), %ymm2
+
+# Only 18 bytes can be read at 1384, but vmovdqu reads 32.
+# Copy 18 bytes to the red zone and zero pad to 32 bytes.
+xor %r9, %r9
+movq %r9, -16(%rsp)
+movq %r9, -8(%rsp)
+movq 1384(%rdi), %r9
+movq %r9, -32(%rsp)
+movq 1384+8(%rdi), %r9
+movq %r9, -24(%rsp)
+movw 1384+16(%rdi), %r9w
+movw %r9w, -16(%rsp)
+vmovdqu -32(%rsp), %ymm2
+
 vpaddw %ymm5, %ymm8, %ymm5
 vpaddw %ymm6, %ymm10, %ymm6
 vpaddw %ymm4, %ymm2, %ymm4