bn/asm/x86_64-mont5.pl: fix carry propagating bug (CVE-2015-3193).

(Imported from upstream's d73cc256c8e256c32ed959456101b73ba9842f72.)

Change-Id: I673301fee57f0ab5bef24553caf8b2aac67fb3a9
Reviewed-on: https://boringssl-review.googlesource.com/6616
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 38def07..99385a8 100644
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -322,16 +322,17 @@
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
 	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	(%rsp,$i,8),$ap
-	mov	($rp,$i,8),$np
-	xor	$np,$ap			# conditional select:
-	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
-	xor	$np,$ap			# ap = borrow?tp:rp
+	mov	($ap,$i,8),%rax
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
@@ -1770,6 +1771,15 @@
 .align	32
 .L8x_tail_done:
 	add	(%rdx),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	xor	%rax,%rax
 
 	neg	$carry
@@ -3116,6 +3126,15 @@
 .align	32
 .Lsqrx8x_tail_done:
 	add	24+8(%rsp),%r8		# can this overflow?
+	adc	\$0,%r9
+	adc	\$0,%r10
+	adc	\$0,%r11
+	adc	\$0,%r12
+	adc	\$0,%r13
+	adc	\$0,%r14
+	adc	\$0,%r15		# can't overflow, because we
+					# started with "overhung" part
+					# of multiplication
 	mov	$carry,%rax		# xor	%rax,%rax
 
 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
@@ -3159,13 +3178,11 @@
 my @ri=map("%r$_",(10..13));
 my @ni=map("%r$_",(14..15));
 $code.=<<___;
-	xor	%rbx,%rbx
+	xor	%ebx,%ebx
 	sub	%r15,%rsi		# compare top-most words
 	adc	%rbx,%rbx
 	mov	%rcx,%r10		# -$num
-	.byte	0x67
 	or	%rbx,%rax
-	.byte	0x67
 	mov	%rcx,%r9		# -$num
 	xor	\$1,%rax
 	sar	\$3+2,%rcx		# cf=0
diff --git a/crypto/bn/bn_test.cc b/crypto/bn/bn_test.cc
index 7241277..7636f30 100644
--- a/crypto/bn/bn_test.cc
+++ b/crypto/bn/bn_test.cc
@@ -330,6 +330,13 @@
   return 0;
 }
 
+static int HexToBIGNUM(ScopedBIGNUM *out, const char *in) {
+  BIGNUM *raw = NULL;
+  int ret = BN_hex2bn(&raw, in);
+  out->reset(raw);
+  return ret;
+}
+
 static bool test_add(FILE *fp) {
   ScopedBIGNUM a(BN_new());
   ScopedBIGNUM b(BN_new());
@@ -1107,6 +1114,27 @@
       return false;
     }
   }
+
+   // Regression test for carry propagation bug in sqr8x_reduction.
+  if (!HexToBIGNUM(&a, "050505050505") ||
+      !HexToBIGNUM(&b, "02") ||
+      !HexToBIGNUM(
+          &c,
+          "4141414141414141414141274141414141414141414141414141414141414141"
+          "4141414141414141414141414141414141414141414141414141414141414141"
+          "4141414141414141414141800000000000000000000000000000000000000000"
+          "0000000000000000000000000000000000000000000000000000000000000000"
+          "0000000000000000000000000000000000000000000000000000000000000000"
+          "0000000000000000000000000000000000000000000000000000000001") ||
+      !BN_mod_exp(d.get(), a.get(), b.get(), c.get(), ctx) ||
+      !BN_mul(e.get(), a.get(), a.get(), ctx)) {
+    return false;
+  }
+  if (BN_cmp(d.get(), e.get()) != 0) {
+    fprintf(stderr, "BN_mod_exp and BN_mul produce different results!\n");
+    return false;
+  }
+
   return true;
 }
 
@@ -1545,13 +1573,6 @@
   return true;
 }
 
-static int HexToBIGNUM(ScopedBIGNUM *out, const char *in) {
-  BIGNUM *raw = NULL;
-  int ret = BN_hex2bn(&raw, in);
-  out->reset(raw);
-  return ret;
-}
-
 static bool test_hex2bn(BN_CTX *ctx) {
   ScopedBIGNUM bn;
   int ret = HexToBIGNUM(&bn, "0");