Improve ARM asm a bit (particularly for Thumb devices)
diff --git a/asm_arm.inc b/asm_arm.inc
index 5f07264..7b4459f 100644
--- a/asm_arm.inc
+++ b/asm_arm.inc
@@ -996,17 +996,16 @@
         "muls %[r5], %[r0], %[r5] \n\t" /* r5 = a0 * b1 */
         "muls %[r0], %[r4], %[r0] \n\t" /* r0 = a0 * b0 */
         
-        "movs %[r4], #0 \n\t"    /* r4 = 0 */
-        "adds %[r3], %[r5] \n\t" /* r3 = b0 * a1 + a0 * b1 */
-        "adcs %[r4], %[r4] \n\t" /* r4 = carry */
-        "lsls %[r4], #16 \n\t"   /* r4 = carry << 16 */
-        "adds %[r6], %[r4] \n\t" /* r6 = a1 * b1 + carry */
+        /* Add middle terms */
+        "lsls %[r4], %[r3], #16 \n\t"
+        "lsrs %[r3], %[r3], #16 \n\t"
+        "adds %[r0], %[r4] \n\t"
+        "adcs %[r6], %[r3] \n\t"
         
-        "lsls %[r4], %[r3], #16 \n\t" /* r4 = (b0 * a1 + a0 * b1) << 16 */
-        "lsrs %[r3], #16 \n\t"        /* r3 = (b0 * a1 + a0 * b1) >> 16 */
-        "adds %[r0], %[r4] \n\t"      /* r0 = low word = a0 * b0 + ((b0 * a1 + a0 * b1) << 16) */
-        "adcs %[r6], %[r3] \n\t"      /* r6 = high word =
-                                              a1 * b1 + carry + ((b0 * a1 + a0 * b1) >> 16) */
+        "lsls %[r4], %[r5], #16 \n\t"
+        "lsrs %[r5], %[r5], #16 \n\t"
+        "adds %[r0], %[r4] \n\t"
+        "adcs %[r6], %[r5] \n\t"
         
         "pop {%[r3]} \n\t" /* r3 = c0 */
         "pop {%[r4]} \n\t" /* r4 = c1 */
@@ -1087,13 +1086,11 @@
         
         "cmp %[i], %[tt] \n\t"      /* (i < k - i) ? */
         "bge 4f \n\t"               /*   if i >= k - i, skip */
-        "lsls %[t1], #1 \n\t"       /* high word << 1 */
-        "adc %[c2], %[c2], #0 \n\t" /* add carry bit to c2 */
-        "lsls %[t0], #1 \n\t"       /* low word << 1 */
-        "adc %[t1], %[t1], #0 \n\t" /* add carry bit to high word */
+        "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
+        "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
+        "adcs %[c2], %[c2], #0 \n\t"    /* add carry to c2 */
         
         "4: \n\t"
-
         "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
         "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
         "adcs %[c2], %[c2], #0 \n\t"    /* add carry to c2 */
@@ -1137,6 +1134,7 @@
         "lsls %[r2], #1 \n\t" /* r2 = (num_words - 1) * 8 */
         "mov r9, %[r2] \n\t"  /* r9 = (num_words - 1) * 8 */
         "movs %[r2], #0 \n\t" /* c0 = 0 */
+        "mov r10, %[r2] \n\t" /* r10 = 0 */
         "movs %[r3], #0 \n\t" /* c1 = 0 */
         "movs %[r4], #0 \n\t" /* c2 = 0 */
         "movs %[r5], #0 \n\t" /* k = 0 */
@@ -1157,7 +1155,7 @@
         "push {%[r4]} \n\t"
         "push {%[r3]} \n\t"
         "push {%[r2]} \n\t" /* push things, r2 (c0) is at the top of stack. */
-        "subs %[r7], %[r5], %[r6] \n\t"          /* r7 = k - i */
+        "subs %[r7], %[r5], %[r6] \n\t"  /* r7 = k - i */
         
         "ldr %[r3], [%[r1], %[r7]] \n\t" /* r3 = left[k - i] */
         "ldr %[r0], [%[r1], %[r6]] \n\t" /* r0 = left[i] */
@@ -1174,36 +1172,32 @@
         "muls %[r4], %[r0], %[r4] \n\t" /* r4 = a0 * b1 */
         "muls %[r0], %[r3], %[r0] \n\t" /* r0 = a0 * b0 */
         
-        "movs %[r3], #0 \n\t"    /* r3 = 0 */
-        "adds %[r2], %[r4] \n\t" /* r2 = b0 * a1 + a0 * b1 */
-        "adcs %[r3], %[r3] \n\t" /* r3 = carry */
-        "lsls %[r3], #16 \n\t"   /* r3 = carry << 16 */
-        "adds %[r5], %[r3] \n\t" /* r5 = a1 * b1 + carry */
+        /* Add middle terms */
+        "lsls %[r3], %[r2], #16 \n\t"
+        "lsrs %[r2], %[r2], #16 \n\t"
+        "adds %[r0], %[r3] \n\t"
+        "adcs %[r5], %[r2] \n\t"
         
-        "lsls %[r3], %[r2], #16 \n\t" /* r3 = (b0 * a1 + a0 * b1) << 16 */
-        "lsrs %[r2], #16 \n\t"        /* r2 = (b0 * a1 + a0 * b1) >> 16 */
-        "adds %[r0], %[r3] \n\t"      /* r0 = low word = a0 * b0 + ((b0 * a1 + a0 * b1) << 16) */
-        "adcs %[r5], %[r2] \n\t"      /* r5 = high word = 
-                                              a1 * b1 + carry + ((b0 * a1 + a0 * b1) >> 16) */
-    
-        "movs %[r3], #0 \n\t"    /* r3 = 0 */
-        "cmp %[r6], %[r7] \n\t"  /* (i < k - i) ? */
-        "mov %[r7], %[r3] \n\t"  /* r7 = 0 (does not affect condition) */
-        "bge 4f \n\t"            /*   if i >= k - i, skip */
-        "lsls %[r5], #1 \n\t"    /* high word << 1 */
-        "adcs %[r7], %[r3] \n\t" /* r7 = carry bit for c2 */
-        "lsls %[r0], #1 \n\t"    /* low word << 1 */
-        "adcs %[r5], %[r3] \n\t" /* add carry from shift to high word */
+        "lsls %[r3], %[r4], #16 \n\t"
+        "lsrs %[r4], %[r4], #16 \n\t"
+        "adds %[r0], %[r3] \n\t"
+        "adcs %[r5], %[r4] \n\t"
         
-        "4: \n\t"
+        /* Add to acc, doubling if necessary */
         "pop {%[r2]} \n\t" /* r2 = c0 */
         "pop {%[r3]} \n\t" /* r3 = c1 */
         "pop {%[r4]} \n\t" /* r4 = c2 */
-        "adds %[r2], %[r0] \n\t"         /* add low word to c0 */
-        "adcs %[r3], %[r5] \n\t"         /* add high word to c1, including carry */
-        "movs %[r0], #0 \n\t"            /* r0 = 0 (does not affect carry bit) */
-        "adcs %[r4], %[r0] \n\t"         /* add carry to c2 */
-        "adds %[r4], %[r7] \n\t"         /* add carry from doubling (if any) */
+        
+        "cmp %[r6], %[r7] \n\t"    /* (i < k - i) ? */
+        "mov %[r7], r10 \n\t"    /* r7 = 0 (does not affect flags) */
+        "bge 4f \n\t"            /*   if i >= k - i, skip */
+        "adds %[r2], %[r0] \n\t" /* add low word to c0 */
+        "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
+        "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
+        "4: \n\t"
+        "adds %[r2], %[r0] \n\t" /* add low word to c0 */
+        "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
+        "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
         
         "pop {%[r5]} \n\t" /* r5 = k */
         
@@ -1236,7 +1230,7 @@
         : [r2] "+l" (num_words), [r3] "=&l" (r3), [r4] "=&l" (r4),
           [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7)
         : [r0] "l" (result), [r1] "l" (left)
-        : "r8", "r9", "cc", "memory"
+        : "r8", "r9", "r10", "cc", "memory"
     );
 #endif
 }