Several fixes singled out in the pull request discussion thread.
diff --git a/curve-specific.inc b/curve-specific.inc
index 81f725f..0453b21 100644
--- a/curve-specific.inc
+++ b/curve-specific.inc
@@ -1215,7 +1215,7 @@
     
     for (k = 0; k < num_words_secp256k1; ++k) {
         uint64_t p = (uint64_t)0x3D1 * right[k] + carry;
-        result[k] = p;
+        result[k] = (uint32_t) p;
         carry = p >> 32;
     }
     result[num_words_secp256k1] = carry;
diff --git a/uECC.c b/uECC.c
index 2fe4a30..67ad18c 100644
--- a/uECC.c
+++ b/uECC.c
@@ -157,6 +157,16 @@
 #endif
 };
 
+static void bcopy(uint8_t *dst,
+                  const uint8_t *src,
+                  unsigned num_bytes)
+{
+  while (0 != num_bytes) {
+    num_bytes--;
+    dst[num_bytes] = src[num_bytes];
+  }
+}
+
 static cmpresult_t uECC_vli_cmp_unsafe(const uECC_word_t *left,
                                        const uECC_word_t *right,
                                        wordcount_t num_words);
@@ -1016,9 +1026,8 @@
                        const uint8_t *private_key,
                        uint8_t *secret,
                        uECC_Curve curve) {
-     
-    uECC_word_t private[uECC_MAX_WORDS];
     uECC_word_t public[uECC_MAX_WORDS * 2];
+    uECC_word_t private[uECC_MAX_WORDS];
 
     uECC_word_t tmp[uECC_MAX_WORDS];
     uECC_word_t *p2[2] = {private, tmp};
@@ -1028,8 +1037,8 @@
     wordcount_t num_bytes = curve->num_bytes;
 
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(private, private_key, num_bytes);
-    memcpy(public, public_key, num_bytes*2);
+    bcopy((uint8_t *) private, private_key, num_bytes);
+    bcopy((uint8_t *) public, public_key, num_bytes*2);
 #else
     uECC_vli_bytesToNative(private, private_key, BITS_TO_BYTES(curve->num_n_bits));
     uECC_vli_bytesToNative(public, public_key, num_bytes);
@@ -1051,7 +1060,7 @@
 
     EccPoint_mult(public, public, p2[!carry], initial_Z, curve->num_n_bits + 1, curve);
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(secret, public, num_bytes);
+    bcopy((uint8_t *) secret, (uint8_t *) public, num_bytes);
 #else
     uECC_vli_nativeToBytes(secret, num_bytes, public);
 #endif
@@ -1079,7 +1088,7 @@
 #endif
     uECC_word_t *y = point + curve->num_words;
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(public_key, compressed+1, curve->num_bytes);
+    bcopy(public_key, compressed+1, curve->num_bytes);
 #else
     uECC_vli_bytesToNative(point, compressed + 1, curve->num_bytes);
 #endif
@@ -1087,7 +1096,7 @@
     curve->mod_sqrt(y, curve);
 
     if ((y[0] & 0x01) != (compressed[0] & 0x01)) {
-      uECC_vli_sub(y, curve->p, y, curve->num_words);
+        uECC_vli_sub(y, curve->p, y, curve->num_words);
     }
 
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN == 0
@@ -1189,7 +1198,7 @@
 
     uECC_vli_clear(native, num_n_words);
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(native, bits, bits_size);
+    bcopy((uint8_t *) native, bits, bits_size);
 #else
     uECC_vli_bytesToNative(native, bits, bits_size);
 #endif    
@@ -1262,7 +1271,7 @@
 #endif
 
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(tmp, private_key, BITS_TO_BYTES(curve->num_n_bits));
+    bcopy((uint8_t *) tmp, private_key, BITS_TO_BYTES(curve->num_n_bits));
 #else
     uECC_vli_bytesToNative(tmp, private_key, BITS_TO_BYTES(curve->num_n_bits)); /* tmp = d */
 #endif
@@ -1278,7 +1287,7 @@
         return 0;
     }
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(signature + curve->num_bytes, s, curve->num_bytes);
+    bcopy((uint8_t *) signature + curve->num_bytes, (uint8_t *) s, curve->num_bytes);
 #else
     uECC_vli_nativeToBytes(signature + curve->num_bytes, curve->num_bytes, s);
 #endif    
@@ -1464,8 +1473,8 @@
     s[num_n_words - 1] = 0;
 
 #if uECC_VLI_NATIVE_LITTLE_ENDIAN
-    memcpy(r, signature, curve->num_bytes);
-    memcpy(s, signature + curve->num_bytes, curve->num_bytes);
+    bcopy((uint8_t *) r, signature, curve->num_bytes);
+    bcopy((uint8_t *) s, signature + curve->num_bytes, curve->num_bytes);
 #else
     uECC_vli_bytesToNative(public, public_key, curve->num_bytes);
     uECC_vli_bytesToNative(
diff --git a/uECC.h b/uECC.h
index c8c8e18..8269916 100644
--- a/uECC.h
+++ b/uECC.h
@@ -35,9 +35,11 @@
     #define uECC_SQUARE_FUNC 0
 #endif
 
-/* uECC_VLI_NATIVE_LITTLE_ENDIAN - If enabled (defined as nonzero), this will enable native
-little-endian format for all keys passed in and out of the public API. This will *only* work
-on native little-endian processors. */
+/* uECC_VLI_NATIVE_LITTLE_ENDIAN - If enabled (defined as nonzero), this will switch to native
+little-endian format for *all* arrays passed in and out of the public API. This includes public 
+and private keys, shared secrets, signatures and message hashes. 
+Using this switch reduces the amount of call stack memory used by uECC, since less intermediate
+translations are required. Note that this will *only* work on native little-endian processors. */
 #ifndef uECC_VLI_NATIVE_LITTLE_ENDIAN
     #define uECC_VLI_NATIVE_LITTLE_ENDIAN 0
 #endif