Improve optimization for little-endian platforms.

Previously there was a fast path for little endian platforms
in pb_decode_fixed64() but not in pb_encode_fixed64(). Also
the macros used for the check didn't trigger on GCC.

Macro checks were expanded to cover all common compilers and
now it is possible to specify PB_LITTLE_ENDIAN_8BIT manually
if it is not automatically detected.
diff --git a/pb.h b/pb.h
index 3bd1908..f4a9bc2 100644
--- a/pb.h
+++ b/pb.h
@@ -14,7 +14,8 @@
 /* #define PB_ENABLE_MALLOC 1 */
 
 /* Define this if your CPU / compiler combination does not support
- * unaligned memory access to packed structures. */
+ * unaligned memory access to packed structures. Note that packed
+ * structures are only used when requested in .proto options. */
 /* #define PB_NO_PACKED_STRUCTS 1 */
 
 /* Increase the number of required fields that are tracked.
@@ -47,6 +48,10 @@
  * the string processing slightly and slightly increases code size. */
 /* #define PB_VALIDATE_UTF8 1 */
 
+/* This can be defined if the platform is little-endian and has 8-bit bytes.
+ * Normally it is automatically detected based on __BYTE_ORDER__ macro. */
+/* #define PB_LITTLE_ENDIAN_8BIT 1 */
+
 /******************************************************************
  * You usually don't need to change anything below this line.     *
  * Feel free to look around and use the defined macros, though.   *
@@ -116,6 +121,18 @@
 #   define pb_packed
 #endif
 
+/* Detect endianess */
+#ifndef PB_LITTLE_ENDIAN_8BIT
+#if ((defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
+     (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || \
+      defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || \
+      defined(__THUMBEL__) || defined(__AARCH64EL__) || defined(_MIPSEL) || \
+      defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM)) \
+     && CHAR_BIT == 8
+#define PB_LITTLE_ENDIAN_8BIT 1
+#endif
+#endif
+
 /* Handly macro for suppressing unreferenced-parameter compiler warnings. */
 #ifndef PB_UNUSED
 #define PB_UNUSED(x) (void)(x)
diff --git a/pb_decode.c b/pb_decode.c
index d9ecf25..f388932 100644
--- a/pb_decode.c
+++ b/pb_decode.c
@@ -1362,7 +1362,7 @@
     if (!pb_read(stream, u.bytes, 4))
         return false;
 
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN && CHAR_BIT == 8
+#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1
     /* fast path - if we know that we're on little endian, assign directly */
     *(uint32_t*)dest = u.fixed32;
 #else
@@ -1385,7 +1385,7 @@
     if (!pb_read(stream, u.bytes, 8))
         return false;
 
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN && CHAR_BIT == 8
+#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1
     /* fast path - if we know that we're on little endian, assign directly */
     *(uint64_t*)dest = u.fixed64;
 #else
diff --git a/pb_encode.c b/pb_encode.c
index de716f7..f5f1676 100644
--- a/pb_encode.c
+++ b/pb_encode.c
@@ -632,6 +632,10 @@
 
 bool checkreturn pb_encode_fixed32(pb_ostream_t *stream, const void *value)
 {
+#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1
+    /* Fast path if we know that we're on little endian */
+    return pb_write(stream, (const pb_byte_t*)value, 4);
+#else
     uint32_t val = *(const uint32_t*)value;
     pb_byte_t bytes[4];
     bytes[0] = (pb_byte_t)(val & 0xFF);
@@ -639,11 +643,16 @@
     bytes[2] = (pb_byte_t)((val >> 16) & 0xFF);
     bytes[3] = (pb_byte_t)((val >> 24) & 0xFF);
     return pb_write(stream, bytes, 4);
+#endif
 }
 
 #ifndef PB_WITHOUT_64BIT
 bool checkreturn pb_encode_fixed64(pb_ostream_t *stream, const void *value)
 {
+#if defined(PB_LITTLE_ENDIAN_8BIT) && PB_LITTLE_ENDIAN_8BIT == 1
+    /* Fast path if we know that we're on little endian */
+    return pb_write(stream, (const pb_byte_t*)value, 8);
+#else
     uint64_t val = *(const uint64_t*)value;
     pb_byte_t bytes[8];
     bytes[0] = (pb_byte_t)(val & 0xFF);
@@ -655,6 +664,7 @@
     bytes[6] = (pb_byte_t)((val >> 48) & 0xFF);
     bytes[7] = (pb_byte_t)((val >> 56) & 0xFF);
     return pb_write(stream, bytes, 8);
+#endif
 }
 #endif