lib: crc32_sw: 4 bit at a time implementation

Calculate crc32 4 bits at a time. The return value of the calculation is
identical to the previous 1 bit at a time implementation.

Results in a speed up of a factor 3 at the cost of using 64 bytes of
flash for a crc table.

Calculating crc32 of 128kB of flash on a 120MHz Kinetis MKE16F512
Cortex-M4 takes 99ms using the 1 bit at a time implementation, and 30ms
using the 4 bits at a time implementation.

The crc32 routine is used by subsys/canbus/canopen/canopen_program.c to
calculate crc of flash images.

Signed-off-by: Klaus H. Sorensen <khso@vestas.com>
diff --git a/lib/os/crc32_sw.c b/lib/os/crc32_sw.c
index a72f28c..5d74cd8 100644
--- a/lib/os/crc32_sw.c
+++ b/lib/os/crc32_sw.c
@@ -13,13 +13,21 @@
 
 uint32_t crc32_ieee_update(uint32_t crc, const uint8_t *data, size_t len)
 {
-	crc = ~crc;
-	for (size_t i = 0; i < len; i++) {
-		crc = crc ^ data[i];
+	/* crc table generated from polynomial 0xedb88320 */
+	static const uint32_t table[16] = {
+		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
+		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
+		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
+		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c,
+	};
 
-		for (uint8_t j = 0; j < 8; j++) {
-			crc = (crc >> 1) ^ (0xEDB88320 & -(crc & 1));
-		}
+	crc = ~crc;
+
+	for (size_t i = 0; i < len; i++) {
+		uint8_t byte = data[i];
+
+		crc = (crc >> 4) ^ table[(crc ^ byte) & 0x0f];
+		crc = (crc >> 4) ^ table[(crc ^ (byte >> 4)) & 0x0f];
 	}
 
 	return (~crc);