| // Copyright 2022 The Abseil Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Implementation of CRCs (aka Rabin Fingerprints). |
| // Treats the input as a polynomial with coefficients in Z(2), |
| // and finds the remainder when divided by an irreducible polynomial |
| // of the appropriate length. |
| // It handles all CRC sizes from 8 to 128 bits. |
| // It's somewhat complicated by having separate implementations optimized for |
| // CRC's <=32 bits, <= 64 bits, and <= 128 bits. |
| // The input string is prefixed with a "1" bit, and has "degree" "0" bits |
| // appended to it before the remainder is found. This ensures that |
| // short strings are scrambled somewhat and that strings consisting |
| // of all nulls have a non-zero CRC. |
| // |
| // Uses the "interleaved word-by-word" method from |
| // "Everything we know about CRC but afraid to forget" by Andrew Kadatch |
| // and Bob Jenkins, |
| // http://crcutil.googlecode.com/files/crc-doc.1.0.pdf |
| // |
| // The idea is to compute kStride CRCs simultaneously, allowing the |
| // processor to more effectively use multiple execution units. Each of |
| // the CRCs is calculated on one word of data followed by kStride - 1 |
| // words of zeroes; the CRC starting points are staggered by one word. |
| // Assuming a stride of 4 with data words "ABCDABCDABCD", the first |
| // CRC is over A000A000A, the second over 0B000B000B, and so on. |
| // The CRC of the whole data is then calculated by properly aligning the |
| // CRCs by appending zeroes until the data lengths agree then XORing |
| // the CRCs. |
| |
| #include "absl/crc/internal/crc.h" |
| |
| #include <cstdint> |
| |
| #include "absl/base/internal/endian.h" |
| #include "absl/base/internal/raw_logging.h" |
| #include "absl/base/prefetch.h" |
| #include "absl/crc/internal/crc_internal.h" |
| |
| namespace absl { |
| ABSL_NAMESPACE_BEGIN |
| namespace crc_internal { |
| |
| namespace { |
| |
| // Constants |
| #if defined(__i386__) || defined(__x86_64__) |
| constexpr bool kNeedAlignedLoads = false; |
| #else |
| constexpr bool kNeedAlignedLoads = true; |
| #endif |
| |
| // We express the number of zeroes as a number in base ZEROES_BASE. By |
| // pre-computing the zero extensions for all possible components of such an |
| // expression (numbers in a form a*ZEROES_BASE**b), we can calculate the |
| // resulting extension by multiplying the extensions for individual components |
| // using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of |
| // zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries. |
| constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE) |
| constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2 |
| |
| constexpr uint32_t kCrc32cPoly = 0x82f63b78; |
| |
| uint32_t ReverseBits(uint32_t bits) { |
| bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1; |
| bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2; |
| bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4; |
| return absl::gbswap_32(bits); |
| } |
| |
| // Polynomial long multiplication mod the polynomial of degree 32. |
| void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) { |
| uint32_t l = *val; |
| uint32_t result = 0; |
| auto onebit = uint32_t{0x80000000u}; |
| for (uint32_t one = onebit; one != 0; one >>= 1) { |
| if ((l & one) != 0) { |
| result ^= m; |
| } |
| if (m & 1) { |
| m = (m >> 1) ^ poly; |
| } else { |
| m >>= 1; |
| } |
| } |
| *val = result; |
| } |
| } // namespace |
| |
| void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size, |
| Uint32By256* t) { |
| for (int j = 0; j != word_size; j++) { // for each byte of extension.... |
| t[j][0] = 0; // a zero has no effect |
| for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2 |
| if (j == 0 && i == 128) { |
| t[j][i] = last; // top bit in last byte is given |
| } else { |
| // each successive power of two is derived from the previous |
| // one, either in this table, or the last table |
| uint32_t pred; |
| if (i == 128) { |
| pred = t[j - 1][1]; |
| } else { |
| pred = t[j][i << 1]; |
| } |
| // Advance the CRC by one bit (multiply by X, and take remainder |
| // through one step of polynomial long division) |
| if (pred & 1) { |
| t[j][i] = (pred >> 1) ^ poly; |
| } else { |
| t[j][i] = pred >> 1; |
| } |
| } |
| } |
| // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b) |
| // so we can make all the tables for non-powers of two by |
| // xoring previously created entries. |
| for (int i = 2; i != 256; i <<= 1) { |
| for (int k = i + 1; k != (i << 1); k++) { |
| t[j][k] = t[j][i] ^ t[j][k - i]; |
| } |
| } |
| } |
| } |
| |
| int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) { |
| uint32_t inc = 1; |
| inc <<= 31; |
| |
| // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0. |
| inc >>= 1; |
| |
| // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte. |
| for (int i = 0; i < 3; ++i) { |
| PolyMultiply(&inc, inc, poly); |
| } |
| |
| int j = 0; |
| for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) { |
| // Every entry in the table adds an additional inc_len zeroes. |
| uint32_t v = inc; |
| for (int a = 1; a != ZEROES_BASE; a++) { |
| t[0][j] = v; |
| PolyMultiply(&v, inc, poly); |
| j++; |
| } |
| inc = v; |
| } |
| ABSL_RAW_CHECK(j <= 256, ""); |
| return j; |
| } |
| |
| // Internal version of the "constructor". |
| CRCImpl* CRCImpl::NewInternal() { |
| // Find an accelearated implementation first. |
| CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined(); |
| |
| // Fall back to generic implementions if no acceleration is available. |
| if (result == nullptr) { |
| result = new CRC32(); |
| } |
| |
| result->InitTables(); |
| |
| return result; |
| } |
| |
| // The 32-bit implementation |
| |
| void CRC32::InitTables() { |
| // Compute the table for extending a CRC by one byte. |
| Uint32By256* t = new Uint32By256[4]; |
| FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t); |
| for (int i = 0; i != 256; i++) { |
| this->table0_[i] = t[0][i]; |
| } |
| |
| // Construct a table for updating the CRC by 4 bytes data followed by |
| // 12 bytes of zeroes. |
| // |
| // Note: the data word size could be larger than the CRC size; it might |
| // be slightly faster to use a 64-bit data word, but doing so doubles the |
| // table size. |
| uint32_t last = kCrc32cPoly; |
| const size_t size = 12; |
| for (size_t i = 0; i < size; ++i) { |
| last = (last >> 8) ^ this->table0_[last & 0xff]; |
| } |
| FillWordTable(kCrc32cPoly, last, 4, t); |
| for (size_t b = 0; b < 4; ++b) { |
| for (int i = 0; i < 256; ++i) { |
| this->table_[b][i] = t[b][i]; |
| } |
| } |
| |
| int j = FillZeroesTable(kCrc32cPoly, t); |
| ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), ""); |
| for (int i = 0; i < j; i++) { |
| this->zeroes_[i] = t[0][i]; |
| } |
| |
| delete[] t; |
| |
| // Build up tables for _reversing_ the operation of doing CRC operations on |
| // zero bytes. |
| |
| // In C++, extending `crc` by a single zero bit is done by the following: |
| // (A) bool low_bit_set = (crc & 1); |
| // crc >>= 1; |
| // if (low_bit_set) crc ^= kCrc32cPoly; |
| // |
| // In particular note that the high bit of `crc` after this operation will be |
| // set if and only if the low bit of `crc` was set before it. This means that |
| // no information is lost, and the operation can be reversed, as follows: |
| // (B) bool high_bit_set = (crc & 0x80000000u); |
| // if (high_bit_set) crc ^= kCrc32cPoly; |
| // crc <<= 1; |
| // if (high_bit_set) crc ^= 1; |
| // |
| // Or, equivalently: |
| // (C) bool high_bit_set = (crc & 0x80000000u); |
| // crc <<= 1; |
| // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1); |
| // |
| // The last observation is, if we store our checksums in variable `rcrc`, |
| // with order of the bits reversed, the inverse operation becomes: |
| // (D) bool low_bit_set = (rcrc & 1); |
| // rcrc >>= 1; |
| // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1) |
| // |
| // This is the same algorithm (A) that we started with, only with a different |
| // polynomial bit pattern. This means that by building up our tables with |
| // this alternate polynomial, we can apply the CRC algorithms to a |
| // bit-reversed CRC checksum to perform inverse zero-extension. |
| |
| const uint32_t kCrc32cUnextendPoly = |
| ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1)); |
| FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_); |
| |
| j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_); |
| ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)), |
| ""); |
| } |
| |
| void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const { |
| const uint8_t* p = static_cast<const uint8_t*>(bytes); |
| const uint8_t* e = p + length; |
| uint32_t l = *crc; |
| |
| auto step_one_byte = [this, &p, &l]() { |
| int c = (l & 0xff) ^ *p++; |
| l = this->table0_[c] ^ (l >> 8); |
| }; |
| |
| if (kNeedAlignedLoads) { |
| // point x at first 4-byte aligned byte in string. this might be past the |
| // end of the string. |
| const uint8_t* x = RoundUp<4>(p); |
| if (x <= e) { |
| // Process bytes until finished or p is 4-byte aligned |
| while (p != x) { |
| step_one_byte(); |
| } |
| } |
| } |
| |
| const size_t kSwathSize = 16; |
| if (static_cast<size_t>(e - p) >= kSwathSize) { |
| // Load one swath of data into the operating buffers. |
| uint32_t buf0 = absl::little_endian::Load32(p) ^ l; |
| uint32_t buf1 = absl::little_endian::Load32(p + 4); |
| uint32_t buf2 = absl::little_endian::Load32(p + 8); |
| uint32_t buf3 = absl::little_endian::Load32(p + 12); |
| p += kSwathSize; |
| |
| // Increment a CRC value by a "swath"; this combines the four bytes |
| // starting at `ptr` and twelve zero bytes, so that four CRCs can be |
| // built incrementally and combined at the end. |
| const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) { |
| return absl::little_endian::Load32(ptr) ^ |
| this->table_[3][crc_in & 0xff] ^ |
| this->table_[2][(crc_in >> 8) & 0xff] ^ |
| this->table_[1][(crc_in >> 16) & 0xff] ^ |
| this->table_[0][crc_in >> 24]; |
| }; |
| |
| // Run one CRC calculation step over all swaths in one 16-byte stride |
| const auto step_stride = [&]() { |
| buf0 = step_swath(buf0, p); |
| buf1 = step_swath(buf1, p + 4); |
| buf2 = step_swath(buf2, p + 8); |
| buf3 = step_swath(buf3, p + 12); |
| p += 16; |
| }; |
| |
| // Process kStride interleaved swaths through the data in parallel. |
| while ((e - p) > kPrefetchHorizon) { |
| PrefetchToLocalCacheNta( |
| reinterpret_cast<const void*>(p + kPrefetchHorizon)); |
| // Process 64 bytes at a time |
| step_stride(); |
| step_stride(); |
| step_stride(); |
| step_stride(); |
| } |
| while (static_cast<size_t>(e - p) >= kSwathSize) { |
| step_stride(); |
| } |
| |
| // Now advance one word at a time as far as possible. This isn't worth |
| // doing if we have word-advance tables. |
| while (static_cast<size_t>(e - p) >= 4) { |
| buf0 = step_swath(buf0, p); |
| uint32_t tmp = buf0; |
| buf0 = buf1; |
| buf1 = buf2; |
| buf2 = buf3; |
| buf3 = tmp; |
| p += 4; |
| } |
| |
| // Combine the results from the different swaths. This is just a CRC |
| // on the data values in the bufX words. |
| auto combine_one_word = [this](uint32_t crc_in, uint32_t w) { |
| w ^= crc_in; |
| for (size_t i = 0; i < 4; ++i) { |
| w = (w >> 8) ^ this->table0_[w & 0xff]; |
| } |
| return w; |
| }; |
| |
| l = combine_one_word(0, buf0); |
| l = combine_one_word(l, buf1); |
| l = combine_one_word(l, buf2); |
| l = combine_one_word(l, buf3); |
| } |
| |
| // Process the last few bytes |
| while (p != e) { |
| step_one_byte(); |
| } |
| |
| *crc = l; |
| } |
| |
| void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length, |
| const uint32_t zeroes_table[256], |
| const uint32_t poly_table[256]) { |
| if (length != 0) { |
| uint32_t l = *crc; |
| // For each ZEROES_BASE_LG bits in length |
| // (after the low-order bits have been removed) |
| // we lookup the appropriate polynomial in the zeroes_ array |
| // and do a polynomial long multiplication (mod the CRC polynomial) |
| // to extend the CRC by the appropriate number of bits. |
| for (int i = 0; length != 0; |
| i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) { |
| int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits |
| if (c != 0) { // if they are not zero, |
| // multiply by entry in table |
| // Build a table to aid in multiplying 2 bits at a time. |
| // It takes too long to build tables for more bits. |
| uint64_t m = zeroes_table[c + i - 1]; |
| m <<= 1; |
| uint64_t m2 = m << 1; |
| uint64_t mtab[4] = {0, m, m2, m2 ^ m}; |
| |
| // Do the multiply one byte at a time. |
| uint64_t result = 0; |
| for (int x = 0; x < 32; x += 8) { |
| // The carry-less multiply. |
| result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^ |
| (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6); |
| l >>= 8; |
| |
| // Reduce modulo the polynomial |
| result = (result >> 8) ^ poly_table[result & 0xff]; |
| } |
| l = static_cast<uint32_t>(result); |
| } |
| } |
| *crc = l; |
| } |
| } |
| |
| void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const { |
| return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_); |
| } |
| |
| void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const { |
| // See the comment in CRC32::InitTables() for an explanation of the algorithm |
| // below. |
| *crc = ReverseBits(*crc); |
| ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_); |
| *crc = ReverseBits(*crc); |
| } |
| |
| void CRC32::Scramble(uint32_t* crc) const { |
| // Rotate by near half the word size plus 1. See the scramble comment in |
| // crc_internal.h for an explanation. |
| constexpr int scramble_rotate = (32 / 2) + 1; |
| *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo), |
| 32, scramble_rotate) & |
| MaskOfLength<uint32_t>(32); |
| } |
| |
| void CRC32::Unscramble(uint32_t* crc) const { |
| constexpr int scramble_rotate = (32 / 2) + 1; |
| uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32, |
| 32 - scramble_rotate); |
| *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32); |
| } |
| |
| // Constructor and destructor for base class CRC. |
| CRC::~CRC() {} |
| CRC::CRC() {} |
| |
| // The "constructor" for a CRC32C with a standard polynomial. |
| CRC* CRC::Crc32c() { |
| static CRC* singleton = CRCImpl::NewInternal(); |
| return singleton; |
| } |
| |
| } // namespace crc_internal |
| ABSL_NAMESPACE_END |
| } // namespace absl |