| // Copyright 2020 The Pigweed Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| // use this file except in compliance with the License. You may obtain a copy of |
| // the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| // License for the specific language governing permissions and limitations under |
| // the License. |
| #pragma once |
| |
| #include <array> |
| #include <cstddef> |
| #include <cstdint> |
| #include <iterator> |
| |
| namespace pw::tokenizer { |
| |
| // Reads entries from a binary token string database. This class does not copy |
| // or modify the contents of the database. |
| // |
| // A binary token database is comprised of a 16-byte header followed by an array |
| // of 8-byte entries and a table of null-terminated strings. The header |
| // specifies the number of entries. Each entry contains information about a |
| // tokenized string: the token and removal date, if any. All fields are |
| // little-endian. |
| // |
| // Header |
| // ====== |
| // Offset Size Field |
| // ----------------------------------- |
| // 0 6 Magic number (TOKENS) |
| // 6 2 Version (00 00) |
| // 8 4 Entry count |
| // 12 4 Reserved |
| // |
| // Entry |
| // ===== |
| // Offset Size Field |
| // ----------------------------------- |
| // 0 4 Token |
| // 4 4 Removal date (d m yy) |
| // |
| // Entries are sorted by token. A string table with a null-terminated string for |
| // each entry in order follows the entries. |
| // |
| // Entries are accessed by iterating over the database. A O(n) Find function is |
| // also provided. In typical use, a TokenDatabase is preprocessed by a |
| // Detokenizer into a std::unordered_map. |
| class TokenDatabase { |
| public: |
| // Internal struct that describes how the underlying binary token database |
| // stores entries. RawEntries generally should not be used directly. Instead, |
| // use an Entry, which contains a pointer to the entry's string. |
| struct RawEntry { |
| uint32_t token; |
| uint32_t date_removed; |
| }; |
| |
| static_assert(sizeof(RawEntry) == 8u); |
| |
| // An entry in the token database. This struct adds the string to a RawEntry. |
| struct Entry { |
| // The token calculated for this string. |
| uint32_t token; |
| |
| // The date the token and string was removed from the database, or |
| // 0xFFFFFFFF if it was never removed. Dates are encoded such that natural |
| // integer sorting sorts from oldest to newest dates. The day is stored an |
| // an 8-bit day, 8-bit month, and 16-bit year, packed into a little-endian |
| // uint32_t. |
| uint32_t date_removed; |
| |
| // The null-terminated string represented by this token. |
| const char* string; |
| }; |
| |
| // Iterator for TokenDatabase values. Note that this is not a normal STL-style |
| // iterator, since * returns a value instead of a reference. |
| class Iterator { |
| public: |
| constexpr Iterator(const RawEntry* raw_entry, const char* string) |
| : raw_(raw_entry), string_(string) {} |
| |
| // Constructs a TokenDatabase::Entry for the entry this iterator refers to. |
| constexpr Entry entry() const { |
| return {raw_->token, raw_->date_removed, string_}; |
| } |
| |
| constexpr Iterator& operator++() { |
| raw_ += 1; |
| // Move string_ to the character beyond the next null terminator. |
| while (*string_++ != '\0') { |
| } |
| return *this; |
| } |
| constexpr Iterator operator++(int) { return operator++(); } |
| constexpr bool operator==(const Iterator& rhs) const { |
| return raw_ == rhs.raw_; |
| } |
| constexpr bool operator!=(const Iterator& rhs) const { |
| return raw_ != rhs.raw_; |
| } |
| |
| // Derefencing a TokenDatabase::Iterator returns an Entry, not an Entry&. |
| constexpr Entry operator*() const { return entry(); } |
| |
| // Point directly into the underlying RawEntry. Strings are not accessible |
| // via this operator. |
| constexpr const RawEntry* operator->() const { return raw_; } |
| |
| constexpr ptrdiff_t operator-(const Iterator& rhs) const { |
| return raw_ - rhs.raw_; |
| } |
| |
| private: |
| const RawEntry* raw_; |
| const char* string_; |
| }; |
| |
| // A list of token entries returned from a Find operation. This object can be |
| // iterated over or indexed as an array. |
| class Entries { |
| public: |
| constexpr Entries(const Iterator& begin, const Iterator& end) |
| : begin_(begin), end_(end) {} |
| |
| // The number of entries in this list. |
| constexpr size_t size() const { return end_ - begin_; } |
| |
| // True of the list is empty. |
| constexpr bool empty() const { return begin_ == end_; } |
| |
| // Accesses the specified entry in this set. Returns an Entry object, which |
| // is constructed from the underlying raw entry. The index must be less than |
| // size(). This operation is O(n) in size(). |
| Entry operator[](size_t index) const; |
| |
| constexpr const Iterator& begin() const { return begin_; } |
| constexpr const Iterator& end() const { return end_; } |
| |
| private: |
| Iterator begin_; |
| Iterator end_; |
| }; |
| |
| // Returns true if the provided data is a valid token database. This checks |
| // the magic number ("TOKENS"), version (which must be 0), and that there is |
| // is one string for each entry in the database. A database with extra strings |
| // or other trailing data is considered valid. |
| template <typename ByteArray> |
| static constexpr bool IsValid(const ByteArray& bytes) { |
| return HasValidHeader(bytes) && EachEntryHasAString(bytes); |
| } |
| |
| // Creates a TokenDatabase and checks if the provided data is valid at compile |
| // time. Accepts references to constexpr containers (array, span, string_view, |
| // etc.) with static storage duration. For example: |
| // |
| // constexpr char kMyData[] = ...; |
| // constexpr TokenDatabase db = TokenDatabase::Create<kMyData>(); |
| // |
| template <const auto& kDatabaseBytes> |
| static constexpr TokenDatabase Create() { |
| static_assert( |
| HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes), |
| "Databases must start with a 16-byte header that begins with TOKENS."); |
| |
| static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes), |
| "The database must have at least one string for each entry."); |
| |
| return TokenDatabase(std::data(kDatabaseBytes)); |
| } |
| |
| // Creates a TokenDatabase from the provided byte array. The array may be a |
| // span, array, or other container type. If the data is not valid, returns a |
| // default-constructed database for which ok() is false. |
| // |
| // Prefer the Create overload that takes the data as a template parameter |
| // whenever possible, since that function checks the integrity of the data at |
| // compile time. |
| template <typename ByteArray> |
| static constexpr TokenDatabase Create(const ByteArray& database_bytes) { |
| return IsValid<ByteArray>(database_bytes) |
| ? TokenDatabase(std::data(database_bytes)) |
| : TokenDatabase(); // Invalid database. |
| } |
| // Creates a database with no data. ok() returns false. |
| constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {} |
| |
| // Returns all entries associated with this token. This is a O(n) operation. |
| Entries Find(uint32_t token) const; |
| |
| // Returns the total number of entries (unique token-string pairs). |
| constexpr size_t size() const { |
| return (end_.data - begin_.data) / sizeof(RawEntry); |
| } |
| |
| // True if this database was constructed with valid data. |
| constexpr bool ok() const { return begin_.data != nullptr; } |
| |
| Iterator begin() const { return Iterator(begin_.entry, end_.data); } |
| Iterator end() const { return Iterator(end_.entry, nullptr); } |
| |
| private: |
| struct Header { |
| std::array<char, 6> magic; |
| uint16_t version; |
| uint32_t entry_count; |
| uint32_t reserved; |
| }; |
| |
| static_assert(sizeof(Header) == 2 * sizeof(RawEntry)); |
| |
| template <typename ByteArray> |
| static constexpr bool HasValidHeader(const ByteArray& bytes) { |
| static_assert(sizeof(*std::data(bytes)) == 1u); |
| |
| if (std::size(bytes) < sizeof(Header)) { |
| return false; |
| } |
| |
| // Check the magic number and version. |
| for (size_t i = 0; i < kMagicAndVersion.size(); ++i) { |
| if (bytes[i] != kMagicAndVersion[i]) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| template <typename ByteArray> |
| static constexpr bool EachEntryHasAString(const ByteArray& bytes) { |
| const size_t entries = ReadEntryCount(std::data(bytes)); |
| |
| // Check that the data is large enough to have a string table. |
| if (std::size(bytes) < StringTable(entries)) { |
| return false; |
| } |
| |
| // Count the strings in the string table. |
| size_t string_count = 0; |
| for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes); |
| ++i) { |
| string_count += (*i == '\0') ? 1 : 0; |
| } |
| |
| // Check that there is at least one string for each entry. |
| return string_count >= entries; |
| } |
| |
| // Reads the number of entries from a database header. Cast to the bytes to |
| // uint8_t to avoid sign extension if T is signed. |
| template <typename T> |
| static constexpr uint32_t ReadEntryCount(const T* header_bytes) { |
| const T* bytes = header_bytes + offsetof(Header, entry_count); |
| return static_cast<uint8_t>(bytes[0]) | |
| static_cast<uint8_t>(bytes[1]) << 8 | |
| static_cast<uint8_t>(bytes[2]) << 16 | |
| static_cast<uint8_t>(bytes[3]) << 24; |
| } |
| |
| // Calculates the offset of the string table. |
| static constexpr size_t StringTable(size_t entries) { |
| return sizeof(Header) + entries * sizeof(RawEntry); |
| } |
| |
| // The magic number that starts the table is "TOKENS". The version is encoded |
| // next as two bytes. |
| static constexpr std::array<char, 8> kMagicAndVersion = { |
| 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'}; |
| |
| template <typename Byte> |
| constexpr TokenDatabase(const Byte bytes[]) |
| : TokenDatabase(bytes + sizeof(Header), |
| bytes + StringTable(ReadEntryCount(bytes))) { |
| static_assert(sizeof(Byte) == 1u); |
| } |
| |
| // It is illegal to reinterpret_cast in constexpr functions, but acceptable to |
| // use unions. Instead of using a reinterpret_cast to change the byte pointer |
| // to a RawEntry pointer, have a separate overload for each byte pointer type |
| // and store them in a union. |
| constexpr TokenDatabase(const char* begin, const char* end) |
| : begin_{.data = begin}, end_{.data = end} {} |
| |
| constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end) |
| : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {} |
| |
| constexpr TokenDatabase(const signed char* begin, const signed char* end) |
| : begin_{.signed_data = begin}, end_{.signed_data = end} {} |
| |
| // Store the beginning and end pointers as a union to avoid breaking constexpr |
| // rules for reinterpret_cast. |
| union { |
| const RawEntry* entry; |
| const char* data; |
| const unsigned char* unsigned_data; |
| const signed char* signed_data; |
| } begin_, end_; |
| }; |
| |
| } // namespace pw::tokenizer |