blob: 3938d8986e4de483aaa02ee1f4a058286107c14f [file] [log] [blame]
// Copyright 2020 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
// This file provides the Detokenizer class, which is used to decode tokenized
// strings. To use a Detokenizer, load a binary format token database into
// memory, construct a TokenDatabase, and pass it to a Detokenizer:
//
// std::vector data = ReadFile("my_tokenized_strings.db");
// Detokenizer detok(TokenDatabase::Create(data));
//
// DetokenizedString result = detok.Detokenize(my_data);
// std::cout << result.BestString() << '\n';
//
#pragma once
#include <cstddef>
#include <cstdint>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "pw_result/result.h"
#include "pw_span/span.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/token_database.h"
namespace pw::tokenizer {
/// @defgroup pw_tokenizer_detokenize
/// @{
/// Token database entry.
using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
/// A string that has been detokenized. This class tracks all possible results
/// if there are token collisions.
class DetokenizedString {
public:
DetokenizedString(uint32_t token,
const span<const TokenizedStringEntry>& entries,
const span<const std::byte>& arguments);
DetokenizedString() : has_token_(false) {}
/// True if there was only one valid match and it decoded successfully.
bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }
/// Returns the strings that matched the token, with the best matches first.
const std::vector<DecodedFormatString>& matches() const { return matches_; }
const uint32_t& token() const { return token_; }
/// Returns the detokenized string or an empty string if there were no
/// matches. If there are multiple possible results, the `DetokenizedString`
/// returns the first match.
std::string BestString() const;
/// Returns the best match, with error messages inserted for arguments that
/// failed to parse.
std::string BestStringWithErrors() const;
private:
uint32_t token_;
bool has_token_;
std::vector<DecodedFormatString> matches_;
};
/// Decodes and detokenizes from a token database. This class builds a hash
/// table of tokens to give `O(1)` token lookups.
class Detokenizer {
public:
/// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is
/// not referenced by the `Detokenizer` after construction; its memory can be
/// freed.
explicit Detokenizer(const TokenDatabase& database);
/// Constructs a detokenizer by directly passing the parsed database.
explicit Detokenizer(
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&&
database)
: database_(std::move(database)) {}
/// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an
/// ELF binary.
static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);
/// Overload of `FromElfSection` for a `uint8_t` span.
static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
return FromElfSection(as_bytes(elf_section));
}
/// Decodes and detokenizes the binary encoded message. Returns a
/// `DetokenizedString` that stores all possible detokenized string results.
DetokenizedString Detokenize(const span<const std::byte>& encoded) const;
/// Overload of `Detokenize` for `span<const uint8_t>`.
DetokenizedString Detokenize(const span<const uint8_t>& encoded) const {
return Detokenize(as_bytes(encoded));
}
/// Overload of `Detokenize` for `std::string_view`.
DetokenizedString Detokenize(std::string_view encoded) const {
return Detokenize(encoded.data(), encoded.size());
}
/// Overload of `Detokenize` for a pointer and length.
DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes));
}
/// Decodes and detokenizes a Base64-encoded message. Returns a
/// `DetokenizedString` that stores all possible detokenized string results.
DetokenizedString DetokenizeBase64Message(std::string_view text) const;
/// Decodes and detokenizes nested tokenized messages in a string.
///
/// This function currently only supports Base64 nested tokenized messages.
/// Support for hexadecimal-encoded string literals will be added.
///
/// @param[in] text Text potentially containing tokenized messages.
///
/// @param[in] max_passes `DetokenizeText` supports recursive detokenization.
/// Tokens can expand to other tokens. The maximum number of detokenization
/// passes is specified by `max_passes` (0 is equivalent to 1).
///
/// @returns The original string with nested tokenized messages decoded in
/// context. Messages that fail to decode are left as-is.
std::string DetokenizeText(std::string_view text,
unsigned max_passes = 3) const;
/// Deprecated version of `DetokenizeText` with no recursive detokenization.
/// @deprecated Call `DetokenizeText` instead.
[[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64(
std::string_view text) const {
return DetokenizeText(text, 1);
}
/// Decodes data that may or may not be tokenized, such as proto fields marked
/// as optionally tokenized.
///
/// This function currently only supports Base64 nested tokenized messages.
/// Support for hexadecimal-encoded string literals will be added.
///
/// This function currently assumes when data is not tokenized it is printable
/// ASCII. Otherwise, the returned string will be base64-encoded.
///
/// @param[in] optionally_tokenized_data Data optionally tokenized.
///
/// @returns The decoded text if successfully detokenized or if the data is
/// printable, otherwise returns the data base64-encoded.
std::string DecodeOptionallyTokenizedData(
const span<const std::byte>& optionally_tokenized_data);
private:
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
};
/// @}
} // namespace pw::tokenizer