pw_tokenizer/public/pw_tokenizer/detokenize.h - pigweed/pigweed - Git at Google

 // Copyright 2020 The Pigweed Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy of
 // the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 // License for the specific language governing permissions and limitations under
 // the License.

 // This file provides the Detokenizer class, which is used to decode tokenized
 // strings.  To use a Detokenizer, load a binary format token database into
 // memory, construct a TokenDatabase, and pass it to a Detokenizer:
 //
 //   std::vector data = ReadFile("my_tokenized_strings.db");
 //   Detokenizer detok(TokenDatabase::Create(data));
 //
 //   DetokenizedString result = detok.Detokenize(my_data);
 //   std::cout << result.BestString() << '\n';
 //
 #pragma once

 #include <cstddef>
 #include <cstdint>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 #include "pw_result/result.h"
 #include "pw_span/span.h"
 #include "pw_tokenizer/internal/decode.h"
 #include "pw_tokenizer/token_database.h"

 namespace pw::tokenizer {

 /// @defgroup pw_tokenizer_detokenize
 /// @{

 /// Token database entry.
 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;

 /// A string that has been detokenized. This class tracks all possible results
 /// if there are token collisions.
 class DetokenizedString {
  public:
   DetokenizedString(uint32_t token,
                     const span<const TokenizedStringEntry>& entries,
                     const span<const std::byte>& arguments);

   DetokenizedString() : has_token_(false) {}

   /// True if there was only one valid match and it decoded successfully.
   bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }

   /// Returns the strings that matched the token, with the best matches first.
   const std::vector<DecodedFormatString>& matches() const { return matches_; }

   const uint32_t& token() const { return token_; }

   /// Returns the detokenized string or an empty string if there were no
   /// matches. If there are multiple possible results, the `DetokenizedString`
   /// returns the first match.
   std::string BestString() const;

   /// Returns the best match, with error messages inserted for arguments that
   /// failed to parse.
   std::string BestStringWithErrors() const;

  private:
   uint32_t token_;
   bool has_token_;
   std::vector<DecodedFormatString> matches_;
 };

 /// Decodes and detokenizes from a token database. This class builds a hash
 /// table of tokens to give `O(1)` token lookups.
 class Detokenizer {
  public:
   /// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is
   /// not referenced by the `Detokenizer` after construction; its memory can be
   /// freed.
   explicit Detokenizer(const TokenDatabase& database);

   /// Constructs a detokenizer by directly passing the parsed database.
   explicit Detokenizer(
       std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&&
           database)
       : database_(std::move(database)) {}

   /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an
   /// ELF binary.
   static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);

   /// Overload of `FromElfSection` for a `uint8_t` span.
   static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
     return FromElfSection(as_bytes(elf_section));
   }

   /// Decodes and detokenizes the binary encoded message. Returns a
   /// `DetokenizedString` that stores all possible detokenized string results.
   DetokenizedString Detokenize(const span<const std::byte>& encoded) const;

   /// Overload of `Detokenize` for `span<const uint8_t>`.
   DetokenizedString Detokenize(const span<const uint8_t>& encoded) const {
     return Detokenize(as_bytes(encoded));
   }

   /// Overload of `Detokenize` for `std::string_view`.
   DetokenizedString Detokenize(std::string_view encoded) const {
     return Detokenize(encoded.data(), encoded.size());
   }

   /// Overload of `Detokenize` for a pointer and length.
   DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
     return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes));
   }

   /// Decodes and detokenizes a Base64-encoded message. Returns a
   /// `DetokenizedString` that stores all possible detokenized string results.
   DetokenizedString DetokenizeBase64Message(std::string_view text) const;

   /// Decodes and detokenizes nested tokenized messages in a string.
   ///
   /// This function currently only supports Base64 nested tokenized messages.
   /// Support for hexadecimal-encoded string literals will be added.
   ///
   /// @param[in] text Text potentially containing tokenized messages.
   ///
   /// @param[in] max_passes `DetokenizeText` supports recursive detokenization.
   /// Tokens can expand to other tokens. The maximum number of detokenization
   /// passes is specified by `max_passes` (0 is equivalent to 1).
   ///
   /// @returns The original string with nested tokenized messages decoded in
   ///     context. Messages that fail to decode are left as-is.
   std::string DetokenizeText(std::string_view text,
                              unsigned max_passes = 3) const;

   /// Deprecated version of `DetokenizeText` with no recursive detokenization.
   /// @deprecated Call `DetokenizeText` instead.
   [[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64(
       std::string_view text) const {
     return DetokenizeText(text, 1);
   }

   /// Decodes data that may or may not be tokenized, such as proto fields marked
   /// as optionally tokenized.
   ///
   /// This function currently only supports Base64 nested tokenized messages.
   /// Support for hexadecimal-encoded string literals will be added.
   ///
   /// This function currently assumes when data is not tokenized it is printable
   /// ASCII. Otherwise, the returned string will be base64-encoded.
   ///
   /// @param[in] optionally_tokenized_data Data optionally tokenized.
   ///
   /// @returns The decoded text if successfully detokenized or if the data is
   /// printable, otherwise returns the data base64-encoded.
   std::string DecodeOptionallyTokenizedData(
       const span<const std::byte>& optionally_tokenized_data);

  private:
   std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
 };

 /// @}

 }  // namespace pw::tokenizer
	// Copyright 2020 The Pigweed Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
	// use this file except in compliance with the License. You may obtain a copy of
	// the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	// License for the specific language governing permissions and limitations under
	// the License.

	// This file provides the Detokenizer class, which is used to decode tokenized
	// strings. To use a Detokenizer, load a binary format token database into
	// memory, construct a TokenDatabase, and pass it to a Detokenizer:
	//
	// std::vector data = ReadFile("my_tokenized_strings.db");
	// Detokenizer detok(TokenDatabase::Create(data));
	//
	// DetokenizedString result = detok.Detokenize(my_data);
	// std::cout << result.BestString() << '\n';
	//
	#pragma once

	#include <cstddef>
	#include <cstdint>
	#include <string>
	#include <unordered_map>
	#include <utility>
	#include <vector>

	#include "pw_result/result.h"
	#include "pw_span/span.h"
	#include "pw_tokenizer/internal/decode.h"
	#include "pw_tokenizer/token_database.h"

	namespace pw::tokenizer {

	/// @defgroup pw_tokenizer_detokenize
	/// @{

	/// Token database entry.
	using TokenizedStringEntry = std::pair<FormatString, uint32_t /date removed/>;

	/// A string that has been detokenized. This class tracks all possible results
	/// if there are token collisions.
	class DetokenizedString {
	public:
	DetokenizedString(uint32_t token,
	const span<const TokenizedStringEntry>& entries,
	const span<const std::byte>& arguments);

	DetokenizedString() : has_token_(false) {}

	/// True if there was only one valid match and it decoded successfully.
	bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }

	/// Returns the strings that matched the token, with the best matches first.
	const std::vector<DecodedFormatString>& matches() const { return matches_; }

	const uint32_t& token() const { return token_; }

	/// Returns the detokenized string or an empty string if there were no
	/// matches. If there are multiple possible results, the `DetokenizedString`
	/// returns the first match.
	std::string BestString() const;

	/// Returns the best match, with error messages inserted for arguments that
	/// failed to parse.
	std::string BestStringWithErrors() const;

	private:
	uint32_t token_;
	bool has_token_;
	std::vector<DecodedFormatString> matches_;
	};

	/// Decodes and detokenizes from a token database. This class builds a hash
	/// table of tokens to give `O(1)` token lookups.
	class Detokenizer {
	public:
	/// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is
	/// not referenced by the `Detokenizer` after construction; its memory can be
	/// freed.
	explicit Detokenizer(const TokenDatabase& database);

	/// Constructs a detokenizer by directly passing the parsed database.
	explicit Detokenizer(
	std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&&
	database)
	: database_(std::move(database)) {}

	/// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an
	/// ELF binary.
	static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section);

	/// Overload of `FromElfSection` for a `uint8_t` span.
	static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) {
	return FromElfSection(as_bytes(elf_section));
	}

	/// Decodes and detokenizes the binary encoded message. Returns a
	/// `DetokenizedString` that stores all possible detokenized string results.
	DetokenizedString Detokenize(const span<const std::byte>& encoded) const;

	/// Overload of `Detokenize` for `span<const uint8_t>`.
	DetokenizedString Detokenize(const span<const uint8_t>& encoded) const {
	return Detokenize(as_bytes(encoded));
	}

	/// Overload of `Detokenize` for `std::string_view`.
	DetokenizedString Detokenize(std::string_view encoded) const {
	return Detokenize(encoded.data(), encoded.size());
	}

	/// Overload of `Detokenize` for a pointer and length.
	DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
	return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes));
	}

	/// Decodes and detokenizes a Base64-encoded message. Returns a
	/// `DetokenizedString` that stores all possible detokenized string results.
	DetokenizedString DetokenizeBase64Message(std::string_view text) const;

	/// Decodes and detokenizes nested tokenized messages in a string.
	///
	/// This function currently only supports Base64 nested tokenized messages.
	/// Support for hexadecimal-encoded string literals will be added.
	///
	/// @param[in] text Text potentially containing tokenized messages.
	///
	/// @param[in] max_passes `DetokenizeText` supports recursive detokenization.
	/// Tokens can expand to other tokens. The maximum number of detokenization
	/// passes is specified by `max_passes` (0 is equivalent to 1).
	///
	/// @returns The original string with nested tokenized messages decoded in
	/// context. Messages that fail to decode are left as-is.
	std::string DetokenizeText(std::string_view text,
	unsigned max_passes = 3) const;

	/// Deprecated version of `DetokenizeText` with no recursive detokenization.
	/// @deprecated Call `DetokenizeText` instead.
	[[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64(
	std::string_view text) const {
	return DetokenizeText(text, 1);
	}

	/// Decodes data that may or may not be tokenized, such as proto fields marked
	/// as optionally tokenized.
	///
	/// This function currently only supports Base64 nested tokenized messages.
	/// Support for hexadecimal-encoded string literals will be added.
	///
	/// This function currently assumes when data is not tokenized it is printable
	/// ASCII. Otherwise, the returned string will be base64-encoded.
	///
	/// @param[in] optionally_tokenized_data Data optionally tokenized.
	///
	/// @returns The decoded text if successfully detokenized or if the data is
	/// printable, otherwise returns the data base64-encoded.
	std::string DecodeOptionallyTokenizedData(
	const span<const std::byte>& optionally_tokenized_data);

	private:
	std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
	};

	/// @}

	} // namespace pw::tokenizer