pw_tokenizer: Add DecodeOptionallyTokenizedData Add new API to decode data that may or may not be tokenized. Change-Id: I00289a0b8041dcbd2ad86489366b741f3cd4091d Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/206070 Reviewed-by: Wyatt Hepler <hepler@google.com> Commit-Queue: Carlos Chinchilla <cachinchilla@google.com> Pigweed-Auto-Submit: Carlos Chinchilla <cachinchilla@google.com>

commit: f79f7c42e7921fe6b1b93a992f889354dc0b94b6 [log] [tgz]
author: Carlos Chinchilla <cachinchilla@google.com> Wed Apr 24 21:03:05 2024 +0000
committer: CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Apr 24 21:03:05 2024 +0000
tree: d9a9705801af0bb79399964d40be9e3d6ea48de9
parent: 82bbfff7ff83136feaf61d6e720f845ee5517a37 [diff]
diff --git a/pw_tokenizer/detokenize.cc b/pw_tokenizer/detokenize.cc
index ecec29a..8f816d3 100644
--- a/pw_tokenizer/detokenize.cc
+++ b/pw_tokenizer/detokenize.cc

@@ -15,7 +15,10 @@
 #include "pw_tokenizer/detokenize.h"
 
 #include <algorithm>
+#include <cctype>
 #include <cstring>
+#include <string_view>
+#include <vector>
 
 #include "pw_bytes/bit.h"
 #include "pw_bytes/endian.h"
@@ -144,6 +147,22 @@
   return lhs.second > rhs.second;
 }
 
+// Returns true if all characters in data are printable, space, or if the string
+// is empty.
+constexpr bool IsPrintableAscii(std::string_view data) {
+  // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
+  //
+  //   if ''.join(text.split()).isprintable():
+  //     return text
+  //
+  for (int letter : data) {
+    if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 DetokenizedString::DetokenizedString(
@@ -261,4 +280,52 @@
   return result;
 }
 
+std::string Detokenizer::DecodeOptionallyTokenizedData(
+    const ConstByteSpan& optionally_tokenized_data) {
+  // Try detokenizing as binary using the best result if available, else use
+  // the input data as a string.
+  const auto result = Detokenize(optionally_tokenized_data);
+  const bool found_matches = !result.matches().empty();
+  // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
+  // process does not encode and decode UTF8 format, it is sufficient to check
+  // if the data is printable ASCII.
+  const std::string data =
+      found_matches
+          ? result.BestString()
+          : std::string(
+                reinterpret_cast<const char*>(optionally_tokenized_data.data()),
+                optionally_tokenized_data.size());
+
+  const bool is_data_printable = IsPrintableAscii(data);
+  if (!found_matches && !is_data_printable) {
+    // Assume the token is unknown or the data is corrupt.
+    std::vector<char> base64_encoding_buffer(
+        Base64EncodedBufferSize(optionally_tokenized_data.size()));
+    const size_t encoded_length = PrefixedBase64Encode(
+        optionally_tokenized_data, span(base64_encoding_buffer));
+    return std::string{base64_encoding_buffer.data(), encoded_length};
+  }
+
+  // Successfully detokenized, check if the field has more prefixed
+  // base64-encoded tokens.
+  const std::string field = DetokenizeText(data);
+  // If anything detokenized successfully, use that.
+  if (field != data) {
+    return field;
+  }
+
+  // Attempt to determine whether this is an unknown token or plain text.
+  // Any string with only printable or whitespace characters is plain text.
+  if (found_matches || is_data_printable) {
+    return data;
+  }
+
+  // Assume this field is tokenized data that could not be decoded.
+  std::vector<char> base64_encoding_buffer(
+      Base64EncodedBufferSize(optionally_tokenized_data.size()));
+  const size_t encoded_length = PrefixedBase64Encode(
+      optionally_tokenized_data, span(base64_encoding_buffer));
+  return std::string{base64_encoding_buffer.data(), encoded_length};
+}
+
 }  // namespace pw::tokenizer

diff --git a/pw_tokenizer/detokenize_test.cc b/pw_tokenizer/detokenize_test.cc
index db116a8..02ac2e2 100644
--- a/pw_tokenizer/detokenize_test.cc
+++ b/pw_tokenizer/detokenize_test.cc

@@ -34,7 +34,7 @@
   return std::array<Case, sizeof...(Args)>{args...};
 }
 
-// Database with the following entries:
+// Database with the following entries and arbitrary token values:
 // {
 //   0x00000001: "One",
 //   0x00000005: "TWO",
@@ -44,18 +44,20 @@
 // }
 constexpr char kTestDatabase[] =
     "TOKENS\0\0"
-    "\x05\x00\x00\x00"
+    "\x06\x00\x00\x00"  // Number of tokens in this database.
     "\0\0\0\0"
     "\x01\x00\x00\x00----"
     "\x05\x00\x00\x00----"
     "\xFF\x00\x00\x00----"
     "\xFF\xEE\xEE\xDD----"
     "\xEE\xEE\xEE\xEE----"
+    "\x9D\xA7\x97\xF8----"
     "One\0"
     "TWO\0"
     "333\0"
     "FOUR\0"
-    "$AQAAAA==";
+    "$AQAAAA==\0"
+    "■msg♦This is $AQAAAA== message■module♦■file♦file.txt";
 
 class Detokenize : public ::testing::Test {
  protected:
@@ -166,6 +168,33 @@
   }
 }
 
+TEST_F(Detokenize, OptionallyTokenizedData) {
+  for (auto [data, expected] : TestCases(
+           Case{ONE, "One"},
+           Case{"\1\0\0\0", "One"},
+           Case{TWO, "TWO"},
+           Case{THREE, "333"},
+           Case{FOUR, "FOUR"},
+           Case{FOUR ONE ONE, "FOUROneOne"},
+           Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
+           Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
+                "One\r\nTWO\r\n333\r\nFOUR\r\n"},
+           Case{"123" FOUR, "123FOUR"},
+           Case{"123" FOUR ", 56", "123FOUR, 56"},
+           Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
+           Case{"$0" ONE, "$0One"},
+           Case{"$/+7u3Q=", "$/+7u3Q="},  // incomplete message (missing "=")
+           Case{"$123456==" FOUR, "$123456==FOUR"},
+           Case{NEST_ONE, "One"},
+           Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
+           Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
+           Case{"$naeX+A==",
+                "■msg♦This is One message■module♦■file♦file.txt"})) {
+    EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
+              std::string(expected));
+  }
+}
+
 constexpr char kDataWithArguments[] =
     "TOKENS\0\0"
     "\x09\x00\x00\x00"

diff --git a/pw_tokenizer/public/pw_tokenizer/detokenize.h b/pw_tokenizer/public/pw_tokenizer/detokenize.h
index ac936fa..3938d89 100644
--- a/pw_tokenizer/public/pw_tokenizer/detokenize.h
+++ b/pw_tokenizer/public/pw_tokenizer/detokenize.h

@@ -147,6 +147,22 @@
     return DetokenizeText(text, 1);
   }
 
+  /// Decodes data that may or may not be tokenized, such as proto fields marked
+  /// as optionally tokenized.
+  ///
+  /// This function currently only supports Base64 nested tokenized messages.
+  /// Support for hexadecimal-encoded string literals will be added.
+  ///
+  /// This function currently assumes when data is not tokenized it is printable
+  /// ASCII. Otherwise, the returned string will be base64-encoded.
+  ///
+  /// @param[in] optionally_tokenized_data Data optionally tokenized.
+  ///
+  /// @returns The decoded text if successfully detokenized or if the data is
+  /// printable, otherwise returns the data base64-encoded.
+  std::string DecodeOptionallyTokenizedData(
+      const span<const std::byte>& optionally_tokenized_data);
+
  private:
   std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
 };
commit	f79f7c42e7921fe6b1b93a992f889354dc0b94b6	[log] [tgz]
author	Carlos Chinchilla <cachinchilla@google.com>	Wed Apr 24 21:03:05 2024 +0000
committer	CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Apr 24 21:03:05 2024 +0000
tree	d9a9705801af0bb79399964d40be9e3d6ea48de9
parent	82bbfff7ff83136feaf61d6e720f845ee5517a37 [diff]