pw_tokenizer: Add DecodeOptionallyTokenizedData
Add new API to decode data that may or may not be tokenized.
Change-Id: I00289a0b8041dcbd2ad86489366b741f3cd4091d
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/206070
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Carlos Chinchilla <cachinchilla@google.com>
Pigweed-Auto-Submit: Carlos Chinchilla <cachinchilla@google.com>
diff --git a/pw_tokenizer/detokenize.cc b/pw_tokenizer/detokenize.cc
index ecec29a..8f816d3 100644
--- a/pw_tokenizer/detokenize.cc
+++ b/pw_tokenizer/detokenize.cc
@@ -15,7 +15,10 @@
#include "pw_tokenizer/detokenize.h"
#include <algorithm>
+#include <cctype>
#include <cstring>
+#include <string_view>
+#include <vector>
#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
@@ -144,6 +147,22 @@
return lhs.second > rhs.second;
}
+// Returns true if all characters in data are printable, space, or if the string
+// is empty.
+constexpr bool IsPrintableAscii(std::string_view data) {
+ // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
+ //
+ // if ''.join(text.split()).isprintable():
+ // return text
+ //
+ for (int letter : data) {
+ if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
} // namespace
DetokenizedString::DetokenizedString(
@@ -261,4 +280,52 @@
return result;
}
+std::string Detokenizer::DecodeOptionallyTokenizedData(
+ const ConstByteSpan& optionally_tokenized_data) {
+ // Try detokenizing as binary using the best result if available, else use
+ // the input data as a string.
+ const auto result = Detokenize(optionally_tokenized_data);
+ const bool found_matches = !result.matches().empty();
+ // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
+ // process does not encode and decode UTF8 format, it is sufficient to check
+ // if the data is printable ASCII.
+ const std::string data =
+ found_matches
+ ? result.BestString()
+ : std::string(
+ reinterpret_cast<const char*>(optionally_tokenized_data.data()),
+ optionally_tokenized_data.size());
+
+ const bool is_data_printable = IsPrintableAscii(data);
+ if (!found_matches && !is_data_printable) {
+ // Assume the token is unknown or the data is corrupt.
+ std::vector<char> base64_encoding_buffer(
+ Base64EncodedBufferSize(optionally_tokenized_data.size()));
+ const size_t encoded_length = PrefixedBase64Encode(
+ optionally_tokenized_data, span(base64_encoding_buffer));
+ return std::string{base64_encoding_buffer.data(), encoded_length};
+ }
+
+ // Successfully detokenized, check if the field has more prefixed
+ // base64-encoded tokens.
+ const std::string field = DetokenizeText(data);
+ // If anything detokenized successfully, use that.
+ if (field != data) {
+ return field;
+ }
+
+ // Attempt to determine whether this is an unknown token or plain text.
+ // Any string with only printable or whitespace characters is plain text.
+ if (found_matches || is_data_printable) {
+ return data;
+ }
+
+ // Assume this field is tokenized data that could not be decoded.
+ std::vector<char> base64_encoding_buffer(
+ Base64EncodedBufferSize(optionally_tokenized_data.size()));
+ const size_t encoded_length = PrefixedBase64Encode(
+ optionally_tokenized_data, span(base64_encoding_buffer));
+ return std::string{base64_encoding_buffer.data(), encoded_length};
+}
+
} // namespace pw::tokenizer
diff --git a/pw_tokenizer/detokenize_test.cc b/pw_tokenizer/detokenize_test.cc
index db116a8..02ac2e2 100644
--- a/pw_tokenizer/detokenize_test.cc
+++ b/pw_tokenizer/detokenize_test.cc
@@ -34,7 +34,7 @@
return std::array<Case, sizeof...(Args)>{args...};
}
-// Database with the following entries:
+// Database with the following entries and arbitrary token values:
// {
// 0x00000001: "One",
// 0x00000005: "TWO",
@@ -44,18 +44,20 @@
// }
constexpr char kTestDatabase[] =
"TOKENS\0\0"
- "\x05\x00\x00\x00"
+ "\x06\x00\x00\x00" // Number of tokens in this database.
"\0\0\0\0"
"\x01\x00\x00\x00----"
"\x05\x00\x00\x00----"
"\xFF\x00\x00\x00----"
"\xFF\xEE\xEE\xDD----"
"\xEE\xEE\xEE\xEE----"
+ "\x9D\xA7\x97\xF8----"
"One\0"
"TWO\0"
"333\0"
"FOUR\0"
- "$AQAAAA==";
+ "$AQAAAA==\0"
+ "■msg♦This is $AQAAAA== message■module♦■file♦file.txt";
class Detokenize : public ::testing::Test {
protected:
@@ -166,6 +168,33 @@
}
}
+TEST_F(Detokenize, OptionallyTokenizedData) {
+ for (auto [data, expected] : TestCases(
+ Case{ONE, "One"},
+ Case{"\1\0\0\0", "One"},
+ Case{TWO, "TWO"},
+ Case{THREE, "333"},
+ Case{FOUR, "FOUR"},
+ Case{FOUR ONE ONE, "FOUROneOne"},
+ Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
+ Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
+ "One\r\nTWO\r\n333\r\nFOUR\r\n"},
+ Case{"123" FOUR, "123FOUR"},
+ Case{"123" FOUR ", 56", "123FOUR, 56"},
+ Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
+ Case{"$0" ONE, "$0One"},
+ Case{"$/+7u3Q=", "$/+7u3Q="}, // incomplete message (missing "=")
+ Case{"$123456==" FOUR, "$123456==FOUR"},
+ Case{NEST_ONE, "One"},
+ Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
+ Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
+ Case{"$naeX+A==",
+ "■msg♦This is One message■module♦■file♦file.txt"})) {
+ EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
+ std::string(expected));
+ }
+}
+
constexpr char kDataWithArguments[] =
"TOKENS\0\0"
"\x09\x00\x00\x00"
diff --git a/pw_tokenizer/public/pw_tokenizer/detokenize.h b/pw_tokenizer/public/pw_tokenizer/detokenize.h
index ac936fa..3938d89 100644
--- a/pw_tokenizer/public/pw_tokenizer/detokenize.h
+++ b/pw_tokenizer/public/pw_tokenizer/detokenize.h
@@ -147,6 +147,22 @@
return DetokenizeText(text, 1);
}
+ /// Decodes data that may or may not be tokenized, such as proto fields marked
+ /// as optionally tokenized.
+ ///
+ /// This function currently only supports Base64 nested tokenized messages.
+ /// Support for hexadecimal-encoded string literals will be added.
+ ///
+ /// This function currently assumes when data is not tokenized it is printable
+ /// ASCII. Otherwise, the returned string will be base64-encoded.
+ ///
+ /// @param[in] optionally_tokenized_data Data optionally tokenized.
+ ///
+ /// @returns The decoded text if successfully detokenized or if the data is
+ /// printable, otherwise returns the data base64-encoded.
+ std::string DecodeOptionallyTokenizedData(
+ const span<const std::byte>& optionally_tokenized_data);
+
private:
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
};