blob: 518a7281c5458abc39531185960550901badc9f8 [file] [edit]
// Copyright 2022 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
/** Decodes and detokenizes strings from binary or Base64 input. */
import { Buffer } from 'buffer';
import { Frame } from 'pigweedjs/pw_hdlc';
import { TokenDatabase } from './token_database';
import { PrintfDecoder } from './printf_decoder';
const MAX_RECURSIONS = 9;
const BASE64CHARS = '[A-Za-z0-9+\\-\\/_]';
const PATTERN = new RegExp(
// Base64 tokenized strings start with the prefix character ($)
'\\$' +
// Tokenized strings contain 0 or more blocks of four Base64 chars.
`(?:${BASE64CHARS}{4})*` +
// The last block of 4 chars may have one or two padding chars (=).
`(?:${BASE64CHARS}{3}=|${BASE64CHARS}{2}==)?`,
'g',
);
const BASE10_TOKEN_REGEX = '(?<base10>[0-9]{10})';
const BASE16_TOKEN_REGEX = '(?<base16>[A-Fa-f0-9]{8})';
const BASE64_TOKEN_REGEX = `(?<base64>(?:${BASE64CHARS}{4})*(?:${BASE64CHARS}{3}=|${BASE64CHARS}{2}==)?)`;
const NESTED_TOKEN_BASE_PREFIX = '#';
const NESTED_DOMAIN_START_PREFIX = '{';
const NESTED_DOMAIN_END_PREFIX = '}';
const NESTED_TOKEN_PREFIX = '$';
const NESTED_TOKEN_FORMATS = [
BASE10_TOKEN_REGEX,
BASE16_TOKEN_REGEX,
BASE64_TOKEN_REGEX,
];
function tokenRegex(prefix: string): RegExp {
const escapedPrefix = prefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // Escape special regex characters
return new RegExp(
// Tokenized strings start with the prefix character ($).
escapedPrefix +
// Optional; no domain specifier defaults to (empty) domain.
// Brackets ({}) specifies domain string
'(?<domainspec>(' +
NESTED_DOMAIN_START_PREFIX +
'(?<domain>\\s*|\\s*[a-zA-Z_:][a-zA-Z0-9_:\\s]*)' +
NESTED_DOMAIN_END_PREFIX +
'))?' +
// Optional; no base specifier defaults to BASE64.
// Hash (#) with no number specified defaults to Base-16.
'(?<basespec>(?<base>[0-9]*)?' +
NESTED_TOKEN_BASE_PREFIX +
')?' +
// Match one of the following token formats.
'(' +
NESTED_TOKEN_FORMATS.join('|') +
')',
'g',
);
}
interface TokenAndArgs {
token: number;
args: Uint8Array;
}
export class Detokenizer {
private database: TokenDatabase;
private _token_regex: RegExp;
private prefix: string;
constructor(csvDatabase: string) {
this.prefix = NESTED_TOKEN_PREFIX; // Initialize prefix first
this.database = new TokenDatabase(csvDatabase);
this._token_regex = tokenRegex(this.prefix); // Now it's safe to use this.prefix
}
/**
* Detokenize frame data into actual string messages using the provided
* token database.
*
* If the frame doesn't match any token from database, the frame will be
* returned as string as is.
*/
detokenize(
tokenizedFrame: Frame,
domain = '',
maxRecursion: number = MAX_RECURSIONS,
): string {
return this.detokenizeUint8Array(tokenizedFrame.data, domain, maxRecursion);
}
/**
* Detokenize uint8 into actual string messages using the provided
* token database.
*
* If the data doesn't match any token from database, the data will be
* returned as string as is.
*/
detokenizeUint8Array(
data: Uint8Array,
domain = '',
recursion: number = MAX_RECURSIONS,
): string {
const decodedString = new TextDecoder().decode(data);
if (PATTERN.test(decodedString)) {
return this.detokenizeBase64String(decodedString, domain, recursion);
}
const { token, args } = this.decodeUint8Array(data);
// Parse arguments if this is printf-style text.
const formatString = this.database.get(token, domain);
if (!formatString) {
return decodedString;
}
if (recursion > 0) {
return this.detokenizeText(formatString, recursion);
}
return new PrintfDecoder().decode(String(formatString), args);
}
public detokenizeBase64String(
base64String: string,
domain: string,
recursions: number,
): string {
return base64String.replace(PATTERN, (base64Substring) => {
const { token, args } = this.decodeBase64TokenFrame(base64Substring);
const format = this.database.get(token, domain);
// Parse arguments if this is printf-style text.
if (format) {
const decodedOriginal = new PrintfDecoder().decode(
String(format),
args,
);
// Detokenize nested Base64 tokens and their arguments.
if (recursions > 0) {
return this.detokenizeBase64String(
decodedOriginal,
// Update the domain value for nested tokens.
this.getDomain(base64Substring),
recursions - 1,
);
}
return decodedOriginal;
}
return base64Substring;
});
}
private decodeUint8Array(data: Uint8Array): TokenAndArgs {
const token = new DataView(data.buffer, data.byteOffset, 4).getUint32(
0,
true,
);
const args = new Uint8Array(data.buffer.slice(data.byteOffset + 4));
return { token, args };
}
private decodeBase64TokenFrame(base64Data: string): TokenAndArgs {
// Remove the prefix '$' and convert from Base64.
const prefixRemoved = base64Data.slice(1);
const noBase64 = Buffer.from(prefixRemoved, 'base64').toString('binary');
// Convert back to bytes and return token and arguments.
const bytes = noBase64.split('').map((ch) => ch.charCodeAt(0));
let uIntArray = new Uint8Array(bytes);
// Check if there are enough bytes to create a DataView of length 4.
if (uIntArray.length < 4) {
// If there are not enough bytes, pad with zeroes.
const paddedArray = new Uint8Array(4);
paddedArray.set(uIntArray);
uIntArray = paddedArray;
}
const token = new DataView(
uIntArray.buffer,
uIntArray.byteOffset,
4,
).getUint32(0, true);
const args = new Uint8Array(bytes.slice(4));
return { token, args };
}
private detokenizeScan(match: string, ...args: any[]): string {
const groups = args.pop();
const basespec = groups.basespec;
const base = groups.base;
let domain = groups.domain;
const base10 = groups.base10;
const base16 = groups.base16;
const base64 = groups.base64;
if (domain === undefined) {
domain = '';
} else {
domain = domain.replace(/\s/g, '');
}
if (base64 || basespec == undefined) {
// Detokenize Base64-encoded frame data into actual string messages using the
// provided token database.
//
// If the frame doesn't match any token from database, the frame will be
// returned as string as is.
const { token, args } = this.decodeBase64TokenFrame(match);
const format = this.database.get(token, domain);
if (format) {
return new PrintfDecoder().decode(String(format), args);
}
return match;
}
let usedBase = base;
if (!usedBase) {
usedBase = '16';
}
let tokenString = '';
if (base10) {
tokenString = base10;
} else if (base16) {
tokenString = base16;
}
return this.detokenizeOnce(tokenString, usedBase, domain, match);
}
private detokenizeOnce(
tokenString: string,
base: string,
domain: string,
match: string,
): string {
if (!this.database)
return `${this.prefix}${
domain.length > 0
? NESTED_DOMAIN_START_PREFIX + domain + NESTED_DOMAIN_END_PREFIX
: ''
}${base.length > 0 ? base + NESTED_TOKEN_BASE_PREFIX : ''}${tokenString}`;
const token = parseInt(tokenString, parseInt(base));
const entries = this.database.get(token, domain);
if (entries) {
return String(entries);
}
return match;
}
public detokenizeText(
message: string,
recursion: number = MAX_RECURSIONS,
): string {
if (!this.database) {
return message;
}
let result: string = message;
for (let i = 0; i < recursion - 1; i++) {
const newResult = result.replace(this._token_regex, (match, ...args) => {
return this.detokenizeScan(match, ...args);
});
if (newResult === result) {
return result;
}
result = newResult;
}
return result;
}
private getDomain(base64String: string): string {
const result = base64String.replace(this._token_regex, (_, ...args) => {
const groups = args.pop();
let domain = groups.domain;
if (domain === undefined) {
domain = '';
} else {
domain = domain.replace(/\s/g, '');
}
return domain;
});
return result;
}
}