blob: 5698eab09aa3686a12dc7ab0cf941721b6380a34 [file] [log] [blame]
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "google/protobuf/json/internal/lexer.h"
#include <sys/types.h>
#include <atomic>
#include <cfloat>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <limits>
#include <ostream>
#include <string>
#include <utility>
#include "google/protobuf/stubs/logging.h"
#include "google/protobuf/stubs/common.h"
#include "absl/algorithm/container.h"
#include "absl/numeric/bits.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/ascii.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "google/protobuf/stubs/status_macros.h"
// Must be included last.
#include "google/protobuf/port_def.inc"
namespace google {
namespace protobuf {
namespace json_internal {
namespace {
// Randomly inserts bonus whitespace of a few different kinds into a string.
//
// This utility is intended to make error messages hostile to machine
// interpretation as a Hyrum's Law countermeasure, without potentially confusing
// human readers.
void HardenAgainstHyrumsLaw(absl::string_view to_obfuscate, std::string& out) {
// Get some simple randomness from ASLR, which is enabled in most
// environments. Our goal is to be annoying, not secure.
static const void* const kAslrSeed = &kAslrSeed;
// Per-call randomness from a relaxed atomic.
static std::atomic<uintptr_t> kCounterSeed{0};
constexpr uint64_t kA = 0x5851f42d4c957f2dull;
constexpr uint64_t kB = 0x14057b7ef767814full;
uint64_t state = absl::bit_cast<uintptr_t>(kAslrSeed) + kB +
kCounterSeed.fetch_add(1, std::memory_order_relaxed);
auto rng = [&state, &kA, &kB] {
state = state * kA + kB;
return absl::rotr(static_cast<uint32_t>(((state >> 18) ^ state) >> 27),
state >> 59);
};
(void)rng(); // Advance state once.
out.reserve(to_obfuscate.size() + absl::c_count(to_obfuscate, ' '));
for (char c : to_obfuscate) {
out.push_back(c);
if (c != ' ' || rng() % 3 != 0) {
continue;
}
size_t count = rng() % 2 + 1;
for (size_t i = 0; i < count; ++i) {
out.push_back(' ');
}
}
}
} // namespace
constexpr size_t ParseOptions::kDefaultDepth;
absl::Status JsonLocation::Invalid(absl::string_view message,
SourceLocation sl) const {
// NOTE: we intentionally do not harden the "invalid JSON" part, so that
// people have a hope of grepping for it in logs. That part is easy to
// commit to, as stability goes.
//
// This copies the error twice. Because this is the "unhappy" path, this
// function is cold and can afford the waste.
std::string status_message = "invalid JSON";
std::string to_obfuscate;
if (path != nullptr) {
absl::StrAppend(&to_obfuscate, " in ");
path->Describe(to_obfuscate);
to_obfuscate.push_back(',');
}
absl::StrAppendFormat(&to_obfuscate, " near %zu:%zu (offset %zu): %s",
line + 1, col + 1, offset, message);
HardenAgainstHyrumsLaw(to_obfuscate, status_message);
return absl::InvalidArgumentError(std::move(status_message));
}
absl::StatusOr<JsonLexer::Kind> JsonLexer::PeekKind() {
RETURN_IF_ERROR(SkipToToken());
char c = stream_.PeekChar();
switch (c) {
case '{':
return JsonLexer::kObj;
case '[':
return JsonLexer::kArr;
case '"':
case '\'':
return JsonLexer::kStr;
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return JsonLexer::kNum;
case 't':
return JsonLexer::kTrue;
case 'f':
return JsonLexer::kFalse;
case 'n':
return JsonLexer::kNull;
default:
return Invalid(absl::StrFormat("unexpected character: '%c'", c));
}
}
absl::Status JsonLexer::SkipValue() {
absl::StatusOr<Kind> kind = PeekKind();
RETURN_IF_ERROR(kind.status());
switch (*kind) {
case JsonLexer::kObj:
return VisitObject(
[this](LocationWith<MaybeOwnedString>&) { return SkipValue(); });
case JsonLexer::kArr:
return VisitArray([this] { return SkipValue(); });
case JsonLexer::kStr:
return ParseUtf8().status();
case JsonLexer::kNum:
return ParseNumber().status();
case JsonLexer::kTrue:
return Expect("true");
case JsonLexer::kFalse:
return Expect("false");
case JsonLexer::kNull:
return Expect("null");
default:
break;
}
// Some compilers seem to fail to realize this is a basic block
// terminator and incorrectly believe this function is missing
// a return.
GOOGLE_CHECK(false) << "unreachable";
return absl::OkStatus();
}
absl::StatusOr<uint16_t> JsonLexer::ParseU16HexCodepoint() {
absl::StatusOr<LocationWith<MaybeOwnedString>> escape = Take(4);
RETURN_IF_ERROR(escape.status());
uint16_t u16 = 0;
for (char c : escape->value.AsView()) {
if (c >= '0' && c <= '9') {
c -= '0';
} else if (c >= 'a' && c <= 'f') {
c = c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
c = c - 'A' + 10;
} else {
return Invalid("invalid Unicode escape");
}
u16 <<= 4;
u16 |= c;
}
return u16;
}
absl::Status JsonLexer::SkipToToken() {
while (true) {
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
switch (stream_.PeekChar()) {
case '\n':
RETURN_IF_ERROR(Advance(1));
++json_loc_.line;
json_loc_.col = 0;
break;
case '\r':
case '\t':
case ' ':
RETURN_IF_ERROR(Advance(1));
break;
default:
return absl::OkStatus();
}
}
}
absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseRawNumber() {
RETURN_IF_ERROR(SkipToToken());
enum { kInt, kFraction, kExponent } state = kInt;
char prev_var = 0;
auto number = TakeWhile([state, prev_var](size_t index, char c) mutable {
char prev = prev_var;
prev_var = c;
if (absl::ascii_isdigit(c)) {
return true;
}
bool last_was_int = absl::ascii_isdigit(prev);
// These checks handle transitions between the integer, fractional, and
// exponent part of a number. This will cut off at the first syntax error.
// Because all numbers must be followed by `,`, `]`, or `}`, we can let
// that catch what's left behind.
if (state == kInt && c == '-') {
return !last_was_int;
}
if (state == kInt && last_was_int && c == '.') {
state = kFraction;
return true;
}
if (state != kExponent && last_was_int && (c == 'e' || c == 'E')) {
state = kExponent;
return true;
}
if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) {
return true;
}
return false;
});
RETURN_IF_ERROR(number.status());
absl::string_view number_text = number->value.AsView();
if (number_text.empty() || number_text == "-") {
return number->loc.Invalid("expected a number");
}
auto without_minus =
number_text[0] == '-' ? number_text.substr(1) : number_text;
if (without_minus.size() > 1 && without_minus[0] == '0' &&
absl::ascii_isdigit(without_minus[1])) {
return number->loc.Invalid("number cannot have extraneous leading zero");
}
if (number_text.back() == '.') {
return number->loc.Invalid("number cannot have trailing period");
}
double d;
if (!absl::SimpleAtod(number_text, &d) || !std::isfinite(d)) {
return number->loc.Invalid(
absl::StrFormat("invalid number: '%s'", number_text));
}
// Find the next token, to make sure we didn't leave something behind we
// shouldn't have.
if (!stream_.AtEof()) {
RETURN_IF_ERROR(SkipToToken());
switch (stream_.PeekChar()) {
case ',':
case ']':
case '}':
break;
default:
return Invalid(
absl::StrFormat("unexpected character: '%c'", stream_.PeekChar()));
}
}
return number;
}
absl::StatusOr<LocationWith<double>> JsonLexer::ParseNumber() {
auto number = ParseRawNumber();
RETURN_IF_ERROR(number.status());
double d;
if (!absl::SimpleAtod(number->value.AsView(), &d) || !std::isfinite(d)) {
return number->loc.Invalid(
absl::StrFormat("invalid number: '%s'", number->value.AsView()));
}
return LocationWith<double>{d, number->loc};
}
absl::StatusOr<size_t> JsonLexer::ParseUnicodeEscape(char out_utf8[4]) {
auto hex = ParseU16HexCodepoint();
RETURN_IF_ERROR(hex.status());
uint32_t rune = *hex;
if (rune >= 0xd800 && rune <= 0xdbff) {
// Surrogate pair: two 16-bit codepoints become a 32-bit codepoint.
uint32_t high = rune;
RETURN_IF_ERROR(Expect("\\u"));
auto hex = ParseU16HexCodepoint();
RETURN_IF_ERROR(hex.status());
uint32_t low = *hex;
if (low < 0xdc00 || low > 0xdfff) {
return Invalid("invalid low surrogate");
}
rune = (high & 0x3ff) << 10;
rune |= (low & 0x3ff);
rune += 0x10000;
} else if (rune >= 0xdc00 && rune <= 0xdfff) {
return Invalid("unpaired low surrogate");
}
// Write as UTF-8.
if (rune <= 0x7f) {
out_utf8[0] = rune;
return 1;
} else if (rune <= 0x07ff) {
out_utf8[0] = ((rune >> 6) & 0x1f) | 0xc0;
out_utf8[1] = ((rune >> 0) & 0x3f) | 0x80;
return 2;
} else if (rune <= 0xffff) {
out_utf8[0] = ((rune >> 12) & 0x0f) | 0xe0;
out_utf8[1] = ((rune >> 6) & 0x3f) | 0x80;
out_utf8[2] = ((rune >> 0) & 0x3f) | 0x80;
return 3;
} else if (rune < 0x10ffff) {
out_utf8[0] = ((rune >> 18) & 0x07) | 0xF0;
out_utf8[1] = ((rune >> 12) & 0x3f) | 0x80;
out_utf8[2] = ((rune >> 6) & 0x3f) | 0x80;
out_utf8[3] = ((rune >> 0) & 0x3f) | 0x80;
return 4;
} else {
return Invalid("invalid codepoint");
}
}
static char ParseSimpleEscape(char c, bool allow_legacy_syntax) {
switch (c) {
case '"':
return '"';
case '\\':
return '\\';
case '/':
return '/';
case 'b':
return '\b';
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case '\'':
if (allow_legacy_syntax) {
return '\'';
}
ABSL_FALLTHROUGH_INTENDED;
default:
return 0;
}
}
absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
RETURN_IF_ERROR(SkipToToken());
// This is a non-standard extension accepted by the ESF parser that we will
// need to accept for backwards-compat.
bool is_single_quote = stream_.PeekChar() == '\'';
if (!options_.allow_legacy_syntax && is_single_quote) {
return Invalid("expected '\"'");
}
JsonLocation loc = json_loc_;
RETURN_IF_ERROR(Expect(is_single_quote ? "'" : "\""));
// on_heap is empty if we do not need to heap-allocate the string.
std::string on_heap;
LocationWith<Mark> mark = BeginMark();
while (true) {
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
char c = stream_.PeekChar();
RETURN_IF_ERROR(Advance(1));
switch (c) {
case '"':
case '\'': {
if (c != (is_single_quote ? '\'' : '"')) {
goto normal_character;
}
if (!on_heap.empty()) {
return LocationWith<MaybeOwnedString>{
MaybeOwnedString(std::move(on_heap)), loc};
}
// NOTE: the 1 below clips off the " from the end of the string.
return LocationWith<MaybeOwnedString>{mark.value.UpToUnread(1), loc};
}
case '\\': {
if (on_heap.empty()) {
// The 1 skips over the `\`.
on_heap = std::string(mark.value.UpToUnread(1).AsView());
// Clang-tidy incorrectly notes this as being moved-from multiple
// times, but it can only occur in one loop iteration. The mark is
// destroyed only if we need to handle an escape when on_heap is
// empty. Because this branch unconditionally pushes to on_heap, this
// condition can never be reached in any iteration that follows it.
// This, at most one move every actually occurs.
std::move(mark).value.Discard();
}
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
char c = stream_.PeekChar();
RETURN_IF_ERROR(Advance(1));
if (c == 'u' || (c == 'U' && options_.allow_legacy_syntax)) {
// Ensure there is actual space to scribble the UTF-8 onto.
on_heap.resize(on_heap.size() + 4);
auto written = ParseUnicodeEscape(&on_heap[on_heap.size() - 4]);
RETURN_IF_ERROR(written.status());
on_heap.resize(on_heap.size() - 4 + *written);
} else {
char escape = ParseSimpleEscape(c, options_.allow_legacy_syntax);
if (escape == 0) {
return Invalid(absl::StrFormat("invalid escape char: '%c'", c));
}
on_heap.push_back(escape);
}
break;
}
normal_character:
default: {
uint8_t uc = static_cast<uint8_t>(c);
// If people have newlines in their strings, that's their problem; it
// is too difficult to support correctly in our location tracking, and
// is out of spec, so users will get slightly wrong locations in errors.
if ((uc < 0x20 || uc == 0xff) && !options_.allow_legacy_syntax) {
return Invalid(absl::StrFormat(
"invalid control character 0x%02x in string", uc));
}
// Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
// one of the following (big-endian) patterns:
//
// 0b0xxxxxxx
// 0b110xxxxx'10xxxxxx
// 0b1110xxxx'10xxxxxx'10xxxxxx
// 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
//
// We don't need to decode it; just validate it.
size_t lookahead = 0;
switch (absl::countl_one(uc)) {
case 0:
break;
case 2:
lookahead = 1;
break;
case 3:
lookahead = 2;
break;
case 4:
lookahead = 3;
break;
default:
return Invalid("invalid UTF-8 in string");
}
if (!on_heap.empty()) {
on_heap.push_back(c);
}
for (int i = 0; i < lookahead; ++i) {
RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
uint8_t uc = static_cast<uint8_t>(stream_.PeekChar());
if ((uc >> 6) != 2) {
return Invalid("invalid UTF-8 in string");
}
if (!on_heap.empty()) {
on_heap.push_back(stream_.PeekChar());
}
RETURN_IF_ERROR(Advance(1));
}
break;
}
}
}
return Invalid("EOF inside string");
}
absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseBareWord() {
RETURN_IF_ERROR(SkipToToken());
auto ident = TakeWhile(
[](size_t, char c) { return c == '_' || absl::ascii_isalnum(c); });
RETURN_IF_ERROR(ident.status());
absl::string_view text = ident->value.AsView();
if (text.empty() || absl::ascii_isdigit(text[0]) || text == "null" ||
text == "true" || text == "false") {
return ident->loc.Invalid("expected bare word");
}
return ident;
}
} // namespace json_internal
} // namespace protobuf
} // namespace google