| // Protocol Buffers - Google's data interchange format |
| // Copyright 2008 Google Inc. All rights reserved. |
| // https://developers.google.com/protocol-buffers/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // Internal JSON tokenization utilities; not public API. |
| #ifndef GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__ |
| #define GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__ |
| |
| #include <array> |
| #include <cfloat> |
| #include <cmath> |
| #include <cstdint> |
| #include <iostream> |
| #include <limits> |
| #include <ostream> |
| #include <string> |
| #include <utility> |
| |
| #include "google/protobuf/stubs/logging.h" |
| #include "google/protobuf/stubs/common.h" |
| #include "google/protobuf/descriptor.h" |
| #include "absl/status/status.h" |
| #include "absl/status/statusor.h" |
| #include "absl/strings/match.h" |
| #include "absl/strings/str_format.h" |
| #include "absl/strings/string_view.h" |
| #include "google/protobuf/io/zero_copy_stream.h" |
| #include "google/protobuf/json/internal/message_path.h" |
| #include "google/protobuf/json/internal/zero_copy_buffered_stream.h" |
| #include "google/protobuf/stubs/status_macros.h" |
| |
| |
| // Must be included last. |
| #include "google/protobuf/port_def.inc" |
| |
| namespace google { |
| namespace protobuf { |
| namespace json_internal { |
| // This is a duplicate of JsonParseOptions from json_util.h; it must be |
| // re-defined here so that :json_lexer does not need to depend on :json_util. |
| struct ParseOptions { |
| bool ignore_unknown_fields = false; |
| bool case_insensitive_enum_parsing = false; |
| |
| static constexpr size_t kDefaultDepth = 100; |
| |
| // The number of times we may recurse before bailing out on the grounds of |
| // avoiding pathological input. |
| int recursion_depth = kDefaultDepth; |
| |
| // The original parser used by json_util2 accepted a number of non-standard |
| // options. Setting this flag enables them. |
| // |
| // What those extensions were is explicitly not documented, beyond what exists |
| // in the unit tests; we intend to remove this setting eventually. See |
| // b/234868512. |
| bool allow_legacy_syntax = false; |
| }; |
| |
| // A position in JSON input, for error context. |
| struct JsonLocation { |
| // This type exists to work around an absl type that has not yet been |
| // released. |
| struct SourceLocation { |
| static SourceLocation current() { return {}; } |
| }; |
| |
| // Line and column are both zero-indexed in-memory. |
| size_t offset = 0; |
| size_t line = 0; |
| size_t col = 0; |
| const MessagePath* path = nullptr; |
| |
| // Creates an absl::InvalidArgumentError with line/column information. |
| absl::Status Invalid(absl::string_view message, |
| SourceLocation sl = SourceLocation::current()) const; |
| }; |
| |
| template <typename T> |
| struct LocationWith { |
| T value; |
| JsonLocation loc; |
| }; |
| |
| class JsonLexer { |
| public: |
| // A kind of token that PeekKind() can detect. |
| enum Kind { |
| kObj, |
| kArr, |
| kStr, |
| kNum, |
| kTrue, |
| kFalse, |
| kNull, |
| }; |
| |
| using SourceLocation = JsonLocation::SourceLocation; |
| |
| JsonLexer(io::ZeroCopyInputStream* stream, const ParseOptions& options, |
| MessagePath* path = nullptr, JsonLocation start = {}) |
| : stream_(stream), options_(options), json_loc_(start), path_(path) { |
| json_loc_.path = path_; |
| } |
| |
| const ParseOptions& options() const { return options_; } |
| |
| const MessagePath& path() const { return *path_; } |
| MessagePath& path() { return *path_; } |
| |
| // Creates an absl::InvalidArgumentError with line/column information. |
| absl::Status Invalid(absl::string_view message, |
| SourceLocation sl = SourceLocation::current()) { |
| return json_loc_.Invalid(message, sl); |
| } |
| |
| // Expects the next bytes to be parsed (after consuming whitespace) to be |
| // exactly `literal`. If they are, consumes them; otherwise returns an error. |
| absl::Status Expect(absl::string_view literal, |
| SourceLocation sl = SourceLocation::current()) { |
| RETURN_IF_ERROR(SkipToToken()); |
| auto buffering = stream_.BufferAtLeast(literal.size()); |
| RETURN_IF_ERROR(buffering.status()); |
| |
| if (!absl::StartsWith(stream_.Unread(), literal)) { |
| return Invalid( |
| absl::StrFormat("unexpected character: '%c'; expected '%s'", |
| stream_.PeekChar(), literal), |
| sl); |
| } |
| |
| return Advance(literal.size()); |
| } |
| |
| // Like Expect(), but returns a boolean. This makes it clear that the |
| // lookahead is failible. |
| bool Peek(absl::string_view literal) { |
| // Suppress the error; this can only fail on EOF in which case we would |
| // return false regardless. |
| (void)SkipToToken(); |
| auto ignored = stream_.BufferAtLeast(literal.size()); |
| if (!absl::StartsWith(stream_.Unread(), literal)) { |
| return false; |
| } |
| |
| // We just ensured we had enough buffered so we can suppress this error. |
| (void)Advance(literal.size()); |
| return true; |
| } |
| |
| // Like Peek(string), but returns true if and only if a token of the given |
| // kind can be lexed next. Returns false on EOF, just like Peek(string). |
| bool Peek(Kind needle) { |
| auto kind = PeekKind(); |
| return kind.ok() && *kind == needle; |
| } |
| |
| // Consumes all whitespace and other ignored characters until the next |
| // token. |
| // |
| // This function returns an error on EOF, so PeekChar() can be safely |
| // called if it returns ok. |
| absl::Status SkipToToken(); |
| |
| // Returns which kind of value token (i.e., something that can occur after |
| // a `:`) is next up to be parsed. |
| absl::StatusOr<Kind> PeekKind(); |
| |
| // Parses a JSON number. |
| absl::StatusOr<LocationWith<double>> ParseNumber(); |
| |
| // Parses a number as a string, without turning it into an integer. |
| absl::StatusOr<LocationWith<MaybeOwnedString>> ParseRawNumber(); |
| |
| // Parses a UTF-8 string. If the contents of the string happen to actually be |
| // UTF-8, it will return a zero-copy view; otherwise it will allocate. |
| absl::StatusOr<LocationWith<MaybeOwnedString>> ParseUtf8(); |
| |
| // Walks over an array, calling `f` each time an element is reached. |
| // |
| // `f` should have type `() -> absl::Status`. |
| template <typename F> |
| absl::Status VisitArray(F f); |
| |
| // Walks over an object, calling `f` just after parsing each `:`. |
| // |
| // `f` should have type `(absl::string_view) -> absl::Status`. |
| template <typename F> |
| absl::Status VisitObject(F f); |
| |
| // Parses a single value and discards it. |
| absl::Status SkipValue(); |
| |
| // Forwards of functions from ZeroCopyBufferedStream. |
| |
| bool AtEof() { |
| // Ignore whitespace for the purposes of finding the EOF. This will return |
| // an error if we hit EOF, so we discard it. |
| (void)SkipToToken(); |
| return stream_.AtEof(); |
| } |
| |
| absl::StatusOr<LocationWith<MaybeOwnedString>> Take(size_t len) { |
| JsonLocation loc = json_loc_; |
| auto taken = stream_.Take(len); |
| RETURN_IF_ERROR(taken.status()); |
| return LocationWith<MaybeOwnedString>{*std::move(taken), loc}; |
| } |
| |
| template <typename Pred> |
| absl::StatusOr<LocationWith<MaybeOwnedString>> TakeWhile(Pred p) { |
| JsonLocation loc = json_loc_; |
| auto taken = stream_.TakeWhile(std::move(p)); |
| RETURN_IF_ERROR(taken.status()); |
| return LocationWith<MaybeOwnedString>{*std::move(taken), loc}; |
| } |
| |
| LocationWith<Mark> BeginMark() { return {stream_.BeginMark(), json_loc_}; } |
| |
| private: |
| friend BufferingGuard; |
| friend Mark; |
| friend MaybeOwnedString; |
| |
| absl::Status Push() { |
| if (options_.recursion_depth == 0) { |
| return Invalid("JSON content was too deeply nested"); |
| } |
| --options_.recursion_depth; |
| return absl::OkStatus(); |
| } |
| |
| void Pop() { ++options_.recursion_depth; } |
| |
| // Parses the next four bytes as a 16-bit hex numeral. |
| absl::StatusOr<uint16_t> ParseU16HexCodepoint(); |
| |
| // Parses a Unicode escape (\uXXXX); this may be a surrogate pair, so it may |
| // consume the character that follows. Both are encoded as utf8 into |
| // `out_utf8`; returns the number of bytes written. |
| absl::StatusOr<size_t> ParseUnicodeEscape(char out_utf8[4]); |
| |
| // Parses an alphanumeric "identifier", for use with the non-standard |
| // "unquoted keys" extension. |
| absl::StatusOr<LocationWith<MaybeOwnedString>> ParseBareWord(); |
| |
| absl::Status Advance(size_t bytes) { |
| RETURN_IF_ERROR(stream_.Advance(bytes)); |
| json_loc_.offset += static_cast<int>(bytes); |
| json_loc_.col += static_cast<int>(bytes); |
| return absl::OkStatus(); |
| } |
| |
| ZeroCopyBufferedStream stream_; |
| |
| ParseOptions options_; |
| JsonLocation json_loc_; |
| MessagePath* path_; |
| }; |
| |
| template <typename F> |
| absl::Status JsonLexer::VisitArray(F f) { |
| RETURN_IF_ERROR(Expect("[")); |
| RETURN_IF_ERROR(Push()); |
| |
| if (Peek("]")) { |
| Pop(); |
| return absl::OkStatus(); |
| } |
| |
| bool has_comma = true; |
| do { |
| if (!has_comma) { |
| return Invalid("expected ','"); |
| } |
| RETURN_IF_ERROR(f()); |
| has_comma = Peek(","); |
| } while (!Peek("]")); |
| |
| if (!options_.allow_legacy_syntax && has_comma) { |
| return Invalid("expected ']'"); |
| } |
| |
| Pop(); |
| return absl::OkStatus(); |
| } |
| |
| // Walks over an object, calling `f` just after parsing each `:`. |
| // |
| // `f` should have type `(MaybeOwnedString&) -> absl::Status`. |
| template <typename F> |
| absl::Status JsonLexer::VisitObject(F f) { |
| RETURN_IF_ERROR(Expect("{")); |
| RETURN_IF_ERROR(Push()); |
| |
| if (Peek("}")) { |
| Pop(); |
| return absl::OkStatus(); |
| } |
| |
| bool has_comma = true; |
| do { |
| if (!has_comma) { |
| return Invalid("expected ','"); |
| } |
| RETURN_IF_ERROR(SkipToToken()); |
| |
| absl::StatusOr<LocationWith<MaybeOwnedString>> key; |
| if (stream_.PeekChar() == '"' || stream_.PeekChar() == '\'') { |
| key = ParseUtf8(); |
| } else if (options_.allow_legacy_syntax) { |
| key = ParseBareWord(); |
| } else { |
| return Invalid("expected '\"'"); |
| } |
| |
| RETURN_IF_ERROR(key.status()); |
| RETURN_IF_ERROR(Expect(":")); |
| RETURN_IF_ERROR(f(*key)); |
| has_comma = Peek(","); |
| } while (!Peek("}")); |
| Pop(); |
| |
| if (!options_.allow_legacy_syntax && has_comma) { |
| return Invalid("expected '}'"); |
| } |
| |
| return absl::OkStatus(); |
| } |
| } // namespace json_internal |
| } // namespace protobuf |
| } // namespace google |
| |
| #include "google/protobuf/port_undef.inc" |
| #endif // GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__ |