| // Protocol Buffers - Google's data interchange format |
| // Copyright 2008 Google Inc. All rights reserved. |
| // https://developers.google.com/protocol-buffers/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "google/protobuf/json/internal/writer.h" |
| |
| #include <cstdint> |
| #include <initializer_list> |
| #include <limits> |
| #include <utility> |
| |
| #include "google/protobuf/stubs/logging.h" |
| #include "google/protobuf/stubs/common.h" |
| #include "absl/algorithm/container.h" |
| |
| // Must be included last. |
| #include "google/protobuf/port_def.inc" |
| |
| namespace google { |
| namespace protobuf { |
| namespace json_internal { |
| |
| // Tries to write a non-finite double if necessary; returns false if |
| // nothing was written. |
| bool JsonWriter::MaybeWriteSpecialFp(double val) { |
| if (val == std::numeric_limits<double>::infinity()) { |
| Write("\"Infinity\""); |
| } else if (val == -std::numeric_limits<double>::infinity()) { |
| Write("\"-Infinity\""); |
| } else if (std::isnan(val)) { |
| Write("\"NaN\""); |
| } else { |
| return false; |
| } |
| return true; |
| } |
| |
| void JsonWriter::WriteBase64(absl::string_view str) { |
| // This is the regular base64, not the "web-safe" version. |
| constexpr absl::string_view kBase64 = |
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
| const char* ptr = str.data(); |
| const char* end = ptr + str.size(); |
| |
| // Reads the `n`th character off of `ptr` while gracefully avoiding |
| // sign extension due to implicit conversions |
| auto read = [&](size_t n) { |
| return static_cast<size_t>(static_cast<uint8_t>(ptr[n])); |
| }; |
| |
| char buf[4]; |
| absl::string_view view(buf, sizeof(buf)); |
| Write("\""); |
| |
| while (end - ptr >= 3) { |
| buf[0] = kBase64[read(0) >> 2]; |
| buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)]; |
| buf[2] = kBase64[((read(1) & 0xf) << 2) | (read(2) >> 6)]; |
| buf[3] = kBase64[read(2) & 0x3f]; |
| Write(view); |
| ptr += 3; |
| } |
| |
| switch (end - ptr) { |
| case 2: |
| buf[0] = kBase64[read(0) >> 2]; |
| buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)]; |
| buf[2] = kBase64[(read(1) & 0xf) << 2]; |
| buf[3] = '='; |
| Write(view); |
| break; |
| case 1: |
| buf[0] = kBase64[read(0) >> 2]; |
| buf[1] = kBase64[((read(0) & 0x3) << 4)]; |
| buf[2] = '='; |
| buf[3] = '='; |
| Write(view); |
| break; |
| } |
| |
| Write("\""); |
| } |
| |
| // The minimum value of a unicode high-surrogate code unit in the utf-16 |
| // encoding. A high-surrogate is also known as a leading-surrogate. |
| // See http://www.unicode.org/glossary/#high_surrogate_code_unit |
| static constexpr uint16_t kMinHighSurrogate = 0xd800; |
| |
| // The minimum value of a unicode low-surrogate code unit in the utf-16 |
| // encoding. A low-surrogate is also known as a trailing-surrogate. |
| // See http://www.unicode.org/glossary/#low_surrogate_code_unit |
| static constexpr uint16_t kMinLowSurrogate = 0xdc00; |
| |
| // The maximum value of a unicode low-surrogate code unit in the utf-16 |
| // encoding. A low-surrogate is also known as a trailing surrogate. |
| // See http://www.unicode.org/glossary/#low_surrogate_code_unit |
| static constexpr uint16_t kMaxLowSurrogate = 0xdfff; |
| |
| // The minimum value of a unicode supplementary code point. |
| // See http://www.unicode.org/glossary/#supplementary_code_point |
| static constexpr uint32_t kMinSupplementaryCodePoint = 0x010000; |
| |
| // The maximum value of a unicode code point. |
| // See http://www.unicode.org/glossary/#code_point |
| static constexpr uint32_t kMaxCodePoint = 0x10ffff; |
| |
| // Indicates decoding failure; not a valid Unicode scalar. |
| static constexpr uint32_t kErrorSentinel = 0xaaaaaaaa; |
| |
| // A Unicode Scalar encoded two ways. |
| struct Utf8Scalar { |
| // The Unicode scalar value as a 32-bit integer. If decoding failed, this |
| // is equal to kErrorSentinel. |
| uint32_t u32; |
| // The Unicode scalar value encoded as UTF-8 bytes. May not reflect the |
| // contents of `u32` if it is kErrorSentinel. |
| absl::string_view utf8; |
| }; |
| |
| // Parses a single UTF-8-encoded Unicode scalar from `str`. Returns a pair of |
| // the scalar and the UTF-8-encoded content corresponding to it from `str`. |
| // |
| // Returns U+FFFD on failure, and consumes an unspecified number of bytes in |
| // doing so. |
| static Utf8Scalar ConsumeUtf8Scalar(absl::string_view& str) { |
| GOOGLE_DCHECK(!str.empty()); |
| uint32_t scalar = static_cast<uint8_t>(str[0]); |
| const char* start = str.data(); |
| size_t len = 1; |
| |
| str = str.substr(1); |
| |
| // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying |
| // one of the following (big-endian) patterns: |
| // |
| // 0b0xxxxxxx |
| // 0b110xxxxx'10xxxxxx |
| // 0b1110xxxx'10xxxxxx'10xxxxxx |
| // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx |
| // |
| // We don't need to decode it; just validate it. |
| int lookahead = 0; |
| switch (absl::countl_one(static_cast<uint8_t>(scalar))) { |
| case 0: |
| break; |
| case 2: |
| lookahead = 1; |
| scalar &= (1 << 5) - 1; |
| break; |
| case 3: |
| lookahead = 2; |
| scalar &= (1 << 4) - 1; |
| break; |
| case 4: |
| lookahead = 3; |
| scalar &= (1 << 3) - 1; |
| break; |
| default: |
| scalar = kErrorSentinel; |
| break; |
| } |
| |
| for (int i = 0; i < lookahead; ++i) { |
| if (str.empty()) { |
| scalar = kErrorSentinel; |
| break; |
| } |
| |
| uint8_t next = str[0]; |
| str = str.substr(1); |
| ++len; |
| |
| // Looking for top 2 bits are 0b10. |
| if (next >> 6 != 2) { |
| scalar = kErrorSentinel; |
| break; |
| } |
| next &= (1 << 6) - 1; |
| scalar <<= 6; |
| scalar |= next; |
| } |
| |
| if (scalar > kMaxCodePoint) { |
| scalar = kErrorSentinel; |
| } |
| |
| return {scalar, absl::string_view(start, len)}; |
| } |
| |
| // Decides whether we must escape `scalar`. |
| // |
| // If the given Unicode scalar would not use a \u escape, `custom_escape` will |
| // be set to a non-empty string. |
| static bool MustEscape(uint32_t scalar, absl::string_view& custom_escape) { |
| switch (scalar) { |
| // These escapes are defined by the JSON spec. We do not escape /. |
| case '\n': |
| custom_escape = R"(\n)"; |
| return true; |
| case '\r': |
| custom_escape = R"(\r)"; |
| return true; |
| case '\t': |
| custom_escape = R"(\t)"; |
| return true; |
| case '\"': |
| custom_escape = R"(\")"; |
| return true; |
| case '\f': |
| custom_escape = R"(\f)"; |
| return true; |
| case '\b': |
| custom_escape = R"(\b)"; |
| return true; |
| case '\\': |
| custom_escape = R"(\\)"; |
| return true; |
| |
| case kErrorSentinel: |
| // Decoding failure turns into spaces, *not* replacement characters. We |
| // handle this separately from "normal" spaces so that it follows the |
| // escaping code-path. |
| // |
| // Note that literal replacement characters in the input string DO NOT |
| // get turned into spaces; this is only for decoding failures! |
| custom_escape = " "; |
| return true; |
| |
| // These are not required by the JSON spec, but help |
| // to prevent security bugs in JavaScript. |
| // |
| // These were originally present in the ESF parser, so they are kept for |
| // legacy compatibility (and because escaping most of these is in good |
| // taste, regardless). |
| case '<': |
| case '>': |
| case 0xfeff: // Zero width no-break space. |
| case 0xfff9: // Interlinear annotation anchor. |
| case 0xfffa: // Interlinear annotation separator. |
| case 0xfffb: // Interlinear annotation terminator. |
| case 0x00ad: // Soft-hyphen. |
| case 0x06dd: // Arabic end of ayah. |
| case 0x070f: // Syriac abbreviation mark. |
| case 0x17b4: // Khmer vowel inherent Aq. |
| case 0x17b5: // Khmer vowel inherent Aa. |
| case 0x000e0001: // Language tag. |
| return true; |
| default: |
| static constexpr std::pair<uint32_t, uint32_t> kEscapedRanges[] = { |
| {0x0000, 0x001f}, // ASCII control. |
| {0x007f, 0x009f}, // High ASCII bytes. |
| {0x0600, 0x0603}, // Arabic signs. |
| {0x200b, 0x200f}, // Zero width etc. |
| {0x2028, 0x202e}, // Separators etc. |
| {0x2060, 0x2064}, // Invisible etc. |
| {0x206a, 0x206f}, // Shaping etc. |
| {0x0001d173, 0x0001d17a}, // Music formatting. |
| {0x000e0020, 0x000e007f}, // TAG symbols. |
| }; |
| |
| return absl::c_any_of(kEscapedRanges, [scalar](auto range) { |
| return range.first <= scalar && scalar <= range.second; |
| }); |
| } |
| } |
| |
| void JsonWriter::WriteEscapedUtf8(absl::string_view str) { |
| while (!str.empty()) { |
| auto scalar = ConsumeUtf8Scalar(str); |
| absl::string_view custom_escape; |
| |
| if (!MustEscape(scalar.u32, custom_escape)) { |
| Write(scalar.utf8); |
| continue; |
| } |
| |
| if (!custom_escape.empty()) { |
| Write(custom_escape); |
| continue; |
| } |
| |
| if (scalar.u32 < 0x10000) { |
| WriteUEscape(scalar.u32); |
| continue; |
| } |
| |
| uint16_t lo = |
| (scalar.u32 & (kMaxLowSurrogate - kMinLowSurrogate)) + kMinLowSurrogate; |
| uint16_t hi = (scalar.u32 >> 10) + |
| (kMinHighSurrogate - (kMinSupplementaryCodePoint >> 10)); |
| WriteUEscape(hi); |
| WriteUEscape(lo); |
| } |
| } |
| |
| void JsonWriter::WriteUEscape(uint16_t val) { |
| char hex[7]; |
| int len = absl::SNPrintF(hex, sizeof(hex), R"(\u%04x)", val); |
| Write(absl::string_view(hex, static_cast<size_t>(len))); |
| } |
| } // namespace json_internal |
| } // namespace protobuf |
| } // namespace google |