blob: 5bb27fd0eba5f456a59302a7c2719744d830cfa1 [file] [log] [blame]
#region Copyright notice and license
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
// http://github.com/jskeet/dotnet-protobufs/
// Original C++/Java/Python code:
// http://code.google.com/p/protobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endregion
using System;
using System.Globalization;
using System.Text.RegularExpressions;
namespace Google.ProtocolBuffers
{
/// <summary>
/// Represents a stream of tokens parsed from a string.
/// </summary>
internal sealed class TextTokenizer
{
private readonly string text;
private string currentToken;
/// <summary>
/// The character index within the text to perform the next regex match at.
/// </summary>
private int matchPos = 0;
/// <summary>
/// The character index within the text at which the current token begins.
/// </summary>
private int pos = 0;
/// <summary>
/// The line number of the current token.
/// </summary>
private int line = 0;
/// <summary>
/// The column number of the current token.
/// </summary>
private int column = 0;
/// <summary>
/// The line number of the previous token.
/// </summary>
private int previousLine = 0;
/// <summary>
/// The column number of the previous token.
/// </summary>
private int previousColumn = 0;
// Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
internal static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
FrameworkPortability.
CompiledRegexWhereAvailable |
RegexOptions.Multiline);
private static readonly Regex TokenPattern = new Regex(
"\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier
"\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number
"\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string
"\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string
FrameworkPortability.CompiledRegexWhereAvailable | RegexOptions.Multiline);
private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$",
FrameworkPortability.CompiledRegexWhereAvailable |
RegexOptions.IgnoreCase);
private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$",
FrameworkPortability.CompiledRegexWhereAvailable |
RegexOptions.IgnoreCase);
private static readonly Regex FloatNan = new Regex("^nanf?$",
FrameworkPortability.CompiledRegexWhereAvailable |
RegexOptions.IgnoreCase);
/** Construct a tokenizer that parses tokens from the given text. */
public TextTokenizer(string text)
{
this.text = text;
SkipWhitespace();
NextToken();
}
/// <summary>
/// Are we at the end of the input?
/// </summary>
public bool AtEnd
{
get { return currentToken.Length == 0; }
}
/// <summary>
/// Advances to the next token.
/// </summary>
public void NextToken()
{
previousLine = line;
previousColumn = column;
// Advance the line counter to the current position.
while (pos < matchPos)
{
if (text[pos] == '\n')
{
++line;
column = 0;
}
else
{
++column;
}
++pos;
}
// Match the next token.
if (matchPos == text.Length)
{
// EOF
currentToken = "";
}
else
{
Match match = TokenPattern.Match(text, matchPos);
if (match.Success)
{
currentToken = match.Value;
matchPos += match.Length;
}
else
{
// Take one character.
currentToken = text[matchPos].ToString();
matchPos++;
}
SkipWhitespace();
}
}
/// <summary>
/// Skip over any whitespace so that matchPos starts at the next token.
/// </summary>
private void SkipWhitespace()
{
Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
if (match.Success)
{
matchPos += match.Length;
}
}
/// <summary>
/// If the next token exactly matches the given token, consume it and return
/// true. Otherwise, return false without doing anything.
/// </summary>
public bool TryConsume(string token)
{
if (currentToken == token)
{
NextToken();
return true;
}
return false;
}
/*
* If the next token exactly matches {@code token}, consume it. Otherwise,
* throw a {@link ParseException}.
*/
/// <summary>
/// If the next token exactly matches the specified one, consume it.
/// Otherwise, throw a FormatException.
/// </summary>
/// <param name="token"></param>
public void Consume(string token)
{
if (!TryConsume(token))
{
throw CreateFormatException("Expected \"" + token + "\".");
}
}
/// <summary>
/// Returns true if the next token is an integer, but does not consume it.
/// </summary>
public bool LookingAtInteger()
{
if (currentToken.Length == 0)
{
return false;
}
char c = currentToken[0];
return ('0' <= c && c <= '9') || c == '-' || c == '+';
}
/// <summary>
/// If the next token is an identifier, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public string ConsumeIdentifier()
{
foreach (char c in currentToken)
{
if (('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
(c == '_') || (c == '.'))
{
// OK
}
else
{
throw CreateFormatException("Expected identifier.");
}
}
string result = currentToken;
NextToken();
return result;
}
/// <summary>
/// If the next token is a 32-bit signed integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public int ConsumeInt32()
{
try
{
int result = TextFormat.ParseInt32(currentToken);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 32-bit unsigned integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public uint ConsumeUInt32()
{
try
{
uint result = TextFormat.ParseUInt32(currentToken);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 64-bit signed integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public long ConsumeInt64()
{
try
{
long result = TextFormat.ParseInt64(currentToken);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a 64-bit unsigned integer, consume it and return its
/// value. Otherwise, throw a FormatException.
/// </summary>
public ulong ConsumeUInt64()
{
try
{
ulong result = TextFormat.ParseUInt64(currentToken);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateIntegerParseException(e);
}
}
/// <summary>
/// If the next token is a double, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public double ConsumeDouble()
{
// We need to parse infinity and nan separately because
// double.Parse() does not accept "inf", "infinity", or "nan".
if (DoubleInfinity.IsMatch(currentToken))
{
bool negative = currentToken.StartsWith("-");
NextToken();
return negative ? double.NegativeInfinity : double.PositiveInfinity;
}
if (currentToken.Equals("nan", StringComparison.OrdinalIgnoreCase))
{
NextToken();
return Double.NaN;
}
try
{
double result = double.Parse(currentToken, FrameworkPortability.InvariantCulture);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateFloatParseException(e);
}
catch (OverflowException e)
{
throw CreateFloatParseException(e);
}
}
/// <summary>
/// If the next token is a float, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public float ConsumeFloat()
{
// We need to parse infinity and nan separately because
// Float.parseFloat() does not accept "inf", "infinity", or "nan".
if (FloatInfinity.IsMatch(currentToken))
{
bool negative = currentToken.StartsWith("-");
NextToken();
return negative ? float.NegativeInfinity : float.PositiveInfinity;
}
if (FloatNan.IsMatch(currentToken))
{
NextToken();
return float.NaN;
}
if (currentToken.EndsWith("f"))
{
currentToken = currentToken.TrimEnd('f');
}
try
{
float result = float.Parse(currentToken, FrameworkPortability.InvariantCulture);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateFloatParseException(e);
}
catch (OverflowException e)
{
throw CreateFloatParseException(e);
}
}
/// <summary>
/// If the next token is a Boolean, consume it and return its value.
/// Otherwise, throw a FormatException.
/// </summary>
public bool ConsumeBoolean()
{
if (currentToken == "true")
{
NextToken();
return true;
}
if (currentToken == "false")
{
NextToken();
return false;
}
throw CreateFormatException("Expected \"true\" or \"false\".");
}
/// <summary>
/// If the next token is a string, consume it and return its (unescaped) value.
/// Otherwise, throw a FormatException.
/// </summary>
public string ConsumeString()
{
return ConsumeByteString().ToStringUtf8();
}
/// <summary>
/// If the next token is a string, consume it, unescape it as a
/// ByteString and return it. Otherwise, throw a FormatException.
/// </summary>
public ByteString ConsumeByteString()
{
char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
if (quote != '\"' && quote != '\'')
{
throw CreateFormatException("Expected string.");
}
if (currentToken.Length < 2 ||
currentToken[currentToken.Length - 1] != quote)
{
throw CreateFormatException("String missing ending quote.");
}
try
{
string escaped = currentToken.Substring(1, currentToken.Length - 2);
ByteString result = TextFormat.UnescapeBytes(escaped);
NextToken();
return result;
}
catch (FormatException e)
{
throw CreateFormatException(e.Message);
}
}
/// <summary>
/// Returns a format exception with the current line and column numbers
/// in the description, suitable for throwing.
/// </summary>
public FormatException CreateFormatException(string description)
{
// Note: People generally prefer one-based line and column numbers.
return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
}
/// <summary>
/// Returns a format exception with the line and column numbers of the
/// previous token in the description, suitable for throwing.
/// </summary>
public FormatException CreateFormatExceptionPreviousToken(string description)
{
// Note: People generally prefer one-based line and column numbers.
return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
}
/// <summary>
/// Constructs an appropriate FormatException for the given existing exception
/// when trying to parse an integer.
/// </summary>
private FormatException CreateIntegerParseException(FormatException e)
{
return CreateFormatException("Couldn't parse integer: " + e.Message);
}
/// <summary>
/// Constructs an appropriate FormatException for the given existing exception
/// when trying to parse a float or double.
/// </summary>
private FormatException CreateFloatParseException(Exception e)
{
return CreateFormatException("Couldn't parse number: " + e.Message);
}
}
}