Blame - compiler/front_end/tokenizer.py - third_party/github/google/emboss

blob: defee343159d2a60be26170cb4cc2c95aa07bb19 [file] [log] [blame]

Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	1	# Copyright 2019 Google LLC
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# https://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	"""Tokenization for the Emboss definition language.
				16
				17	This module exports the tokenize function and various errors.
				18
				19	In addition, a couple of lists are exported for the use of
				20	generate_grammar_md.py:
				21
				22	LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against
				23	input.
				24	REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization.
				25	REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject
				26	(REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and
				27	REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens
				28	which match the pattern.
				29	"""
				30
				31	import collections
				32	import re
				33
reventlov	6731fc4	2019-10-03 15:23:13 -0700	[diff] [blame]	34	from compiler.util import error
				35	from compiler.util import parser_types
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	36
				37
				38	def tokenize(text, file_name):
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	39	# TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched
				40	# delimiters ([], (), and {}).
				41	"""Tokenizes its argument.
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	42
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	43	Arguments:
				44	text: The raw text of a .emb file.
				45	file_name: The name of the file to use in errors.
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	46
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	47	Returns:
				48	A tuple of:
				49	a list of parser_types.Tokens or None
				50	a possibly-empty list of errors.
				51	"""
				52	tokens = []
				53	indent_stack = [""]
				54	line_number = 0
				55	for line in text.splitlines():
				56	line_number += 1
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	57
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	58	# _tokenize_line splits the actual text into tokens.
				59	line_tokens, errors = _tokenize_line(line, line_number, file_name)
				60	if errors:
				61	return None, errors
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	62
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	63	# Lines with only whitespace and comments are not used for Indent/Dedent
				64	# calculation, and do not produce end-of-line tokens.
				65	for token in line_tokens:
				66	if token.symbol != "Comment":
				67	break
				68	else:
				69	tokens.extend(line_tokens)
				70	tokens.append(
				71	parser_types.Token(
				72	'"\\n"',
				73	"\n",
				74	parser_types.make_location(
				75	(line_number, len(line) + 1), (line_number, len(line) + 1)
				76	),
				77	)
				78	)
				79	continue
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	80
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	81	# Leading whitespace is whatever .lstrip() removes.
				82	leading_whitespace = line[0 : len(line) - len(line.lstrip())]
				83	if leading_whitespace == indent_stack[-1]:
				84	# If the current leading whitespace is equal to the last leading
				85	# whitespace, do not emit an Indent or Dedent token.
				86	pass
				87	elif leading_whitespace.startswith(indent_stack[-1]):
				88	# If the current leading whitespace is longer than the last leading
				89	# whitespace, emit an Indent token. For the token text, take the new
				90	# part of the whitespace.
				91	tokens.append(
				92	parser_types.Token(
				93	"Indent",
				94	leading_whitespace[len(indent_stack[-1]) :],
				95	parser_types.make_location(
				96	(line_number, len(indent_stack[-1]) + 1),
				97	(line_number, len(leading_whitespace) + 1),
				98	),
				99	)
				100	)
				101	indent_stack.append(leading_whitespace)
				102	else:
				103	# Otherwise, search for the unclosed indentation level that matches
				104	# the current indentation level. Emit a Dedent token for each
				105	# newly-closed indentation level.
				106	for i in range(len(indent_stack) - 1, -1, -1):
				107	if leading_whitespace == indent_stack[i]:
				108	break
				109	tokens.append(
				110	parser_types.Token(
				111	"Dedent",
				112	"",
				113	parser_types.make_location(
				114	(line_number, len(leading_whitespace) + 1),
				115	(line_number, len(leading_whitespace) + 1),
				116	),
				117	)
				118	)
				119	del indent_stack[i]
				120	else:
				121	return None, [
				122	[
				123	error.error(
				124	file_name,
				125	parser_types.make_location(
				126	(line_number, 1),
				127	(line_number, len(leading_whitespace) + 1),
				128	),
				129	"Bad indentation",
				130	)
				131	]
				132	]
				133
				134	tokens.extend(line_tokens)
				135
				136	# Append an end-of-line token (for non-whitespace lines).
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	137	tokens.append(
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	138	parser_types.Token(
				139	'"\\n"',
				140	"\n",
				141	parser_types.make_location(
				142	(line_number, len(line) + 1), (line_number, len(line) + 1)
				143	),
				144	)
				145	)
				146	for i in range(len(indent_stack) - 1):
				147	tokens.append(
				148	parser_types.Token(
				149	"Dedent",
				150	"",
				151	parser_types.make_location((line_number + 1, 1), (line_number + 1, 1)),
				152	)
				153	)
				154	return tokens, []
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	155
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	156
				157	# Token patterns used by _tokenize_line.
				158	LITERAL_TOKEN_PATTERNS = (
				159	"[ ] ( ) : = + - * . ? == != && \|\| < > <= >= , "
				160	"$static_size_in_bits $is_statically_sized "
reventlov	4000cd0	2022-04-05 12:15:39 -0700	[diff] [blame]	161	"$max $present $upper_bound $lower_bound $next "
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	162	"$size_in_bits $size_in_bytes "
				163	"$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes "
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	164	"$default struct bits enum external import as if let"
				165	).split()
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	166	_T = collections.namedtuple("T", ["regex", "symbol"])
				167	REGEX_TOKEN_PATTERNS = [
				168	# Words starting with variations of "emboss reserved" are reserved for
				169	# internal use by the Emboss compiler.
				170	_T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"),
				171	_T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"),
				172	_T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"),
				173	_T(re.compile(r'"(?:[^"\n\\]\|\\[n\\"])*"'), "String"),
				174	_T(re.compile("[0-9]+"), "Number"),
				175	_T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"),
				176	_T(re.compile("0x[0-9a-fA-F]+"), "Number"),
				177	_T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"),
				178	_T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"),
				179	_T(re.compile("0b[01]+"), "Number"),
				180	_T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"),
				181	_T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"),
				182	_T(re.compile("true\|false"), "BooleanConstant"),
				183	_T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"),
				184	# Single-letter ShoutyWords (like "A") and single-letter-followed-by-number
				185	# ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A
				186	# ShoutyWord must start with an upper case letter and contain at least one
				187	# more upper case letter or '_'.
				188	_T(re.compile("[A-Z][A-Z_0-9][A-Z_][A-Z_0-9]"), "ShoutyWord"),
				189	# A CamelWord starts with A-Z and contains at least one a-z, and no _.
				190	_T(re.compile("[A-Z][a-zA-Z0-9][a-z][a-zA-Z0-9]"), "CamelWord"),
				191	_T(re.compile("-- .*"), "Documentation"),
				192	_T(re.compile("--$"), "Documentation"),
				193	_T(re.compile("--.*"), "BadDocumentation"),
				194	_T(re.compile(r"\s+"), None),
				195	_T(re.compile("#.*"), "Comment"),
				196	# BadWord and BadNumber are a catch-alls for words and numbers so that
				197	# something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord].
				198	#
				199	# This is preferable to returning an error because the BadWord and BadNumber
				200	# token types can be used in example-based errors.
				201	_T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"),
				202	_T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"),
				203	]
				204	del _T
				205
				206
				207	def _tokenize_line(line, line_number, file_name):
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	208	"""Tokenizes a single line of input.
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	209
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	210	Arguments:
				211	line: The line of text to tokenize.
				212	line_number: The line number (used when constructing token objects).
				213	file_name: The name of a file to use in errors.
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	214
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	215	Returns:
				216	A tuple of:
				217	A list of token objects or None.
				218	A possibly-empty list of errors.
				219	"""
				220	tokens = []
				221	offset = 0
				222	while offset < len(line):
				223	best_candidate = ""
				224	best_candidate_symbol = None
				225	# Find the longest match. Ties go to the first match. This way, keywords
				226	# ("struct") are matched as themselves, but words that only happen to start
				227	# with keywords ("structure") are matched as words.
Ben Olmstead	c0d7784	2019-07-31 17:34:05 -0700	[diff] [blame]	228	#
Dmitri Prime	495d3f2	2024-09-06 16:56:59 -0700	[diff] [blame]	229	# There is never a reason to try to match a literal after a regex that
				230	# could also match that literal, so check literals first.
				231	for literal in LITERAL_TOKEN_PATTERNS:
				232	if line[offset:].startswith(literal) and len(literal) > len(best_candidate):
				233	best_candidate = literal
				234	# For Emboss, the name of a literal token is just the literal in quotes,
				235	# so that the grammar can read a little more naturally, e.g.:
				236	#
				237	# expression -> expression "+" expression
				238	#
				239	# instead of
				240	#
				241	# expression -> expression Plus expression
				242	best_candidate_symbol = '"' + literal + '"'
				243	for pattern in REGEX_TOKEN_PATTERNS:
				244	match_result = pattern.regex.match(line[offset:])
				245	if match_result and len(match_result.group(0)) > len(best_candidate):
				246	best_candidate = match_result.group(0)
				247	best_candidate_symbol = pattern.symbol
				248	if not best_candidate:
				249	return None, [
				250	[
				251	error.error(
				252	file_name,
				253	parser_types.make_location(
				254	(line_number, offset + 1), (line_number, offset + 2)
				255	),
				256	"Unrecognized token",
				257	)
				258	]
				259	]
				260	if best_candidate_symbol:
				261	tokens.append(
				262	parser_types.Token(
				263	best_candidate_symbol,
				264	best_candidate,
				265	parser_types.make_location(
				266	(line_number, offset + 1),
				267	(line_number, offset + len(best_candidate) + 1),
				268	),
				269	)
				270	)
				271	offset += len(best_candidate)
				272	return tokens, None