blob: 910162163905a15aedf64c7e517351c975c63ecd [file] [log] [blame]
// Copyright 2023 The Bazel Authors. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package python
import (
sitter ""
const (
sitterNodeTypeString = "string"
sitterNodeTypeComment = "comment"
sitterNodeTypeIdentifier = "identifier"
sitterNodeTypeDottedName = "dotted_name"
sitterNodeTypeIfStatement = "if_statement"
sitterNodeTypeAliasedImport = "aliased_import"
sitterNodeTypeWildcardImport = "wildcard_import"
sitterNodeTypeImportStatement = "import_statement"
sitterNodeTypeComparisonOperator = "comparison_operator"
sitterNodeTypeImportFromStatement = "import_from_statement"
type ParserOutput struct {
FileName string
Modules []module
Comments []comment
HasMain bool
type FileParser struct {
code []byte
relFilepath string
output ParserOutput
func NewFileParser() *FileParser {
return &FileParser{}
// ParseCode instantiates a new tree-sitter Parser and parses the python code, returning
// the tree-sitter RootNode.
// It prints a warning if parsing fails.
func ParseCode(code []byte, path string) (*sitter.Node, error) {
parser := sitter.NewParser()
tree, err := parser.ParseCtx(context.Background(), nil, code)
if err != nil {
return nil, err
root := tree.RootNode()
if root.HasError() {
log.Printf("WARNING: failed to parse %q. The resulting BUILD target may be incorrect.", path)
verbose, envExists := os.LookupEnv("GAZELLE_VERBOSE")
if envExists && verbose == "1" {
for i := 0; i < int(root.ChildCount()); i++ {
child := root.Child(i)
if child.IsError() {
log.Printf("Parse error at %+v:\n%+v", child.StartPoint(), child.Content(code))
log.Printf("The above was parsed as: %v", child.String())
return root, nil
// parseMain returns true if the python file has an `if __name__ == "__main__":` block,
// which is a common idiom for python scripts/binaries.
func (p *FileParser) parseMain(ctx context.Context, node *sitter.Node) bool {
for i := 0; i < int(node.ChildCount()); i++ {
if err := ctx.Err(); err != nil {
return false
child := node.Child(i)
if child.Type() == sitterNodeTypeIfStatement &&
child.Child(1).Type() == sitterNodeTypeComparisonOperator && child.Child(1).Child(1).Type() == "==" {
statement := child.Child(1)
a, b := statement.Child(0), statement.Child(2)
// convert "'__main__' == __name__" to "__name__ == '__main__'"
if b.Type() == sitterNodeTypeIdentifier {
a, b = b, a
if a.Type() == sitterNodeTypeIdentifier && a.Content(p.code) == "__name__" &&
// at (after v0.0.0-20240422154435-0628b34cbf9c we used)
// "__main__" is the second child of b. But now, it isn't.
// we cannot use the latest go-tree-sitter because of the top level reference in scanner.c.
b.Type() == sitterNodeTypeString && string(p.code[b.StartByte()+1:b.EndByte()-1]) == "__main__" {
return true
return false
// parseImportStatement parses a node for an import statement, returning a `module` and a boolean
// representing if the parse was OK or not.
func parseImportStatement(node *sitter.Node, code []byte) (module, bool) {
switch node.Type() {
case sitterNodeTypeDottedName:
return module{
Name: node.Content(code),
LineNumber: node.StartPoint().Row + 1,
}, true
case sitterNodeTypeAliasedImport:
return parseImportStatement(node.Child(0), code)
case sitterNodeTypeWildcardImport:
return module{
Name: "*",
LineNumber: node.StartPoint().Row + 1,
}, true
return module{}, false
// parseImportStatements parses a node for import statements, returning true if the node is
// an import statement. It updates FileParser.output.Modules with the `module` that the
// import represents.
func (p *FileParser) parseImportStatements(node *sitter.Node) bool {
if node.Type() == sitterNodeTypeImportStatement {
for j := 1; j < int(node.ChildCount()); j++ {
m, ok := parseImportStatement(node.Child(j), p.code)
if !ok {
m.Filepath = p.relFilepath
if strings.HasPrefix(m.Name, ".") {
p.output.Modules = append(p.output.Modules, m)
} else if node.Type() == sitterNodeTypeImportFromStatement {
from := node.Child(1).Content(p.code)
if strings.HasPrefix(from, ".") {
return true
for j := 3; j < int(node.ChildCount()); j++ {
m, ok := parseImportStatement(node.Child(j), p.code)
if !ok {
m.Filepath = p.relFilepath
m.From = from
m.Name = fmt.Sprintf("%s.%s", from, m.Name)
p.output.Modules = append(p.output.Modules, m)
} else {
return false
return true
// parseComments parses a node for comments, returning true if the node is a comment.
// It updates FileParser.output.Comments with the parsed comment.
func (p *FileParser) parseComments(node *sitter.Node) bool {
if node.Type() == sitterNodeTypeComment {
p.output.Comments = append(p.output.Comments, comment(node.Content(p.code)))
return true
return false
func (p *FileParser) SetCodeAndFile(code []byte, relPackagePath, filename string) {
p.code = code
p.relFilepath = filepath.Join(relPackagePath, filename)
p.output.FileName = filename
func (p *FileParser) parse(ctx context.Context, node *sitter.Node) {
if node == nil {
for i := 0; i < int(node.ChildCount()); i++ {
if err := ctx.Err(); err != nil {
child := node.Child(i)
if p.parseImportStatements(child) {
if p.parseComments(child) {
p.parse(ctx, child)
func (p *FileParser) Parse(ctx context.Context) (*ParserOutput, error) {
rootNode, err := ParseCode(p.code, p.relFilepath)
if err != nil {
return nil, err
p.output.HasMain = p.parseMain(ctx, rootNode)
p.parse(ctx, rootNode)
return &p.output, nil
func (p *FileParser) ParseFile(ctx context.Context, repoRoot, relPackagePath, filename string) (*ParserOutput, error) {
code, err := os.ReadFile(filepath.Join(repoRoot, relPackagePath, filename))
if err != nil {
return nil, err
p.SetCodeAndFile(code, relPackagePath, filename)
return p.Parse(ctx)