refactor: make modules_mapping a regular rule (#578)
* refactor: make modules_mapping a regular rule
* fix: remove unnecessary comment
Signed-off-by: Thulio Ferraz Assis <3149049+f0rmiga@users.noreply.github.com>
Co-authored-by: Alex Eagle <eagle@post.harvard.edu>
diff --git a/examples/build_file_generation/BUILD b/examples/build_file_generation/BUILD
index ec31255..3e6d44a 100644
--- a/examples/build_file_generation/BUILD
+++ b/examples/build_file_generation/BUILD
@@ -1,8 +1,17 @@
load("@bazel_gazelle//:def.bzl", "gazelle")
+load("@pip//:requirements.bzl", "all_whl_requirements")
load("@rules_python//gazelle:def.bzl", "GAZELLE_PYTHON_RUNTIME_DEPS")
load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest")
+load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping")
load("@rules_python//python:defs.bzl", "py_library")
+# This rule fetches the metadata for python packages we depend on. That data is
+# required for the gazelle_python_manifest rule to update our manifest file.
+modules_mapping(
+ name = "modules_map",
+ wheels = all_whl_requirements,
+)
+
# Gazelle python extension needs a manifest file mapping from
# an import to the installed package that provides it.
# This macro produces two targets:
@@ -12,7 +21,7 @@
# the manifest doesn't need to be updated
gazelle_python_manifest(
name = "gazelle_python_manifest",
- modules_mapping = "@modules_map//:modules_mapping.json",
+ modules_mapping = ":modules_map",
pip_deps_repository_name = "pip",
requirements = "//:requirements_lock.txt",
)
diff --git a/examples/build_file_generation/WORKSPACE b/examples/build_file_generation/WORKSPACE
index 4255932..c58b50f 100644
--- a/examples/build_file_generation/WORKSPACE
+++ b/examples/build_file_generation/WORKSPACE
@@ -60,14 +60,3 @@
load("@rules_python//gazelle:deps.bzl", _py_gazelle_deps = "gazelle_deps")
_py_gazelle_deps()
-
-load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping")
-
-# This repository rule fetches the metadata for python packages we
-# depend on. That data is required for the gazelle_python_manifest
-# rule to update our manifest file.
-# To see what this rule does, try `bazel run @modules_map//:print`
-modules_mapping(
- name = "modules_map",
- requirements = "//:requirements_lock.txt",
-)
diff --git a/gazelle/README.md b/gazelle/README.md
index 9edf773..2058458 100644
--- a/gazelle/README.md
+++ b/gazelle/README.md
@@ -9,11 +9,8 @@
First, you'll need to add Gazelle to your `WORKSPACE` file.
Follow the instructions at https://github.com/bazelbuild/bazel-gazelle#running-gazelle-with-bazel
-Next, we need to add two more things to the `WORKSPACE`:
-
-1. fetch the third-party Go libraries that the python extension depends on
-1. fetch metadata about your Python dependencies, so that gazelle can
- determine which package a given import statement comes from.
+Next, we need to fetch the third-party Go libraries that the python extension
+depends on.
Add this to your `WORKSPACE`:
@@ -23,22 +20,12 @@
load("@rules_python//gazelle:deps.bzl", _py_gazelle_deps = "gazelle_deps")
_py_gazelle_deps()
-
-load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping")
-
-# This repository rule fetches the metadata for python packages we
-# depend on. That data is required for the gazelle_python_manifest
-# rule to update our manifest file.
-# To see what this rule does, try `bazel run @modules_map//:print`
-modules_mapping(
- name = "modules_map",
- # This should point to wherever we declare our python dependencies
- requirements = "//:requirements_lock.txt",
-)
```
-Next, we'll make a pair of targets for consuming that `modules_mapping` we
-fetched, and writing it as a manifest file for Gazelle to read.
+Next, we'll fetch metadata about your Python dependencies, so that gazelle can
+determine which package a given import statement comes from. This is provided
+by the `modules_mapping` rule. We'll make a target for consuming this
+`modules_mapping`, and writing it as a manifest file for Gazelle to read.
This is checked into the repo for speed, as it takes some time to calculate
in a large monorepo.
@@ -48,7 +35,16 @@
Then put this in your `BUILD.bazel` file next to the `requirements.txt`:
```starlark
+load("@pip//:requirements.bzl", "all_whl_requirements")
load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest")
+load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping")
+
+# This rule fetches the metadata for python packages we depend on. That data is
+# required for the gazelle_python_manifest rule to update our manifest file.
+modules_mapping(
+ name = "modules_map",
+ wheels = all_whl_requirements,
+)
# Gazelle python extension needs a manifest file mapping from
# an import to the installed package that provides it.
@@ -59,9 +55,7 @@
# the manifest doesn't need to be updated
gazelle_python_manifest(
name = "gazelle_python_manifest",
- # The @modules_map refers to the name we gave in the modules_mapping
- # rule in the WORKSPACE
- modules_mapping = "@modules_map//:modules_mapping.json",
+ modules_mapping = ":modules_map",
# This is what we called our `pip_install` rule, where third-party
# python libraries are loaded in BUILD files.
pip_deps_repository_name = "pip",
diff --git a/gazelle/modules_mapping/BUILD.bazel b/gazelle/modules_mapping/BUILD.bazel
index 4ce6a00..d1cd42e 100644
--- a/gazelle/modules_mapping/BUILD.bazel
+++ b/gazelle/modules_mapping/BUILD.bazel
@@ -1,4 +1,7 @@
-exports_files([
- "builder.py",
- "generator.py",
-])
+load("@rules_python//python:defs.bzl", "py_binary")
+
+py_binary(
+ name = "generator",
+ srcs = ["generator.py"],
+ visibility = ["//visibility:public"],
+)
diff --git a/gazelle/modules_mapping/builder.py b/gazelle/modules_mapping/builder.py
deleted file mode 100644
index 3b471c0..0000000
--- a/gazelle/modules_mapping/builder.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import argparse
-import multiprocessing
-import subprocess
-import sys
-from datetime import datetime
-
-mutex = multiprocessing.Lock()
-
-
-def build(wheel):
- print("{}: building {}".format(datetime.now(), wheel), file=sys.stderr)
- process = subprocess.run(
- [sys.executable, "-m", "build", "--wheel", "--no-isolation"], cwd=wheel
- )
- if process.returncode != 0:
- # If the build without isolation fails, try to build it again with
- # isolation. We need to protect this following logic in two ways:
- # 1. Only build one at a time in this process.
- # 2. Retry a few times to get around flakiness.
- success = False
- for _ in range(0, 3):
- with mutex:
- process = subprocess.run(
- [sys.executable, "-m", "build", "--wheel"],
- encoding="utf-8",
- cwd=wheel,
- capture_output=True,
- )
- if process.returncode != 0:
- continue
- success = True
- break
- if not success:
- print("STDOUT:", file=sys.stderr)
- print(process.stdout, file=sys.stderr)
- print("STDERR:", file=sys.stderr)
- print(process.stderr, file=sys.stderr)
- raise RuntimeError(
- "{}: ERROR: failed to build {}".format(datetime.now(), wheel)
- )
-
-
-def main(jobs, wheels):
- with multiprocessing.Pool(jobs) as pool:
- results = []
- for wheel in wheels:
- result = pool.apply_async(build, args=(wheel,))
- results.append(result)
- pool.close()
- for result in results:
- result.get()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Builds Python wheels.")
- parser.add_argument(
- "wheels",
- metavar="wheel",
- type=str,
- nargs="+",
- help="A path to the extracted wheel directory.",
- )
- parser.add_argument(
- "--jobs",
- type=int,
- default=8,
- help="The number of concurrent build jobs to be executed.",
- )
- args = parser.parse_args()
- exit(main(args.jobs, args.wheels))
diff --git a/gazelle/modules_mapping/def.bzl b/gazelle/modules_mapping/def.bzl
index e01ebd3..e90d454 100644
--- a/gazelle/modules_mapping/def.bzl
+++ b/gazelle/modules_mapping/def.bzl
@@ -7,324 +7,39 @@
This mapping is necessary when reading Python import statements and determining
if they are provided by third-party dependencies. Most importantly, when the
module name doesn't match the wheel distribution name.
-
-Currently, this module only works with requirements.txt files locked using
-pip-tools (https://github.com/jazzband/pip-tools) with hashes. This is necessary
-in order to keep downloaded wheels in the Bazel cache. Also, the
-modules_mapping rule does not consider extras as specified by PEP 508.
"""
-# _modules_mapping_impl is the root entry for the modules_mapping rule
-# implementation.
-def _modules_mapping_impl(rctx):
- requirements_data = rctx.read(rctx.attr.requirements)
- python_interpreter = _get_python_interpreter(rctx)
- pythonpath = "{}/__pythonpath".format(rctx.path(""))
- res = rctx.execute(
- [
- python_interpreter,
- "-m",
- "pip",
- "--verbose",
- "--isolated",
- "install",
- "--target={}".format(pythonpath),
- "--upgrade",
- "--no-build-isolation",
- "--no-cache-dir",
- "--disable-pip-version-check",
- "--index-url={}".format(rctx.attr.pip_index_url),
- "build=={}".format(rctx.attr.build_wheel_version),
- "setuptools=={}".format(rctx.attr.setuptools_wheel_version),
- ],
- quiet = rctx.attr.quiet,
- timeout = rctx.attr.install_build_timeout,
+def _modules_mapping_impl(ctx):
+ modules_mapping = ctx.actions.declare_file(ctx.attr.modules_mapping_name)
+ args = ctx.actions.args()
+ args.add(modules_mapping.path)
+ args.add_all([whl.path for whl in ctx.files.wheels])
+ ctx.actions.run(
+ inputs = ctx.files.wheels,
+ outputs = [modules_mapping],
+ executable = ctx.executable._generator,
+ arguments = [args],
+ use_default_shell_env = False,
)
- if res.return_code != 0:
- fail(res.stderr)
- parsed_requirements = _parse_requirements_txt(requirements_data)
- wheels = _get_wheels(rctx, python_interpreter, pythonpath, parsed_requirements)
- res = rctx.execute(
- [
- python_interpreter,
- rctx.path(rctx.attr._generator),
- ] + wheels,
- quiet = rctx.attr.quiet,
- timeout = rctx.attr.generate_timeout,
- )
- if res.return_code != 0:
- fail(res.stderr)
- rctx.file("modules_mapping.json", content = res.stdout)
- rctx.file("print.sh", content = "#!/usr/bin/env bash\ncat $1", executable = True)
- rctx.file("BUILD", """\
-exports_files(["modules_mapping.json"])
+ return [DefaultInfo(files = depset([modules_mapping]))]
-sh_binary(
- name = "print",
- srcs = ["print.sh"],
- data = [":modules_mapping.json"],
- args = ["$(rootpath :modules_mapping.json)"],
-)
-""")
-
-# _get_python_interpreter determines whether the system or the user-provided
-# Python interpreter should be used and returns the path to be called.
-def _get_python_interpreter(rctx):
- if rctx.attr.python_interpreter == None:
- return "python"
- return rctx.path(rctx.attr.python_interpreter)
-
-# _parse_requirements_txt parses the requirements.txt data into structs with the
-# information needed to download them using Bazel.
-def _parse_requirements_txt(data):
- result = []
- lines = data.split("\n")
- current_requirement = ""
- continue_previous_line = False
- for line in lines:
- # Ignore empty lines and comments.
- if len(line) == 0 or line.startswith("#"):
- continue
-
- line = line.strip()
-
- stripped_backslash = False
- if line.endswith("\\"):
- line = line[:-1]
- stripped_backslash = True
-
- # If this line is a continuation of the previous one, append the current
- # line to the current requirement being processed, otherwise, start a
- # new requirement.
- if continue_previous_line:
- current_requirement += line
- else:
- current_requirement = line
-
- # Control whether the next line in the requirements.txt should be a
- # continuation of the current requirement being processed or not.
- continue_previous_line = stripped_backslash
- if not continue_previous_line:
- result.append(_parse_requirement(current_requirement))
- return result
-
-# _parse_requirement parses a single requirement line.
-def _parse_requirement(requirement_line):
- split = requirement_line.split("==")
- requirement = {}
-
- # Removing the extras (https://www.python.org/dev/peps/pep-0508/#extras)
- # from the requirement name is fine since it's expected that the
- # requirements.txt was compiled with pip-tools, which includes the extras as
- # direct dependencies.
- name = _remove_extras_from_name(split[0])
- requirement["name"] = name
- if len(split) == 1:
- return struct(**requirement)
- split = split[1].split(" ")
- requirement["version"] = split[0]
- if len(split) == 1:
- return struct(**requirement)
- args = split[1:]
- hashes = []
- for arg in args:
- arg = arg.strip()
-
- # Skip empty arguments.
- if len(arg) == 0:
- continue
-
- # Halt processing if it hits a comment.
- if arg.startswith("#"):
- break
- if arg.startswith("--hash="):
- hashes.append(arg[len("--hash="):])
- requirement["hashes"] = hashes
- return struct(**requirement)
-
-# _remove_extras_from_name removes the [extras] from a requirement.
-# https://www.python.org/dev/peps/pep-0508/#extras
-def _remove_extras_from_name(name):
- bracket_index = name.find("[")
- if bracket_index == -1:
- return name
- return name[:bracket_index]
-
-# _get_wheels returns the wheel distributions for the given requirements. It
-# uses a few different strategies depending on whether compiled wheel
-# distributions exist on the remote index or not. The order in which it
-# operates:
-#
-# 1. Try to use the platform-independent compiled wheel (*-none-any.whl).
-# 2. Try to use the first match of the linux-dependent compiled wheel from the
-# sorted releases list. This is valid as it's deterministic and the Python
-# extension for Gazelle doesn't support other platform-specific wheels
-# (one must use manual means to accomplish platform-specific dependency
-# resolution).
-# 3. Use the published source for the wheel.
-def _get_wheels(rctx, python_interpreter, pythonpath, requirements):
- wheels = []
- to_build = []
- for requirement in requirements:
- if not hasattr(requirement, "hashes"):
- if hasattr(requirement, "name") and requirement.name.startswith("#"):
- # This is a comment in the requirements file.
- continue
- else:
- fail("missing requirement hash for {}-{}: use pip-tools to produce a locked file".format(
- requirement.name,
- requirement.version,
- ))
-
- wheel = {}
- wheel["name"] = requirement.name
-
- requirement_info_url = "{index_base}/{name}/{version}/json".format(
- index_base = rctx.attr.index_base,
- name = requirement.name,
- version = requirement.version,
- )
- requirement_info_path = "{}_info.json".format(requirement.name)
-
- # TODO(f0rmiga): if the logs are too spammy, use rctx.execute with
- # Python to perform the downloads since it's impossible to get the
- # checksums of these JSON files and there's no option to mute Bazel
- # here.
- rctx.download(requirement_info_url, output = requirement_info_path)
- requirement_info = json.decode(rctx.read(requirement_info_path))
- if requirement.version in requirement_info["releases"]:
- wheel["version"] = requirement.version
- elif requirement.version.endswith(".0") and requirement.version[:-len(".0")] in requirement_info["releases"]:
- wheel["version"] = requirement.version[:-len(".0")]
- else:
- fail("missing requirement version \"{}\" for wheel \"{}\" in fetched releases: available {}".format(
- requirement.version,
- requirement.name,
- [version for version in requirement_info["releases"]],
- ))
- releases = sorted(requirement_info["releases"][wheel["version"]], key = _sort_release_by_url)
- (wheel_url, sha256) = _search_url(releases, "-none-any.whl")
-
- # TODO(f0rmiga): handle PEP 600.
- # https://www.python.org/dev/peps/pep-0600/
- if not wheel_url:
- # Search for the Linux tag as defined in PEP 599.
- (wheel_url, sha256) = _search_url(releases, "manylinux2014_x86_64")
- if not wheel_url:
- # Search for the Linux tag as defined in PEP 571.
- (wheel_url, sha256) = _search_url(releases, "manylinux2010_x86_64")
- if not wheel_url:
- # Search for the Linux tag as defined in PEP 513.
- (wheel_url, sha256) = _search_url(releases, "manylinux1_x86_64")
- if not wheel_url:
- # Search for the MacOS tag
- (wheel_url, sha256) = _search_url(releases, "macosx_10_9_x86_64")
-
- if wheel_url:
- wheel_path = wheel_url.split("/")[-1]
- rctx.download(wheel_url, output = wheel_path, sha256 = sha256)
- wheel["path"] = wheel_path
- else:
- extension = ".tar.gz"
- (src_url, sha256) = _search_url(releases, extension)
- if not src_url:
- extension = ".zip"
- (src_url, sha256) = _search_url(releases, extension)
- if not src_url:
- fail("requirement URL for {}-{} not found".format(requirement.name, wheel["version"]))
- rctx.download_and_extract(src_url, sha256 = sha256)
- sanitized_name = requirement.name.lower().replace("-", "_")
- requirement_path = src_url.split("/")[-1]
- requirement_path = requirement_path[:-len(extension)]
-
- # The resulting filename for the .whl file is not feasible to
- # predict as it has too many variations, so we defer it to the
- # Python globing to find the right file name since only one .whl
- # file should be generated by the compilation.
- wheel_path = "{}/**/*.whl".format(requirement_path)
- wheel["path"] = wheel_path
- to_build.append(requirement_path)
-
- wheels.append(json.encode(wheel))
-
- if len(to_build) > 0:
- res = rctx.execute(
- [python_interpreter, rctx.path(rctx.attr._builder)] + to_build,
- quiet = rctx.attr.quiet,
- environment = {
- # To avoid use local "pip.conf"
- "HOME": str(rctx.path("").realpath),
- # Make uses of pip to use the requested index
- "PIP_INDEX_URL": rctx.attr.pip_index_url,
- "PYTHONPATH": pythonpath,
- },
- )
- if res.return_code != 0:
- fail(res.stderr)
-
- return wheels
-
-# _sort_release_by_url is the custom function for the key property of the sorted
-# releases.
-def _sort_release_by_url(release):
- return release["url"]
-
-# _search_url searches for a release in the list of releases that has a url
-# matching the provided extension.
-def _search_url(releases, extension):
- for release in releases:
- url = release["url"]
- if url.find(extension) >= 0:
- return (url, release["digests"]["sha256"])
- return (None, None)
-
-modules_mapping = repository_rule(
+modules_mapping = rule(
_modules_mapping_impl,
attrs = {
- "build_wheel_version": attr.string(
- default = "0.5.1",
- doc = "The build wheel version.",
+ "modules_mapping_name": attr.string(
+ default = "modules_mapping.json",
+ doc = "The name for the output JSON file.",
+ mandatory = False,
),
- "generate_timeout": attr.int(
- default = 30,
- doc = "The timeout for the generator.py command.",
- ),
- "index_base": attr.string(
- default = "https://pypi.org/pypi",
- doc = "The base URL used for querying releases data as JSON.",
- ),
- "install_build_timeout": attr.int(
- default = 30,
- doc = "The timeout for the `pip install build` command.",
- ),
- "pip_index_url": attr.string(
- default = "https://pypi.python.org/simple",
- doc = "The index URL used for any pip install actions",
- ),
- "python_interpreter": attr.label(
- allow_single_file = True,
- doc = "If set, uses the custom-built Python interpreter, otherwise, uses the system one.",
- ),
- "quiet": attr.bool(
- default = True,
- doc = "Toggle this attribute to get verbose output from this rule.",
- ),
- "requirements": attr.label(
- allow_single_file = True,
- doc = "The requirements.txt file with hashes locked using pip-tools.",
+ "wheels": attr.label_list(
+ allow_files = True,
+ doc = "The list of wheels, usually the 'all_whl_requirements' from @<pip_repository>//:requirements.bzl",
mandatory = True,
),
- "setuptools_wheel_version": attr.string(
- default = "v57.5.0",
- doc = "The setuptools wheel version.",
- ),
- "_builder": attr.label(
- allow_single_file = True,
- default = "//gazelle/modules_mapping:builder.py",
- ),
"_generator": attr.label(
- allow_single_file = True,
- default = "//gazelle/modules_mapping:generator.py",
+ cfg = "host",
+ default = "//gazelle/modules_mapping:generator",
+ executable = True,
),
},
doc = "Creates a modules_mapping.json file for mapping module names to wheel distribution names.",
diff --git a/gazelle/modules_mapping/generator.py b/gazelle/modules_mapping/generator.py
index 6ee654c..b93f968 100644
--- a/gazelle/modules_mapping/generator.py
+++ b/gazelle/modules_mapping/generator.py
@@ -1,4 +1,3 @@
-import glob
import json
import pathlib
import sys
@@ -7,29 +6,19 @@
# Generator is the modules_mapping.json file generator.
class Generator:
- stdout = None
stderr = None
+ output_file = None
- def __init__(self, stdout, stderr):
- self.stdout = stdout
+ def __init__(self, stderr, output_file):
self.stderr = stderr
+ self.output_file = output_file
# dig_wheel analyses the wheel .whl file determining the modules it provides
# by looking at the directory structure.
- def dig_wheel(self, wheel):
+ def dig_wheel(self, whl):
mapping = {}
- wheel_paths = glob.glob(wheel["path"])
- assert len(wheel_paths) != 0, "wheel not found for {}: searched for {}".format(
- wheel["name"],
- wheel["path"],
- )
- wheel_path = wheel_paths[0]
- assert (
- "UNKNOWN" not in wheel_path
- ), "unknown-named wheel found for {}: possibly bad compilation".format(
- wheel["name"],
- )
- with zipfile.ZipFile(wheel_path, "r") as zip_file:
+ wheel_name = get_wheel_name(whl)
+ with zipfile.ZipFile(whl, "r") as zip_file:
for path in zip_file.namelist():
if is_metadata(path):
continue
@@ -40,32 +29,43 @@
# where this file is as an importable package.
if path.endswith("/__init__.py"):
module = path[: -len("/__init__.py")].replace("/", ".")
- mapping[module] = wheel["name"]
+ mapping[module] = wheel_name
# Always index the module file.
if ext == ".so":
# Also remove extra metadata that is embeded as part of
# the file name as an extra extension.
ext = "".join(pathlib.Path(path).suffixes)
module = path[: -len(ext)].replace("/", ".")
- mapping[module] = wheel["name"]
+ mapping[module] = wheel_name
return mapping
# run is the entrypoint for the generator.
def run(self, wheels):
mapping = {}
- for wheel_json in wheels:
- wheel = json.loads(wheel_json)
+ for whl in wheels:
try:
- mapping.update(self.dig_wheel(wheel))
+ mapping.update(self.dig_wheel(whl))
except AssertionError as error:
print(error, file=self.stderr)
return 1
mapping_json = json.dumps(mapping)
- print(mapping_json, file=self.stdout)
- self.stdout.flush()
+ with open(self.output_file, "w") as f:
+ f.write(mapping_json)
return 0
+def get_wheel_name(path):
+ pp = pathlib.PurePath(path)
+ if pp.suffix != ".whl":
+ raise RuntimeError(
+ "{} is not a valid wheel file name: the wheel doesn't follow ".format(
+ pp.name
+ )
+ + "https://www.python.org/dev/peps/pep-0427/#file-name-convention"
+ )
+ return pp.name[: pp.name.find("-")]
+
+
# is_metadata checks if the path is in a metadata directory.
# Ref: https://www.python.org/dev/peps/pep-0427/#file-contents.
def is_metadata(path):
@@ -74,6 +74,7 @@
if __name__ == "__main__":
- wheels = sys.argv[1:]
- generator = Generator(sys.stdout, sys.stderr)
+ output_file = sys.argv[1]
+ wheels = sys.argv[2:]
+ generator = Generator(sys.stderr, output_file)
exit(generator.run(wheels))