fix(pipstar): correctly handle platlib and purelib in .data (#3501)
Some packages like use `platlib` in the data to put the main files. This
PR is implementing correct handling of such packages by recursively
merging two trees. If we have any collisions, we will print an error
and stop. That is unlikely but better to be safe. Users can patch the
failure to be a warning if necessary.
In order to make this more testable, move the functions to a separate
file.
Fixes #3500
Fixes #2949
To be cherry-picked as part of #3466
diff --git a/examples/pip_parse/BUILD.bazel b/examples/pip_parse/BUILD.bazel
index 6ed8d26..37a25fe 100644
--- a/examples/pip_parse/BUILD.bazel
+++ b/examples/pip_parse/BUILD.bazel
@@ -79,5 +79,8 @@
"WHEEL_DIST_INFO_CONTENTS": "$(rootpaths @pypi//requests:dist_info)",
"YAMLLINT_ENTRY_POINT": "$(rlocationpath :yamllint)",
},
- deps = ["@rules_python//python/runfiles"],
+ deps = [
+ "@pypi//libclang",
+ "@rules_python//python/runfiles",
+ ],
)
diff --git a/examples/pip_parse/requirements.in b/examples/pip_parse/requirements.in
index 9d9e766..e4af3b1 100644
--- a/examples/pip_parse/requirements.in
+++ b/examples/pip_parse/requirements.in
@@ -3,3 +3,4 @@
yamllint~=1.28.0
sphinx
sphinxcontrib-serializinghtml
+libclang
diff --git a/examples/pip_parse/requirements_lock.txt b/examples/pip_parse/requirements_lock.txt
index dc34b45..13a2bba 100644
--- a/examples/pip_parse/requirements_lock.txt
+++ b/examples/pip_parse/requirements_lock.txt
@@ -42,6 +42,18 @@
--hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
--hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
# via sphinx
+libclang==18.1.1 \
+ --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \
+ --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \
+ --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \
+ --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \
+ --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \
+ --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \
+ --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \
+ --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \
+ --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \
+ --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe
+ # via -r requirements.in
markupsafe==2.1.3 \
--hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
--hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
diff --git a/examples/pip_parse/requirements_windows.txt b/examples/pip_parse/requirements_windows.txt
index 78c1a45..7a1329d 100644
--- a/examples/pip_parse/requirements_windows.txt
+++ b/examples/pip_parse/requirements_windows.txt
@@ -46,6 +46,18 @@
--hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
--hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
# via sphinx
+libclang==18.1.1 \
+ --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \
+ --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \
+ --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \
+ --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \
+ --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \
+ --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \
+ --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \
+ --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \
+ --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \
+ --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe
+ # via -r requirements.in
markupsafe==2.1.3 \
--hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
--hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
diff --git a/python/private/pypi/BUILD.bazel b/python/private/pypi/BUILD.bazel
index 96ae42f..0a97e5f 100644
--- a/python/private/pypi/BUILD.bazel
+++ b/python/private/pypi/BUILD.bazel
@@ -421,6 +421,16 @@
)
bzl_library(
+ name = "whl_extract_bzl",
+ srcs = ["whl_extract.bzl"],
+ deps = [
+ ":whl_metadata_bzl",
+ "//python/private:repo_utils_bzl",
+ "@rules_python_internal//:rules_python_config_bzl",
+ ],
+)
+
+bzl_library(
name = "whl_library_alias_bzl",
srcs = ["whl_library_alias.bzl"],
deps = [
@@ -440,6 +450,7 @@
":patch_whl_bzl",
":pep508_requirement_bzl",
":pypi_repo_utils_bzl",
+ ":whl_extract_bzl",
":whl_metadata_bzl",
":whl_target_platforms_bzl",
"//python/private:auth_bzl",
diff --git a/python/private/pypi/whl_extract.bzl b/python/private/pypi/whl_extract.bzl
new file mode 100644
index 0000000..6b2e050
--- /dev/null
+++ b/python/private/pypi/whl_extract.bzl
@@ -0,0 +1,109 @@
+"""A simple whl extractor."""
+
+load("@rules_python_internal//:rules_python_config.bzl", rp_config = "config")
+load("//python/private:repo_utils.bzl", "repo_utils")
+load(":whl_metadata.bzl", "find_whl_metadata")
+
+def whl_extract(rctx, *, whl_path, logger):
+ """Extract whls in Starlark.
+
+ Args:
+ rctx: the repository ctx.
+ whl_path: the whl path to extract.
+ logger: The logger to use
+ """
+ install_dir_path = whl_path.dirname.get_child("site-packages")
+ repo_utils.extract(
+ rctx,
+ archive = whl_path,
+ output = install_dir_path,
+ supports_whl_extraction = rp_config.supports_whl_extraction,
+ )
+ metadata_file = find_whl_metadata(
+ install_dir = install_dir_path,
+ logger = logger,
+ )
+
+ # Get the <prefix>.dist_info dir name
+ dist_info_dir = metadata_file.dirname
+ rctx.file(
+ dist_info_dir.get_child("INSTALLER"),
+ "https://github.com/bazel-contrib/rules_python#pipstar",
+ )
+ repo_root_dir = whl_path.dirname
+
+ # Get the <prefix>.dist_info dir name
+ data_dir = dist_info_dir.dirname.get_child(dist_info_dir.basename[:-len(".dist-info")] + ".data")
+ if data_dir.exists:
+ for prefix, dest_prefix in {
+ # https://docs.python.org/3/library/sysconfig.html#posix-prefix
+ # We are taking this from the legacy whl installer config
+ "data": "data",
+ "headers": "include",
+ # In theory there may be directory collisions here, so it would be best to
+ # merge the paths here. We are doing for quite a few levels deep. What is
+ # more, this code has to be reasonably efficient because some packages like
+ # to not put everything to the top level, but to indicate explicitly if
+ # something is in `platlib` or `purelib` (e.g. libclang wheel).
+ "platlib": "site-packages",
+ "purelib": "site-packages",
+ "scripts": "bin",
+ }.items():
+ src = data_dir.get_child(prefix)
+ if not src.exists:
+ # The prefix does not exist in the wheel, we can continue
+ continue
+
+ for (src, dest) in merge_trees(src, repo_root_dir.get_child(dest_prefix)):
+ logger.debug(lambda: "Renaming: {} -> {}".format(src, dest))
+ rctx.rename(src, dest)
+
+ # TODO @aignas 2025-12-16: when moving scripts to `bin`, rewrite the #!python
+ # shebang to be something else, for inspiration look at the hermetic
+ # toolchain wrappers
+
+ # Ensure that there is no data dir left
+ rctx.delete(data_dir)
+
+def merge_trees(src, dest):
+ """Merge src into the destination path.
+
+ This will attempt to merge-move src files to the destination directory if there are
+ existing files. Fails at directory depth is 10000 or if there are collisions.
+
+ Args:
+ src: {type}`path` a src path to rename.
+ dest: {type}`path` a dest path to rename to.
+
+ Returns:
+ A list of tuples for src and destination paths.
+ """
+ ret = []
+ remaining = [(src, dest)]
+ collisions = []
+ for _ in range(10000):
+ if collisions or not remaining:
+ break
+
+ tmp = []
+ for (s, d) in remaining:
+ if not d.exists:
+ ret.append((s, d))
+ continue
+
+ if not s.is_dir or not d.is_dir:
+ collisions.append(s)
+ continue
+
+ for file_or_dir in s.readdir():
+ tmp.append((file_or_dir, d.get_child(file_or_dir.basename)))
+
+ remaining = tmp
+
+ if remaining:
+ fail("Exceeded maximum directory depth of 10000 during tree merge.")
+
+ if collisions:
+ fail("Detected collisions between {} and {}: {}".format(src, dest, collisions))
+
+ return ret
diff --git a/python/private/pypi/whl_library.bzl b/python/private/pypi/whl_library.bzl
index c368dea..3c4b6be 100644
--- a/python/private/pypi/whl_library.bzl
+++ b/python/private/pypi/whl_library.bzl
@@ -26,7 +26,8 @@
load(":patch_whl.bzl", "patch_whl")
load(":pep508_requirement.bzl", "requirement")
load(":pypi_repo_utils.bzl", "pypi_repo_utils")
-load(":whl_metadata.bzl", "find_whl_metadata", "whl_metadata")
+load(":whl_extract.bzl", "whl_extract")
+load(":whl_metadata.bzl", "whl_metadata")
load(":whl_target_platforms.bzl", "whl_target_platforms")
_CPPFLAGS = "CPPFLAGS"
@@ -265,48 +266,6 @@
env[_CPPFLAGS] = " ".join(cppflags)
return env
-def _extract_whl_star(rctx, *, whl_path, logger):
- install_dir_path = whl_path.dirname.get_child("site-packages")
- repo_utils.extract(
- rctx,
- archive = whl_path,
- output = install_dir_path,
- supports_whl_extraction = rp_config.supports_whl_extraction,
- )
- metadata_file = find_whl_metadata(
- install_dir = install_dir_path,
- logger = logger,
- )
-
- # Get the <prefix>.dist_info dir name
- dist_info_dir = metadata_file.dirname
- rctx.file(
- dist_info_dir.get_child("INSTALLER"),
- "https://github.com/bazel-contrib/rules_python#pipstar",
- )
- repo_root_dir = whl_path.dirname
-
- # Get the <prefix>.dist_info dir name
- data_dir = dist_info_dir.dirname.get_child(dist_info_dir.basename[:-len(".dist-info")] + ".data")
- if data_dir.exists:
- for prefix, dest in {
- # https://docs.python.org/3/library/sysconfig.html#posix-prefix
- # We are taking this from the legacy whl installer config
- "data": "data",
- "headers": "include",
- "platlib": "site-packages",
- "purelib": "site-packages",
- "scripts": "bin",
- }.items():
- src = data_dir.get_child(prefix)
- dest = repo_root_dir.get_child(dest)
- if src.exists:
- rctx.rename(src, dest)
-
- # TODO @aignas 2025-12-16: when moving scripts to `bin`, rewrite the #!python
- # shebang to be something else, for inspiration look at the hermetic
- # toolchain wrappers
-
def _extract_whl_py(rctx, *, python_interpreter, args, whl_path, environment, logger):
target_platforms = rctx.attr.experimental_target_platforms or []
if target_platforms:
@@ -448,7 +407,7 @@
)
if enable_pipstar_extract:
- _extract_whl_star(rctx, whl_path = whl_path, logger = logger)
+ whl_extract(rctx, whl_path = whl_path, logger = logger)
else:
_extract_whl_py(
rctx,