feat(pypi): generate filegroup with all extracted wheel files (#3011)

Adds a filegroup with all the files that came from the extracted wheel.

This has two benefits over using `whl_filegroup`: it avoids copying the
wheel
and makes the set of files directly visible to the analysis phase.

Some wheels are multiple gigabytes in size (e.g. torch, cuda,
tensorflow), so
avoiding the copy and archive processing saves a decent amount of time.

Knowing the specific files at analysis time is generally beneficial. The
particular case I ran into was the CC rules were unhappy with a
TreeArtifact
of header files because they couldn't enforce some check about who was
properly providing headers that were included (layering check?).

Another example is using the unused_inputs_list optimization, which
allows
an action to ignore inputs that aren't actually used. e.g. an action
could
take all the wheel's files as inputs, only care about the headers, and
then
tell bazel all the non-header files aren't relevant, and thus changes to
other files don't re-run the thing that only cares about headers.

---------

Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2b57af6..c1d3a43 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -90,6 +90,10 @@
 * (pypi) To configure the environment for `requirements.txt` evaluation, use the newly added
   developer preview of the `pip.default` tag class. Only `rules_python` and root modules can use
   this feature. You can also configure custom `config_settings` using `pip.default`.
+* (pypi) PyPI dependencies now expose an `:extracted_whl_files` filegroup target
+  of all the files extracted from the wheel. This can be used in lieu of
+  {obj}`whl_filegroup` to avoid copying/extracting wheel multiple times to
+  get a subset of their files.
 * (gazelle) New directive `gazelle:python_generate_pyi_deps`; when `true`,
   dependencies added to satisfy type-only imports (`if TYPE_CHECKING`) and type
   stub packages are added to `pyi_deps` instead of `deps`.
diff --git a/docs/pypi/use.md b/docs/pypi/use.md
index 6212097..a668167 100644
--- a/docs/pypi/use.md
+++ b/docs/pypi/use.md
@@ -40,9 +40,16 @@
 * `@pypi//numpy:data` - the {obj}`filegroup` for all of the extra files that are included
   as data in the `pkg` target.
 * `@pypi//numpy:dist_info` - the {obj}`filegroup` for all of the files in the `<pkg prefix with version>.distinfo` directory.
+* `@pypi//numpy:extracted_whl_files` - a {obj}`filegroup` of all the files
+  extracted from the whl file.
 * `@pypi//numpy:whl` - the {obj}`filegroup` that is the `.whl` file itself, which includes all
   transitive dependencies via the {attr}`filegroup.data` attribute.
 
+:::{versionadded} VERSION_NEXT_FEATURE
+
+The `:extracted_whl_files` target was added
+:::
+
 ## Entry points
 
 If you would like to access [entry points][whl_ep], see the `py_console_script_binary` rule documentation,
diff --git a/python/private/pypi/labels.bzl b/python/private/pypi/labels.bzl
index 73df07b..22161b1 100644
--- a/python/private/pypi/labels.bzl
+++ b/python/private/pypi/labels.bzl
@@ -14,6 +14,7 @@
 
 """Constants used by parts of pip_repository for naming libraries and wheels."""
 
+EXTRACTED_WHEEL_FILES = "extracted_whl_files"
 WHEEL_FILE_PUBLIC_LABEL = "whl"
 WHEEL_FILE_IMPL_LABEL = "_whl"
 PY_LIBRARY_PUBLIC_LABEL = "pkg"
diff --git a/python/private/pypi/pkg_aliases.bzl b/python/private/pypi/pkg_aliases.bzl
index d71c37c..4d3cc61 100644
--- a/python/private/pypi/pkg_aliases.bzl
+++ b/python/private/pypi/pkg_aliases.bzl
@@ -79,6 +79,7 @@
     ":labels.bzl",
     "DATA_LABEL",
     "DIST_INFO_LABEL",
+    "EXTRACTED_WHEEL_FILES",
     "PY_LIBRARY_IMPL_LABEL",
     "PY_LIBRARY_PUBLIC_LABEL",
     "WHEEL_FILE_IMPL_LABEL",
@@ -151,6 +152,7 @@
         WHEEL_FILE_PUBLIC_LABEL: WHEEL_FILE_IMPL_LABEL if group_name else WHEEL_FILE_PUBLIC_LABEL,
         DATA_LABEL: DATA_LABEL,
         DIST_INFO_LABEL: DIST_INFO_LABEL,
+        EXTRACTED_WHEEL_FILES: EXTRACTED_WHEEL_FILES,
     } | {
         x: x
         for x in extra_aliases or []
diff --git a/python/private/pypi/whl_library.bzl b/python/private/pypi/whl_library.bzl
index de5fcb9..15bb680 100644
--- a/python/private/pypi/whl_library.bzl
+++ b/python/private/pypi/whl_library.bzl
@@ -248,6 +248,7 @@
     environment = _create_repository_execution_environment(rctx, python_interpreter, logger = logger)
 
     whl_path = None
+    sdist_filename = None
     if rctx.attr.whl_file:
         rctx.watch(rctx.attr.whl_file)
         whl_path = rctx.path(rctx.attr.whl_file)
@@ -277,6 +278,8 @@
         if filename.endswith(".whl"):
             whl_path = rctx.path(filename)
         else:
+            sdist_filename = filename
+
             # It is an sdist and we need to tell PyPI to use a file in this directory
             # and, allow getting build dependencies from PYTHONPATH, which we
             # setup in this repository rule, but still download any necessary
@@ -382,6 +385,7 @@
 
         build_file_contents = generate_whl_library_build_bazel(
             name = whl_path.basename,
+            sdist_filename = sdist_filename,
             dep_template = rctx.attr.dep_template or "@{}{{name}}//:{{target}}".format(rctx.attr.repo_prefix),
             entry_points = entry_points,
             metadata_name = metadata.name,
@@ -455,6 +459,7 @@
 
         build_file_contents = generate_whl_library_build_bazel(
             name = whl_path.basename,
+            sdist_filename = sdist_filename,
             dep_template = rctx.attr.dep_template or "@{}{{name}}//:{{target}}".format(rctx.attr.repo_prefix),
             entry_points = entry_points,
             # TODO @aignas 2025-05-17: maybe have a build flag for this instead
diff --git a/python/private/pypi/whl_library_targets.bzl b/python/private/pypi/whl_library_targets.bzl
index 95c1f5e..aed5bc7 100644
--- a/python/private/pypi/whl_library_targets.bzl
+++ b/python/private/pypi/whl_library_targets.bzl
@@ -24,6 +24,7 @@
     ":labels.bzl",
     "DATA_LABEL",
     "DIST_INFO_LABEL",
+    "EXTRACTED_WHEEL_FILES",
     "PY_LIBRARY_IMPL_LABEL",
     "PY_LIBRARY_PUBLIC_LABEL",
     "WHEEL_ENTRY_POINT_PREFIX",
@@ -33,6 +34,16 @@
 load(":namespace_pkgs.bzl", _create_inits = "create_inits")
 load(":pep508_deps.bzl", "deps")
 
+# Files that are special to the Bazel processing of things.
+_BAZEL_REPO_FILE_GLOBS = [
+    "BUILD",
+    "BUILD.bazel",
+    "REPO.bazel",
+    "WORKSPACE",
+    "WORKSPACE",
+    "WORKSPACE.bazel",
+]
+
 def whl_library_targets_from_requires(
         *,
         name,
@@ -97,14 +108,12 @@
         *,
         name,
         dep_template,
+        sdist_filename = None,
         data_exclude = [],
         srcs_exclude = [],
         tags = [],
-        filegroups = {
-            DIST_INFO_LABEL: ["site-packages/*.dist-info/**"],
-            DATA_LABEL: ["data/**"],
-        },
         dependencies = [],
+        filegroups = None,
         dependencies_by_platform = {},
         dependencies_with_markers = {},
         group_deps = [],
@@ -129,14 +138,16 @@
             filegroup. This may be also parsed to generate extra metadata.
         dep_template: {type}`str` The dep_template to use for dependency
             interpolation.
+        sdist_filename: {type}`str | None` If the wheel was built from an sdist,
+            the filename of the sdist.
         tags: {type}`list[str]` The tags set on the `py_library`.
         dependencies: {type}`list[str]` A list of dependencies.
         dependencies_by_platform: {type}`dict[str, list[str]]` A list of
             dependencies by platform key.
         dependencies_with_markers: {type}`dict[str, str]` A marker to evaluate
             in order for the dep to be included.
-        filegroups: {type}`dict[str, list[str]]` A dictionary of the target
-            names and the glob matches.
+        filegroups: {type}`dict[str, list[str]] | None` A dictionary of the target
+            names and the glob matches. If `None`, defaults will be used.
         group_name: {type}`str` name of the dependency group (if any) which
             contains this library. If set, this library will behave as a shim
             to group implementation rules which will provide simultaneously
@@ -169,10 +180,28 @@
     tags = sorted(tags)
     data = [] + data
 
-    for filegroup_name, glob in filegroups.items():
+    if filegroups == None:
+        filegroups = {
+            EXTRACTED_WHEEL_FILES: dict(
+                include = ["**"],
+                exclude = (
+                    _BAZEL_REPO_FILE_GLOBS +
+                    [sdist_filename] if sdist_filename else []
+                ),
+            ),
+            DIST_INFO_LABEL: dict(
+                include = ["site-packages/*.dist-info/**"],
+            ),
+            DATA_LABEL: dict(
+                include = ["data/**"],
+            ),
+        }
+
+    for filegroup_name, glob_kwargs in filegroups.items():
+        glob_kwargs = {"allow_empty": True} | glob_kwargs
         native.filegroup(
             name = filegroup_name,
-            srcs = native.glob(glob, allow_empty = True),
+            srcs = native.glob(**glob_kwargs),
             visibility = ["//visibility:public"],
         )
 
diff --git a/python/private/whl_filegroup/whl_filegroup.bzl b/python/private/whl_filegroup/whl_filegroup.bzl
index d2e6e43..c52211b 100644
--- a/python/private/whl_filegroup/whl_filegroup.bzl
+++ b/python/private/whl_filegroup/whl_filegroup.bzl
@@ -42,7 +42,14 @@
     includes = ["numpy_includes/numpy/core/include"],
     deps = ["@rules_python//python/cc:current_py_cc_headers"],
 )
+
 ```
+
+:::{seealso}
+
+The `:extracted_whl_files` target, which is a filegroup of all the files
+from the already extracted whl file.
+:::
 """,
     attrs = {
         "pattern": attr.string(default = "", doc = "Only file paths matching this regex pattern will be extracted."),
diff --git a/tests/pypi/pkg_aliases/pkg_aliases_test.bzl b/tests/pypi/pkg_aliases/pkg_aliases_test.bzl
index 123ee72..3fd08c3 100644
--- a/tests/pypi/pkg_aliases/pkg_aliases_test.bzl
+++ b/tests/pypi/pkg_aliases/pkg_aliases_test.bzl
@@ -43,6 +43,7 @@
         "whl": "@repo//:whl",
         "data": "@repo//:data",
         "dist_info": "@repo//:dist_info",
+        "extracted_whl_files": "@repo//:extracted_whl_files",
         "my_special": "@repo//:my_special",
     }
 
@@ -243,6 +244,10 @@
             "actual": "@repo//:dist_info",
         },
         {
+            "name": "extracted_whl_files",
+            "actual": "@repo//:extracted_whl_files",
+        },
+        {
             "name": "pkg",
             "actual": "//_groups:my_group_pkg",
         },
diff --git a/tests/pypi/whl_library_targets/whl_library_targets_tests.bzl b/tests/pypi/whl_library_targets/whl_library_targets_tests.bzl
index bc58be9..ec7ca63 100644
--- a/tests/pypi/whl_library_targets/whl_library_targets_tests.bzl
+++ b/tests/pypi/whl_library_targets/whl_library_targets_tests.bzl
@@ -27,9 +27,10 @@
 def _test_filegroups(env):
     calls = []
 
-    def glob(match, *, allow_empty):
+    def glob(include, *, exclude = [], allow_empty):
+        _ = exclude  # @unused
         env.expect.that_bool(allow_empty).equals(True)
-        return match
+        return include
 
     whl_library_targets(
         name = "",
@@ -41,7 +42,7 @@
         rules = struct(),
     )
 
-    env.expect.that_collection(calls).contains_exactly([
+    env.expect.that_collection(calls, expr = "filegroup calls").contains_exactly([
         {
             "name": "dist_info",
             "srcs": ["site-packages/*.dist-info/**"],
@@ -53,6 +54,11 @@
             "visibility": ["//visibility:public"],
         },
         {
+            "name": "extracted_whl_files",
+            "srcs": ["**"],
+            "visibility": ["//visibility:public"],
+        },
+        {
             "name": "whl",
             "srcs": [""],
             "data": [],