refactor: optimize venv building for namespace packages (#3454)

When implicit namespace packages are used, it's common for multiple
distributions
to install into the same directory, triggering the expensive conflict
merging
logic. This can be observed wit our doc builds, where `sphinxcontrib` is
a
namespace package that 7 distributions install into.

To fix, treat top-level directories that have an importable name and
don't have an
`__init__` looking file as implicit namespace packages and mark them as
disallowed
from being directly linked. The importable name check is to exclude
dist-info
directories.

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/python/private/venv_runfiles.bzl b/python/private/venv_runfiles.bzl
index 0c79ea8..7ff5c85 100644
--- a/python/private/venv_runfiles.bzl
+++ b/python/private/venv_runfiles.bzl
@@ -13,6 +13,7 @@
     "VenvSymlinkEntry",
     "VenvSymlinkKind",
 )
+load(":py_internal.bzl", "py_internal")
 
 def create_venv_app_files(ctx, deps, venv_dir_map):
     """Creates the tree of app-specific files for a venv for a binary.
@@ -231,6 +232,21 @@
                 if venv_path not in keep_map:
                     keep_map[venv_path] = file
 
+def _is_importable_name(name):
+    # Requires Bazel 8+
+    if hasattr(py_internal, "regex_match"):
+        # ?U means activates unicode matching (Python allows most unicode
+        # in module names / identifiers).
+        # \w matches alphanumeric and underscore.
+        # NOTE: regex_match has an implicit ^ and $
+        return py_internal.regex_match(name, "(?U)\\w+")
+    else:
+        # Otherwise, use a rough hueristic that should catch most cases.
+        return (
+            "." not in name and
+            "-" not in name
+        )
+
 def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
     """Compute the VenvSymlinkEntry objects for a library.
 
@@ -270,6 +286,9 @@
     # List of (File, str venv_path) tuples
     files_left_to_link = []
 
+    # dict[str dirname, bool is_namespace_package]
+    namespace_package_dirs = {}
+
     # We want to minimize the number of files symlinked. Ideally, only the
     # top-level directories are symlinked. Unfortunately, shared libraries
     # complicate matters: if a shared library's directory is linked, then the
@@ -310,6 +329,29 @@
         else:
             files_left_to_link.append((src, venv_path))
 
+        top_level_dirname, _, tail = venv_path.partition("/")
+        if (
+            # If it's already not directly linkable, nothing to do
+            not cannot_be_linked_directly.get(top_level_dirname, False) and
+            # If its already known to be non-implicit namespace, then skip
+            namespace_package_dirs.get(top_level_dirname, True) and
+            # It must be an importable name to be an implicit namespace package
+            _is_importable_name(top_level_dirname)
+        ):
+            namespace_package_dirs.setdefault(top_level_dirname, True)
+
+            # Looking for `__init__.` isn't 100% correct, as it'll match e.g.
+            # `__init__.pyi`, but it's close enough.
+            if "/" not in tail and tail.startswith("__init__."):
+                namespace_package_dirs[top_level_dirname] = False
+
+    # We treat namespace packages as a hint that other distributions may
+    # install into the same directory. As such, we avoid linking them directly
+    # to avoid conflict merging later.
+    for dirname, is_namespace_package in namespace_package_dirs.items():
+        if is_namespace_package:
+            cannot_be_linked_directly[dirname] = True
+
     # At this point, venv_symlinks has entries for the shared libraries
     # and cannot_be_linked_directly has the directories that cannot be
     # directly linked. Next, we loop over the remaining files and group
diff --git a/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl b/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl
index 486293b..e92c0aa 100644
--- a/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl
+++ b/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl
@@ -219,6 +219,7 @@
     empty_files(
         name = name + "_files",
         paths = [
+            "site-packages/pkg2/__init__.py",
             "site-packages/pkg2/a.txt",
             "site-packages/pkg2/b_mod.so",
         ],
@@ -248,6 +249,7 @@
             "pkg2",
             link_to_path = rr + "pkg2",
             files = [
+                "tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/__init__.py",
                 "tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/a.txt",
                 "tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/b_mod.so",
             ],
@@ -264,6 +266,70 @@
     # The point of the optimization is to avoid having to merge conflicts.
     env.expect.that_collection(conflicts).contains_exactly([])
 
+def _test_optimized_grouping_implicit_namespace_packages(name):
+    empty_files(
+        name = name + "_files",
+        paths = [
+            # NOTE: An alphanumeric name with underscores is used to verify
+            # name matching is correct.
+            "site-packages/name_space9/part1/foo.py",
+            "site-packages/name_space9/part2/bar.py",
+            "site-packages/name_space9-1.0.dist-info/METADATA",
+        ],
+    )
+    analysis_test(
+        name = name,
+        impl = _test_optimized_grouping_implicit_namespace_packages_impl,
+        target = name + "_files",
+    )
+
+_tests.append(_test_optimized_grouping_implicit_namespace_packages)
+
+def _test_optimized_grouping_implicit_namespace_packages_impl(env, target):
+    test_ctx = _ctx(workspace_name = env.ctx.workspace_name)
+    entries = get_venv_symlinks(
+        test_ctx,
+        target.files.to_list(),
+        package = "pkg3",
+        version_str = "1.0",
+        site_packages_root = env.ctx.label.package + "/site-packages",
+    )
+    actual = _venv_symlinks_from_entries(entries)
+
+    rr = "{}/{}/site-packages/".format(test_ctx.workspace_name, env.ctx.label.package)
+    expected = [
+        _venv_symlink(
+            "name_space9/part1",
+            link_to_path = rr + "name_space9/part1",
+            files = [
+                "tests/venv_site_packages_libs/app_files_building/site-packages/name_space9/part1/foo.py",
+            ],
+        ),
+        _venv_symlink(
+            "name_space9/part2",
+            link_to_path = rr + "name_space9/part2",
+            files = [
+                "tests/venv_site_packages_libs/app_files_building/site-packages/name_space9/part2/bar.py",
+            ],
+        ),
+        _venv_symlink(
+            "name_space9-1.0.dist-info",
+            link_to_path = rr + "name_space9-1.0.dist-info",
+            files = [
+                "tests/venv_site_packages_libs/app_files_building/site-packages/name_space9-1.0.dist-info/METADATA",
+            ],
+        ),
+    ]
+    expected = sorted(expected, key = lambda e: (e.link_to_path, e.venv_path))
+    env.expect.that_collection(
+        actual,
+    ).contains_exactly(expected)
+
+    _, conflicts = build_link_map(test_ctx, entries, return_conflicts = True)
+
+    # The point of the optimization is to avoid having to merge conflicts.
+    env.expect.that_collection(conflicts).contains_exactly([])
+
 def _test_package_version_filtering(name):
     analysis_test(
         name = name,