fix(pypi): change the parallelisation scheme for querying SimpleAPI (#2531)

Instead of querying everything in parallel and yielding a lot of 404
warnings, let's query the main index first and then query the other
indexes only for the packages that were not yet found.

What is more, we can print the value of
`experimental_index_url_overrides`
for the users to use.

Whilst at it, add a unit test to check the new logic.

Fixes #2100, since this is the best `rules_python` can do for now.

---------

Co-authored-by: Douglas Thor <dougthor42@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ae28a9..ad3f0a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -56,6 +56,10 @@
 * Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported
   version, per our Bazel support matrix. Earlier versions are not
   tested by CI, so functionality cannot be guaranteed.
+* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when
+  fetching the metadata from SimpleAPI. The calls will be done in parallel to
+  each index separately, so the extension evaluation time might slow down if
+  not using {bzl:obj}`pip.parse.experimental_index_url_overrides`.
 * ({bzl:obj}`pip.parse`) Only query SimpleAPI for packages that have
   sha values in the `requirements.txt` file.
 
diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl
index d16a7cc..6409bcc 100644
--- a/python/private/pypi/extension.bzl
+++ b/python/private/pypi/extension.bzl
@@ -657,6 +657,11 @@
 https://packaging.python.org/en/latest/specifications/simple-repository-api/
 
 This is equivalent to `--extra-index-urls` `pip` option.
+
+:::{versionchanged} 1.1.0
+Starting with this version we will iterate over each index specified until
+we find metadata for all references distributions.
+:::
 """,
             default = [],
         ),
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
index c730c20..6401a06 100644
--- a/python/private/pypi/simpleapi_download.bzl
+++ b/python/private/pypi/simpleapi_download.bzl
@@ -20,9 +20,17 @@
 load("//python/private:auth.bzl", "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
+load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
 
-def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
+def simpleapi_download(
+        ctx,
+        *,
+        attr,
+        cache,
+        parallel_download = True,
+        read_simpleapi = None,
+        _fail = fail):
     """Download Simple API HTML.
 
     Args:
@@ -49,6 +57,9 @@
             reflected when re-evaluating the extension unless we do
             `bazel clean --expunge`.
         parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
+        read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
+            Used in tests.
+        _fail: a function to print a failure. Used in tests.
 
     Returns:
         dict of pkg name to the parsed HTML contents - a list of structs.
@@ -64,15 +75,22 @@
 
     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
-    async_downloads = {}
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
-    for pkg in attr.sources:
-        pkg_normalized = normalize_name(pkg)
+    read_simpleapi = read_simpleapi or _read_simpleapi
 
-        success = False
-        for index_url in index_urls:
-            result = _read_simpleapi(
+    found_on_index = {}
+    warn_overrides = False
+    for i, index_url in enumerate(index_urls):
+        if i != 0:
+            # Warn the user about a potential fix for the overrides
+            warn_overrides = True
+
+        async_downloads = {}
+        sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+        for pkg in sources:
+            pkg_normalized = normalize_name(pkg)
+            result = read_simpleapi(
                 ctx = ctx,
                 url = "{}/{}/".format(
                     index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
@@ -84,42 +102,45 @@
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
-                async_downloads.setdefault(pkg_normalized, []).append(
-                    struct(
-                        pkg_normalized = pkg_normalized,
-                        wait = result.wait,
-                    ),
+                async_downloads[pkg] = struct(
+                    pkg_normalized = pkg_normalized,
+                    wait = result.wait,
                 )
-                continue
-
-            if result.success:
+            elif result.success:
                 contents[pkg_normalized] = result.output
-                success = True
-                break
+                found_on_index[pkg] = index_url
 
-        if not async_downloads and not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
+        if not async_downloads:
+            continue
 
-    if not async_downloads:
-        return contents
-
-    # If we use `block` == False, then we need to have a second loop that is
-    # collecting all of the results as they were being downloaded in parallel.
-    for pkg, downloads in async_downloads.items():
-        success = False
-        for download in downloads:
+        # If we use `block` == False, then we need to have a second loop that is
+        # collecting all of the results as they were being downloaded in parallel.
+        for pkg, download in async_downloads.items():
             result = download.wait()
 
-            if result.success and download.pkg_normalized not in contents:
+            if result.success:
                 contents[download.pkg_normalized] = result.output
-                success = True
+                found_on_index[pkg] = index_url
 
-        if not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
+    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+    if failed_sources:
+        _fail("Failed to download metadata for {} for from urls: {}".format(
+            failed_sources,
+            index_urls,
+        ))
+        return None
+
+    if warn_overrides:
+        index_url_overrides = {
+            pkg: found_on_index[pkg]
+            for pkg in attr.sources
+            if found_on_index[pkg] != attr.index_url
+        }
+
+        # buildifier: disable=print
+        print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
+            render.dict(index_url_overrides),
+        ))
 
     return contents
 
diff --git a/tests/pypi/simpleapi_download/BUILD.bazel b/tests/pypi/simpleapi_download/BUILD.bazel
new file mode 100644
index 0000000..04747b6
--- /dev/null
+++ b/tests/pypi/simpleapi_download/BUILD.bazel
@@ -0,0 +1,5 @@
+load("simpleapi_download_tests.bzl", "simpleapi_download_test_suite")
+
+simpleapi_download_test_suite(
+    name = "simpleapi_download_tests",
+)
diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
new file mode 100644
index 0000000..9b2967b
--- /dev/null
+++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
@@ -0,0 +1,128 @@
+# Copyright 2024 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""
+
+load("@rules_testing//lib:test_suite.bzl", "test_suite")
+load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download")  # buildifier: disable=bzl-visibility
+
+_tests = []
+
+def _test_simple(env):
+    calls = []
+
+    def read_simpleapi(ctx, url, attr, cache, block):
+        _ = ctx  # buildifier: disable=unused-variable
+        _ = attr
+        _ = cache
+        env.expect.that_bool(block).equals(False)
+        calls.append(url)
+        if "foo" in url and "main" in url:
+            return struct(
+                output = "",
+                success = False,
+            )
+        else:
+            return struct(
+                output = "data from {}".format(url),
+                success = True,
+            )
+
+    contents = simpleapi_download(
+        ctx = struct(
+            os = struct(environ = {}),
+        ),
+        attr = struct(
+            index_url_overrides = {},
+            index_url = "main",
+            extra_index_urls = ["extra"],
+            sources = ["foo", "bar", "baz"],
+            envsubst = [],
+        ),
+        cache = {},
+        parallel_download = True,
+        read_simpleapi = read_simpleapi,
+    )
+
+    env.expect.that_collection(calls).contains_exactly([
+        "extra/foo/",
+        "main/bar/",
+        "main/baz/",
+        "main/foo/",
+    ])
+    env.expect.that_dict(contents).contains_exactly({
+        "bar": "data from main/bar/",
+        "baz": "data from main/baz/",
+        "foo": "data from extra/foo/",
+    })
+
+_tests.append(_test_simple)
+
+def _test_fail(env):
+    calls = []
+    fails = []
+
+    def read_simpleapi(ctx, url, attr, cache, block):
+        _ = ctx  # buildifier: disable=unused-variable
+        _ = attr
+        _ = cache
+        env.expect.that_bool(block).equals(False)
+        calls.append(url)
+        if "foo" in url:
+            return struct(
+                output = "",
+                success = False,
+            )
+        else:
+            return struct(
+                output = "data from {}".format(url),
+                success = True,
+            )
+
+    simpleapi_download(
+        ctx = struct(
+            os = struct(environ = {}),
+        ),
+        attr = struct(
+            index_url_overrides = {},
+            index_url = "main",
+            extra_index_urls = ["extra"],
+            sources = ["foo", "bar", "baz"],
+            envsubst = [],
+        ),
+        cache = {},
+        parallel_download = True,
+        read_simpleapi = read_simpleapi,
+        _fail = fails.append,
+    )
+
+    env.expect.that_collection(fails).contains_exactly([
+        """Failed to download metadata for ["foo"] for from urls: ["main", "extra"]""",
+    ])
+    env.expect.that_collection(calls).contains_exactly([
+        "extra/foo/",
+        "main/bar/",
+        "main/baz/",
+        "main/foo/",
+    ])
+
+_tests.append(_test_fail)
+
+def simpleapi_download_test_suite(name):
+    """Create the test suite.
+
+    Args:
+        name: the name of the test suite
+    """
+    test_suite(name = name, basic_tests = _tests)