| # Copyright 2024 The Bazel Authors. All rights reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| A file that houses private functions used in the `bzlmod` extension with the same name. |
| """ |
| |
| load("@bazel_features//:features.bzl", "bazel_features") |
| load("//python/private:auth.bzl", _get_auth = "get_auth") |
| load("//python/private:envsubst.bzl", "envsubst") |
| load("//python/private:normalize_name.bzl", "normalize_name") |
| load("//python/private:text_util.bzl", "render") |
| load(":parse_simpleapi_html.bzl", "parse_simpleapi_html") |
| load(":urllib.bzl", "urllib") |
| |
| def simpleapi_download( |
| ctx, |
| *, |
| attr, |
| cache, |
| parallel_download = True, |
| read_simpleapi = None, |
| get_auth = None, |
| _fail = fail): |
| """Download Simple API HTML. |
| |
| Args: |
| ctx: The module_ctx or repository_ctx. |
| attr: Contains the parameters for the download. They are grouped into a |
| struct for better clarity. It must have attributes: |
| * index_url: str, the index. |
| * index_url_overrides: dict[str, str], the index overrides for |
| separate packages. |
| * extra_index_urls: Extra index URLs that will be looked up after |
| the main is looked up. |
| * sources: list[str], the sources to download things for. Each value is |
| the contents of requirements files. |
| * envsubst: list[str], the envsubst vars for performing substitution in index url. |
| * netrc: The netrc parameter for ctx.download, see http_file for docs. |
| * auth_patterns: The auth_patterns parameter for ctx.download, see |
| http_file for docs. |
| cache: An opaque object used to cache call results. For implementation |
| see ./pypi_cache.bzl file. We use the canonical_id parameter for the key |
| value to ensure that distribution fetches from different indexes do not cause |
| cache collisions, because the index may return different locations from where |
| the files should be downloaded. We are not using the built-in cache in the |
| `download` function because the index may get updated at any time and we need |
| to be able to refresh the data. |
| parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. |
| read_simpleapi: a function for reading and parsing of the SimpleAPI contents. |
| Used in tests. |
| get_auth: A function to get auth information passed to read_simpleapi. Used in tests. |
| _fail: a function to print a failure. Used in tests. |
| |
| Returns: |
| dict of pkg name to the parsed HTML contents - a list of structs. |
| """ |
| index_url_overrides = { |
| normalize_name(p): i |
| for p, i in (attr.index_url_overrides or {}).items() |
| } |
| |
| # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes |
| # to replicate how `pip` would handle this case. |
| contents = {} |
| index_urls = [attr.index_url] + attr.extra_index_urls |
| read_simpleapi = read_simpleapi or _read_simpleapi |
| |
| download_kwargs = {} |
| if bazel_features.external_deps.download_has_block_param: |
| download_kwargs["block"] = not parallel_download |
| |
| if len(index_urls) == 1 or index_url_overrides: |
| download_kwargs["allow_fail"] = False |
| else: |
| download_kwargs["allow_fail"] = True |
| |
| input_sources = attr.sources |
| |
| found_on_index = {} |
| warn_overrides = False |
| ctx.report_progress("Fetch package lists from PyPI index") |
| for i, index_url in enumerate(index_urls): |
| if i != 0: |
| # Warn the user about a potential fix for the overrides |
| warn_overrides = True |
| |
| async_downloads = {} |
| sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index} |
| for pkg, versions in sources.items(): |
| pkg_normalized = normalize_name(pkg) |
| url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format( |
| index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), |
| distribution = pkg, |
| )) |
| result = read_simpleapi( |
| ctx = ctx, |
| attr = attr, |
| versions = versions, |
| url = url, |
| cache = cache, |
| get_auth = get_auth, |
| **download_kwargs |
| ) |
| if hasattr(result, "wait"): |
| # We will process it in a separate loop: |
| async_downloads[pkg] = struct( |
| pkg_normalized = pkg_normalized, |
| wait = result.wait, |
| url = url, |
| ) |
| elif result.success: |
| contents[pkg_normalized] = _with_index_url(url, result.output) |
| found_on_index[pkg] = index_url |
| |
| if not async_downloads: |
| continue |
| |
| # If we use `block` == False, then we need to have a second loop that is |
| # collecting all of the results as they were being downloaded in parallel. |
| for pkg, download in async_downloads.items(): |
| result = download.wait() |
| |
| if result.success: |
| contents[download.pkg_normalized] = _with_index_url(download.url, result.output) |
| found_on_index[pkg] = index_url |
| |
| failed_sources = [pkg for pkg in input_sources if pkg not in found_on_index] |
| if failed_sources: |
| pkg_index_urls = { |
| pkg: index_url_overrides.get( |
| normalize_name(pkg), |
| index_urls, |
| ) |
| for pkg in failed_sources |
| } |
| |
| _fail( |
| """ |
| Failed to download metadata of the following packages from urls: |
| {pkg_index_urls} |
| |
| If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call. |
| """.format( |
| pkg_index_urls = render.dict(pkg_index_urls), |
| failed_sources = render.list(failed_sources), |
| ), |
| ) |
| return None |
| |
| if warn_overrides: |
| index_url_overrides = { |
| pkg: found_on_index[pkg] |
| for pkg in attr.sources |
| if found_on_index[pkg] != attr.index_url |
| } |
| |
| if index_url_overrides: |
| # buildifier: disable=print |
| print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format( |
| render.dict(index_url_overrides), |
| )) |
| |
| return contents |
| |
| def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs): |
| """Read SimpleAPI. |
| |
| Args: |
| ctx: The module_ctx or repository_ctx. |
| url: {type}`str`, the url parameter that can be passed to ctx.download. |
| attr: The attribute that contains necessary info for downloading. The |
| following attributes must be present: |
| * envsubst: {type}`dict[str, str]` for performing substitutions in the URL. |
| * netrc: The netrc parameter for ctx.download, see {obj}`http_file` for docs. |
| * auth_patterns: The auth_patterns parameter for ctx.download, see |
| {obj}`http_file` for docs. |
| cache: {type}`struct` the `pypi_cache` instance. |
| versions: {type}`list[str] The versions that have been requested. |
| get_auth: A function to get auth information. Used in tests. |
| **download_kwargs: Any extra params to ctx.download. |
| Note that output and auth will be passed for you. |
| |
| Returns: |
| A similar object to what `download` would return except that in result.out |
| will be the parsed simple api contents. |
| """ |
| # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for |
| # the whl location and we cannot handle multiple URLs at once by passing |
| # them to ctx.download if we want to correctly handle the relative URLs. |
| # TODO: Add a test that env subbed index urls do not leak into the lock file. |
| |
| real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv)) |
| |
| cache_key = (url, real_url, versions) |
| cached_result = cache.get(cache_key) |
| if cached_result: |
| return struct(success = True, output = cached_result) |
| |
| output_str = envsubst( |
| url, |
| attr.envsubst, |
| # Use env names in the subst values - this will be unique over |
| # the lifetime of the execution of this function and we also use |
| # `~` as the separator to ensure that we don't get clashes. |
| {e: "~{}~".format(e) for e in attr.envsubst}.get, |
| ) |
| |
| # Transform the URL into a valid filename |
| for char in [".", ":", "/", "\\", "-"]: |
| output_str = output_str.replace(char, "_") |
| |
| output = ctx.path(output_str.strip("_").lower() + ".html") |
| |
| get_auth = get_auth or _get_auth |
| |
| # NOTE: this may have block = True or block = False in the download_kwargs |
| download = ctx.download( |
| url = [real_url], |
| output = output, |
| auth = get_auth(ctx, [real_url], ctx_attr = attr), |
| **download_kwargs |
| ) |
| |
| if download_kwargs.get("block") == False: |
| # Simulate the same API as ctx.download has |
| return struct( |
| wait = lambda: _read_index_result( |
| ctx, |
| result = download.wait(), |
| output = output, |
| cache = cache, |
| cache_key = cache_key, |
| ), |
| ) |
| |
| return _read_index_result( |
| ctx, |
| result = download, |
| output = output, |
| cache = cache, |
| cache_key = cache_key, |
| ) |
| |
| def _read_index_result(ctx, *, result, output, cache, cache_key): |
| if not result.success: |
| return struct(success = False) |
| |
| content = ctx.read(output) |
| |
| output = parse_simpleapi_html(content = content) |
| if output: |
| cache.setdefault(cache_key, output) |
| return struct(success = True, output = output) |
| else: |
| return struct(success = False) |
| |
| def _with_index_url(index_url, values): |
| if not values: |
| return values |
| |
| return struct( |
| sdists = values.sdists, |
| whls = values.whls, |
| sha256s_by_version = values.sha256s_by_version, |
| index_url = index_url, |
| ) |