| # Copyright 2024 The Bazel Authors. All rights reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| A file that houses private functions used in the `bzlmod` extension with the same name. |
| """ |
| |
| load("@bazel_features//:features.bzl", "bazel_features") |
| load("@bazel_skylib//lib:sets.bzl", "sets") |
| load("//python/pip_install:requirements_parser.bzl", parse_requirements = "parse") |
| load(":auth.bzl", "get_auth") |
| load(":envsubst.bzl", "envsubst") |
| load(":normalize_name.bzl", "normalize_name") |
| |
| def simpleapi_download(ctx, *, attr, cache, parallel_download = True): |
| """Download Simple API HTML. |
| |
| Args: |
| ctx: The module_ctx or repository_ctx. |
| attr: Contains the parameters for the download. They are grouped into a |
| struct for better clarity. It must have attributes: |
| * index_url: str, the index. |
| * index_url_overrides: dict[str, str], the index overrides for |
| separate packages. |
| * extra_index_urls: Extra index URLs that will be looked up after |
| the main is looked up. |
| * sources: list[str], the sources to download things for. Each value is |
| the contents of requirements files. |
| * envsubst: list[str], the envsubst vars for performing substitution in index url. |
| * netrc: The netrc parameter for ctx.download, see http_file for docs. |
| * auth_patterns: The auth_patterns parameter for ctx.download, see |
| http_file for docs. |
| cache: A dictionary that can be used as a cache between calls during a |
| single evaluation of the extension. We use a dictionary as a cache |
| so that we can reuse calls to the simple API when evaluating the |
| extension. Using the canonical_id parameter of the module_ctx would |
| deposit the simple API responses to the bazel cache and that is |
| undesirable because additions to the PyPI index would not be |
| reflected when re-evaluating the extension unless we do |
| `bazel clean --expunge`. |
| parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. |
| |
| Returns: |
| dict of pkg name to the parsed HTML contents - a list of structs. |
| """ |
| index_url_overrides = { |
| normalize_name(p): i |
| for p, i in (attr.index_url_overrides or {}).items() |
| } |
| |
| download_kwargs = {} |
| if bazel_features.external_deps.download_has_block_param: |
| download_kwargs["block"] = not parallel_download |
| |
| # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes |
| # to replicate how `pip` would handle this case. |
| async_downloads = {} |
| contents = {} |
| index_urls = [attr.index_url] + attr.extra_index_urls |
| for pkg in get_packages_from_requirements(attr.sources): |
| pkg_normalized = normalize_name(pkg) |
| |
| success = False |
| for index_url in index_urls: |
| result = read_simple_api( |
| ctx = ctx, |
| url = "{}/{}/".format( |
| index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), |
| pkg, |
| ), |
| attr = attr, |
| cache = cache, |
| **download_kwargs |
| ) |
| if hasattr(result, "wait"): |
| # We will process it in a separate loop: |
| async_downloads.setdefault(pkg_normalized, []).append( |
| struct( |
| pkg_normalized = pkg_normalized, |
| wait = result.wait, |
| ), |
| ) |
| continue |
| |
| if result.success: |
| contents[pkg_normalized] = result.output |
| success = True |
| break |
| |
| if not async_downloads and not success: |
| fail("Failed to download metadata from urls: {}".format( |
| ", ".join(index_urls), |
| )) |
| |
| if not async_downloads: |
| return contents |
| |
| # If we use `block` == False, then we need to have a second loop that is |
| # collecting all of the results as they were being downloaded in parallel. |
| for pkg, downloads in async_downloads.items(): |
| success = False |
| for download in downloads: |
| result = download.wait() |
| |
| if result.success and download.pkg_normalized not in contents: |
| contents[download.pkg_normalized] = result.output |
| success = True |
| |
| if not success: |
| fail("Failed to download metadata from urls: {}".format( |
| ", ".join(index_urls), |
| )) |
| |
| return contents |
| |
| def read_simple_api(ctx, url, attr, cache, **download_kwargs): |
| """Read SimpleAPI. |
| |
| Args: |
| ctx: The module_ctx or repository_ctx. |
| url: str, the url parameter that can be passed to ctx.download. |
| attr: The attribute that contains necessary info for downloading. The |
| following attributes must be present: |
| * envsubst: The envsubst values for performing substitutions in the URL. |
| * netrc: The netrc parameter for ctx.download, see http_file for docs. |
| * auth_patterns: The auth_patterns parameter for ctx.download, see |
| http_file for docs. |
| cache: A dict for storing the results. |
| **download_kwargs: Any extra params to ctx.download. |
| Note that output and auth will be passed for you. |
| |
| Returns: |
| A similar object to what `download` would return except that in result.out |
| will be the parsed simple api contents. |
| """ |
| # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for |
| # the whl location and we cannot handle multiple URLs at once by passing |
| # them to ctx.download if we want to correctly handle the relative URLs. |
| # TODO: Add a test that env subbed index urls do not leak into the lock file. |
| |
| real_url = envsubst( |
| url, |
| attr.envsubst, |
| ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, |
| ) |
| |
| cache_key = real_url |
| if cache_key in cache: |
| return struct(success = True, output = cache[cache_key]) |
| |
| output_str = envsubst( |
| url, |
| attr.envsubst, |
| # Use env names in the subst values - this will be unique over |
| # the lifetime of the execution of this function and we also use |
| # `~` as the separator to ensure that we don't get clashes. |
| {e: "~{}~".format(e) for e in attr.envsubst}.get, |
| ) |
| |
| # Transform the URL into a valid filename |
| for char in [".", ":", "/", "\\", "-"]: |
| output_str = output_str.replace(char, "_") |
| |
| output = ctx.path(output_str.strip("_").lower() + ".html") |
| |
| # NOTE: this may have block = True or block = False in the download_kwargs |
| download = ctx.download( |
| url = [real_url], |
| output = output, |
| auth = get_auth(ctx, [real_url], ctx_attr = attr), |
| allow_fail = True, |
| **download_kwargs |
| ) |
| |
| if download_kwargs.get("block") == False: |
| # Simulate the same API as ctx.download has |
| return struct( |
| wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key), |
| ) |
| |
| return _read_index_result(ctx, download, output, url, cache, cache_key) |
| |
| def _read_index_result(ctx, result, output, url, cache, cache_key): |
| if not result.success: |
| return struct(success = False) |
| |
| content = ctx.read(output) |
| |
| output = parse_simple_api_html(url = url, content = content) |
| if output: |
| cache.setdefault(cache_key, output) |
| return struct(success = True, output = output, cache_key = cache_key) |
| else: |
| return struct(success = False) |
| |
| def get_packages_from_requirements(requirements_files): |
| """Get Simple API sources from a list of requirements files and merge them. |
| |
| Args: |
| requirements_files(list[str]): A list of requirements files contents. |
| |
| Returns: |
| A list. |
| """ |
| want_packages = sets.make() |
| for contents in requirements_files: |
| parse_result = parse_requirements(contents) |
| for distribution, _ in parse_result.requirements: |
| # NOTE: we'll be querying the PyPI servers multiple times if the |
| # requirements contains non-normalized names, but this is what user |
| # is specifying to us. |
| sets.insert(want_packages, distribution) |
| |
| return sets.to_list(want_packages) |
| |
| def get_simpleapi_sources(line): |
| """Get PyPI sources from a requirements.txt line. |
| |
| We interpret the spec described in |
| https://pip.pypa.io/en/stable/reference/requirement-specifiers/#requirement-specifiers |
| |
| Args: |
| line(str): The requirements.txt entry. |
| |
| Returns: |
| A struct with shas attribute containing a list of shas to download from pypi_index. |
| """ |
| head, _, maybe_hashes = line.partition(";") |
| _, _, version = head.partition("==") |
| version = version.partition(" ")[0].strip() |
| |
| if "@" in head: |
| shas = [] |
| else: |
| maybe_hashes = maybe_hashes or line |
| shas = [ |
| sha.strip() |
| for sha in maybe_hashes.split("--hash=sha256:")[1:] |
| ] |
| |
| if head == line: |
| head = line.partition("--hash=")[0].strip() |
| else: |
| head = head + ";" + maybe_hashes.partition("--hash=")[0].strip() |
| |
| return struct( |
| requirement = line if not shas else head, |
| version = version, |
| shas = sorted(shas), |
| ) |
| |
| def parse_simple_api_html(*, url, content): |
| """Get the package URLs for given shas by parsing the Simple API HTML. |
| |
| Args: |
| url(str): The URL that the HTML content can be downloaded from. |
| content(str): The Simple API HTML content. |
| |
| Returns: |
| A list of structs with: |
| * filename: The filename of the artifact. |
| * url: The URL to download the artifact. |
| * sha256: The sha256 of the artifact. |
| * metadata_sha256: The whl METADATA sha256 if we can download it. If this is |
| present, then the 'metadata_url' is also present. Defaults to "". |
| * metadata_url: The URL for the METADATA if we can download it. Defaults to "". |
| """ |
| sdists = {} |
| whls = {} |
| lines = content.split("<a href=\"") |
| |
| _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"") |
| api_version, _, _ = api_version.partition("\"") |
| |
| # We must assume the 1.0 if it is not present |
| # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients |
| api_version = api_version or "1.0" |
| api_version = tuple([int(i) for i in api_version.split(".")]) |
| |
| if api_version >= (2, 0): |
| # We don't expect to have version 2.0 here, but have this check in place just in case. |
| # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api |
| fail("Unsupported API version: {}".format(api_version)) |
| |
| for line in lines[1:]: |
| dist_url, _, tail = line.partition("#sha256=") |
| sha256, _, tail = tail.partition("\"") |
| |
| # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api |
| yanked = "data-yanked" in line |
| |
| maybe_metadata, _, tail = tail.partition(">") |
| filename, _, tail = tail.partition("<") |
| |
| metadata_sha256 = "" |
| metadata_url = "" |
| for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]: |
| metadata_marker = metadata_marker + "=\"sha256=" |
| if metadata_marker in maybe_metadata: |
| # Implement https://peps.python.org/pep-0714/ |
| _, _, tail = maybe_metadata.partition(metadata_marker) |
| metadata_sha256, _, _ = tail.partition("\"") |
| metadata_url = dist_url + ".metadata" |
| break |
| |
| if filename.endswith(".whl"): |
| whls[sha256] = struct( |
| filename = filename, |
| url = _absolute_url(url, dist_url), |
| sha256 = sha256, |
| metadata_sha256 = metadata_sha256, |
| metadata_url = _absolute_url(url, metadata_url), |
| yanked = yanked, |
| ) |
| else: |
| sdists[sha256] = struct( |
| filename = filename, |
| url = _absolute_url(url, dist_url), |
| sha256 = sha256, |
| metadata_sha256 = "", |
| metadata_url = "", |
| yanked = yanked, |
| ) |
| |
| return struct( |
| sdists = sdists, |
| whls = whls, |
| ) |
| |
| def _absolute_url(index_url, candidate): |
| if not candidate.startswith(".."): |
| return candidate |
| |
| candidate_parts = candidate.split("..") |
| last = candidate_parts[-1] |
| for _ in range(len(candidate_parts) - 1): |
| index_url, _, _ = index_url.rstrip("/").rpartition("/") |
| |
| return "{}/{}".format(index_url, last.strip("/")) |