blob: 20d79ba9b40e80ea2fa504d2c6be84b5abc6f9ca [file] [log] [blame]
# Copyright 2024 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A file that houses private functions used in the `bzlmod` extension with the same name.
"""
load("@bazel_features//:features.bzl", "bazel_features")
load("//python/private:auth.bzl", _get_auth = "get_auth")
load("//python/private:envsubst.bzl", "envsubst")
load("//python/private:normalize_name.bzl", "normalize_name")
load("//python/private:text_util.bzl", "render")
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
load(":urllib.bzl", "urllib")
def simpleapi_download(
ctx,
*,
attr,
cache,
parallel_download = True,
read_simpleapi = None,
get_auth = None,
_fail = fail):
"""Download Simple API HTML.
Args:
ctx: The module_ctx or repository_ctx.
attr: Contains the parameters for the download. They are grouped into a
struct for better clarity. It must have attributes:
* index_url: str, the index.
* index_url_overrides: dict[str, str], the index overrides for
separate packages.
* extra_index_urls: Extra index URLs that will be looked up after
the main is looked up.
* sources: list[str], the sources to download things for. Each value is
the contents of requirements files.
* envsubst: list[str], the envsubst vars for performing substitution in index url.
* netrc: The netrc parameter for ctx.download, see http_file for docs.
* auth_patterns: The auth_patterns parameter for ctx.download, see
http_file for docs.
cache: An opaque object used to cache call results. For implementation
see ./pypi_cache.bzl file. We use the canonical_id parameter for the key
value to ensure that distribution fetches from different indexes do not cause
cache collisions, because the index may return different locations from where
the files should be downloaded. We are not using the built-in cache in the
`download` function because the index may get updated at any time and we need
to be able to refresh the data.
parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
Used in tests.
get_auth: A function to get auth information passed to read_simpleapi. Used in tests.
_fail: a function to print a failure. Used in tests.
Returns:
dict of pkg name to the parsed HTML contents - a list of structs.
"""
index_url_overrides = {
normalize_name(p): i
for p, i in (attr.index_url_overrides or {}).items()
}
# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
# to replicate how `pip` would handle this case.
contents = {}
index_urls = [attr.index_url] + attr.extra_index_urls
read_simpleapi = read_simpleapi or _read_simpleapi
download_kwargs = {}
if bazel_features.external_deps.download_has_block_param:
download_kwargs["block"] = not parallel_download
if len(index_urls) == 1 or index_url_overrides:
download_kwargs["allow_fail"] = False
else:
download_kwargs["allow_fail"] = True
input_sources = attr.sources
found_on_index = {}
warn_overrides = False
ctx.report_progress("Fetch package lists from PyPI index")
for i, index_url in enumerate(index_urls):
if i != 0:
# Warn the user about a potential fix for the overrides
warn_overrides = True
async_downloads = {}
sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index}
for pkg, versions in sources.items():
pkg_normalized = normalize_name(pkg)
url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
distribution = pkg,
))
result = read_simpleapi(
ctx = ctx,
attr = attr,
versions = versions,
url = url,
cache = cache,
get_auth = get_auth,
**download_kwargs
)
if hasattr(result, "wait"):
# We will process it in a separate loop:
async_downloads[pkg] = struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
url = url,
)
elif result.success:
contents[pkg_normalized] = _with_index_url(url, result.output)
found_on_index[pkg] = index_url
if not async_downloads:
continue
# If we use `block` == False, then we need to have a second loop that is
# collecting all of the results as they were being downloaded in parallel.
for pkg, download in async_downloads.items():
result = download.wait()
if result.success:
contents[download.pkg_normalized] = _with_index_url(download.url, result.output)
found_on_index[pkg] = index_url
failed_sources = [pkg for pkg in input_sources if pkg not in found_on_index]
if failed_sources:
pkg_index_urls = {
pkg: index_url_overrides.get(
normalize_name(pkg),
index_urls,
)
for pkg in failed_sources
}
_fail(
"""
Failed to download metadata of the following packages from urls:
{pkg_index_urls}
If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call.
""".format(
pkg_index_urls = render.dict(pkg_index_urls),
failed_sources = render.list(failed_sources),
),
)
return None
if warn_overrides:
index_url_overrides = {
pkg: found_on_index[pkg]
for pkg in attr.sources
if found_on_index[pkg] != attr.index_url
}
if index_url_overrides:
# buildifier: disable=print
print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
render.dict(index_url_overrides),
))
return contents
def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs):
"""Read SimpleAPI.
Args:
ctx: The module_ctx or repository_ctx.
url: {type}`str`, the url parameter that can be passed to ctx.download.
attr: The attribute that contains necessary info for downloading. The
following attributes must be present:
* envsubst: {type}`dict[str, str]` for performing substitutions in the URL.
* netrc: The netrc parameter for ctx.download, see {obj}`http_file` for docs.
* auth_patterns: The auth_patterns parameter for ctx.download, see
{obj}`http_file` for docs.
cache: {type}`struct` the `pypi_cache` instance.
versions: {type}`list[str] The versions that have been requested.
get_auth: A function to get auth information. Used in tests.
**download_kwargs: Any extra params to ctx.download.
Note that output and auth will be passed for you.
Returns:
A similar object to what `download` would return except that in result.out
will be the parsed simple api contents.
"""
# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
# the whl location and we cannot handle multiple URLs at once by passing
# them to ctx.download if we want to correctly handle the relative URLs.
# TODO: Add a test that env subbed index urls do not leak into the lock file.
real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
cache_key = (url, real_url, versions)
cached_result = cache.get(cache_key)
if cached_result:
return struct(success = True, output = cached_result)
output_str = envsubst(
url,
attr.envsubst,
# Use env names in the subst values - this will be unique over
# the lifetime of the execution of this function and we also use
# `~` as the separator to ensure that we don't get clashes.
{e: "~{}~".format(e) for e in attr.envsubst}.get,
)
# Transform the URL into a valid filename
for char in [".", ":", "/", "\\", "-"]:
output_str = output_str.replace(char, "_")
output = ctx.path(output_str.strip("_").lower() + ".html")
get_auth = get_auth or _get_auth
# NOTE: this may have block = True or block = False in the download_kwargs
download = ctx.download(
url = [real_url],
output = output,
auth = get_auth(ctx, [real_url], ctx_attr = attr),
**download_kwargs
)
if download_kwargs.get("block") == False:
# Simulate the same API as ctx.download has
return struct(
wait = lambda: _read_index_result(
ctx,
result = download.wait(),
output = output,
cache = cache,
cache_key = cache_key,
),
)
return _read_index_result(
ctx,
result = download,
output = output,
cache = cache,
cache_key = cache_key,
)
def _read_index_result(ctx, *, result, output, cache, cache_key):
if not result.success:
return struct(success = False)
content = ctx.read(output)
output = parse_simpleapi_html(content = content)
if output:
cache.setdefault(cache_key, output)
return struct(success = True, output = output)
else:
return struct(success = False)
def _with_index_url(index_url, values):
if not values:
return values
return struct(
sdists = values.sdists,
whls = values.whls,
sha256s_by_version = values.sha256s_by_version,
index_url = index_url,
)