python/private/pypi/simpleapi_download.bzl - third_party/github/bazelbuild/rules_python - Git at Google

 # Copyright 2024 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 A file that houses private functions used in the `bzlmod` extension with the same name.
 """

 load("@bazel_features//:features.bzl", "bazel_features")
 load("//python/private:auth.bzl", "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

 def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
     """Download Simple API HTML.

     Args:
         ctx: The module_ctx or repository_ctx.
         attr: Contains the parameters for the download. They are grouped into a
           struct for better clarity. It must have attributes:
            * index_url: str, the index.
            * index_url_overrides: dict[str, str], the index overrides for
              separate packages.
            * extra_index_urls: Extra index URLs that will be looked up after
              the main is looked up.
            * sources: list[str], the sources to download things for. Each value is
              the contents of requirements files.
            * envsubst: list[str], the envsubst vars for performing substitution in index url.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dictionary that can be used as a cache between calls during a
             single evaluation of the extension. We use a dictionary as a cache
             so that we can reuse calls to the simple API when evaluating the
             extension. Using the canonical_id parameter of the module_ctx would
             deposit the simple API responses to the bazel cache and that is
             undesirable because additions to the PyPI index would not be
             reflected when re-evaluating the extension unless we do
             `bazel clean --expunge`.
         parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.

     Returns:
         dict of pkg name to the parsed HTML contents - a list of structs.
     """
     index_url_overrides = {
         normalize_name(p): i
         for p, i in (attr.index_url_overrides or {}).items()
     }

     download_kwargs = {}
     if bazel_features.external_deps.download_has_block_param:
         download_kwargs["block"] = not parallel_download

     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
     async_downloads = {}
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
     for pkg in attr.sources:
         pkg_normalized = normalize_name(pkg)

         success = False
         for index_url in index_urls:
             result = _read_simpleapi(
                 ctx = ctx,
                 url = "{}/{}/".format(
                     index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
                     pkg,
                 ),
                 attr = attr,
                 cache = cache,
                 **download_kwargs
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
                 async_downloads.setdefault(pkg_normalized, []).append(
                     struct(
                         pkg_normalized = pkg_normalized,
                         wait = result.wait,
                     ),
                 )
                 continue

             if result.success:
                 contents[pkg_normalized] = result.output
                 success = True
                 break

         if not async_downloads and not success:
             fail("Failed to download metadata from urls: {}".format(
                 ", ".join(index_urls),
             ))

     if not async_downloads:
         return contents

     # If we use `block` == False, then we need to have a second loop that is
     # collecting all of the results as they were being downloaded in parallel.
     for pkg, downloads in async_downloads.items():
         success = False
         for download in downloads:
             result = download.wait()

             if result.success and download.pkg_normalized not in contents:
                 contents[download.pkg_normalized] = result.output
                 success = True

         if not success:
             fail("Failed to download metadata from urls: {}".format(
                 ", ".join(index_urls),
             ))

     return contents

 def _read_simpleapi(ctx, url, attr, cache, **download_kwargs):
     """Read SimpleAPI.

     Args:
         ctx: The module_ctx or repository_ctx.
         url: str, the url parameter that can be passed to ctx.download.
         attr: The attribute that contains necessary info for downloading. The
           following attributes must be present:
            * envsubst: The envsubst values for performing substitutions in the URL.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dict for storing the results.
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.

     Returns:
         A similar object to what `download` would return except that in result.out
         will be the parsed simple api contents.
     """
     # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
     # the whl location and we cannot handle multiple URLs at once by passing
     # them to ctx.download if we want to correctly handle the relative URLs.
     # TODO: Add a test that env subbed index urls do not leak into the lock file.

     real_url = envsubst(
         url,
         attr.envsubst,
         ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
     )

     cache_key = real_url
     if cache_key in cache:
         return struct(success = True, output = cache[cache_key])

     output_str = envsubst(
         url,
         attr.envsubst,
         # Use env names in the subst values - this will be unique over
         # the lifetime of the execution of this function and we also use
         # `~` as the separator to ensure that we don't get clashes.
         {e: "~{}~".format(e) for e in attr.envsubst}.get,
     )

     # Transform the URL into a valid filename
     for char in [".", ":", "/", "\\", "-"]:
         output_str = output_str.replace(char, "_")

     output = ctx.path(output_str.strip("_").lower() + ".html")

     # NOTE: this may have block = True or block = False in the download_kwargs
     download = ctx.download(
         url = [real_url],
         output = output,
         auth = get_auth(ctx, [real_url], ctx_attr = attr),
         allow_fail = True,
         **download_kwargs
     )

     if download_kwargs.get("block") == False:
         # Simulate the same API as ctx.download has
         return struct(
             wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
         )

     return _read_index_result(ctx, download, output, real_url, cache, cache_key)

 def _read_index_result(ctx, result, output, url, cache, cache_key):
     if not result.success:
         return struct(success = False)

     content = ctx.read(output)

     output = parse_simpleapi_html(url = url, content = content)
     if output:
         cache.setdefault(cache_key, output)
         return struct(success = True, output = output, cache_key = cache_key)
     else:
         return struct(success = False)
	# Copyright 2024 The Bazel Authors. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	A file that houses private functions used in the `bzlmod` extension with the same name.
	"""

	load("@bazel_features//:features.bzl", "bazel_features")
	load("//python/private:auth.bzl", "get_auth")
	load("//python/private:envsubst.bzl", "envsubst")
	load("//python/private:normalize_name.bzl", "normalize_name")
	load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

	def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
	"""Download Simple API HTML.

	Args:
	ctx: The module_ctx or repository_ctx.
	attr: Contains the parameters for the download. They are grouped into a
	struct for better clarity. It must have attributes:
	* index_url: str, the index.
	* index_url_overrides: dict[str, str], the index overrides for
	separate packages.
	* extra_index_urls: Extra index URLs that will be looked up after
	the main is looked up.
	* sources: list[str], the sources to download things for. Each value is
	the contents of requirements files.
	* envsubst: list[str], the envsubst vars for performing substitution in index url.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dictionary that can be used as a cache between calls during a
	single evaluation of the extension. We use a dictionary as a cache
	so that we can reuse calls to the simple API when evaluating the
	extension. Using the canonical_id parameter of the module_ctx would
	deposit the simple API responses to the bazel cache and that is
	undesirable because additions to the PyPI index would not be
	reflected when re-evaluating the extension unless we do
	`bazel clean --expunge`.
	parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.

	Returns:
	dict of pkg name to the parsed HTML contents - a list of structs.
	"""
	index_url_overrides = {
	normalize_name(p): i
	for p, i in (attr.index_url_overrides or {}).items()
	}

	download_kwargs = {}
	if bazel_features.external_deps.download_has_block_param:
	download_kwargs["block"] = not parallel_download

	# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
	# to replicate how `pip` would handle this case.
	async_downloads = {}
	contents = {}
	index_urls = [attr.index_url] + attr.extra_index_urls
	for pkg in attr.sources:
	pkg_normalized = normalize_name(pkg)

	success = False
	for index_url in index_urls:
	result = _read_simpleapi(
	ctx = ctx,
	url = "{}/{}/".format(
	index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
	pkg,
	),
	attr = attr,
	cache = cache,
	**download_kwargs
	)
	if hasattr(result, "wait"):
	# We will process it in a separate loop:
	async_downloads.setdefault(pkg_normalized, []).append(
	struct(
	pkg_normalized = pkg_normalized,
	wait = result.wait,
	),
	)
	continue

	if result.success:
	contents[pkg_normalized] = result.output
	success = True
	break

	if not async_downloads and not success:
	fail("Failed to download metadata from urls: {}".format(
	", ".join(index_urls),
	))

	if not async_downloads:
	return contents

	# If we use `block` == False, then we need to have a second loop that is
	# collecting all of the results as they were being downloaded in parallel.
	for pkg, downloads in async_downloads.items():
	success = False
	for download in downloads:
	result = download.wait()

	if result.success and download.pkg_normalized not in contents:
	contents[download.pkg_normalized] = result.output
	success = True

	if not success:
	fail("Failed to download metadata from urls: {}".format(
	", ".join(index_urls),
	))

	return contents

	def _read_simpleapi(ctx, url, attr, cache, **download_kwargs):
	"""Read SimpleAPI.

	Args:
	ctx: The module_ctx or repository_ctx.
	url: str, the url parameter that can be passed to ctx.download.
	attr: The attribute that contains necessary info for downloading. The
	following attributes must be present:
	* envsubst: The envsubst values for performing substitutions in the URL.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dict for storing the results.
	**download_kwargs: Any extra params to ctx.download.
	Note that output and auth will be passed for you.

	Returns:
	A similar object to what `download` would return except that in result.out
	will be the parsed simple api contents.
	"""
	# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
	# the whl location and we cannot handle multiple URLs at once by passing
	# them to ctx.download if we want to correctly handle the relative URLs.
	# TODO: Add a test that env subbed index urls do not leak into the lock file.

	real_url = envsubst(
	url,
	attr.envsubst,
	ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
	)

	cache_key = real_url
	if cache_key in cache:
	return struct(success = True, output = cache[cache_key])

	output_str = envsubst(
	url,
	attr.envsubst,
	# Use env names in the subst values - this will be unique over
	# the lifetime of the execution of this function and we also use
	# `~` as the separator to ensure that we don't get clashes.
	{e: "~{}~".format(e) for e in attr.envsubst}.get,
	)

	# Transform the URL into a valid filename
	for char in [".", ":", "/", "\\", "-"]:
	output_str = output_str.replace(char, "_")

	output = ctx.path(output_str.strip("_").lower() + ".html")

	# NOTE: this may have block = True or block = False in the download_kwargs
	download = ctx.download(
	url = [real_url],
	output = output,
	auth = get_auth(ctx, [real_url], ctx_attr = attr),
	allow_fail = True,
	**download_kwargs
	)

	if download_kwargs.get("block") == False:
	# Simulate the same API as ctx.download has
	return struct(
	wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
	)

	return _read_index_result(ctx, download, output, real_url, cache, cache_key)

	def _read_index_result(ctx, result, output, url, cache, cache_key):
	if not result.success:
	return struct(success = False)

	content = ctx.read(output)

	output = parse_simpleapi_html(url = url, content = content)
	if output:
	cache.setdefault(cache_key, output)
	return struct(success = True, output = output, cache_key = cache_key)
	else:
	return struct(success = False)