python/private/pypi_index.bzl - third_party/github/bazelbuild/rules_python - Git at Google

 # Copyright 2024 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 A file that houses private functions used in the `bzlmod` extension with the same name.
 """

 load("@bazel_features//:features.bzl", "bazel_features")
 load("@bazel_skylib//lib:sets.bzl", "sets")
 load("//python/pip_install:requirements_parser.bzl", parse_requirements = "parse")
 load(":auth.bzl", "get_auth")
 load(":envsubst.bzl", "envsubst")
 load(":normalize_name.bzl", "normalize_name")

 def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
     """Download Simple API HTML.

     Args:
         ctx: The module_ctx or repository_ctx.
         attr: Contains the parameters for the download. They are grouped into a
           struct for better clarity. It must have attributes:
            * index_url: str, the index.
            * index_url_overrides: dict[str, str], the index overrides for
              separate packages.
            * extra_index_urls: Extra index URLs that will be looked up after
              the main is looked up.
            * sources: list[str], the sources to download things for. Each value is
              the contents of requirements files.
            * envsubst: list[str], the envsubst vars for performing substitution in index url.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dictionary that can be used as a cache between calls during a
             single evaluation of the extension. We use a dictionary as a cache
             so that we can reuse calls to the simple API when evaluating the
             extension. Using the canonical_id parameter of the module_ctx would
             deposit the simple API responses to the bazel cache and that is
             undesirable because additions to the PyPI index would not be
             reflected when re-evaluating the extension unless we do
             `bazel clean --expunge`.
         parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.

     Returns:
         dict of pkg name to the parsed HTML contents - a list of structs.
     """
     index_url_overrides = {
         normalize_name(p): i
         for p, i in (attr.index_url_overrides or {}).items()
     }

     download_kwargs = {}
     if bazel_features.external_deps.download_has_block_param:
         download_kwargs["block"] = not parallel_download

     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
     async_downloads = {}
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
     for pkg in get_packages_from_requirements(attr.sources):
         pkg_normalized = normalize_name(pkg)

         success = False
         for index_url in index_urls:
             result = read_simple_api(
                 ctx = ctx,
                 url = "{}/{}/".format(
                     index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
                     pkg,
                 ),
                 attr = attr,
                 cache = cache,
                 **download_kwargs
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
                 async_downloads.setdefault(pkg_normalized, []).append(
                     struct(
                         pkg_normalized = pkg_normalized,
                         wait = result.wait,
                     ),
                 )
                 continue

             if result.success:
                 contents[pkg_normalized] = result.output
                 success = True
                 break

         if not async_downloads and not success:
             fail("Failed to download metadata from urls: {}".format(
                 ", ".join(index_urls),
             ))

     if not async_downloads:
         return contents

     # If we use `block` == False, then we need to have a second loop that is
     # collecting all of the results as they were being downloaded in parallel.
     for pkg, downloads in async_downloads.items():
         success = False
         for download in downloads:
             result = download.wait()

             if result.success and download.pkg_normalized not in contents:
                 contents[download.pkg_normalized] = result.output
                 success = True

         if not success:
             fail("Failed to download metadata from urls: {}".format(
                 ", ".join(index_urls),
             ))

     return contents

 def read_simple_api(ctx, url, attr, cache, **download_kwargs):
     """Read SimpleAPI.

     Args:
         ctx: The module_ctx or repository_ctx.
         url: str, the url parameter that can be passed to ctx.download.
         attr: The attribute that contains necessary info for downloading. The
           following attributes must be present:
            * envsubst: The envsubst values for performing substitutions in the URL.
            * netrc: The netrc parameter for ctx.download, see http_file for docs.
            * auth_patterns: The auth_patterns parameter for ctx.download, see
                http_file for docs.
         cache: A dict for storing the results.
         **download_kwargs: Any extra params to ctx.download.
             Note that output and auth will be passed for you.

     Returns:
         A similar object to what `download` would return except that in result.out
         will be the parsed simple api contents.
     """
     # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
     # the whl location and we cannot handle multiple URLs at once by passing
     # them to ctx.download if we want to correctly handle the relative URLs.
     # TODO: Add a test that env subbed index urls do not leak into the lock file.

     real_url = envsubst(
         url,
         attr.envsubst,
         ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
     )

     cache_key = real_url
     if cache_key in cache:
         return struct(success = True, output = cache[cache_key])

     output_str = envsubst(
         url,
         attr.envsubst,
         # Use env names in the subst values - this will be unique over
         # the lifetime of the execution of this function and we also use
         # `~` as the separator to ensure that we don't get clashes.
         {e: "~{}~".format(e) for e in attr.envsubst}.get,
     )

     # Transform the URL into a valid filename
     for char in [".", ":", "/", "\\", "-"]:
         output_str = output_str.replace(char, "_")

     output = ctx.path(output_str.strip("_").lower() + ".html")

     # NOTE: this may have block = True or block = False in the download_kwargs
     download = ctx.download(
         url = [real_url],
         output = output,
         auth = get_auth(ctx, [real_url], ctx_attr = attr),
         allow_fail = True,
         **download_kwargs
     )

     if download_kwargs.get("block") == False:
         # Simulate the same API as ctx.download has
         return struct(
             wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key),
         )

     return _read_index_result(ctx, download, output, url, cache, cache_key)

 def _read_index_result(ctx, result, output, url, cache, cache_key):
     if not result.success:
         return struct(success = False)

     content = ctx.read(output)

     output = parse_simple_api_html(url = url, content = content)
     if output:
         cache.setdefault(cache_key, output)
         return struct(success = True, output = output, cache_key = cache_key)
     else:
         return struct(success = False)

 def get_packages_from_requirements(requirements_files):
     """Get Simple API sources from a list of requirements files and merge them.

     Args:
         requirements_files(list[str]): A list of requirements files contents.

     Returns:
         A list.
     """
     want_packages = sets.make()
     for contents in requirements_files:
         parse_result = parse_requirements(contents)
         for distribution, _ in parse_result.requirements:
             # NOTE: we'll be querying the PyPI servers multiple times if the
             # requirements contains non-normalized names, but this is what user
             # is specifying to us.
             sets.insert(want_packages, distribution)

     return sets.to_list(want_packages)

 def get_simpleapi_sources(line):
     """Get PyPI sources from a requirements.txt line.

     We interpret the spec described in
     https://pip.pypa.io/en/stable/reference/requirement-specifiers/#requirement-specifiers

     Args:
         line(str): The requirements.txt entry.

     Returns:
         A struct with shas attribute containing a list of shas to download from pypi_index.
     """
     head, _, maybe_hashes = line.partition(";")
     _, _, version = head.partition("==")
     version = version.partition(" ")[0].strip()

     if "@" in head:
         shas = []
     else:
         maybe_hashes = maybe_hashes or line
         shas = [
             sha.strip()
             for sha in maybe_hashes.split("--hash=sha256:")[1:]
         ]

     if head == line:
         head = line.partition("--hash=")[0].strip()
     else:
         head = head + ";" + maybe_hashes.partition("--hash=")[0].strip()

     return struct(
         requirement = line if not shas else head,
         version = version,
         shas = sorted(shas),
     )

 def parse_simple_api_html(*, url, content):
     """Get the package URLs for given shas by parsing the Simple API HTML.

     Args:
         url(str): The URL that the HTML content can be downloaded from.
         content(str): The Simple API HTML content.

     Returns:
         A list of structs with:
         * filename: The filename of the artifact.
         * url: The URL to download the artifact.
         * sha256: The sha256 of the artifact.
         * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
           present, then the 'metadata_url' is also present. Defaults to "".
         * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
     """
     sdists = {}
     whls = {}
     lines = content.split("<a href=\"")

     _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
     api_version, _, _ = api_version.partition("\"")

     # We must assume the 1.0 if it is not present
     # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
     api_version = api_version or "1.0"
     api_version = tuple([int(i) for i in api_version.split(".")])

     if api_version >= (2, 0):
         # We don't expect to have version 2.0 here, but have this check in place just in case.
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))

     for line in lines[1:]:
         dist_url, _, tail = line.partition("#sha256=")
         sha256, _, tail = tail.partition("\"")

         # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
         yanked = "data-yanked" in line

         maybe_metadata, _, tail = tail.partition(">")
         filename, _, tail = tail.partition("<")

         metadata_sha256 = ""
         metadata_url = ""
         for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
             metadata_marker = metadata_marker + "=\"sha256="
             if metadata_marker in maybe_metadata:
                 # Implement https://peps.python.org/pep-0714/
                 _, _, tail = maybe_metadata.partition(metadata_marker)
                 metadata_sha256, _, _ = tail.partition("\"")
                 metadata_url = dist_url + ".metadata"
                 break

         if filename.endswith(".whl"):
             whls[sha256] = struct(
                 filename = filename,
                 url = _absolute_url(url, dist_url),
                 sha256 = sha256,
                 metadata_sha256 = metadata_sha256,
                 metadata_url = _absolute_url(url, metadata_url),
                 yanked = yanked,
             )
         else:
             sdists[sha256] = struct(
                 filename = filename,
                 url = _absolute_url(url, dist_url),
                 sha256 = sha256,
                 metadata_sha256 = "",
                 metadata_url = "",
                 yanked = yanked,
             )

     return struct(
         sdists = sdists,
         whls = whls,
     )

 def _absolute_url(index_url, candidate):
     if not candidate.startswith(".."):
         return candidate

     candidate_parts = candidate.split("..")
     last = candidate_parts[-1]
     for _ in range(len(candidate_parts) - 1):
         index_url, _, _ = index_url.rstrip("/").rpartition("/")

     return "{}/{}".format(index_url, last.strip("/"))
	# Copyright 2024 The Bazel Authors. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	A file that houses private functions used in the `bzlmod` extension with the same name.
	"""

	load("@bazel_features//:features.bzl", "bazel_features")
	load("@bazel_skylib//lib:sets.bzl", "sets")
	load("//python/pip_install:requirements_parser.bzl", parse_requirements = "parse")
	load(":auth.bzl", "get_auth")
	load(":envsubst.bzl", "envsubst")
	load(":normalize_name.bzl", "normalize_name")

	def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
	"""Download Simple API HTML.

	Args:
	ctx: The module_ctx or repository_ctx.
	attr: Contains the parameters for the download. They are grouped into a
	struct for better clarity. It must have attributes:
	* index_url: str, the index.
	* index_url_overrides: dict[str, str], the index overrides for
	separate packages.
	* extra_index_urls: Extra index URLs that will be looked up after
	the main is looked up.
	* sources: list[str], the sources to download things for. Each value is
	the contents of requirements files.
	* envsubst: list[str], the envsubst vars for performing substitution in index url.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dictionary that can be used as a cache between calls during a
	single evaluation of the extension. We use a dictionary as a cache
	so that we can reuse calls to the simple API when evaluating the
	extension. Using the canonical_id parameter of the module_ctx would
	deposit the simple API responses to the bazel cache and that is
	undesirable because additions to the PyPI index would not be
	reflected when re-evaluating the extension unless we do
	`bazel clean --expunge`.
	parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.

	Returns:
	dict of pkg name to the parsed HTML contents - a list of structs.
	"""
	index_url_overrides = {
	normalize_name(p): i
	for p, i in (attr.index_url_overrides or {}).items()
	}

	download_kwargs = {}
	if bazel_features.external_deps.download_has_block_param:
	download_kwargs["block"] = not parallel_download

	# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
	# to replicate how `pip` would handle this case.
	async_downloads = {}
	contents = {}
	index_urls = [attr.index_url] + attr.extra_index_urls
	for pkg in get_packages_from_requirements(attr.sources):
	pkg_normalized = normalize_name(pkg)

	success = False
	for index_url in index_urls:
	result = read_simple_api(
	ctx = ctx,
	url = "{}/{}/".format(
	index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
	pkg,
	),
	attr = attr,
	cache = cache,
	**download_kwargs
	)
	if hasattr(result, "wait"):
	# We will process it in a separate loop:
	async_downloads.setdefault(pkg_normalized, []).append(
	struct(
	pkg_normalized = pkg_normalized,
	wait = result.wait,
	),
	)
	continue

	if result.success:
	contents[pkg_normalized] = result.output
	success = True
	break

	if not async_downloads and not success:
	fail("Failed to download metadata from urls: {}".format(
	", ".join(index_urls),
	))

	if not async_downloads:
	return contents

	# If we use `block` == False, then we need to have a second loop that is
	# collecting all of the results as they were being downloaded in parallel.
	for pkg, downloads in async_downloads.items():
	success = False
	for download in downloads:
	result = download.wait()

	if result.success and download.pkg_normalized not in contents:
	contents[download.pkg_normalized] = result.output
	success = True

	if not success:
	fail("Failed to download metadata from urls: {}".format(
	", ".join(index_urls),
	))

	return contents

	def read_simple_api(ctx, url, attr, cache, **download_kwargs):
	"""Read SimpleAPI.

	Args:
	ctx: The module_ctx or repository_ctx.
	url: str, the url parameter that can be passed to ctx.download.
	attr: The attribute that contains necessary info for downloading. The
	following attributes must be present:
	* envsubst: The envsubst values for performing substitutions in the URL.
	* netrc: The netrc parameter for ctx.download, see http_file for docs.
	* auth_patterns: The auth_patterns parameter for ctx.download, see
	http_file for docs.
	cache: A dict for storing the results.
	**download_kwargs: Any extra params to ctx.download.
	Note that output and auth will be passed for you.

	Returns:
	A similar object to what `download` would return except that in result.out
	will be the parsed simple api contents.
	"""
	# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
	# the whl location and we cannot handle multiple URLs at once by passing
	# them to ctx.download if we want to correctly handle the relative URLs.
	# TODO: Add a test that env subbed index urls do not leak into the lock file.

	real_url = envsubst(
	url,
	attr.envsubst,
	ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
	)

	cache_key = real_url
	if cache_key in cache:
	return struct(success = True, output = cache[cache_key])

	output_str = envsubst(
	url,
	attr.envsubst,
	# Use env names in the subst values - this will be unique over
	# the lifetime of the execution of this function and we also use
	# `~` as the separator to ensure that we don't get clashes.
	{e: "~{}~".format(e) for e in attr.envsubst}.get,
	)

	# Transform the URL into a valid filename
	for char in [".", ":", "/", "\\", "-"]:
	output_str = output_str.replace(char, "_")

	output = ctx.path(output_str.strip("_").lower() + ".html")

	# NOTE: this may have block = True or block = False in the download_kwargs
	download = ctx.download(
	url = [real_url],
	output = output,
	auth = get_auth(ctx, [real_url], ctx_attr = attr),
	allow_fail = True,
	**download_kwargs
	)

	if download_kwargs.get("block") == False:
	# Simulate the same API as ctx.download has
	return struct(
	wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key),
	)

	return _read_index_result(ctx, download, output, url, cache, cache_key)

	def _read_index_result(ctx, result, output, url, cache, cache_key):
	if not result.success:
	return struct(success = False)

	content = ctx.read(output)

	output = parse_simple_api_html(url = url, content = content)
	if output:
	cache.setdefault(cache_key, output)
	return struct(success = True, output = output, cache_key = cache_key)
	else:
	return struct(success = False)

	def get_packages_from_requirements(requirements_files):
	"""Get Simple API sources from a list of requirements files and merge them.

	Args:
	requirements_files(list[str]): A list of requirements files contents.

	Returns:
	A list.
	"""
	want_packages = sets.make()
	for contents in requirements_files:
	parse_result = parse_requirements(contents)
	for distribution, _ in parse_result.requirements:
	# NOTE: we'll be querying the PyPI servers multiple times if the
	# requirements contains non-normalized names, but this is what user
	# is specifying to us.
	sets.insert(want_packages, distribution)

	return sets.to_list(want_packages)

	def get_simpleapi_sources(line):
	"""Get PyPI sources from a requirements.txt line.

	We interpret the spec described in
	https://pip.pypa.io/en/stable/reference/requirement-specifiers/#requirement-specifiers

	Args:
	line(str): The requirements.txt entry.

	Returns:
	A struct with shas attribute containing a list of shas to download from pypi_index.
	"""
	head, _, maybe_hashes = line.partition(";")
	_, _, version = head.partition("==")
	version = version.partition(" ")[0].strip()

	if "@" in head:
	shas = []
	else:
	maybe_hashes = maybe_hashes or line
	shas = [
	sha.strip()
	for sha in maybe_hashes.split("--hash=sha256:")[1:]
	]

	if head == line:
	head = line.partition("--hash=")[0].strip()
	else:
	head = head + ";" + maybe_hashes.partition("--hash=")[0].strip()

	return struct(
	requirement = line if not shas else head,
	version = version,
	shas = sorted(shas),
	)

	def parse_simple_api_html(*, url, content):
	"""Get the package URLs for given shas by parsing the Simple API HTML.

	Args:
	url(str): The URL that the HTML content can be downloaded from.
	content(str): The Simple API HTML content.

	Returns:
	A list of structs with:
	* filename: The filename of the artifact.
	* url: The URL to download the artifact.
	* sha256: The sha256 of the artifact.
	* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
	present, then the 'metadata_url' is also present. Defaults to "".
	* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
	"""
	sdists = {}
	whls = {}
	lines = content.split("<a href=\"")

	_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
	api_version, _, _ = api_version.partition("\"")

	# We must assume the 1.0 if it is not present
	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
	api_version = api_version or "1.0"
	api_version = tuple([int(i) for i in api_version.split(".")])

	if api_version >= (2, 0):
	# We don't expect to have version 2.0 here, but have this check in place just in case.
	# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
	fail("Unsupported API version: {}".format(api_version))

	for line in lines[1:]:
	dist_url, _, tail = line.partition("#sha256=")
	sha256, _, tail = tail.partition("\"")

	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
	yanked = "data-yanked" in line

	maybe_metadata, _, tail = tail.partition(">")
	filename, _, tail = tail.partition("<")

	metadata_sha256 = ""
	metadata_url = ""
	for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
	metadata_marker = metadata_marker + "=\"sha256="
	if metadata_marker in maybe_metadata:
	# Implement https://peps.python.org/pep-0714/
	_, _, tail = maybe_metadata.partition(metadata_marker)
	metadata_sha256, _, _ = tail.partition("\"")
	metadata_url = dist_url + ".metadata"
	break

	if filename.endswith(".whl"):
	whls[sha256] = struct(
	filename = filename,
	url = _absolute_url(url, dist_url),
	sha256 = sha256,
	metadata_sha256 = metadata_sha256,
	metadata_url = _absolute_url(url, metadata_url),
	yanked = yanked,
	)
	else:
	sdists[sha256] = struct(
	filename = filename,
	url = _absolute_url(url, dist_url),
	sha256 = sha256,
	metadata_sha256 = "",
	metadata_url = "",
	yanked = yanked,
	)

	return struct(
	sdists = sdists,
	whls = whls,
	)

	def _absolute_url(index_url, candidate):
	if not candidate.startswith(".."):
	return candidate

	candidate_parts = candidate.split("..")
	last = candidate_parts[-1]
	for _ in range(len(candidate_parts) - 1):
	index_url, _, _ = index_url.rstrip("/").rpartition("/")

	return "{}/{}".format(index_url, last.strip("/"))