Blame - python/private/pypi_index.bzl - third_party/github/bazelbuild/rules_python

blob: 64d908e32b533f353493a86644b6cd60f518d2cf [file] [log] [blame]

Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	1	# Copyright 2024 The Bazel Authors. All rights reserved.
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# http://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	"""
				16	A file that houses private functions used in the `bzlmod` extension with the same name.
				17	"""
				18
				19	load("@bazel_features//:features.bzl", "bazel_features")
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	20	load(":auth.bzl", "get_auth")
				21	load(":envsubst.bzl", "envsubst")
				22	load(":normalize_name.bzl", "normalize_name")
				23
Ignas Anikevicius	9a638ea	2024-04-23 08:29:02 +0900	[diff] [blame]	24	def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	25	"""Download Simple API HTML.
				26
				27	Args:
				28	ctx: The module_ctx or repository_ctx.
				29	attr: Contains the parameters for the download. They are grouped into a
				30	struct for better clarity. It must have attributes:
				31	* index_url: str, the index.
				32	* index_url_overrides: dict[str, str], the index overrides for
				33	separate packages.
				34	* extra_index_urls: Extra index URLs that will be looked up after
				35	the main is looked up.
				36	* sources: list[str], the sources to download things for. Each value is
				37	the contents of requirements files.
				38	* envsubst: list[str], the envsubst vars for performing substitution in index url.
				39	* netrc: The netrc parameter for ctx.download, see http_file for docs.
				40	* auth_patterns: The auth_patterns parameter for ctx.download, see
				41	http_file for docs.
				42	cache: A dictionary that can be used as a cache between calls during a
				43	single evaluation of the extension. We use a dictionary as a cache
				44	so that we can reuse calls to the simple API when evaluating the
				45	extension. Using the canonical_id parameter of the module_ctx would
				46	deposit the simple API responses to the bazel cache and that is
				47	undesirable because additions to the PyPI index would not be
				48	reflected when re-evaluating the extension unless we do
				49	`bazel clean --expunge`.
Ignas Anikevicius	9a638ea	2024-04-23 08:29:02 +0900	[diff] [blame]	50	parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	51
				52	Returns:
				53	dict of pkg name to the parsed HTML contents - a list of structs.
				54	"""
				55	index_url_overrides = {
				56	normalize_name(p): i
				57	for p, i in (attr.index_url_overrides or {}).items()
				58	}
				59
				60	download_kwargs = {}
				61	if bazel_features.external_deps.download_has_block_param:
Ignas Anikevicius	9a638ea	2024-04-23 08:29:02 +0900	[diff] [blame]	62	download_kwargs["block"] = not parallel_download
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	63
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	64	# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
				65	# to replicate how `pip` would handle this case.
				66	async_downloads = {}
				67	contents = {}
				68	index_urls = [attr.index_url] + attr.extra_index_urls
Ignas Anikevicius	a6cb620	2024-05-19 12:38:03 +0900	[diff] [blame]	69	for pkg in attr.sources:
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	70	pkg_normalized = normalize_name(pkg)
				71
				72	success = False
				73	for index_url in index_urls:
				74	result = read_simple_api(
				75	ctx = ctx,
				76	url = "{}/{}/".format(
				77	index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
				78	pkg,
				79	),
				80	attr = attr,
				81	cache = cache,
				82	**download_kwargs
				83	)
				84	if hasattr(result, "wait"):
				85	# We will process it in a separate loop:
				86	async_downloads.setdefault(pkg_normalized, []).append(
				87	struct(
				88	pkg_normalized = pkg_normalized,
				89	wait = result.wait,
				90	),
				91	)
				92	continue
				93
				94	if result.success:
				95	contents[pkg_normalized] = result.output
				96	success = True
				97	break
				98
				99	if not async_downloads and not success:
				100	fail("Failed to download metadata from urls: {}".format(
				101	", ".join(index_urls),
				102	))
				103
				104	if not async_downloads:
				105	return contents
				106
				107	# If we use `block` == False, then we need to have a second loop that is
				108	# collecting all of the results as they were being downloaded in parallel.
				109	for pkg, downloads in async_downloads.items():
				110	success = False
				111	for download in downloads:
				112	result = download.wait()
				113
				114	if result.success and download.pkg_normalized not in contents:
				115	contents[download.pkg_normalized] = result.output
				116	success = True
				117
				118	if not success:
				119	fail("Failed to download metadata from urls: {}".format(
				120	", ".join(index_urls),
				121	))
				122
				123	return contents
				124
				125	def read_simple_api(ctx, url, attr, cache, **download_kwargs):
				126	"""Read SimpleAPI.
				127
				128	Args:
				129	ctx: The module_ctx or repository_ctx.
				130	url: str, the url parameter that can be passed to ctx.download.
				131	attr: The attribute that contains necessary info for downloading. The
				132	following attributes must be present:
				133	* envsubst: The envsubst values for performing substitutions in the URL.
				134	* netrc: The netrc parameter for ctx.download, see http_file for docs.
				135	* auth_patterns: The auth_patterns parameter for ctx.download, see
				136	http_file for docs.
				137	cache: A dict for storing the results.
				138	**download_kwargs: Any extra params to ctx.download.
				139	Note that output and auth will be passed for you.
				140
				141	Returns:
				142	A similar object to what `download` would return except that in result.out
				143	will be the parsed simple api contents.
				144	"""
				145	# NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
				146	# the whl location and we cannot handle multiple URLs at once by passing
				147	# them to ctx.download if we want to correctly handle the relative URLs.
				148	# TODO: Add a test that env subbed index urls do not leak into the lock file.
				149
				150	real_url = envsubst(
				151	url,
				152	attr.envsubst,
				153	ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
				154	)
				155
				156	cache_key = real_url
				157	if cache_key in cache:
				158	return struct(success = True, output = cache[cache_key])
				159
				160	output_str = envsubst(
				161	url,
				162	attr.envsubst,
				163	# Use env names in the subst values - this will be unique over
				164	# the lifetime of the execution of this function and we also use
				165	# `~` as the separator to ensure that we don't get clashes.
				166	{e: "~{}~".format(e) for e in attr.envsubst}.get,
				167	)
				168
				169	# Transform the URL into a valid filename
				170	for char in [".", ":", "/", "\\", "-"]:
				171	output_str = output_str.replace(char, "_")
				172
				173	output = ctx.path(output_str.strip("_").lower() + ".html")
				174
				175	# NOTE: this may have block = True or block = False in the download_kwargs
				176	download = ctx.download(
				177	url = [real_url],
				178	output = output,
				179	auth = get_auth(ctx, [real_url], ctx_attr = attr),
				180	allow_fail = True,
				181	**download_kwargs
				182	)
				183
				184	if download_kwargs.get("block") == False:
				185	# Simulate the same API as ctx.download has
				186	return struct(
				187	wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key),
				188	)
				189
				190	return _read_index_result(ctx, download, output, url, cache, cache_key)
				191
				192	def _read_index_result(ctx, result, output, url, cache, cache_key):
				193	if not result.success:
				194	return struct(success = False)
				195
				196	content = ctx.read(output)
				197
				198	output = parse_simple_api_html(url = url, content = content)
				199	if output:
				200	cache.setdefault(cache_key, output)
				201	return struct(success = True, output = output, cache_key = cache_key)
				202	else:
				203	return struct(success = False)
				204
Ignas Anikevicius	4a615be	2024-04-05 17:13:19 +0900	[diff] [blame]	205	def parse_simple_api_html(*, url, content):
				206	"""Get the package URLs for given shas by parsing the Simple API HTML.
				207
				208	Args:
				209	url(str): The URL that the HTML content can be downloaded from.
				210	content(str): The Simple API HTML content.
				211
				212	Returns:
				213	A list of structs with:
				214	* filename: The filename of the artifact.
				215	* url: The URL to download the artifact.
				216	* sha256: The sha256 of the artifact.
				217	* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
				218	present, then the 'metadata_url' is also present. Defaults to "".
				219	* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
				220	"""
				221	sdists = {}
				222	whls = {}
				223	lines = content.split("<a href=\"")
				224
				225	_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
				226	api_version, _, _ = api_version.partition("\"")
				227
				228	# We must assume the 1.0 if it is not present
				229	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
				230	api_version = api_version or "1.0"
				231	api_version = tuple([int(i) for i in api_version.split(".")])
				232
				233	if api_version >= (2, 0):
				234	# We don't expect to have version 2.0 here, but have this check in place just in case.
				235	# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
				236	fail("Unsupported API version: {}".format(api_version))
				237
				238	for line in lines[1:]:
				239	dist_url, _, tail = line.partition("#sha256=")
				240	sha256, _, tail = tail.partition("\"")
				241
				242	# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
				243	yanked = "data-yanked" in line
				244
				245	maybe_metadata, _, tail = tail.partition(">")
				246	filename, _, tail = tail.partition("<")
				247
				248	metadata_sha256 = ""
				249	metadata_url = ""
				250	for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
				251	metadata_marker = metadata_marker + "=\"sha256="
				252	if metadata_marker in maybe_metadata:
				253	# Implement https://peps.python.org/pep-0714/
				254	_, _, tail = maybe_metadata.partition(metadata_marker)
				255	metadata_sha256, _, _ = tail.partition("\"")
				256	metadata_url = dist_url + ".metadata"
				257	break
				258
				259	if filename.endswith(".whl"):
				260	whls[sha256] = struct(
				261	filename = filename,
				262	url = _absolute_url(url, dist_url),
				263	sha256 = sha256,
				264	metadata_sha256 = metadata_sha256,
				265	metadata_url = _absolute_url(url, metadata_url),
				266	yanked = yanked,
				267	)
				268	else:
				269	sdists[sha256] = struct(
				270	filename = filename,
				271	url = _absolute_url(url, dist_url),
				272	sha256 = sha256,
				273	metadata_sha256 = "",
				274	metadata_url = "",
				275	yanked = yanked,
				276	)
				277
				278	return struct(
				279	sdists = sdists,
				280	whls = whls,
				281	)
				282
				283	def _absolute_url(index_url, candidate):
				284	if not candidate.startswith(".."):
				285	return candidate
				286
				287	candidate_parts = candidate.split("..")
				288	last = candidate_parts[-1]
				289	for _ in range(len(candidate_parts) - 1):
				290	index_url, _, _ = index_url.rstrip("/").rpartition("/")
				291
				292	return "{}/{}".format(index_url, last.strip("/"))