blob: 64d908e32b533f353493a86644b6cd60f518d2cf [file] [log] [blame]
Ignas Anikevicius4a615be2024-04-05 17:13:19 +09001# Copyright 2024 The Bazel Authors. All rights reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""
16A file that houses private functions used in the `bzlmod` extension with the same name.
17"""
18
19load("@bazel_features//:features.bzl", "bazel_features")
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090020load(":auth.bzl", "get_auth")
21load(":envsubst.bzl", "envsubst")
22load(":normalize_name.bzl", "normalize_name")
23
Ignas Anikevicius9a638ea2024-04-23 08:29:02 +090024def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090025 """Download Simple API HTML.
26
27 Args:
28 ctx: The module_ctx or repository_ctx.
29 attr: Contains the parameters for the download. They are grouped into a
30 struct for better clarity. It must have attributes:
31 * index_url: str, the index.
32 * index_url_overrides: dict[str, str], the index overrides for
33 separate packages.
34 * extra_index_urls: Extra index URLs that will be looked up after
35 the main is looked up.
36 * sources: list[str], the sources to download things for. Each value is
37 the contents of requirements files.
38 * envsubst: list[str], the envsubst vars for performing substitution in index url.
39 * netrc: The netrc parameter for ctx.download, see http_file for docs.
40 * auth_patterns: The auth_patterns parameter for ctx.download, see
41 http_file for docs.
42 cache: A dictionary that can be used as a cache between calls during a
43 single evaluation of the extension. We use a dictionary as a cache
44 so that we can reuse calls to the simple API when evaluating the
45 extension. Using the canonical_id parameter of the module_ctx would
46 deposit the simple API responses to the bazel cache and that is
47 undesirable because additions to the PyPI index would not be
48 reflected when re-evaluating the extension unless we do
49 `bazel clean --expunge`.
Ignas Anikevicius9a638ea2024-04-23 08:29:02 +090050 parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090051
52 Returns:
53 dict of pkg name to the parsed HTML contents - a list of structs.
54 """
55 index_url_overrides = {
56 normalize_name(p): i
57 for p, i in (attr.index_url_overrides or {}).items()
58 }
59
60 download_kwargs = {}
61 if bazel_features.external_deps.download_has_block_param:
Ignas Anikevicius9a638ea2024-04-23 08:29:02 +090062 download_kwargs["block"] = not parallel_download
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090063
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090064 # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
65 # to replicate how `pip` would handle this case.
66 async_downloads = {}
67 contents = {}
68 index_urls = [attr.index_url] + attr.extra_index_urls
Ignas Anikeviciusa6cb6202024-05-19 12:38:03 +090069 for pkg in attr.sources:
Ignas Anikevicius4a615be2024-04-05 17:13:19 +090070 pkg_normalized = normalize_name(pkg)
71
72 success = False
73 for index_url in index_urls:
74 result = read_simple_api(
75 ctx = ctx,
76 url = "{}/{}/".format(
77 index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
78 pkg,
79 ),
80 attr = attr,
81 cache = cache,
82 **download_kwargs
83 )
84 if hasattr(result, "wait"):
85 # We will process it in a separate loop:
86 async_downloads.setdefault(pkg_normalized, []).append(
87 struct(
88 pkg_normalized = pkg_normalized,
89 wait = result.wait,
90 ),
91 )
92 continue
93
94 if result.success:
95 contents[pkg_normalized] = result.output
96 success = True
97 break
98
99 if not async_downloads and not success:
100 fail("Failed to download metadata from urls: {}".format(
101 ", ".join(index_urls),
102 ))
103
104 if not async_downloads:
105 return contents
106
107 # If we use `block` == False, then we need to have a second loop that is
108 # collecting all of the results as they were being downloaded in parallel.
109 for pkg, downloads in async_downloads.items():
110 success = False
111 for download in downloads:
112 result = download.wait()
113
114 if result.success and download.pkg_normalized not in contents:
115 contents[download.pkg_normalized] = result.output
116 success = True
117
118 if not success:
119 fail("Failed to download metadata from urls: {}".format(
120 ", ".join(index_urls),
121 ))
122
123 return contents
124
125def read_simple_api(ctx, url, attr, cache, **download_kwargs):
126 """Read SimpleAPI.
127
128 Args:
129 ctx: The module_ctx or repository_ctx.
130 url: str, the url parameter that can be passed to ctx.download.
131 attr: The attribute that contains necessary info for downloading. The
132 following attributes must be present:
133 * envsubst: The envsubst values for performing substitutions in the URL.
134 * netrc: The netrc parameter for ctx.download, see http_file for docs.
135 * auth_patterns: The auth_patterns parameter for ctx.download, see
136 http_file for docs.
137 cache: A dict for storing the results.
138 **download_kwargs: Any extra params to ctx.download.
139 Note that output and auth will be passed for you.
140
141 Returns:
142 A similar object to what `download` would return except that in result.out
143 will be the parsed simple api contents.
144 """
145 # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for
146 # the whl location and we cannot handle multiple URLs at once by passing
147 # them to ctx.download if we want to correctly handle the relative URLs.
148 # TODO: Add a test that env subbed index urls do not leak into the lock file.
149
150 real_url = envsubst(
151 url,
152 attr.envsubst,
153 ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get,
154 )
155
156 cache_key = real_url
157 if cache_key in cache:
158 return struct(success = True, output = cache[cache_key])
159
160 output_str = envsubst(
161 url,
162 attr.envsubst,
163 # Use env names in the subst values - this will be unique over
164 # the lifetime of the execution of this function and we also use
165 # `~` as the separator to ensure that we don't get clashes.
166 {e: "~{}~".format(e) for e in attr.envsubst}.get,
167 )
168
169 # Transform the URL into a valid filename
170 for char in [".", ":", "/", "\\", "-"]:
171 output_str = output_str.replace(char, "_")
172
173 output = ctx.path(output_str.strip("_").lower() + ".html")
174
175 # NOTE: this may have block = True or block = False in the download_kwargs
176 download = ctx.download(
177 url = [real_url],
178 output = output,
179 auth = get_auth(ctx, [real_url], ctx_attr = attr),
180 allow_fail = True,
181 **download_kwargs
182 )
183
184 if download_kwargs.get("block") == False:
185 # Simulate the same API as ctx.download has
186 return struct(
187 wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key),
188 )
189
190 return _read_index_result(ctx, download, output, url, cache, cache_key)
191
192def _read_index_result(ctx, result, output, url, cache, cache_key):
193 if not result.success:
194 return struct(success = False)
195
196 content = ctx.read(output)
197
198 output = parse_simple_api_html(url = url, content = content)
199 if output:
200 cache.setdefault(cache_key, output)
201 return struct(success = True, output = output, cache_key = cache_key)
202 else:
203 return struct(success = False)
204
Ignas Anikevicius4a615be2024-04-05 17:13:19 +0900205def parse_simple_api_html(*, url, content):
206 """Get the package URLs for given shas by parsing the Simple API HTML.
207
208 Args:
209 url(str): The URL that the HTML content can be downloaded from.
210 content(str): The Simple API HTML content.
211
212 Returns:
213 A list of structs with:
214 * filename: The filename of the artifact.
215 * url: The URL to download the artifact.
216 * sha256: The sha256 of the artifact.
217 * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
218 present, then the 'metadata_url' is also present. Defaults to "".
219 * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
220 """
221 sdists = {}
222 whls = {}
223 lines = content.split("<a href=\"")
224
225 _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
226 api_version, _, _ = api_version.partition("\"")
227
228 # We must assume the 1.0 if it is not present
229 # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
230 api_version = api_version or "1.0"
231 api_version = tuple([int(i) for i in api_version.split(".")])
232
233 if api_version >= (2, 0):
234 # We don't expect to have version 2.0 here, but have this check in place just in case.
235 # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
236 fail("Unsupported API version: {}".format(api_version))
237
238 for line in lines[1:]:
239 dist_url, _, tail = line.partition("#sha256=")
240 sha256, _, tail = tail.partition("\"")
241
242 # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
243 yanked = "data-yanked" in line
244
245 maybe_metadata, _, tail = tail.partition(">")
246 filename, _, tail = tail.partition("<")
247
248 metadata_sha256 = ""
249 metadata_url = ""
250 for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
251 metadata_marker = metadata_marker + "=\"sha256="
252 if metadata_marker in maybe_metadata:
253 # Implement https://peps.python.org/pep-0714/
254 _, _, tail = maybe_metadata.partition(metadata_marker)
255 metadata_sha256, _, _ = tail.partition("\"")
256 metadata_url = dist_url + ".metadata"
257 break
258
259 if filename.endswith(".whl"):
260 whls[sha256] = struct(
261 filename = filename,
262 url = _absolute_url(url, dist_url),
263 sha256 = sha256,
264 metadata_sha256 = metadata_sha256,
265 metadata_url = _absolute_url(url, metadata_url),
266 yanked = yanked,
267 )
268 else:
269 sdists[sha256] = struct(
270 filename = filename,
271 url = _absolute_url(url, dist_url),
272 sha256 = sha256,
273 metadata_sha256 = "",
274 metadata_url = "",
275 yanked = yanked,
276 )
277
278 return struct(
279 sdists = sdists,
280 whls = whls,
281 )
282
283def _absolute_url(index_url, candidate):
284 if not candidate.startswith(".."):
285 return candidate
286
287 candidate_parts = candidate.split("..")
288 last = candidate_parts[-1]
289 for _ in range(len(candidate_parts) - 1):
290 index_url, _, _ = index_url.rstrip("/").rpartition("/")
291
292 return "{}/{}".format(index_url, last.strip("/"))