Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 1 | # Copyright 2024 The Bazel Authors. All rights reserved. |
| 2 | # |
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | # you may not use this file except in compliance with the License. |
| 5 | # You may obtain a copy of the License at |
| 6 | # |
| 7 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | # |
| 9 | # Unless required by applicable law or agreed to in writing, software |
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | # See the License for the specific language governing permissions and |
| 13 | # limitations under the License. |
| 14 | |
| 15 | """ |
| 16 | A file that houses private functions used in the `bzlmod` extension with the same name. |
| 17 | """ |
| 18 | |
| 19 | load("@bazel_features//:features.bzl", "bazel_features") |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 20 | load(":auth.bzl", "get_auth") |
| 21 | load(":envsubst.bzl", "envsubst") |
| 22 | load(":normalize_name.bzl", "normalize_name") |
| 23 | |
Ignas Anikevicius | 9a638ea | 2024-04-23 08:29:02 +0900 | [diff] [blame] | 24 | def simpleapi_download(ctx, *, attr, cache, parallel_download = True): |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 25 | """Download Simple API HTML. |
| 26 | |
| 27 | Args: |
| 28 | ctx: The module_ctx or repository_ctx. |
| 29 | attr: Contains the parameters for the download. They are grouped into a |
| 30 | struct for better clarity. It must have attributes: |
| 31 | * index_url: str, the index. |
| 32 | * index_url_overrides: dict[str, str], the index overrides for |
| 33 | separate packages. |
| 34 | * extra_index_urls: Extra index URLs that will be looked up after |
| 35 | the main is looked up. |
| 36 | * sources: list[str], the sources to download things for. Each value is |
| 37 | the contents of requirements files. |
| 38 | * envsubst: list[str], the envsubst vars for performing substitution in index url. |
| 39 | * netrc: The netrc parameter for ctx.download, see http_file for docs. |
| 40 | * auth_patterns: The auth_patterns parameter for ctx.download, see |
| 41 | http_file for docs. |
| 42 | cache: A dictionary that can be used as a cache between calls during a |
| 43 | single evaluation of the extension. We use a dictionary as a cache |
| 44 | so that we can reuse calls to the simple API when evaluating the |
| 45 | extension. Using the canonical_id parameter of the module_ctx would |
| 46 | deposit the simple API responses to the bazel cache and that is |
| 47 | undesirable because additions to the PyPI index would not be |
| 48 | reflected when re-evaluating the extension unless we do |
| 49 | `bazel clean --expunge`. |
Ignas Anikevicius | 9a638ea | 2024-04-23 08:29:02 +0900 | [diff] [blame] | 50 | parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 51 | |
| 52 | Returns: |
| 53 | dict of pkg name to the parsed HTML contents - a list of structs. |
| 54 | """ |
| 55 | index_url_overrides = { |
| 56 | normalize_name(p): i |
| 57 | for p, i in (attr.index_url_overrides or {}).items() |
| 58 | } |
| 59 | |
| 60 | download_kwargs = {} |
| 61 | if bazel_features.external_deps.download_has_block_param: |
Ignas Anikevicius | 9a638ea | 2024-04-23 08:29:02 +0900 | [diff] [blame] | 62 | download_kwargs["block"] = not parallel_download |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 63 | |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 64 | # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes |
| 65 | # to replicate how `pip` would handle this case. |
| 66 | async_downloads = {} |
| 67 | contents = {} |
| 68 | index_urls = [attr.index_url] + attr.extra_index_urls |
Ignas Anikevicius | a6cb620 | 2024-05-19 12:38:03 +0900 | [diff] [blame] | 69 | for pkg in attr.sources: |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 70 | pkg_normalized = normalize_name(pkg) |
| 71 | |
| 72 | success = False |
| 73 | for index_url in index_urls: |
| 74 | result = read_simple_api( |
| 75 | ctx = ctx, |
| 76 | url = "{}/{}/".format( |
| 77 | index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), |
| 78 | pkg, |
| 79 | ), |
| 80 | attr = attr, |
| 81 | cache = cache, |
| 82 | **download_kwargs |
| 83 | ) |
| 84 | if hasattr(result, "wait"): |
| 85 | # We will process it in a separate loop: |
| 86 | async_downloads.setdefault(pkg_normalized, []).append( |
| 87 | struct( |
| 88 | pkg_normalized = pkg_normalized, |
| 89 | wait = result.wait, |
| 90 | ), |
| 91 | ) |
| 92 | continue |
| 93 | |
| 94 | if result.success: |
| 95 | contents[pkg_normalized] = result.output |
| 96 | success = True |
| 97 | break |
| 98 | |
| 99 | if not async_downloads and not success: |
| 100 | fail("Failed to download metadata from urls: {}".format( |
| 101 | ", ".join(index_urls), |
| 102 | )) |
| 103 | |
| 104 | if not async_downloads: |
| 105 | return contents |
| 106 | |
| 107 | # If we use `block` == False, then we need to have a second loop that is |
| 108 | # collecting all of the results as they were being downloaded in parallel. |
| 109 | for pkg, downloads in async_downloads.items(): |
| 110 | success = False |
| 111 | for download in downloads: |
| 112 | result = download.wait() |
| 113 | |
| 114 | if result.success and download.pkg_normalized not in contents: |
| 115 | contents[download.pkg_normalized] = result.output |
| 116 | success = True |
| 117 | |
| 118 | if not success: |
| 119 | fail("Failed to download metadata from urls: {}".format( |
| 120 | ", ".join(index_urls), |
| 121 | )) |
| 122 | |
| 123 | return contents |
| 124 | |
| 125 | def read_simple_api(ctx, url, attr, cache, **download_kwargs): |
| 126 | """Read SimpleAPI. |
| 127 | |
| 128 | Args: |
| 129 | ctx: The module_ctx or repository_ctx. |
| 130 | url: str, the url parameter that can be passed to ctx.download. |
| 131 | attr: The attribute that contains necessary info for downloading. The |
| 132 | following attributes must be present: |
| 133 | * envsubst: The envsubst values for performing substitutions in the URL. |
| 134 | * netrc: The netrc parameter for ctx.download, see http_file for docs. |
| 135 | * auth_patterns: The auth_patterns parameter for ctx.download, see |
| 136 | http_file for docs. |
| 137 | cache: A dict for storing the results. |
| 138 | **download_kwargs: Any extra params to ctx.download. |
| 139 | Note that output and auth will be passed for you. |
| 140 | |
| 141 | Returns: |
| 142 | A similar object to what `download` would return except that in result.out |
| 143 | will be the parsed simple api contents. |
| 144 | """ |
| 145 | # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for |
| 146 | # the whl location and we cannot handle multiple URLs at once by passing |
| 147 | # them to ctx.download if we want to correctly handle the relative URLs. |
| 148 | # TODO: Add a test that env subbed index urls do not leak into the lock file. |
| 149 | |
| 150 | real_url = envsubst( |
| 151 | url, |
| 152 | attr.envsubst, |
| 153 | ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, |
| 154 | ) |
| 155 | |
| 156 | cache_key = real_url |
| 157 | if cache_key in cache: |
| 158 | return struct(success = True, output = cache[cache_key]) |
| 159 | |
| 160 | output_str = envsubst( |
| 161 | url, |
| 162 | attr.envsubst, |
| 163 | # Use env names in the subst values - this will be unique over |
| 164 | # the lifetime of the execution of this function and we also use |
| 165 | # `~` as the separator to ensure that we don't get clashes. |
| 166 | {e: "~{}~".format(e) for e in attr.envsubst}.get, |
| 167 | ) |
| 168 | |
| 169 | # Transform the URL into a valid filename |
| 170 | for char in [".", ":", "/", "\\", "-"]: |
| 171 | output_str = output_str.replace(char, "_") |
| 172 | |
| 173 | output = ctx.path(output_str.strip("_").lower() + ".html") |
| 174 | |
| 175 | # NOTE: this may have block = True or block = False in the download_kwargs |
| 176 | download = ctx.download( |
| 177 | url = [real_url], |
| 178 | output = output, |
| 179 | auth = get_auth(ctx, [real_url], ctx_attr = attr), |
| 180 | allow_fail = True, |
| 181 | **download_kwargs |
| 182 | ) |
| 183 | |
| 184 | if download_kwargs.get("block") == False: |
| 185 | # Simulate the same API as ctx.download has |
| 186 | return struct( |
| 187 | wait = lambda: _read_index_result(ctx, download.wait(), output, url, cache, cache_key), |
| 188 | ) |
| 189 | |
| 190 | return _read_index_result(ctx, download, output, url, cache, cache_key) |
| 191 | |
| 192 | def _read_index_result(ctx, result, output, url, cache, cache_key): |
| 193 | if not result.success: |
| 194 | return struct(success = False) |
| 195 | |
| 196 | content = ctx.read(output) |
| 197 | |
| 198 | output = parse_simple_api_html(url = url, content = content) |
| 199 | if output: |
| 200 | cache.setdefault(cache_key, output) |
| 201 | return struct(success = True, output = output, cache_key = cache_key) |
| 202 | else: |
| 203 | return struct(success = False) |
| 204 | |
Ignas Anikevicius | 4a615be | 2024-04-05 17:13:19 +0900 | [diff] [blame] | 205 | def parse_simple_api_html(*, url, content): |
| 206 | """Get the package URLs for given shas by parsing the Simple API HTML. |
| 207 | |
| 208 | Args: |
| 209 | url(str): The URL that the HTML content can be downloaded from. |
| 210 | content(str): The Simple API HTML content. |
| 211 | |
| 212 | Returns: |
| 213 | A list of structs with: |
| 214 | * filename: The filename of the artifact. |
| 215 | * url: The URL to download the artifact. |
| 216 | * sha256: The sha256 of the artifact. |
| 217 | * metadata_sha256: The whl METADATA sha256 if we can download it. If this is |
| 218 | present, then the 'metadata_url' is also present. Defaults to "". |
| 219 | * metadata_url: The URL for the METADATA if we can download it. Defaults to "". |
| 220 | """ |
| 221 | sdists = {} |
| 222 | whls = {} |
| 223 | lines = content.split("<a href=\"") |
| 224 | |
| 225 | _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"") |
| 226 | api_version, _, _ = api_version.partition("\"") |
| 227 | |
| 228 | # We must assume the 1.0 if it is not present |
| 229 | # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients |
| 230 | api_version = api_version or "1.0" |
| 231 | api_version = tuple([int(i) for i in api_version.split(".")]) |
| 232 | |
| 233 | if api_version >= (2, 0): |
| 234 | # We don't expect to have version 2.0 here, but have this check in place just in case. |
| 235 | # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api |
| 236 | fail("Unsupported API version: {}".format(api_version)) |
| 237 | |
| 238 | for line in lines[1:]: |
| 239 | dist_url, _, tail = line.partition("#sha256=") |
| 240 | sha256, _, tail = tail.partition("\"") |
| 241 | |
| 242 | # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api |
| 243 | yanked = "data-yanked" in line |
| 244 | |
| 245 | maybe_metadata, _, tail = tail.partition(">") |
| 246 | filename, _, tail = tail.partition("<") |
| 247 | |
| 248 | metadata_sha256 = "" |
| 249 | metadata_url = "" |
| 250 | for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]: |
| 251 | metadata_marker = metadata_marker + "=\"sha256=" |
| 252 | if metadata_marker in maybe_metadata: |
| 253 | # Implement https://peps.python.org/pep-0714/ |
| 254 | _, _, tail = maybe_metadata.partition(metadata_marker) |
| 255 | metadata_sha256, _, _ = tail.partition("\"") |
| 256 | metadata_url = dist_url + ".metadata" |
| 257 | break |
| 258 | |
| 259 | if filename.endswith(".whl"): |
| 260 | whls[sha256] = struct( |
| 261 | filename = filename, |
| 262 | url = _absolute_url(url, dist_url), |
| 263 | sha256 = sha256, |
| 264 | metadata_sha256 = metadata_sha256, |
| 265 | metadata_url = _absolute_url(url, metadata_url), |
| 266 | yanked = yanked, |
| 267 | ) |
| 268 | else: |
| 269 | sdists[sha256] = struct( |
| 270 | filename = filename, |
| 271 | url = _absolute_url(url, dist_url), |
| 272 | sha256 = sha256, |
| 273 | metadata_sha256 = "", |
| 274 | metadata_url = "", |
| 275 | yanked = yanked, |
| 276 | ) |
| 277 | |
| 278 | return struct( |
| 279 | sdists = sdists, |
| 280 | whls = whls, |
| 281 | ) |
| 282 | |
| 283 | def _absolute_url(index_url, candidate): |
| 284 | if not candidate.startswith(".."): |
| 285 | return candidate |
| 286 | |
| 287 | candidate_parts = candidate.split("..") |
| 288 | last = candidate_parts[-1] |
| 289 | for _ in range(len(candidate_parts) - 1): |
| 290 | index_url, _, _ = index_url.rstrip("/").rpartition("/") |
| 291 | |
| 292 | return "{}/{}".format(index_url, last.strip("/")) |