blob: 28c6cbeafbc3657f5ab808b77b86b933d69206f0 [file] [log] [blame]
"""A cache for the PyPI index contents evaluation.
This is design to work as the following:
- in-memory cache for results of PyPI index queries, so that we are not calling PyPI multiple times
for the same package for different hub repos.
In the future the same will be used to:
- Store PyPI index query results as facts in the MODULE.bazel.lock file
"""
load(":version_from_filename.bzl", "version_from_filename")
# This value should be changed whenever the storage format changes.
# Changing it simply means the information cached in the lockfile has to be
# recomputed.
_FACT_VERSION = "v1"
def pypi_cache(mctx = None, store = None):
"""The cache for PyPI index queries.
Currently the key is of the following structure:
(url, real_url, versions)
Args:
mctx: The module context
store: The in-memory store, should implement dict interface for get and setdefault
Returns:
A cache struct
"""
mcache = memory_cache(store)
fcache = facts_cache(getattr(mctx, "facts", None))
# buildifier: disable=uninitialized
self = struct(
_mcache = mcache,
_facts = fcache,
setdefault = lambda key, parsed_result: _pypi_cache_setdefault(self, key, parsed_result),
get = lambda key: _pypi_cache_get(self, key),
get_facts = lambda: _pypi_cache_get_facts(self),
)
# buildifier: enable=uninitialized
return self
def _pypi_cache_setdefault(self, key, parsed_result):
"""Store the value if not yet cached.
Args:
self: {type}`struct` The self of this implementation.
key: {type}`str` The cache key, can be any string.
parsed_result: {type}`struct` The result of `parse_simpleapi_html` function.
index_url and distribution is used to write to the MODULE.bazel.lock file as facts
real_index_url and distribution is used to write to in-memory cache to ensure that there are
no duplicate calls to the PyPI indexes
Returns:
The `parse_result`.
"""
index_url, real_url, versions = key
self._mcache.setdefault(real_url, parsed_result)
if not versions or not self._facts:
return parsed_result
# Filter the packages to only what is needed before writing to the facts cache
filtered = _filter_packages(parsed_result, versions)
return self._facts.setdefault(index_url, filtered)
def _pypi_cache_get(self, key):
"""Return the parsed result from the cache.
Args:
self: {type}`struct` The self of this implementation.
key: {type}`str` The cache key, can be any string.
Returns:
The {type}`struct` or `None` based on if the result is in the cache or not.
"""
index_url, real_url, versions = key
# When retrieving from memory cache, filter down to only what is needed. If the
# cache is empty, we will attempt to read from facts, however, reading from memory
# first allows us to not parse the contents of the lock file that may add up.
cached = _filter_packages(self._mcache.get(real_url), versions)
if not self._facts:
return cached
if not cached and versions:
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
return cached
def _pypi_cache_get_facts(self):
if not self._facts:
return {}
return self._facts.facts
def memory_cache(cache = None):
"""SimpleAPI cache for making fewer calls.
We are using the `real_url` as the key in the cache functions on purpose in order to get the
best possible cache hits.
Args:
cache: the storage to store things in memory.
Returns:
struct with 2 methods, `get` and `setdefault`.
"""
if cache == None:
cache = {}
return struct(
get = lambda real_url: cache.get(real_url),
setdefault = lambda real_url, value: cache.setdefault(real_url, value),
)
def _filter_packages(dists, requested_versions):
if dists == None or not requested_versions:
return dists
sha256s_by_version = {}
whls = {}
sdists = {}
for sha256, d in dists.sdists.items():
if d.version not in requested_versions:
continue
sdists[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)
for sha256, d in dists.whls.items():
if d.version not in requested_versions:
continue
whls[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)
if not whls and not sdists:
# TODO @aignas 2026-03-08: add logging
#print("WARN: no dists matched for versions {}".format(requested_versions))
return None
return struct(
whls = whls,
sdists = sdists,
sha256s_by_version = {
k: sorted(v)
for k, v in sha256s_by_version.items()
},
)
def facts_cache(known_facts, facts_version = _FACT_VERSION):
"""The facts cache.
Here we have a way to store things as facts and the main thing to keep in mind is that we should
not use the real_url in case it contains credentials in it (e.g. is of form `https://<username>:<password>@<host>`).
Args:
known_facts: An opaque object coming from {obj}`module_ctx.facts`.
facts_version: {type}`str` the version of the facts schema, used for short-circuiting.
Returns:
A struct that has:
* `get` method for getting values from the facts cache.
* `setdefault` method for setting values in the cache.
* `facts` attribute that should be passed to the {obj}`module_ctx.extension_metadata` to persist facts.
"""
if known_facts == None:
return None
facts = {}
return struct(
get = lambda index_url, versions: _get_from_facts(
facts,
known_facts,
index_url,
versions,
facts_version,
),
setdefault = lambda url, value: _store_facts(facts, facts_version, url, value),
known_facts = known_facts,
facts = facts,
)
def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_version):
if known_facts.get("fact_version") != facts_version:
# cannot trust known facts, different version that we know how to parse
return None
known_sources = {}
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")
retrieved_versions = {}
for url, sha256 in known_facts.get("dist_hashes", {}).get(root_url, {}).get(distribution, {}).items():
filename = known_facts.get("dist_filenames", {}).get(root_url, {}).get(distribution, {}).get(sha256)
if not filename:
_, _, filename = url.rpartition("/")
version = version_from_filename(filename)
if version not in requested_versions:
# TODO @aignas 2026-01-21: do the check by requested shas at some point
# We don't have sufficient info in the lock file, need to call the API
#
continue
retrieved_versions[version] = True
if filename.endswith(".whl"):
dists = known_sources.setdefault("whls", {})
else:
dists = known_sources.setdefault("sdists", {})
known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256)
dists.setdefault(sha256, struct(
sha256 = sha256,
filename = filename,
version = version,
metadata_url = "",
metadata_sha256 = "",
url = url,
yanked = known_facts.get("dist_yanked", {}).get(root_url, {}).get(distribution, {}).get(sha256),
))
if not known_sources:
# We found nothing in facts
return None
if len(requested_versions) != len(retrieved_versions):
# If the results are incomplete, then return None, so that we can fetch sources from the
# internet again.
return None
output = struct(
whls = known_sources.get("whls", {}),
sdists = known_sources.get("sdists", {}),
sha256s_by_version = {
k: sorted(v)
for k, v in known_sources.get("sha256s_by_version", {}).items()
},
)
# Persist these facts for the next run because we have used them.
return _store_facts(facts, facts_version, index_url, output)
def _store_facts(facts, fact_version, index_url, value):
"""Store values as facts in the lock file.
The main idea is to ensure that the lock file is small and it is only
storing what we would need to fetch from the internet. Any derivative
information we can get from this that can be achieved using pure Starlark
functions should be done in Starlark.
"""
if not value:
return value
facts["fact_version"] = fact_version
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")
for sha256, d in (value.sdists | value.whls).items():
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
if not d.url.endswith(d.filename):
facts.setdefault("dist_filenames", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, d.filename)
if d.yanked != None:
facts.setdefault("dist_yanked", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(sha256, d.yanked)
return value