feat(whlmaker): introduce an internal _WhlFile class and stop sorting RECORD (#1488)

This class is for being able to more easily recreate a wheel file after
extracting it. This is not intended for usage outside the rules_python
project. Also stop sorting the entries when writing a RECORD file making
the order of the RECORD file to be the same as the order the files to
the zip file are added.

Towards #1076
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ddfed3f..0e01615 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,8 +31,12 @@
 * Skip aliases for unloaded toolchains. Some Python versions that don't have full
   platform support, and referencing their undefined repositories can break operations
   like `bazel query rdeps(...)`.
+
 * Python code generated from `proto_library` with `strip_import_prefix` can be imported now.
 
+* (py_wheel) Produce deterministic wheel files and make `RECORD` file entries
+  follow the order of files written to the `.whl` archive.
+
 ## [0.26.0] - 2023-10-06
 
 ### Changed
@@ -106,8 +110,6 @@
 
 * (gazelle) Improve runfiles lookup hermeticity.
 
-* (py_wheel) Produce deterministic wheel files
-
 ## [0.25.0] - 2023-08-22
 
 ### Changed
diff --git a/examples/wheel/wheel_test.py b/examples/wheel/wheel_test.py
index 23b1c8a..ab7b59d 100644
--- a/examples/wheel/wheel_test.py
+++ b/examples/wheel/wheel_test.py
@@ -44,7 +44,7 @@
         else:
             return path
 
-    def assertFileSha256Equal(self, filename, sha):
+    def assertFileSha256Equal(self, filename, want):
         hash = hashlib.sha256()
         with open(filename, "rb") as f:
             while True:
@@ -52,7 +52,7 @@
                 if not buf:
                     break
                 hash.update(buf)
-        self.assertEqual(hash.hexdigest(), sha)
+        self.assertEqual(want, hash.hexdigest())
 
     def assertAllEntriesHasReproducibleMetadata(self, zf):
         for zinfo in zf.infolist():
@@ -78,7 +78,7 @@
                 ],
             )
         self.assertFileSha256Equal(
-            filename, "6da8e06a3fdd9ae5ee9fa8f796610723c05a4b0d7fde0ec5179401e956204139"
+            filename, "2818e70fdebd148934f41820f8c54d5d7676d783c0d66c7c8af2ee9141e7ddc7"
         )
 
     def test_py_package_wheel(self):
@@ -100,7 +100,7 @@
                 ],
             )
         self.assertFileSha256Equal(
-            filename, "2948b0b5e0aa421e0b40f78b74018bbc2f218165f211da0a4609e431e8e52bee"
+            filename, "273e27adf9bf90287a42ac911dcece8aa95f2905c37d786725477b26de23627c"
         )
 
     def test_customized_wheel(self):
@@ -135,16 +135,16 @@
                 record_contents,
                 # The entries are guaranteed to be sorted.
                 b"""\
-example_customized-0.0.1.dist-info/METADATA,sha256=QYQcDJFQSIqan8eiXqL67bqsUfgEAwf2hoK_Lgi1S-0,559
-example_customized-0.0.1.dist-info/NOTICE,sha256=Xpdw-FXET1IRgZ_wTkx1YQfo1-alET0FVf6V1LXO4js,76
-example_customized-0.0.1.dist-info/README,sha256=WmOFwZ3Jga1bHG3JiGRsUheb4UbLffUxyTdHczS27-o,40
-example_customized-0.0.1.dist-info/RECORD,,
-example_customized-0.0.1.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
-example_customized-0.0.1.dist-info/entry_points.txt,sha256=pqzpbQ8MMorrJ3Jp0ntmpZcuvfByyqzMXXi2UujuXD0,137
 examples/wheel/lib/data.txt,sha256=9vJKEdfLu8bZRArKLroPZJh1XKkK3qFMXiM79MBL2Sg,12
 examples/wheel/lib/module_with_data.py,sha256=8s0Khhcqz3yVsBKv2IB5u4l4TMKh7-c_V6p65WVHPms,637
 examples/wheel/lib/simple_module.py,sha256=z2hwciab_XPNIBNH8B1Q5fYgnJvQTeYf0ZQJpY8yLLY,637
 examples/wheel/main.py,sha256=sgg5iWN_9inYBjm6_Zw27hYdmo-l24fA-2rfphT-IlY,909
+example_customized-0.0.1.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
+example_customized-0.0.1.dist-info/METADATA,sha256=QYQcDJFQSIqan8eiXqL67bqsUfgEAwf2hoK_Lgi1S-0,559
+example_customized-0.0.1.dist-info/entry_points.txt,sha256=pqzpbQ8MMorrJ3Jp0ntmpZcuvfByyqzMXXi2UujuXD0,137
+example_customized-0.0.1.dist-info/NOTICE,sha256=Xpdw-FXET1IRgZ_wTkx1YQfo1-alET0FVf6V1LXO4js,76
+example_customized-0.0.1.dist-info/README,sha256=WmOFwZ3Jga1bHG3JiGRsUheb4UbLffUxyTdHczS27-o,40
+example_customized-0.0.1.dist-info/RECORD,,
 """,
             )
             self.assertEqual(
@@ -189,7 +189,7 @@
 second = second.main:s""",
             )
         self.assertFileSha256Equal(
-            filename, "66f0c1bfe2cedb2f4cf08d4fe955096860186c0a2f3524e0cb02387a55ac3e63"
+            filename, "48eed93258bba0bb366c879b77917d947267d89e7e60005d1766d844fb909118"
         )
 
     def test_legacy_filename_escaping(self):
@@ -227,7 +227,7 @@
 """,
             )
         self.assertFileSha256Equal(
-            filename, "593c6ab58627f2446d0f1ef2956fd6d42104eedce4493c72d462f7ebf8cb74fa"
+            filename, "ace5fab6458f8c3b4b50801b8e8214288bba786472e81547fced743a67531312"
         )
 
     def test_filename_escaping(self):
@@ -293,7 +293,7 @@
             for line in record_contents.splitlines():
                 self.assertFalse(line.startswith("/"))
         self.assertFileSha256Equal(
-            filename, "1b1fa3a4e840211084ef80049d07947b845c99bedb2778496d30e0c1524686ac"
+            filename, "16e0345c102c6866fed34999d8de5aed7f351adbf372b27adef3bc15161db65e"
         )
 
     def test_custom_package_root_multi_prefix_wheel(self):
@@ -324,7 +324,7 @@
             for line in record_contents.splitlines():
                 self.assertFalse(line.startswith("/"))
         self.assertFileSha256Equal(
-            filename, "f0422d7a338de3c76bf2525927fd93c0f47f2e9c60ecc0944e3e32b642c28137"
+            filename, "d2031eb21c69e290db5eac76b0dc026858e9dbdb3da2dc0314e4e9f69eab2e1a"
         )
 
     def test_custom_package_root_multi_prefix_reverse_order_wheel(self):
@@ -355,7 +355,7 @@
             for line in record_contents.splitlines():
                 self.assertFalse(line.startswith("/"))
         self.assertFileSha256Equal(
-            filename, "4f9e8c917b4050f121ac81e9a2bb65723ef09a1b90b35d93792ac3a62a60efa3"
+            filename, "a37b90685600ccfa56cc5405d1e9a3729ed21dfb31c76fd356e491e2af989566"
         )
 
     def test_python_requires_wheel(self):
@@ -380,7 +380,7 @@
 """,
             )
         self.assertFileSha256Equal(
-            filename, "9bfe8197d379f88715458a75e45c1f521a8b9d3cc43fe19b407c4ab207228b7c"
+            filename, "529afa454113572e6cd91f069cc9cfe5c28369f29cd495fff19d0ecce389d8e4"
         )
 
     def test_python_abi3_binary_wheel(self):
@@ -445,7 +445,7 @@
                 ],
             )
         self.assertFileSha256Equal(
-            filename, "8ad5f639cc41ac6ac67eb70f6553a7fdecabaf3a1b952c3134eaea59610c2a64"
+            filename, "cc9484d527075f07651ca0e7dff4a185c1314020726bcad55fe28d1bba0fec2e"
         )
 
     def test_rule_expands_workspace_status_keys_in_wheel_metadata(self):
diff --git a/tools/wheelmaker.py b/tools/wheelmaker.py
index f2ecbaf..b051564 100644
--- a/tools/wheelmaker.py
+++ b/tools/wheelmaker.py
@@ -84,118 +84,38 @@
     except packaging.version.InvalidVersion:
         pass
 
-    sanitized = re.sub(r'[^a-z0-9]+', '.', version.lower()).strip('.')
-    substituted = re.sub(r'\{\w+\}', '0', version)
-    delimiter = '.' if '+' in substituted else '+'
+    sanitized = re.sub(r"[^a-z0-9]+", ".", version.lower()).strip(".")
+    substituted = re.sub(r"\{\w+\}", "0", version)
+    delimiter = "." if "+" in substituted else "+"
     try:
-        return str(
-            packaging.version.Version(f'{substituted}{delimiter}{sanitized}')
-        )
+        return str(packaging.version.Version(f"{substituted}{delimiter}{sanitized}"))
     except packaging.version.InvalidVersion:
-        return str(packaging.version.Version(f'0+{sanitized}'))
+        return str(packaging.version.Version(f"0+{sanitized}"))
 
 
-class WheelMaker(object):
+class _WhlFile(zipfile.ZipFile):
     def __init__(
         self,
-        name,
-        version,
-        build_tag,
-        python_tag,
-        abi,
-        platform,
-        outfile=None,
+        filename,
+        *,
+        mode,
+        distinfo_dir,
         strip_path_prefixes=None,
-        incompatible_normalize_name=False,
-        incompatible_normalize_version=False,
+        compression=zipfile.ZIP_DEFLATED,
+        **kwargs,
     ):
-        self._name = name
-        self._version = version
-        self._build_tag = build_tag
-        self._python_tag = python_tag
-        self._abi = abi
-        self._platform = platform
-        self._outfile = outfile
-        self._strip_path_prefixes = (
-            strip_path_prefixes if strip_path_prefixes is not None else []
-        )
-
-        if incompatible_normalize_version:
-            self._version = normalize_pep440(self._version)
-            self._escaped_version = self._version
-        else:
-            self._escaped_version = escape_filename_segment(self._version)
-
-        if incompatible_normalize_name:
-            escaped_name = escape_filename_distribution_name(self._name)
-            self._distinfo_dir = (
-                escaped_name + "-" + self._escaped_version + ".dist-info/"
-            )
-            self._wheelname_fragment_distribution_name = escaped_name
-        else:
-            # The legacy behavior escapes the distinfo dir but not the
-            # wheel name. Enable incompatible_normalize_name to fix it.
-            # https://github.com/bazelbuild/rules_python/issues/1132
-            self._distinfo_dir = (
-                escape_filename_segment(self._name)
-                + "-"
-                + self._escaped_version
-                + ".dist-info/"
-            )
-            self._wheelname_fragment_distribution_name = self._name
-
-        self._zipfile = None
+        self._distinfo_dir = distinfo_dir
+        if not self._distinfo_dir.endswith("/"):
+            self._distinfo_dir += "/"
+        self._strip_path_prefixes = strip_path_prefixes or []
         # Entries for the RECORD file as (filename, hash, size) tuples.
         self._record = []
 
-    def __enter__(self):
-        self._zipfile = zipfile.ZipFile(
-            self.filename(), mode="w", compression=zipfile.ZIP_DEFLATED
-        )
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self._zipfile.close()
-        self._zipfile = None
-
-    def wheelname(self) -> str:
-        components = [
-            self._wheelname_fragment_distribution_name,
-            self._version,
-        ]
-        if self._build_tag:
-            components.append(self._build_tag)
-        components += [self._python_tag, self._abi, self._platform]
-        return "-".join(components) + ".whl"
-
-    def filename(self) -> str:
-        if self._outfile:
-            return self._outfile
-        return self.wheelname()
-
-    def disttags(self):
-        return ["-".join([self._python_tag, self._abi, self._platform])]
+        super().__init__(filename, mode=mode, compression=compression, **kwargs)
 
     def distinfo_path(self, basename):
         return self._distinfo_dir + basename
 
-    def _serialize_digest(self, hash):
-        # https://www.python.org/dev/peps/pep-0376/#record
-        # "base64.urlsafe_b64encode(digest) with trailing = removed"
-        digest = base64.urlsafe_b64encode(hash.digest())
-        digest = b"sha256=" + digest.rstrip(b"=")
-        return digest
-
-    def add_string(self, filename, contents):
-        """Add given 'contents' as filename to the distribution."""
-        if sys.version_info[0] > 2 and isinstance(contents, str):
-            contents = contents.encode("utf-8", "surrogateescape")
-        zinfo = self._zipinfo(filename)
-        self._zipfile.writestr(zinfo, contents)
-        hash = hashlib.sha256()
-        hash.update(contents)
-        self._add_to_record(filename, self._serialize_digest(hash), len(contents))
-
     def add_file(self, package_filename, real_filename):
         """Add given file to the distribution."""
 
@@ -227,7 +147,7 @@
         hash = hashlib.sha256()
         size = 0
         with open(real_filename, "rb") as fsrc:
-            with self._zipfile.open(zinfo, "w") as fdst:
+            with self.open(zinfo, "w") as fdst:
                 while True:
                     block = fsrc.read(2**20)
                     if not block:
@@ -237,6 +157,27 @@
                     size += len(block)
         self._add_to_record(arcname, self._serialize_digest(hash), size)
 
+    def add_string(self, filename, contents):
+        """Add given 'contents' as filename to the distribution."""
+        if sys.version_info[0] > 2 and isinstance(contents, str):
+            contents = contents.encode("utf-8", "surrogateescape")
+        zinfo = self._zipinfo(filename)
+        self.writestr(zinfo, contents)
+        hash = hashlib.sha256()
+        hash.update(contents)
+        self._add_to_record(filename, self._serialize_digest(hash), len(contents))
+
+    def _serialize_digest(self, hash):
+        # https://www.python.org/dev/peps/pep-0376/#record
+        # "base64.urlsafe_b64encode(digest) with trailing = removed"
+        digest = base64.urlsafe_b64encode(hash.digest())
+        digest = b"sha256=" + digest.rstrip(b"=")
+        return digest
+
+    def _add_to_record(self, filename, hash, size):
+        size = str(size).encode("ascii")
+        self._record.append((filename, hash, size))
+
     def _zipinfo(self, filename):
         """Construct deterministic ZipInfo entry for a file named filename"""
         # Strip leading path separators to mirror ZipInfo.from_file behavior
@@ -248,9 +189,110 @@
         zinfo = zipfile.ZipInfo(filename=arcname, date_time=_ZIP_EPOCH)
         zinfo.create_system = 3  # ZipInfo entry created on a unix-y system
         zinfo.external_attr = 0o777 << 16  # permissions: rwxrwxrwx
-        zinfo.compress_type = self._zipfile.compression
+        zinfo.compress_type = self.compression
         return zinfo
 
+    def add_recordfile(self):
+        """Write RECORD file to the distribution."""
+        record_path = self.distinfo_path("RECORD")
+        entries = self._record + [(record_path, b"", b"")]
+        contents = b""
+        for filename, digest, size in entries:
+            if sys.version_info[0] > 2 and isinstance(filename, str):
+                filename = filename.lstrip("/").encode("utf-8", "surrogateescape")
+            contents += b"%s,%s,%s\n" % (filename, digest, size)
+
+        self.add_string(record_path, contents)
+        return contents
+
+
+class WheelMaker(object):
+    def __init__(
+        self,
+        name,
+        version,
+        build_tag,
+        python_tag,
+        abi,
+        platform,
+        outfile=None,
+        strip_path_prefixes=None,
+        incompatible_normalize_name=False,
+        incompatible_normalize_version=False,
+    ):
+        self._name = name
+        self._version = version
+        self._build_tag = build_tag
+        self._python_tag = python_tag
+        self._abi = abi
+        self._platform = platform
+        self._outfile = outfile
+        self._strip_path_prefixes = strip_path_prefixes
+
+        if incompatible_normalize_version:
+            self._version = normalize_pep440(self._version)
+            self._escaped_version = self._version
+        else:
+            self._escaped_version = escape_filename_segment(self._version)
+
+        if incompatible_normalize_name:
+            escaped_name = escape_filename_distribution_name(self._name)
+            self._distinfo_dir = (
+                escaped_name + "-" + self._escaped_version + ".dist-info/"
+            )
+            self._wheelname_fragment_distribution_name = escaped_name
+        else:
+            # The legacy behavior escapes the distinfo dir but not the
+            # wheel name. Enable incompatible_normalize_name to fix it.
+            # https://github.com/bazelbuild/rules_python/issues/1132
+            self._distinfo_dir = (
+                escape_filename_segment(self._name)
+                + "-"
+                + self._escaped_version
+                + ".dist-info/"
+            )
+            self._wheelname_fragment_distribution_name = self._name
+
+        self._whlfile = None
+
+    def __enter__(self):
+        self._whlfile = _WhlFile(
+            self.filename(),
+            mode="w",
+            distinfo_dir=self._distinfo_dir,
+            strip_path_prefixes=self._strip_path_prefixes,
+        )
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._whlfile.close()
+        self._whlfile = None
+
+    def wheelname(self) -> str:
+        components = [
+            self._wheelname_fragment_distribution_name,
+            self._version,
+        ]
+        if self._build_tag:
+            components.append(self._build_tag)
+        components += [self._python_tag, self._abi, self._platform]
+        return "-".join(components) + ".whl"
+
+    def filename(self) -> str:
+        if self._outfile:
+            return self._outfile
+        return self.wheelname()
+
+    def disttags(self):
+        return ["-".join([self._python_tag, self._abi, self._platform])]
+
+    def distinfo_path(self, basename):
+        return self._whlfile.distinfo_path(basename)
+
+    def add_file(self, package_filename, real_filename):
+        """Add given file to the distribution."""
+        self._whlfile.add_file(package_filename, real_filename)
+
     def add_wheelfile(self):
         """Write WHEEL file to the distribution"""
         # TODO(pstradomski): Support non-purelib wheels.
@@ -263,7 +305,7 @@
         )
         for tag in self.disttags():
             wheel_contents += "Tag: %s\n" % tag
-        self.add_string(self.distinfo_path("WHEEL"), wheel_contents)
+        self._whlfile.add_string(self.distinfo_path("WHEEL"), wheel_contents)
 
     def add_metadata(self, metadata, name, description, version):
         """Write METADATA file to the distribution."""
@@ -275,23 +317,11 @@
         # provided.
         metadata += description if description else "UNKNOWN"
         metadata += "\n"
-        self.add_string(self.distinfo_path("METADATA"), metadata)
+        self._whlfile.add_string(self.distinfo_path("METADATA"), metadata)
 
     def add_recordfile(self):
         """Write RECORD file to the distribution."""
-        record_path = self.distinfo_path("RECORD")
-        entries = self._record + [(record_path, b"", b"")]
-        entries.sort()
-        contents = b""
-        for filename, digest, size in entries:
-            if sys.version_info[0] > 2 and isinstance(filename, str):
-                filename = filename.lstrip("/").encode("utf-8", "surrogateescape")
-            contents += b"%s,%s,%s\n" % (filename, digest, size)
-        self.add_string(record_path, contents)
-
-    def _add_to_record(self, filename, hash, size):
-        size = str(size).encode("ascii")
-        self._record.append((filename, hash, size))
+        self._whlfile.add_recordfile()
 
 
 def get_files_to_package(input_files):