feat: gazelle manifest exclude_patterns (#917)

* feat: exclude_patterns for gazelle manifest

Signed-off-by: Thulio Ferraz Assis <3149049+f0rmiga@users.noreply.github.com>

* feat: force gazelle manifest update on logic change

Signed-off-by: Thulio Ferraz Assis <3149049+f0rmiga@users.noreply.github.com>

Signed-off-by: Thulio Ferraz Assis <3149049+f0rmiga@users.noreply.github.com>
diff --git a/examples/build_file_generation/BUILD b/examples/build_file_generation/BUILD
index ef9e967..9204a0e 100644
--- a/examples/build_file_generation/BUILD
+++ b/examples/build_file_generation/BUILD
@@ -4,11 +4,23 @@
 load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest")
 load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping")
 load("@rules_python//python:defs.bzl", "py_binary", "py_library")
+load("@rules_python//python:pip.bzl", "compile_pip_requirements")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = ["--allow-unsafe"],
+    requirements_in = "requirements.txt",
+    requirements_txt = "requirements_lock.txt",
+)
 
 # This rule fetches the metadata for python packages we depend on. That data is
 # required for the gazelle_python_manifest rule to update our manifest file.
 modules_mapping(
     name = "modules_map",
+    exclude_patterns = [
+        "^_|(\\._)+",  # This is the default.
+        "(\\.tests)+",  # Add a custom one to get rid of the psutil tests.
+    ],
     wheels = all_whl_requirements,
 )
 
diff --git a/examples/build_file_generation/gazelle_python.yaml b/examples/build_file_generation/gazelle_python.yaml
index 8e68c1d..f25f59e 100644
--- a/examples/build_file_generation/gazelle_python.yaml
+++ b/examples/build_file_generation/gazelle_python.yaml
@@ -6,18 +6,14 @@
 manifest:
   modules_mapping:
     certifi: certifi
-    certifi.__init__: certifi
-    certifi.__main__: certifi
     certifi.core: certifi
     chardet: chardet
-    chardet.__init__: chardet
     chardet.big5freq: chardet
     chardet.big5prober: chardet
     chardet.chardistribution: chardet
     chardet.charsetgroupprober: chardet
     chardet.charsetprober: chardet
     chardet.cli: chardet
-    chardet.cli.__init__: chardet
     chardet.cli.chardetect: chardet
     chardet.codingstatemachine: chardet
     chardet.compat: chardet
@@ -53,7 +49,6 @@
     chardet.utf8prober: chardet
     chardet.version: chardet
     idna: idna
-    idna.__init__: idna
     idna.codec: idna
     idna.compat: idna
     idna.core: idna
@@ -61,10 +56,8 @@
     idna.intranges: idna
     idna.package_data: idna
     idna.uts46data: idna
+    psutil: psutil
     requests: requests
-    requests.__init__: requests
-    requests.__version__: requests
-    requests._internal_utils: requests
     requests.adapters: requests
     requests.api: requests
     requests.auth: requests
@@ -81,18 +74,9 @@
     requests.structures: requests
     requests.utils: requests
     urllib3: urllib3
-    urllib3.__init__: urllib3
-    urllib3._collections: urllib3
-    urllib3._version: urllib3
     urllib3.connection: urllib3
     urllib3.connectionpool: urllib3
     urllib3.contrib: urllib3
-    urllib3.contrib.__init__: urllib3
-    urllib3.contrib._appengine_environ: urllib3
-    urllib3.contrib._securetransport: urllib3
-    urllib3.contrib._securetransport.__init__: urllib3
-    urllib3.contrib._securetransport.bindings: urllib3
-    urllib3.contrib._securetransport.low_level: urllib3
     urllib3.contrib.appengine: urllib3
     urllib3.contrib.ntlmpool: urllib3
     urllib3.contrib.pyopenssl: urllib3
@@ -102,19 +86,14 @@
     urllib3.fields: urllib3
     urllib3.filepost: urllib3
     urllib3.packages: urllib3
-    urllib3.packages.__init__: urllib3
     urllib3.packages.backports: urllib3
-    urllib3.packages.backports.__init__: urllib3
     urllib3.packages.backports.makefile: urllib3
     urllib3.packages.six: urllib3
     urllib3.packages.ssl_match_hostname: urllib3
-    urllib3.packages.ssl_match_hostname.__init__: urllib3
-    urllib3.packages.ssl_match_hostname._implementation: urllib3
     urllib3.poolmanager: urllib3
     urllib3.request: urllib3
     urllib3.response: urllib3
     urllib3.util: urllib3
-    urllib3.util.__init__: urllib3
     urllib3.util.connection: urllib3
     urllib3.util.proxy: urllib3
     urllib3.util.queue: urllib3
@@ -129,4 +108,4 @@
   pip_repository:
     name: pip
     incremental: true
-integrity: 4b3eed2cb51741419e11bd12a4533f285d059fda8029deaf6fedfe0fcda1b782
+integrity: 91adaddb7e2d3eb7234e78979ff40b666101ab4df91c62659b954cc9376c2f86
diff --git a/examples/build_file_generation/requirements.txt b/examples/build_file_generation/requirements.txt
index 9d84d35..2851c1e 100644
--- a/examples/build_file_generation/requirements.txt
+++ b/examples/build_file_generation/requirements.txt
@@ -1 +1,2 @@
 requests==2.25.1
+psutil==5.9.4
diff --git a/examples/build_file_generation/requirements_lock.txt b/examples/build_file_generation/requirements_lock.txt
index b66c41f..07ff2ec 100644
--- a/examples/build_file_generation/requirements_lock.txt
+++ b/examples/build_file_generation/requirements_lock.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with python 3.9
 # To update, run:
 #
-#    pip-compile --generate-hashes --output-file=requirements_lock.txt requirements.txt
+#    bazel run //:requirements.update
 #
 certifi==2020.12.5 \
     --hash=sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c \
@@ -16,10 +16,26 @@
     --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6 \
     --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0
     # via requests
+psutil==5.9.4 \
+    --hash=sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff \
+    --hash=sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1 \
+    --hash=sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62 \
+    --hash=sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549 \
+    --hash=sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08 \
+    --hash=sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7 \
+    --hash=sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e \
+    --hash=sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe \
+    --hash=sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24 \
+    --hash=sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad \
+    --hash=sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94 \
+    --hash=sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8 \
+    --hash=sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7 \
+    --hash=sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4
+    # via -r ./requirements.txt
 requests==2.25.1 \
     --hash=sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804 \
     --hash=sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e
-    # via -r requirements.txt
+    # via -r ./requirements.txt
 urllib3==1.26.5 \
     --hash=sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c \
     --hash=sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098
diff --git a/gazelle/manifest/defs.bzl b/gazelle/manifest/defs.bzl
index 8439319..a5bbe56 100644
--- a/gazelle/manifest/defs.bzl
+++ b/gazelle/manifest/defs.bzl
@@ -2,7 +2,7 @@
 for updating and testing the Gazelle manifest file.
 """
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("@io_bazel_rules_go//go:def.bzl", "GoSource", "go_binary")
 
 def gazelle_python_manifest(
         name,
@@ -38,7 +38,11 @@
     update_target = "{}.update".format(name)
     update_target_label = "//{}:{}".format(native.package_name(), update_target)
 
+    manifest_generator_hash = Label("//gazelle/manifest/generate:generate_lib_sources_hash")
+
     update_args = [
+        "--manifest-generator-hash",
+        "$(rootpath {})".format(manifest_generator_hash),
         "--requirements",
         "$(rootpath {})".format(requirements),
         "--pip-repository-name",
@@ -55,11 +59,12 @@
 
     go_binary(
         name = update_target,
-        embed = ["@rules_python//gazelle/manifest/generate:generate_lib"],
+        embed = [Label("//gazelle/manifest/generate:generate_lib")],
         data = [
             manifest,
             modules_mapping,
             requirements,
+            manifest_generator_hash,
         ],
         args = update_args,
         visibility = ["//visibility:private"],
@@ -70,21 +75,23 @@
 
     go_binary(
         name = test_binary,
-        embed = ["@rules_python//gazelle/manifest/test:test_lib"],
+        embed = [Label("//gazelle/manifest/test:test_lib")],
         visibility = ["//visibility:private"],
     )
 
     native.sh_test(
         name = "{}.test".format(name),
-        srcs = ["@rules_python//gazelle/manifest/test:run.sh"],
+        srcs = [Label("//gazelle/manifest/test:run.sh")],
         data = [
             ":{}".format(test_binary),
             manifest,
             requirements,
+            manifest_generator_hash,
         ],
         env = {
             "_TEST_BINARY": "$(rootpath :{})".format(test_binary),
             "_TEST_MANIFEST": "$(rootpath {})".format(manifest),
+            "_TEST_MANIFEST_GENERATOR_HASH": "$(rootpath {})".format(manifest_generator_hash),
             "_TEST_REQUIREMENTS": "$(rootpath {})".format(requirements),
         },
         visibility = ["//visibility:private"],
@@ -97,3 +104,56 @@
         tags = ["manual"],
         visibility = ["//visibility:public"],
     )
+
+# buildifier: disable=provider-params
+AllSourcesInfo = provider(fields = {"all_srcs": "All sources collected from the target and dependencies."})
+
+_rules_python_workspace = Label("//:WORKSPACE")
+
+def _get_all_sources_impl(target, ctx):
+    is_rules_python = target.label.workspace_name == _rules_python_workspace.workspace_name
+    if not is_rules_python:
+        # Avoid adding third-party dependency files to the checksum of the srcs.
+        return AllSourcesInfo(all_srcs = depset())
+    srcs = depset(
+        target[GoSource].orig_srcs,
+        transitive = [dep[AllSourcesInfo].all_srcs for dep in ctx.rule.attr.deps],
+    )
+    return [AllSourcesInfo(all_srcs = srcs)]
+
+_get_all_sources = aspect(
+    implementation = _get_all_sources_impl,
+    attr_aspects = ["deps"],
+)
+
+def _sources_hash_impl(ctx):
+    all_srcs = ctx.attr.go_library[AllSourcesInfo].all_srcs
+    hash_file = ctx.actions.declare_file(ctx.attr.name + ".hash")
+    args = ctx.actions.args()
+    args.add(hash_file)
+    args.add_all(all_srcs)
+    ctx.actions.run(
+        outputs = [hash_file],
+        inputs = all_srcs,
+        arguments = [args],
+        executable = ctx.executable._hasher,
+    )
+    return [DefaultInfo(
+        files = depset([hash_file]),
+        runfiles = ctx.runfiles([hash_file]),
+    )]
+
+sources_hash = rule(
+    _sources_hash_impl,
+    attrs = {
+        "go_library": attr.label(
+            aspects = [_get_all_sources],
+            providers = [GoSource],
+        ),
+        "_hasher": attr.label(
+            cfg = "exec",
+            default = Label("//gazelle/manifest/hasher"),
+            executable = True,
+        ),
+    },
+)
diff --git a/gazelle/manifest/generate/BUILD.bazel b/gazelle/manifest/generate/BUILD.bazel
index a8b9cd5..7a5d27f 100644
--- a/gazelle/manifest/generate/BUILD.bazel
+++ b/gazelle/manifest/generate/BUILD.bazel
@@ -1,4 +1,5 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+load("//gazelle/manifest:defs.bzl", "sources_hash")
 
 go_library(
     name = "generate_lib",
@@ -8,6 +9,12 @@
     deps = ["//gazelle/manifest"],
 )
 
+sources_hash(
+    name = "generate_lib_sources_hash",
+    go_library = ":generate_lib",
+    visibility = ["//visibility:public"],
+)
+
 go_binary(
     name = "generate",
     embed = [":generate_lib"],
diff --git a/gazelle/manifest/generate/generate.go b/gazelle/manifest/generate/generate.go
index 04d7441..54e8813 100644
--- a/gazelle/manifest/generate/generate.go
+++ b/gazelle/manifest/generate/generate.go
@@ -24,6 +24,7 @@
 }
 
 func main() {
+	var manifestGeneratorHashPath string
 	var requirementsPath string
 	var pipRepositoryName string
 	var pipRepositoryIncremental bool
@@ -31,6 +32,12 @@
 	var outputPath string
 	var updateTarget string
 	flag.StringVar(
+		&manifestGeneratorHashPath,
+		"manifest-generator-hash",
+		"",
+		"The file containing the hash for the source code of the manifest generator."+
+			"This is important to force manifest updates when the generator logic changes.")
+	flag.StringVar(
 		&requirementsPath,
 		"requirements",
 		"",
@@ -92,7 +99,13 @@
 			Incremental: pipRepositoryIncremental,
 		},
 	})
-	if err := writeOutput(outputPath, header, manifestFile, requirementsPath); err != nil {
+	if err := writeOutput(
+		outputPath,
+		header,
+		manifestFile,
+		manifestGeneratorHashPath,
+		requirementsPath,
+	); err != nil {
 		log.Fatalf("ERROR: %v\n", err)
 	}
 }
@@ -129,6 +142,7 @@
 	outputPath string,
 	header string,
 	manifestFile *manifest.File,
+	manifestGeneratorHashPath string,
 	requirementsPath string,
 ) error {
 	stat, err := os.Stat(outputPath)
@@ -146,7 +160,19 @@
 		return fmt.Errorf("failed to write output: %w", err)
 	}
 
-	if err := manifestFile.Encode(outputFile, requirementsPath); err != nil {
+	manifestGeneratorHash, err := os.Open(manifestGeneratorHashPath)
+	if err != nil {
+		return fmt.Errorf("failed to write output: %w", err)
+	}
+	defer manifestGeneratorHash.Close()
+
+	requirements, err := os.Open(requirementsPath)
+	if err != nil {
+		return fmt.Errorf("failed to write output: %w", err)
+	}
+	defer requirements.Close()
+
+	if err := manifestFile.Encode(outputFile, manifestGeneratorHash, requirements); err != nil {
 		return fmt.Errorf("failed to write output: %w", err)
 	}
 
diff --git a/gazelle/manifest/hasher/BUILD.bazel b/gazelle/manifest/hasher/BUILD.bazel
new file mode 100644
index 0000000..5e67b2f
--- /dev/null
+++ b/gazelle/manifest/hasher/BUILD.bazel
@@ -0,0 +1,14 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+
+go_library(
+    name = "hasher_lib",
+    srcs = ["main.go"],
+    importpath = "github.com/bazelbuild/rules_python/gazelle/manifest/hasher",
+    visibility = ["//visibility:private"],
+)
+
+go_binary(
+    name = "hasher",
+    embed = [":hasher_lib"],
+    visibility = ["//visibility:public"],
+)
diff --git a/gazelle/manifest/hasher/main.go b/gazelle/manifest/hasher/main.go
new file mode 100644
index 0000000..6e88335
--- /dev/null
+++ b/gazelle/manifest/hasher/main.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+	"crypto/sha256"
+	"io"
+	"log"
+	"os"
+)
+
+func main() {
+	h := sha256.New()
+	out, err := os.Create(os.Args[1])
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer out.Close()
+	for _, filename := range os.Args[2:] {
+		f, err := os.Open(filename)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer f.Close()
+		if _, err := io.Copy(h, f); err != nil {
+			log.Fatal(err)
+		}
+	}
+	if _, err := out.Write(h.Sum(nil)); err != nil {
+		log.Fatal(err)
+	}
+}
diff --git a/gazelle/manifest/manifest.go b/gazelle/manifest/manifest.go
index e19162b..640effc 100644
--- a/gazelle/manifest/manifest.go
+++ b/gazelle/manifest/manifest.go
@@ -26,12 +26,8 @@
 }
 
 // Encode encodes the manifest file to the given writer.
-func (f *File) Encode(w io.Writer, requirementsPath string) error {
-	requirementsChecksum, err := sha256File(requirementsPath)
-	if err != nil {
-		return fmt.Errorf("failed to encode manifest file: %w", err)
-	}
-	integrityBytes, err := f.calculateIntegrity(requirementsChecksum)
+func (f *File) Encode(w io.Writer, manifestGeneratorHashFile, requirements io.Reader) error {
+	integrityBytes, err := f.calculateIntegrity(manifestGeneratorHashFile, requirements)
 	if err != nil {
 		return fmt.Errorf("failed to encode manifest file: %w", err)
 	}
@@ -45,12 +41,8 @@
 }
 
 // VerifyIntegrity verifies if the integrity set in the File is valid.
-func (f *File) VerifyIntegrity(requirementsPath string) (bool, error) {
-	requirementsChecksum, err := sha256File(requirementsPath)
-	if err != nil {
-		return false, fmt.Errorf("failed to verify integrity: %w", err)
-	}
-	integrityBytes, err := f.calculateIntegrity(requirementsChecksum)
+func (f *File) VerifyIntegrity(manifestGeneratorHashFile, requirements io.Reader) (bool, error) {
+	integrityBytes, err := f.calculateIntegrity(manifestGeneratorHashFile, requirements)
 	if err != nil {
 		return false, fmt.Errorf("failed to verify integrity: %w", err)
 	}
@@ -62,7 +54,9 @@
 // provided checksum for the requirements.txt file used as input to the modules
 // mapping, plus the manifest structure in the manifest file. This integrity
 // calculation ensures the manifest files are kept up-to-date.
-func (f *File) calculateIntegrity(requirementsChecksum []byte) ([]byte, error) {
+func (f *File) calculateIntegrity(
+	manifestGeneratorHash, requirements io.Reader,
+) ([]byte, error) {
 	hash := sha256.New()
 
 	// Sum the manifest part of the file.
@@ -72,8 +66,13 @@
 		return nil, fmt.Errorf("failed to calculate integrity: %w", err)
 	}
 
+	// Sum the manifest generator checksum bytes.
+	if _, err := io.Copy(hash, manifestGeneratorHash); err != nil {
+		return nil, fmt.Errorf("failed to calculate integrity: %w", err)
+	}
+
 	// Sum the requirements.txt checksum bytes.
-	if _, err := hash.Write(requirementsChecksum); err != nil {
+	if _, err := io.Copy(hash, requirements); err != nil {
 		return nil, fmt.Errorf("failed to calculate integrity: %w", err)
 	}
 
@@ -134,19 +133,3 @@
 	// The incremental property of pip_repository.
 	Incremental bool
 }
-
-// sha256File calculates the checksum of a given file path.
-func sha256File(filePath string) ([]byte, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to calculate sha256 sum for file: %w", err)
-	}
-	defer file.Close()
-
-	hash := sha256.New()
-	if _, err := io.Copy(hash, file); err != nil {
-		return nil, fmt.Errorf("failed to calculate sha256 sum for file: %w", err)
-	}
-
-	return hash.Sum(nil), nil
-}
diff --git a/gazelle/manifest/manifest_test.go b/gazelle/manifest/manifest_test.go
index 3b50fd1..174d999 100644
--- a/gazelle/manifest/manifest_test.go
+++ b/gazelle/manifest/manifest_test.go
@@ -4,7 +4,9 @@
 	"bytes"
 	"io/ioutil"
 	"log"
+	"os"
 	"reflect"
+	"strings"
 	"testing"
 
 	"github.com/bazelbuild/rules_python/gazelle/manifest"
@@ -31,7 +33,14 @@
 			PipDepsRepositoryName: pipDepsRepositoryName,
 		})
 		var b bytes.Buffer
-		if err := f.Encode(&b, "testdata/requirements.txt"); err != nil {
+		manifestGeneratorHashFile := strings.NewReader("")
+		requirements, err := os.Open("testdata/requirements.txt")
+		if err != nil {
+			log.Println(err)
+			t.FailNow()
+		}
+		defer requirements.Close()
+		if err := f.Encode(&b, manifestGeneratorHashFile, requirements); err != nil {
 			log.Println(err)
 			t.FailNow()
 		}
@@ -66,7 +75,14 @@
 			log.Println(err)
 			t.FailNow()
 		}
-		valid, err := f.VerifyIntegrity("testdata/requirements.txt")
+		manifestGeneratorHashFile := strings.NewReader("")
+		requirements, err := os.Open("testdata/requirements.txt")
+		if err != nil {
+			log.Println(err)
+			t.FailNow()
+		}
+		defer requirements.Close()
+		valid, err := f.VerifyIntegrity(manifestGeneratorHashFile, requirements)
 		if err != nil {
 			log.Println(err)
 			t.FailNow()
diff --git a/gazelle/manifest/test/run.sh b/gazelle/manifest/test/run.sh
index 4b24b51..524e9b5 100755
--- a/gazelle/manifest/test/run.sh
+++ b/gazelle/manifest/test/run.sh
@@ -5,4 +5,7 @@
 
 set -o errexit -o nounset
 
-"${_TEST_BINARY}" --requirements "${_TEST_REQUIREMENTS}" --manifest "${_TEST_MANIFEST}"
\ No newline at end of file
+"${_TEST_BINARY}" \
+    --manifest-generator-hash "${_TEST_MANIFEST_GENERATOR_HASH}" \
+    --requirements "${_TEST_REQUIREMENTS}" \
+    --manifest "${_TEST_MANIFEST}"
diff --git a/gazelle/manifest/test/test.go b/gazelle/manifest/test/test.go
index 518fe06..8b580b1 100644
--- a/gazelle/manifest/test/test.go
+++ b/gazelle/manifest/test/test.go
@@ -10,15 +10,23 @@
 import (
 	"flag"
 	"log"
+	"os"
 	"path/filepath"
 
 	"github.com/bazelbuild/rules_python/gazelle/manifest"
 )
 
 func main() {
+	var manifestGeneratorHashPath string
 	var requirementsPath string
 	var manifestPath string
 	flag.StringVar(
+		&manifestGeneratorHashPath,
+		"manifest-generator-hash",
+		"",
+		"The file containing the hash for the source code of the manifest generator."+
+			"This is important to force manifest updates when the generator logic changes.")
+	flag.StringVar(
 		&requirementsPath,
 		"requirements",
 		"",
@@ -47,7 +55,19 @@
 		log.Fatalln("ERROR: failed to find the Gazelle manifest file integrity")
 	}
 
-	valid, err := manifestFile.VerifyIntegrity(requirementsPath)
+	manifestGeneratorHash, err := os.Open(manifestGeneratorHashPath)
+	if err != nil {
+		log.Fatalf("ERROR: %v\n", err)
+	}
+	defer manifestGeneratorHash.Close()
+
+	requirements, err := os.Open(requirementsPath)
+	if err != nil {
+		log.Fatalf("ERROR: %v\n", err)
+	}
+	defer requirements.Close()
+
+	valid, err := manifestFile.VerifyIntegrity(manifestGeneratorHash, requirements)
 	if err != nil {
 		log.Fatalf("ERROR: %v\n", err)
 	}
@@ -60,4 +80,4 @@
 			"ERROR: %q is out-of-date, follow the intructions on this file for updating.\n",
 			manifestRealpath)
 	}
-}
\ No newline at end of file
+}
diff --git a/gazelle/manifest/testdata/gazelle_python.yaml b/gazelle/manifest/testdata/gazelle_python.yaml
index 4dc1f2c..70f7aff 100644
--- a/gazelle/manifest/testdata/gazelle_python.yaml
+++ b/gazelle/manifest/testdata/gazelle_python.yaml
@@ -10,4 +10,4 @@
     arrow.parser: arrow
     arrow.util: arrow
   pip_deps_repository_name: test_repository_name
-integrity: 624f5f6c078eb44b907efd5a64e308354ac3620c568232b815668bcdf3e3366a
+integrity: eedf187f8b7ec27cdfc682feee4206e063b51d13d78f77c05d3a30ec11bd7411
diff --git a/gazelle/modules_mapping/def.bzl b/gazelle/modules_mapping/def.bzl
index 04ea50f..9b1352c 100644
--- a/gazelle/modules_mapping/def.bzl
+++ b/gazelle/modules_mapping/def.bzl
@@ -12,8 +12,9 @@
 def _modules_mapping_impl(ctx):
     modules_mapping = ctx.actions.declare_file(ctx.attr.modules_mapping_name)
     args = ctx.actions.args()
-    args.add(modules_mapping.path)
-    args.add_all([whl.path for whl in ctx.files.wheels])
+    args.add("--output_file", modules_mapping.path)
+    args.add_all("--exclude_patterns", ctx.attr.exclude_patterns)
+    args.add_all("--wheels", [whl.path for whl in ctx.files.wheels])
     ctx.actions.run(
         inputs = ctx.files.wheels,
         outputs = [modules_mapping],
@@ -26,6 +27,11 @@
 modules_mapping = rule(
     _modules_mapping_impl,
     attrs = {
+        "exclude_patterns": attr.string_list(
+            default = ["^_|(\\._)+"],
+            doc = "A set of regex patterns to match against each calculated module path. By default, exclude the modules starting with underscores.",
+            mandatory = False,
+        ),
         "modules_mapping_name": attr.string(
             default = "modules_mapping.json",
             doc = "The name for the output JSON file.",
diff --git a/gazelle/modules_mapping/generator.py b/gazelle/modules_mapping/generator.py
index ec3133a..51b81e7 100644
--- a/gazelle/modules_mapping/generator.py
+++ b/gazelle/modules_mapping/generator.py
@@ -1,5 +1,7 @@
+import argparse
 import json
 import pathlib
+import re
 import sys
 import zipfile
 
@@ -8,36 +10,69 @@
 class Generator:
     stderr = None
     output_file = None
+    excluded_patterns = None
+    mapping = {}
 
-    def __init__(self, stderr, output_file):
+    def __init__(self, stderr, output_file, excluded_patterns):
         self.stderr = stderr
         self.output_file = output_file
+        self.excluded_patterns = [re.compile(pattern) for pattern in excluded_patterns]
 
     # dig_wheel analyses the wheel .whl file determining the modules it provides
     # by looking at the directory structure.
     def dig_wheel(self, whl):
-        mapping = {}
         with zipfile.ZipFile(whl, "r") as zip_file:
             for path in zip_file.namelist():
                 if is_metadata(path):
                     if data_has_purelib_or_platlib(path):
-                        module_for_path(path, whl, mapping)
+                        self.module_for_path(path, whl)
                     else:
                         continue
                 else:
-                    module_for_path(path, whl, mapping)
-        return mapping
+                    self.module_for_path(path, whl)
+
+    def module_for_path(self, path, whl):
+        ext = pathlib.Path(path).suffix
+        if ext == ".py" or ext == ".so":
+            if "purelib" in path or "platlib" in path:
+                root = "/".join(path.split("/")[2:])
+            else:
+                root = path
+
+            wheel_name = get_wheel_name(whl)
+
+            if root.endswith("/__init__.py"):
+                # Note the '/' here means that the __init__.py is not in the
+                # root of the wheel, therefore we can index the directory
+                # where this file is as an importable package.
+                module = root[: -len("/__init__.py")].replace("/", ".")
+                if not self.is_excluded(module):
+                    self.mapping[module] = wheel_name
+
+            # Always index the module file.
+            if ext == ".so":
+                # Also remove extra metadata that is embeded as part of
+                # the file name as an extra extension.
+                ext = "".join(pathlib.Path(root).suffixes)
+            module = root[: -len(ext)].replace("/", ".")
+            if not self.is_excluded(module):
+                self.mapping[module] = wheel_name
+
+    def is_excluded(self, module):
+        for pattern in self.excluded_patterns:
+            if pattern.search(module):
+                return True
+        return False
 
     # run is the entrypoint for the generator.
     def run(self, wheels):
-        mapping = {}
         for whl in wheels:
             try:
-                mapping.update(self.dig_wheel(whl))
+                self.dig_wheel(whl)
             except AssertionError as error:
                 print(error, file=self.stderr)
                 return 1
-        mapping_json = json.dumps(mapping)
+        mapping_json = json.dumps(self.mapping)
         with open(self.output_file, "w") as f:
             f.write(mapping_json)
         return 0
@@ -71,34 +106,14 @@
     return is_metadata(path) and (maybe_lib == "purelib" or maybe_lib == "platlib")
 
 
-def module_for_path(path, whl, mapping):
-    ext = pathlib.Path(path).suffix
-    if ext == ".py" or ext == ".so":
-        if "purelib" in path or "platlib" in path:
-            root = "/".join(path.split("/")[2:])
-        else:
-            root = path
-
-        wheel_name = get_wheel_name(whl)
-
-        if root.endswith("/__init__.py"):
-            # Note the '/' here means that the __init__.py is not in the
-            # root of the wheel, therefore we can index the directory
-            # where this file is as an importable package.
-            module = root[: -len("/__init__.py")].replace("/", ".")
-            mapping[module] = wheel_name
-
-        # Always index the module file.
-        if ext == ".so":
-            # Also remove extra metadata that is embeded as part of
-            # the file name as an extra extension.
-            ext = "".join(pathlib.Path(root).suffixes)
-        module = root[: -len(ext)].replace("/", ".")
-        mapping[module] = wheel_name
-
-
 if __name__ == "__main__":
-    output_file = sys.argv[1]
-    wheels = sys.argv[2:]
-    generator = Generator(sys.stderr, output_file)
-    exit(generator.run(wheels))
+    parser = argparse.ArgumentParser(
+        prog="generator",
+        description="Generates the modules mapping used by the Gazelle manifest.",
+    )
+    parser.add_argument("--output_file", type=str)
+    parser.add_argument("--exclude_patterns", nargs="+", default=[])
+    parser.add_argument("--wheels", nargs="+", default=[])
+    args = parser.parse_args()
+    generator = Generator(sys.stderr, args.output_file, args.exclude_patterns)
+    exit(generator.run(args.wheels))