pw_presubmit: Add banned words check

Add check for non-inclusive language.

Change-Id: Iea485b4e2a40f9db56ef84b97930848eeb9114cc
Bug: 386
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/46360
Pigweed-Auto-Submit: Rob Mohr <mohrr@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
Reviewed-by: Keir Mierle <keir@google.com>
diff --git a/pw_presubmit/docs.rst b/pw_presubmit/docs.rst
index 2173f6e..e53238e 100644
--- a/pw_presubmit/docs.rst
+++ b/pw_presubmit/docs.rst
@@ -127,8 +127,9 @@
       sys.exit(2)
 
   import pw_presubmit
-  from pw_presubmit import build, cli, environment, format_code, git_repo
-  from pw_presubmit import python_checks, filter_paths, PresubmitContext
+  from pw_presubmit import banned_words, build, cli, environment, format_code
+  from pw_presubmit import git_repo, python_checks, filter_paths
+  from pw_presubmit import PresubmitContext
   from pw_presubmit.install_hook import install_hook
 
   # Set up variables for key project paths.
@@ -184,14 +185,13 @@
   # Presubmit check programs
   #
   QUICK = (
-      # Initialize an environment for running presubmit checks.
-      init_cipd,
-      init_virtualenv,
       # List some presubmit checks to run
       pragma_once,
       host_tests,
       # Use the upstream formatting checks, with custom path filters applied.
       format_code.presubmit_checks(exclude=PATH_EXCLUSIONS),
+      # Include the upstream inclusive language check.
+      banned_words.banned_words,
   )
 
   FULL = (
@@ -238,4 +238,4 @@
 ---------------------
 The ``pw_presubmit.format_code`` module formats supported source files using
 external code format tools. The file ``format_code.py`` can be invoked directly
-from the command line or from ``pw`` as ``pw format``.
\ No newline at end of file
+from the command line or from ``pw`` as ``pw format``.
diff --git a/pw_presubmit/py/BUILD.gn b/pw_presubmit/py/BUILD.gn
index cfaa150..444ab0f 100644
--- a/pw_presubmit/py/BUILD.gn
+++ b/pw_presubmit/py/BUILD.gn
@@ -20,6 +20,7 @@
   setup = [ "setup.py" ]
   sources = [
     "pw_presubmit/__init__.py",
+    "pw_presubmit/banned_words.py",
     "pw_presubmit/build.py",
     "pw_presubmit/cli.py",
     "pw_presubmit/environment.py",
diff --git a/pw_presubmit/py/pw_presubmit/banned_words.py b/pw_presubmit/py/pw_presubmit/banned_words.py
new file mode 100644
index 0000000..eae763b
--- /dev/null
+++ b/pw_presubmit/py/pw_presubmit/banned_words.py
@@ -0,0 +1,141 @@
+# Copyright 2021 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+"""Banned words presubmit check."""
+
+import collections
+from pathlib import Path
+import re
+from typing import Dict, List
+
+from . import presubmit
+
+# List borrowed from Android:
+# https://source.android.com/setup/contribute/respectful-code
+# banned-words: disable
+BANNED_WORDS = [
+    r'master',
+    r'slave',
+    r'(white|gr[ae]y|black)\s*(list|hat)',
+    r'craz(y|ie)',
+    r'insane',
+    r'crip+led?',
+    r'sanity',
+    r'sane',
+    r'dummy',
+    r'grandfather',
+    r's?he',
+    r'his',
+    r'her',
+    r'm[ae]n[-\s]*in[-\s]*the[-\s]*middle',
+    r'mitm',
+]
+# banned-words: enable
+
+# Test: master  # banned-words: ignore
+# Test: master
+
+
+def _process_banned_words(*words):
+    """Turn banned-words list into one big regex with common inflections."""
+
+    if not words:
+        words = tuple(BANNED_WORDS)
+
+    all_words = []
+    for entry in words:
+        if isinstance(entry, str):
+            all_words.append(entry)
+        elif isinstance(entry, (list, tuple)):
+            all_words.extend(entry)
+        all_words.extend(x for x in words)
+    all_words = tuple(all_words)
+
+    # Confirm each individual word compiles as a valid regex.
+    for word in all_words:
+        _ = re.compile(word)
+
+    return re.compile(
+        r"\b({})(\b|e?[sd]\b)".format('|'.join(all_words)),
+        re.IGNORECASE,
+    )
+
+
+BANNED_WORDS_REGEX = _process_banned_words()
+
+# If seen, ignore this line and the next.
+_IGNORE = 'banned-words: ignore'
+
+# Ignore a whole section. Please do not change the order of these lines.
+_DISABLE = 'banned-words: disable'
+_ENABLE = 'banned-words: enable'
+
+
+def banned_words(
+    ctx: presubmit.PresubmitContext,
+    words_regex=BANNED_WORDS_REGEX,
+):
+    """Presubmit check that ensures files do not contain banned words."""
+
+    Match = collections.namedtuple('Match', 'line word')
+    found_words: Dict[Path, List[Match]] = {}
+
+    for path in ctx.paths:
+        try:
+            with open(path, 'r') as ins:
+                enabled = True
+                prev = ''
+                for i, line in enumerate(ins, start=1):
+                    if _DISABLE in line:
+                        enabled = False
+                    if _ENABLE in line:
+                        enabled = True
+
+                    # If we see the ignore line on this or the previous line we
+                    # ignore any bad words on this line.
+                    ignored = _IGNORE in prev or _IGNORE in line
+
+                    if enabled and not ignored:
+                        match = words_regex.search(line)
+
+                        if match:
+                            found_words.setdefault(path, [])
+                            found_words[path].append(Match(i, match.group(0)))
+
+                    # Not using 'continue' so this line always executes.
+                    prev = line
+
+        except UnicodeDecodeError:
+            # File is not text, like a gif.
+            pass
+
+    for path, matches in found_words.items():
+        print('=' * 40)
+        print(path)
+        for match in matches:
+            print(f'Found banned word "{match.word}" on line {match.line}')
+
+    if found_words:
+        raise presubmit.PresubmitFailure
+
+
+def banned_words_checker(*words):
+    """Create banned words checker for the given list of banned words."""
+
+    regex = _process_banned_words(*words)
+
+    def banned_words(  # pylint: disable=redefined-outer-name
+        ctx: presubmit.PresubmitContext):
+        globals()['banned_words'](ctx, regex)
+
+    return banned_words
diff --git a/pw_presubmit/py/pw_presubmit/pigweed_presubmit.py b/pw_presubmit/py/pw_presubmit/pigweed_presubmit.py
index 4e6676b..d2e0040 100755
--- a/pw_presubmit/py/pw_presubmit/pigweed_presubmit.py
+++ b/pw_presubmit/py/pw_presubmit/pigweed_presubmit.py
@@ -35,7 +35,7 @@
 
 import pw_package.pigweed_packages
 
-from pw_presubmit import build, cli, format_code, git_repo
+from pw_presubmit import banned_words, build, cli, format_code, git_repo
 from pw_presubmit import call, filter_paths, plural, PresubmitContext
 from pw_presubmit import PresubmitFailure, Programs
 from pw_presubmit.install_hook import install_hook
@@ -621,6 +621,7 @@
 #
 
 OTHER_CHECKS = (
+    banned_words.banned_words,
     # TODO(pwbug/45): Remove clang-tidy from OTHER_CHECKS when it passes.
     clang_tidy,
     # Build that attempts to duplicate the build OSS-Fuzz does. Currently