pw_build: Relative source file name extraction

Adds a helper template that can extract source file names from pw_*
build targets as represented by the __FILE__ macro with
-ffile-prefix-map transformations applied.

Change-Id: I16cb930c6677debb1b8e20dd57ffc905f7fd0022
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/79540
Pigweed-Auto-Submit: Armando Montanez <amontanez@google.com>
Reviewed-by: Keir Mierle <keir@google.com>
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
diff --git a/pw_build/BUILD.gn b/pw_build/BUILD.gn
index 41069a9..a6d0cd7 100644
--- a/pw_build/BUILD.gn
+++ b/pw_build/BUILD.gn
@@ -15,6 +15,7 @@
 import("//build_overrides/pigweed.gni")
 
 import("$dir_pw_build/python.gni")
+import("$dir_pw_build/relative_source_file_names.gni")
 import("$dir_pw_docgen/docs.gni")
 import("target_types.gni")
 
@@ -120,24 +121,34 @@
 
 # Removes system-dependent prefixes from macros like __FILE__ and debug symbols.
 config("relative_paths") {
-  cflags = [
+  _transformations = [
     # Remap absolute paths to the build directory to "out", in case absolute
     # paths to files in the build directory are created.
     #
     # Clang and GCC apply these flags in opposite order. The build directory is
     # often nested under //. To ensure that both compilers removed it before
     # removing the absolute path to //, apply the option both first and last.
-    "-ffile-prefix-map=" + rebase_path(root_build_dir) + "=out",
+    rebase_path(root_build_dir) + "=out",
 
     # Remove absolute paths to the repo root.
-    "-ffile-prefix-map=" + rebase_path("//") + "=",
+    rebase_path("//") + "=",
 
     # Remove relative paths from the build directory to the source tree.
-    "-ffile-prefix-map=" + rebase_path("//", root_build_dir) + "=",
+    rebase_path("//", root_build_dir) + "=",
 
     # Repeat option to remap absolute paths to the build directory.
-    "-ffile-prefix-map=" + rebase_path(root_build_dir) + "=out",
+    rebase_path(root_build_dir) + "=out",
   ]
+  cflags = []
+
+  foreach(transform, _transformations) {
+    cflags += [ "-ffile-prefix-map=" + transform ]
+  }
+
+  # Write the transformations to a well known path so that other utilities
+  # that need to present file names that match the compiler's __FILE__
+  # macro can apply the same transformation.
+  write_file(pw_build_RELATIVE_PATH_TRANSFORM_JSON, _transformations, "json")
 }
 
 # This group is linked into all pw_executable, pw_static_library, and
diff --git a/pw_build/docs.rst b/pw_build/docs.rst
index 44d1fa5..7d2033c 100644
--- a/pw_build/docs.rst
+++ b/pw_build/docs.rst
@@ -470,6 +470,100 @@
   ├── file1.txt
   └── renamed.txt
 
+pw_relative_source_file_names
+-----------------------------
+This template recursively walks the listed dependencies and collects the names
+of all the headers and source files required by the targets, and then transforms
+them such that they reflect the ``__FILE__`` when pw_build's ``relative_paths``
+config is applied. This is primarily intended for side-band generation of
+pw_tokenizer tokens so file name tokens can be utilized in places where
+pw_tokenizer is unable to embed token information as part of C/C++ compilation.
+
+This template produces a JSON file containing an array of strings (file paths
+with ``-ffile-prefix-map``-like transformations applied) that can be used to
+`generate a token database <module-pw_tokenizer-database-creation>`_.
+
+**Arguments**
+
+* ``deps``: A required list of targets to recursively extract file names from.
+* ``outputs``: A required array with a single element: the path to write the
+  final JSON file to.
+
+**Example**
+
+Let's say we have the following project structure:
+
+.. code-block::
+
+  project root
+  ├── foo/
+  │   ├── foo.h
+  │   └── foo.cc
+  ├── bar/
+  │   ├── bar.h
+  │   └── bar.cc
+  ├── unused/
+  │   ├── unused.h
+  │   └── unused.cc
+  └── main.cc
+
+And a BUILD.gn at the root:
+
+.. code-block::
+
+  pw_source_set("bar") {
+    public_configs = [ ":bar_headers" ]
+    public = [ "bar/bar.h" ]
+    sources = [ "bar/bar.cc" ]
+  }
+
+  pw_source_set("foo") {
+    public_configs = [ ":foo_headers" ]
+    public = [ "foo/foo.h" ]
+    sources = [ "foo/foo.cc" ]
+    deps = [ ":bar" ]
+  }
+
+
+  pw_source_set("unused") {
+    public_configs = [ ":unused_headers" ]
+    public = [ "unused/unused.h" ]
+    sources = [ "unused/unused.cc" ]
+    deps = [ ":bar" ]
+  }
+
+  pw_executable("main") {
+    sources = [ "main.cc" ]
+    deps = [ ":foo" ]
+  }
+
+  pw_relative_source_file_names("main_source_files") {
+    deps = [ ":main" ]
+    outputs = [ "$target_gen_dir/main_source_files.json" ]
+  }
+
+The json file written to `out/gen/main_source_files.json` will contain:
+
+.. code-block::
+
+  [
+    "bar/bar.cc",
+    "bar/bar.h",
+    "foo/foo.cc",
+    "foo/foo.h",
+    "main.cc"
+  ]
+
+Since ``unused`` isn't a transitive dependency of ``main``, its source files
+are not included. Similarly, even though ``bar`` is not a direct dependency of
+``main``, its source files *are* included because ``foo`` brings in ``bar`` as
+a transitive dependency.
+
+Note how the file paths in this example are relative to the project root rather
+than being absolute paths (e.g. ``/home/user/ralph/coding/my_proj/main.cc``).
+This is a result of transformations applied to strip absolute pathing prefixes,
+matching the behavior of pw_build's ``$dir_pw_build:relative_paths`` config.
+
 CMake
 =====
 Pigweed's `CMake`_ support is provided primarily for projects that have an
diff --git a/pw_build/py/BUILD.gn b/pw_build/py/BUILD.gn
index 1f4a973..92aa41c 100644
--- a/pw_build/py/BUILD.gn
+++ b/pw_build/py/BUILD.gn
@@ -29,6 +29,7 @@
     "pw_build/create_python_tree.py",
     "pw_build/error.py",
     "pw_build/exec.py",
+    "pw_build/file_prefix_map.py",
     "pw_build/generate_python_package.py",
     "pw_build/generate_python_package_gn.py",
     "pw_build/generated_tests.py",
@@ -42,6 +43,7 @@
   ]
   tests = [
     "create_python_tree_test.py",
+    "file_prefix_map_test.py",
     "python_runner_test.py",
     "zip_test.py",
   ]
diff --git a/pw_build/py/file_prefix_map_test.py b/pw_build/py/file_prefix_map_test.py
new file mode 100644
index 0000000..72630b6
--- /dev/null
+++ b/pw_build/py/file_prefix_map_test.py
@@ -0,0 +1,71 @@
+# Copyright 2022 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+"""Tests for the file_prefix_map utility"""
+
+from io import StringIO
+import json
+import unittest
+
+from pw_build import file_prefix_map
+
+# pylint: disable=line-too-long
+JSON_SOURCE_FILES = json.dumps([
+    "../pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h",
+    "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h",
+    "../pigweed/pw_rpc/client_server.cc",
+    "../pigweed/pw_rpc/public/pw_rpc/client_server.h",
+    "/home/user/pigweed/out/../gen/generated_build_info.cc",
+    "/home/user/pigweed/pw_protobuf/encoder.cc",
+])
+
+JSON_PATH_TRANSFORMATIONS = json.dumps([
+    "/home/user/pigweed/out=out",
+    "/home/user/pigweed/=",
+    "../=",
+    "/home/user/pigweed/out=out",
+])
+
+EXPECTED_TRANSFORMED_PATHS = json.dumps([
+    "pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h",
+    "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h",
+    "pigweed/pw_rpc/client_server.cc",
+    "pigweed/pw_rpc/public/pw_rpc/client_server.h",
+    "out/../gen/generated_build_info.cc",
+    "pw_protobuf/encoder.cc",
+])
+
+
+class FilePrefixMapTest(unittest.TestCase):
+    def test_prefix_remap(self):
+        path_list = [
+            '/foo_root/root_subdir/source.cc',
+            '/foo_root/root_subdir/out/../gen.cc'
+        ]
+        prefix_maps = [('/foo_root/root_subdir/', ''), ('out/../', 'out/')]
+        expected_paths = ['source.cc', 'out/gen.cc']
+        self.assertEqual(
+            list(file_prefix_map.remap_paths(path_list, prefix_maps)),
+            expected_paths)
+
+    def test_json_prefix_map(self):
+        in_fd = StringIO(JSON_SOURCE_FILES)
+        prefix_map_fd = StringIO(JSON_PATH_TRANSFORMATIONS)
+        out_fd = StringIO()
+        file_prefix_map.remap_json_paths(in_fd, out_fd, prefix_map_fd)
+        self.assertEqual(json.loads(out_fd.getvalue()),
+                         json.loads(EXPECTED_TRANSFORMED_PATHS))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pw_build/py/pw_build/file_prefix_map.py b/pw_build/py/pw_build/file_prefix_map.py
new file mode 100644
index 0000000..69daff2
--- /dev/null
+++ b/pw_build/py/pw_build/file_prefix_map.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+"""Transforms a JSON list of paths using -ffile-prefix-map style rules."""
+
+import argparse
+import json
+from typing import Iterator, List, TextIO
+
+# Note: This should be List[Tuple[str, str]], but using string.split()
+# produces Tuple[Any,...], so this permits that typing for convenience.
+PrefixMaps = List[tuple]
+
+
+def _parse_args() -> argparse.Namespace:
+    """Parses and returns the command line arguments."""
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('in_json',
+                        type=argparse.FileType('r'),
+                        help='The JSON file containing a list of file names '
+                        'that the prefix map operations should be applied to')
+    parser.add_argument(
+        '--prefix-map-json',
+        type=argparse.FileType('r'),
+        required=True,
+        help=
+        'JSON file containing an array of prefix map transformations to apply '
+        'to the strings before tokenizing. These string literal '
+        'transformations are of the form "from=to". All strings with the '
+        'prefix `from` will have the prefix replaced with `to`. '
+        'Transformations are applied in the order they are listed in the JSON '
+        'file.')
+
+    parser.add_argument('--output',
+                        type=argparse.FileType('w'),
+                        help='File path to write transformed paths to.')
+    return parser.parse_args()
+
+
+def remap_paths(paths: List[str], prefix_maps: PrefixMaps) -> Iterator[str]:
+    for path in paths:
+        for from_prefix, to_prefix in prefix_maps:
+            if path.startswith(from_prefix):
+                path = path.replace(from_prefix, to_prefix, 1)
+        yield path
+
+
+def remap_json_paths(in_json: TextIO, output: TextIO,
+                     prefix_map_json: TextIO) -> None:
+    paths = json.load(in_json)
+    prefix_maps: PrefixMaps = [
+        tuple(m.split('=', maxsplit=1)) for m in json.load(prefix_map_json)
+    ]
+
+    json.dump(list(remap_paths(paths, prefix_maps)), output)
+
+
+if __name__ == '__main__':
+    remap_json_paths(**vars(_parse_args()))
diff --git a/pw_build/relative_source_file_names.gni b/pw_build/relative_source_file_names.gni
new file mode 100644
index 0000000..0aa1c17
--- /dev/null
+++ b/pw_build/relative_source_file_names.gni
@@ -0,0 +1,67 @@
+# Copyright 2022 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+import("//build_overrides/pigweed.gni")
+
+import("$dir_pw_build/python_action.gni")
+
+# This isn't in a declare_args() block as it likely isn't necessary to change
+# this.
+pw_build_RELATIVE_PATH_TRANSFORM_JSON =
+    "$root_build_dir/relative_path_transformations.json"
+
+# Creates a JSON file containing an array of source file names with
+# -ffile-prefix-map style transformations applied to match __FILE__ as seen in
+# C/C++ sources when using pw_build's `relative_paths` config.
+#
+# Args:
+#    deps (required): A list of targets to recursively extract file names from.
+#    outputs (required): An array with a single element: the path to write the
+#        transformed file paths to. The output format is a JSON array of
+#        strings.
+template("pw_relative_source_file_names") {
+  _raw_file_names_json = "$target_gen_dir/${target_name}.raw.json"
+
+  # The various pw_* templates add pw_source_files metadata which we
+  # aggregate here.
+  generated_file("${target_name}.raw_source_files") {
+    forward_variables_from(invoker, [ "deps" ])
+
+    # Rebase the paths so that they match those that are passed to the
+    # compiler.
+    rebase = root_build_dir
+    outputs = [ _raw_file_names_json ]
+    data_keys = [ "pw_source_files" ]
+    output_conversion = "json"
+  }
+
+  pw_python_action(target_name) {
+    deps = [ ":${target_name}.raw_source_files" ]
+    python_deps = [ "$dir_pw_build/py" ]
+    module = "pw_build.file_prefix_map"
+
+    # GN-ism: You can't do invoker.outputs[0], but _outs[0] works.
+    _outs = invoker.outputs
+    args = [
+      rebase_path(_raw_file_names_json, root_build_dir),
+      "--prefix-map-json",
+      rebase_path(pw_build_RELATIVE_PATH_TRANSFORM_JSON, root_build_dir),
+      "--out",
+      rebase_path(_outs[0], root_build_dir),
+    ]
+
+    inputs = [ _raw_file_names_json ]
+    forward_variables_from(invoker, [ "outputs" ])
+  }
+}