pw_log: Update proto

- Update proto to use a single bytes field for optionally tokenized
  fields.
- Start with a minimal set of fields, but document fields planned for
  future expansion.

Change-Id: I5af8e65335001ffda03acc31e5077edbca9deff3
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/47780
Commit-Queue: Wyatt Hepler <hepler@google.com>
Reviewed-by: Keir Mierle <keir@google.com>
diff --git a/pw_env_setup/BUILD.gn b/pw_env_setup/BUILD.gn
index 48be4eb..afd46b7 100644
--- a/pw_env_setup/BUILD.gn
+++ b/pw_env_setup/BUILD.gn
@@ -35,6 +35,7 @@
     "$dir_pw_doctor/py",
     "$dir_pw_env_setup/py",
     "$dir_pw_hdlc/py",
+    "$dir_pw_log:protos.python",
     "$dir_pw_log_tokenized/py",
     "$dir_pw_module/py",
     "$dir_pw_package/py",
diff --git a/pw_log/BUILD.gn b/pw_log/BUILD.gn
index 2b73b97..25b5d0d 100644
--- a/pw_log/BUILD.gn
+++ b/pw_log/BUILD.gn
@@ -78,8 +78,13 @@
 pw_proto_library("protos") {
   sources = [ "log.proto" ]
   prefix = "pw_log/proto"
+  deps = [ "$dir_pw_tokenizer:proto" ]
 }
 
 pw_doc_group("docs") {
-  sources = [ "docs.rst" ]
+  sources = [
+    "docs.rst",
+    "protobuf.rst",
+  ]
+  inputs = [ "log.proto" ]
 }
diff --git a/pw_log/docs.rst b/pw_log/docs.rst
index 8924fe1..9816681 100644
--- a/pw_log/docs.rst
+++ b/pw_log/docs.rst
@@ -10,6 +10,15 @@
 1. The facade (this module) which is only a macro interface layer
 2. The backend, provided elsewhere, that implements the low level logging
 
+``pw_log`` also defines a logging protobuf and RPC service for efficiently
+storing and transmitting log messages. See :ref:`module-pw_log-protobuf` for
+details.
+
+.. toctree::
+  :hidden:
+
+  protobuf
+
 Usage examples
 --------------
 Here is a typical usage example, showing setting the module name, and using the
diff --git a/pw_log/log.proto b/pw_log/log.proto
index 2e38f42..5975ad2 100644
--- a/pw_log/log.proto
+++ b/pw_log/log.proto
@@ -1,4 +1,4 @@
-// Copyright 2020 The Pigweed Authors
+// Copyright 2021 The Pigweed Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy of
@@ -12,36 +12,32 @@
 // License for the specific language governing permissions and limitations under
 // the License.
 
-syntax = "proto2";
+syntax = "proto3";
 
 package pw.log;
 
-option java_package = "pw.rpc.proto";
+import "pw_tokenizer/proto/options.proto";
+
 option java_outer_classname = "Log";
 
-// A log with a tokenized message, a string message, or dropped indicator.  A
-// message can be one of three types:
+// A log message and metadata. Logs come in a few different forms:
 //
 //  1. A tokenized log message (recommended for production)
 //  2. A non-tokenized log message (good for development)
 //  3. A "log missed" tombstone, indicating that some logs were dropped
 //
-// Size analysis:
+// Size analysis for tokenized log messages, including each field's proto tag:
 //
-// For tokenized log messages in the common case; including the proto tag for
-// the field (so adding the fields gives the total proto message size):
+//  - message     - 6-12 bytes, depending on number and value of arguments
+//  - line_level  - 3 bytes; 4 bytes if line > 2048 (uncommon)
+//  - timestamp   - 3 bytes; assuming delta encoding
+//  - thread_name - 6 bytes
 //
-//  - message_tokenized  - 6-12 bytes, depending on # and value of arguments
-//  - line_level         - 3 bytes; 4 bytes if line > 2048 (uncommon)
-//  - timestamp          - 3 bytes; assuming delta encoding
-//  - thread_tokenized   - 3 bytes
-//
-// Total:
+// Adding the fields gives the total proto message size:
 //
 //    6-12 bytes - log
 //    9-15 bytes - log + level + line
 //   12-18 bytes - log + level + line + timestamp
-//   15-21 bytes - log + level + line + timestamp + task
 //
 // An analysis of a project's log token database revealed the following
 // distribution of the number of arguments to log messages:
@@ -58,30 +54,35 @@
 // enable storing more logs in limited memory. This is why, for example, there
 // is no separate "DroppedLog" type, or a "TokenizedLog" and "StringLog", which
 // would add at least 2 extra bytes per message
-// Note: Time-related fields will likely support specifying the time as a ratio
-// (period) and an absolute time separate from the current delta fields.
 message LogEntry {
-  // The tokenized log message. Internally, the format has a 32-bit token
-  // followed by the arguments for that message. The unformatted log string
-  // corresponding to the token in the token database must follow this format:
+  // The log message, which may be tokenized.
   //
-  //   file|module|message
+  // If tokenized logging is used, implementations may encode metadata in the
+  // log message rather than as separate proto fields. This reduces the size of
+  // the protobuf with no overhead.
   //
-  // For example:
+  // The standard format for encoding metadata in the log message is defined by
+  // the pw_log_tokenized module. The message and metadata are encoded as
+  // key-value pairs using ■ and ♦ as delimiters. For example:
   //
-  //   ../boot/bluetooth.cc|BOOT|Bluetooth is on the fritz; error code: %d
+  //  ■msg♦This is the log message: %d■module♦wifi■file♦../path/to/file.cc
   //
-  // Note: The level and flags are not included since level and flags are
-  // runtime values and so cannot be tokenized.
+  // See http://pigweed.dev/pw_log_tokenized for full details. When
+  // pw_log_tokenized is used, this metadata is automatically included as
+  // described.
   //
-  // Size analysis:
+  // The level and flags are not included since they may be runtime values and
+  // thus cannot always be tokenized. The line number is not included because
+  // line numbers change frequently and a new token is created for each line.
+  //
+  // Size analysis when tokenized:
   //
   //   tag+wire = 1 byte
   //   size     = 1 byte; payload will almost always be < 127 bytes
   //   payload  = N bytes; typically 4-10 in practice
   //
   // Total: 2 + N ~= 6-12 bytes
-  optional bytes message_tokenized = 1;
+  optional bytes message = 1 [(tokenizer.format) = TOKENIZATION_OPTIONAL];
 
   // Packed log level and line number. Structure:
   //
@@ -93,73 +94,87 @@
   // omits the line number.
   optional uint32 line_level = 2;
 
-  // Some log messages have flags to indicate for example assert or PII. The
-  // particular flags are product- and implementation-dependent. When no flags
-  // are present, the field is omitted entirely.
+  // Some log messages have flags to indicate attributes such as whether they
+  // are from an assert or if they contain PII. The particular flags are
+  // product- and implementation-dependent.
   optional uint32 flags = 3;
 
-  // The task or thread that created the log message.
-  //
-  // In practice, the task token and tag should be just 3 bytes, since a 14 bit
-  // token for the task name should be enough.
-  optional uint32 thread_tokenized = 4;
+  // Timestamps are either specified with an absolute timestamp or relative to
+  // the previous log entry.
+  oneof time {
+    // The absolute timestamp in implementation-defined ticks. Applications
+    // determine how to interpret this on the receiving end. In the simplest
+    // case, these ticks might be milliseconds or microseconds since boot.
+    // Applications could also access clock information out-of-band with a
+    // ClockParameters protobuf.
+    int64 timestamp = 4;
 
-  // Timestamp. Note: The units here are TBD and will likely require a separate
-  // mechanism to indicate units. This field is likely to change as we figure
-  // out the right strategy for timestamps in Pigweed. This is a variable-sized
-  // integer to enable scaling this up to a uint64 later on without impacting
-  // the wire format.
-  optional int64 timestamp = 5;
+    // Time since the last entry in implementation-defined ticks, as for the
+    // timestamp field. This enables delta encoding when batching entries
+    // together.
+    //
+    // Size analysis for this field including tag and varint, assuming 1 kHz
+    // ticks:
+    //
+    //           < 127 ms gap == 127 ms      ==  7 bits == 2 bytes
+    //        < 16,000 ms gap ==  16 seconds == 14 bits == 3 bytes
+    //     < 2,000,000 ms gap ==  35 minutes == 21 bits == 4 bytes
+    //   < 300,000,000 ms gap ==  74 hours   == 28 bits == 5 bytes
+    //
+    // Log bursts will thus consume just 2 bytes (tag + up to 127ms delta) for
+    // the timestamp, which is a good improvement over an absolute timestamp.
+    int64 time_since_last_entry = 5;
+  }
 
-  // Time since the last entry. Generally, one of timestamp or this field will
-  // be specified. This enables delta encoding when batching entries together.
-  //
-  // Size analysis for this field including tag and varint:
-  //
-  //           < 127 ms gap == 127 ms      ==  7 bits == 2 bytes
-  //        < 16,000 ms gap ==  16 seconds == 14 bits == 3 bytes
-  //     < 2,000,000 ms gap ==  35 minutes == 21 bits == 4 bytes
-  //   < 300,000,000 ms gap ==  74 hours   == 28 bits == 5 bytes
-  //
-  // Log bursts will thus consume just 2 bytes (tag + up to 127ms delta) for
-  // the timestamp, which is a good improvement over timestamp in many cases.
-  // Note: The units of this field are TBD and will likely require a separate
-  // mechanism to indicate units. The calculations above assume milliseconds
-  // and may change if the units differ.
-  optional int64 elapsed_time_since_last_entry = 6;
-
-  // Fully formatted textual log message.
-  optional string message_string = 16;
-
-  // For non-tokenized logging, the file name.
-  optional string file = 17;
-
-  // String representation of the task that created the log message.
-  optional string thread_string = 18;
+  // The following fields are planned but will not be added until they are
+  // needed. Protobuf field numbers over 15 use an extra byte, so these fields
+  // are left out for now to avoid reserving field numbers unnecessarily.
 
   // When the log buffers are full but more logs come in, the logs are counted
   // and a special log message is omitted with only counts for the number of
   // messages dropped. The timestamp indicates the time that the "missed logs"
   // message was inserted into the queue.
   //
-  // Missed logs messages will only have one of the timestamp fields and these
-  // counters specified.
-  optional uint32 dropped = 19;
-  optional uint32 dropped_warning_or_above = 20;
+  // As an alternative to these fields, implementations may simply send a
+  // message stating the drop count.
+  // optional uint32 dropped = ?;
+  // optional uint32 dropped_warning_or_above = ?;
+
+  // Represents the device from which the log originated. The meaning of this
+  // field is implementation defined
+  // optional uint32 source_id = ?;
+
+  // The task or thread name that created the log message. If the log was not
+  // created on a thread, it should use a name appropriate to that context.
+  // optional bytes thread_name = ?
+  //     [(tokenizer.format) = TOKENIZATION_OPTIONAL];
+
+  // The file path where this log was created, if not encoded in the message.
+  // optional bytes file = ? [(tokenizer.format) = TOKENIZATION_OPTIONAL];
+
+  // The PW_LOG_MODULE_NAME for this log message, if it is not encoded in the
+  // message.
+  // optional bytes module_name = ?
+  //    [(tokenizer.format) = TOKENIZATION_OPTIONAL];
 
   // Some messages are associated with trace events, which may carry additional
   // contextual data. This is a tuple of a data format string which could be
   // used by the decoder to identify the data (e.g. printf-style tokens) and the
   // data itself in bytes.
-  optional string data_format_string = 21;
-  optional bytes data = 22;
+  // optional bytes data_format = ?
+  //     [(tokenizer.format) = TOKENIZATION_OPTIONAL];
+  // optional bytes data = ?;
 }
 
-message LogRequest {}
+message LogRequest {
+  // This will include fields for configuring log filtering.
+}
+
 message LogEntries {
   repeated LogEntry entries = 1;
 }
 
+// RPC service for accessing logs.
 service Logs {
-  rpc Get(LogRequest) returns (stream LogEntries) {}
+  rpc Get(LogRequest) returns (stream LogEntries);
 }
diff --git a/pw_log/protobuf.rst b/pw_log/protobuf.rst
new file mode 100644
index 0000000..2ee7ea3
--- /dev/null
+++ b/pw_log/protobuf.rst
@@ -0,0 +1,52 @@
+.. _module-pw_log-protobuf:
+
+-------------------
+The pw_log protobuf
+-------------------
+``pw_log`` defines a protocol buffer for storing and transmitting log messages.
+The protobuf is optimized to be maximally efficient.
+
+Fields
+======
+The ``pw_log`` protobuf is defined in ``log.proto``.
+
+.. literalinclude:: log.proto
+  :language: protobuf
+  :lines: 14-
+
+Timestamps
+----------
+Timestamps are specified in implementation-defined ticks. Ticks could be
+milliseconds, microsends, or any arbitrary duration derived from the device’s
+clock.
+
+For many applications, the timestamp field can be interpreted based on the prior
+knowledge of the system. For example, ``timestamp`` might be known to be
+milliseconds since boot for one core or microseconds since boot for another.
+
+If desired, a project could collect information about clock parameters
+separately from ``pw_log``, and use that to interpret log timestamps. For
+example, they may call an RPC that returns a ``pw_chrono`` ``ClockParamters``
+protobuf. The values from that could be used to interpret timestamp from that
+device.
+
+The ``pw_log`` proto contains two timestamp fields, only one of which may be set
+at a time:
+
+- ``timestamp`` – Absolute time when message was logged.
+- ``time_since_last_entry`` – When the message was logged, relative
+  to the previously encoded log message. This is used when multiple log entries
+  are sent in a single ``LogEntries`` proto. The previous log entry must use the
+  same time source. If logs with multiple time sources are intermingled in a
+  single ``LogEntries`` proto, they must use an absolute timestamp each time the
+  time source changes.
+
+Optionally tokenized text fields
+--------------------------------
+Several fields in the ``pw_log`` proto store text. Examples include ``message``
+and ``thread_name``. These fields may contain either plain or tokenized text,
+either of which is represented as a single bytes field. These fields are marked
+with a protocol buffer option so the ``pw_tokenizer.proto`` module can detect
+and detokenize tokenized fields as appropriate.
+
+See :ref:`module-pw_tokenizer-proto` for details.
diff --git a/pw_log_multisink/log_queue.cc b/pw_log_multisink/log_queue.cc
index 0547449..d9a6242 100644
--- a/pw_log_multisink/log_queue.cc
+++ b/pw_log_multisink/log_queue.cc
@@ -34,24 +34,26 @@
                                       uint32_t flags,
                                       uint32_t level,
                                       uint32_t line,
-                                      uint32_t thread,
+                                      uint32_t /* thread */,
                                       int64_t timestamp) {
   pw::protobuf::NestedEncoder nested_encoder(encode_buffer_);
   pw::log::LogEntry::Encoder encoder(&nested_encoder);
   Status status;
 
-  encoder.WriteMessageTokenized(message);
+  encoder.WriteMessage(message);
   encoder.WriteLineLevel((level & PW_LOG_LEVEL_BITMASK) |
                          ((line << PW_LOG_LEVEL_BITS) & ~PW_LOG_LEVEL_BITMASK));
   encoder.WriteFlags(flags);
-  encoder.WriteThreadTokenized(thread);
+  // TODO(prashanthsw): Update when the thread_name field is added.
+  // encoder.WriteThreadName(bytes::CopyInOrder(std::endian::little, &thread));
 
   // TODO(prashanthsw): Add support for delta encoding of the timestamp.
   encoder.WriteTimestamp(timestamp);
 
-  if (dropped_entries_ > 0) {
-    encoder.WriteDropped(dropped_entries_);
-  }
+  // TODO(prashanthsw): Handle dropped messages.
+  // if (dropped_entries_ > 0) {
+  //   encoder.WriteDropped(dropped_entries_);
+  // }
 
   ConstByteSpan log_entry;
   status = nested_encoder.Encode(&log_entry);
diff --git a/pw_log_multisink/log_queue_test.cc b/pw_log_multisink/log_queue_test.cc
index 3d77148..9708576 100644
--- a/pw_log_multisink/log_queue_test.cc
+++ b/pw_log_multisink/log_queue_test.cc
@@ -38,7 +38,7 @@
                     const uint32_t expected_flags,
                     const uint32_t expected_level,
                     const uint32_t expected_line,
-                    const uint32_t expected_tokenized_thread,
+                    const uint32_t /* expected_tokenized_thread */,
                     const int64_t expected_timestamp) {
   ConstByteSpan log_entry_message;
   EXPECT_TRUE(log_decoder.Next().ok());  // preamble
@@ -68,15 +68,16 @@
   EXPECT_TRUE(entry_decoder.ReadUint32(&flags).ok());
   EXPECT_EQ(expected_flags, flags);
 
-  uint32_t tokenized_thread;
-  EXPECT_TRUE(entry_decoder.Next().ok());  // tokenized_thread
-  EXPECT_EQ(4U, entry_decoder.FieldNumber());
-  EXPECT_TRUE(entry_decoder.ReadUint32(&tokenized_thread).ok());
-  EXPECT_EQ(expected_tokenized_thread, tokenized_thread);
+  // TODO(prashanthsw): Add thread name when supported.
+  // uint32_t tokenized_thread;
+  // EXPECT_TRUE(entry_decoder.Next().ok());  // tokenized_thread
+  // EXPECT_EQ(4U, entry_decoder.FieldNumber());
+  // EXPECT_TRUE(entry_decoder.ReadUint32(&tokenized_thread).ok());
+  // EXPECT_EQ(expected_tokenized_thread, tokenized_thread);
 
   int64_t timestamp;
   EXPECT_TRUE(entry_decoder.Next().ok());  // timestamp
-  EXPECT_EQ(5U, entry_decoder.FieldNumber());
+  EXPECT_EQ(4U, entry_decoder.FieldNumber());
   EXPECT_TRUE(entry_decoder.ReadInt64(&timestamp).ok());
   EXPECT_EQ(expected_timestamp, timestamp);
 }
diff --git a/pw_log_rpc/logs_rpc.cc b/pw_log_rpc/logs_rpc.cc
index bf5f352..d548c84 100644
--- a/pw_log_rpc/logs_rpc.cc
+++ b/pw_log_rpc/logs_rpc.cc
@@ -21,13 +21,14 @@
 namespace pw::log_rpc {
 namespace {
 
-Result<ConstByteSpan> GenerateDroppedEntryMessage(ByteSpan encode_buffer,
-                                                  size_t dropped_entries) {
-  pw::protobuf::NestedEncoder nested_encoder(encode_buffer);
-  pw::log::LogEntry::Encoder encoder(&nested_encoder);
-  encoder.WriteDropped(dropped_entries);
-  return nested_encoder.Encode();
-}
+// TODO(prashanthsw): Handle dropped messages.
+// Result<ConstByteSpan> GenerateDroppedEntryMessage(ByteSpan encode_buffer,
+//                                                   size_t dropped_entries) {
+//   pw::protobuf::NestedEncoder nested_encoder(encode_buffer);
+//   pw::log::LogEntry::Encoder encoder(&nested_encoder);
+//   encoder.WriteDropped(dropped_entries);
+//   return nested_encoder.Encode();
+// }
 
 }  // namespace
 
@@ -44,13 +45,14 @@
 
   // If previous calls to flush resulted in dropped entries, generate a
   // dropped entry message and write it before further log messages.
-  if (dropped_entries_ > 0) {
-    ByteSpan payload = response_writer_.PayloadBuffer();
-    Result dropped_log = GenerateDroppedEntryMessage(payload, dropped_entries_);
-    PW_TRY(dropped_log.status());
-    PW_TRY(response_writer_.Write(dropped_log.value()));
-    dropped_entries_ = 0;
-  }
+  // TODO(prashanthsw): Handle dropped messages.
+  // if (dropped_entries_ > 0) {
+  //   ByteSpan payload = response_writer_.PayloadBuffer();
+  //   Result dropped_log = GenerateDroppedEntryMessage(payload,
+  //   dropped_entries_); PW_TRY(dropped_log.status());
+  //   PW_TRY(response_writer_.Write(dropped_log.value()));
+  //   dropped_entries_ = 0;
+  // }
 
   // Write logs to the response writer. An important limitation of this
   // implementation is that if this RPC call fails, the logs are lost -
diff --git a/pw_log_sink/log_sink.cc b/pw_log_sink/log_sink.cc
index b3ef96b..98bb086 100644
--- a/pw_log_sink/log_sink.cc
+++ b/pw_log_sink/log_sink.cc
@@ -79,8 +79,7 @@
   va_start(args, message);
   buffer.FormatVaList(message, args);
   va_end(args);
-  encoder.WriteMessageString(buffer.c_str());
-  encoder.WriteThreadString("");
+  encoder.WriteMessage(std::as_bytes(std::span(buffer.view())));
 
   ConstByteSpan log_entry;
   Status status = nested_encoder.Encode(&log_entry);
diff --git a/pw_log_sink/log_sink_test.cc b/pw_log_sink/log_sink_test.cc
index f3432c1..2fb2982 100644
--- a/pw_log_sink/log_sink_test.cc
+++ b/pw_log_sink/log_sink_test.cc
@@ -35,8 +35,8 @@
   EXPECT_TRUE(log_decoder.Next().ok());  // line_level - 2
   EXPECT_TRUE(log_decoder.Next().ok());  // flags - 3
   EXPECT_TRUE(log_decoder.Next().ok());  // timestamp - 5
-  EXPECT_TRUE(log_decoder.Next().ok());  // message_string - 16
-  EXPECT_EQ(16U, log_decoder.FieldNumber());
+  EXPECT_TRUE(log_decoder.Next().ok());  // message - 1
+  EXPECT_EQ(1u, log_decoder.FieldNumber());
   EXPECT_TRUE(log_decoder.ReadString(&log_entry_message).ok());
 
   return std::string(log_entry_message);