pw_metric: Metric RPC service
This adds a metric RPC service, which exposes a metric tree. The user
selects which metrics to expose. The metric tree is sent via streaming
RPC to enable streaming potentially large sets of metrics.
Change-Id: Ib308995b8f7f91ce439ca42fe298f802d4bdf7e3
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/16003
Commit-Queue: Keir Mierle <keir@google.com>
Reviewed-by: Alexei Frolov <frolv@google.com>
Reviewed-by: Ewout van Bekkum <ewout@google.com>
diff --git a/pw_metric/BUILD b/pw_metric/BUILD
index 3cbf66c..051fc27 100644
--- a/pw_metric/BUILD
+++ b/pw_metric/BUILD
@@ -50,6 +50,17 @@
],
)
+pw_cc_library(
+ name = "metric_service_nanopb",
+ hdrs = [
+ "public/pw_metric/metric_service_nanopb.h",
+ ],
+ srcs = [ "metric_service_nanopb.cc" ],
+ deps = [
+ ":metric",
+ ],
+)
+
pw_cc_test(
name = "metric_test",
srcs = [
@@ -69,3 +80,13 @@
":global",
],
)
+
+pw_cc_test(
+ name = "metric_service_nanopb_test",
+ srcs = [
+ "metric_service_nanopb_test.cc",
+ ],
+ deps = [
+ ":metric_service_nanopb",
+ ],
+)
diff --git a/pw_metric/BUILD.gn b/pw_metric/BUILD.gn
index a0148de..9a0ff2b 100644
--- a/pw_metric/BUILD.gn
+++ b/pw_metric/BUILD.gn
@@ -17,6 +17,8 @@
import("$dir_pw_build/target_types.gni")
import("$dir_pw_docgen/docs.gni")
+import("$dir_pw_protobuf_compiler/proto.gni")
+import("$dir_pw_third_party/nanopb/nanopb.gni")
import("$dir_pw_unit_test/test.gni")
config("default_config") {
include_dirs = [ "public" ]
@@ -47,6 +49,43 @@
]
}
+################################################################################
+# Service
+pw_proto_library("metric_service_proto") {
+ sources = [ "pw_metric_proto/metric_service.proto" ]
+ inputs = [ "pw_metric_proto/metric_service.options" ]
+}
+
+# TODO(keir): Consider moving the nanopb service into the nanopb/ directory
+# instead of having it directly inside pw_metric/.
+if (dir_pw_third_party_nanopb != "") {
+ pw_source_set("metric_service_nanopb") {
+ public_configs = [ ":default_config" ]
+ public_deps = [
+ ":metric_service_proto_nanopb_rpc",
+ ":pw_metric",
+ ]
+ public = [ "public/pw_metric/metric_service_nanopb.h" ]
+ deps = [
+ ":metric_service_proto_nanopb_rpc",
+ "$dir_pw_containers:vector",
+ dir_pw_tokenizer,
+ ]
+ sources = [ "metric_service_nanopb.cc" ]
+ }
+
+ pw_test("metric_service_nanopb_test") {
+ deps = [
+ ":global",
+ ":metric_service_nanopb",
+ "$dir_pw_rpc/nanopb:test_method_context",
+ ]
+ sources = [ "metric_service_nanopb_test.cc" ]
+ }
+}
+
+################################################################################
+
pw_doc_group("docs") {
sources = [ "docs.rst" ]
}
@@ -56,6 +95,9 @@
":metric_test",
":global_test",
]
+ if (dir_pw_third_party_nanopb != "") {
+ tests += [ ":metric_service_nanopb_test" ]
+ }
}
pw_test("metric_test") {
diff --git a/pw_metric/docs.rst b/pw_metric/docs.rst
index 65136ee..23ef778 100644
--- a/pw_metric/docs.rst
+++ b/pw_metric/docs.rst
@@ -615,6 +615,102 @@
structured upfront, then manipulated during a device's active phase. They do
not support destruction.
+-----------------
+Exporting metrics
+-----------------
+Collecting metrics on a device is not useful without a mechanism to export
+those metrics for analysis and debugging. ``pw_metric`` offers an optional RPC
+service library (``:metric_service_nanopb``) that enables exporting a
+user-supplied set of on-device metrics via RPC. This facility is intended to
+function from the early stages of device bringup through production in the
+field.
+
+The metrics are fetched by calling the ``MetricService.Get`` RPC method, which
+streams all registered metrics to the caller in batches (server streaming RPC).
+Batching the returned metrics avoids requiring a large buffer or large RPC MTU.
+
+The returned metric objects have flattened paths to the root. For example, the
+returned metrics (post detokenization and jsonified) might look something like:
+
+.. code:: none
+
+ {
+ "/i2c1/failed_txns": 17,
+ "/i2c1/total_txns": 2013,
+ "/i2c1/gyro/resets": 24,
+ "/i2c1/gyro/hangs": 1,
+ "/spi1/thermocouple/reads": 242,
+ "/spi1/thermocouple/temp_celcius": 34.52,
+ }
+
+Note that there is no nesting of the groups; the nesting is implied from the
+path.
+
+RPC service setup
+-----------------
+To expose a ``MetricService`` in your application, do the following:
+
+1. Define metrics around the system, and put them in a group or list of
+ metrics. Easy choices include for example the ``global_groups`` and
+ ``global_metrics`` variables; or creat your own.
+2. Create an instance of ``pw::metric::MetricService``.
+3. Register the service with your RPC server.
+
+For example:
+
+.. code::
+
+ #include "pw_rpc/server.h"
+ #include "pw_metric/metric.h"
+ #include "pw_metric/global.h"
+ #include "pw_metric/metric_service_nanopb.h"
+
+ // Note: You must customize the RPC server setup; see pw_rpc.
+ Channel channels[] = {
+ Channel::Create<1>(&uart_output),
+ };
+ Server server(channels);
+
+ // Metric service instance, pointing to the global metric objects.
+ // This could also point to custom per-product or application objects.
+ pw::metric::MetricService metric_service(
+ pw::metric::global_metrics,
+ pw::metric::global_groups);
+
+ void RegisterServices() {
+ server.RegisterService(metric_service);
+ // Register other services here.
+ }
+
+ void main() {
+ // ... system initialization ...
+
+ RegisterServices();
+
+ // ... start your applcation ...
+ }
+
+.. attention::
+
+ Take care when exporting metrics. Ensure **appropriate access control** is in
+ place. In some cases it may make sense to entirely disable metrics export for
+ production builds. Although reading metrics via RPC won't influence the
+ device, in some cases the metrics could expose sensitive information if
+ product owners are not careful.
+
+.. attention::
+
+ **MetricService::Get is a synchronous RPC method**
+
+ Calls to is ``MetricService::Get`` are blocking and will send all metrics
+ immediately, even though it is a server-streaming RPC. This will work fine if
+ the device doesn't have too many metics, or doesn't have concurrent RPCs like
+ logging, but could be a problem in some cases.
+
+ We plan to offer an async version where the application is responsible for
+ pumping the metrics into the streaming response. This gives flow control to
+ the application.
+
----------------
Design tradeoffs
----------------
@@ -695,8 +791,10 @@
metrics are enabled or disabled at compile time. This may rely on of C++20's
support for zero-sized members to fully remove the cost.
-- **Exposing metrics via RPC** - We plan to add a ``pw_rpc`` service to export
- metrics
+- **Async RCPC** - The current RPC service exports the metrics by streaming
+ them to the client in batches. However, the current solution streams all the
+ metrics to completion; this may block the RPC thread. In the future we will
+ have an async solution where the user is in control of flow priority.
- **Timer integration** - We would like to add a stopwatch type mechanism to
time multiple in-flight events.
diff --git a/pw_metric/metric_service_nanopb.cc b/pw_metric/metric_service_nanopb.cc
new file mode 100644
index 0000000..6b4bf4c
--- /dev/null
+++ b/pw_metric/metric_service_nanopb.cc
@@ -0,0 +1,149 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "pw_metric/metric_service_nanopb.h"
+
+#include <cstring>
+#include <span>
+
+#include "pw_assert/assert.h"
+#include "pw_containers/vector.h"
+#include "pw_metric/metric.h"
+#include "pw_preprocessor/util.h"
+
+namespace pw::metric {
+namespace {
+
+class MetricWriter {
+ public:
+ MetricWriter(rpc::ServerWriter<pw_metric_MetricResponse>& response_writer)
+ : response_(pw_metric_MetricResponse_init_zero),
+ response_writer_(response_writer) {}
+
+ // TODO(keir): Figure out a pw_rpc mechanism to fill a streaming packet based
+ // on transport MTU, rather than having this as a static knob. For example,
+ // some transports may be able to fit 30 metrics; others, only 5.
+ void Write(const Metric& metric, const Vector<Token>& path) {
+ // Nanopb doesn't offer an easy way to do bounds checking, so use span's
+ // type deduction magic to figure out the max size.
+ std::span<pw_metric_Metric> metrics(response_.metrics);
+ PW_CHECK_INT_LT(response_.metrics_count, metrics.size());
+
+ // Grab the next available Metric slot to write to in the response.
+ pw_metric_Metric& proto_metric = response_.metrics[response_.metrics_count];
+
+ // Copy the path.
+ std::span<Token> proto_path(proto_metric.token_path);
+ PW_CHECK_INT_LE(path.size(), proto_path.size());
+ std::copy(path.begin(), path.end(), proto_path.begin());
+ proto_metric.token_path_count = path.size();
+
+ // Copy the metric value.
+ if (metric.is_float()) {
+ proto_metric.value.as_float = metric.as_float();
+ proto_metric.which_value = pw_metric_Metric_as_float_tag;
+ } else {
+ proto_metric.value.as_int = metric.as_int();
+ proto_metric.which_value = pw_metric_Metric_as_int_tag;
+ }
+
+ // Move write head to the next slot.
+ response_.metrics_count++;
+
+ // If the metric response object is full, send the response and reset.
+ // TODO(keir): Support runtime batch sizes < max proto size.
+ if (response_.metrics_count == metrics.size()) {
+ Flush();
+ }
+ }
+
+ void Flush() {
+ if (response_.metrics_count) {
+ response_writer_.Write(response_);
+ response_ = pw_metric_MetricResponse_init_zero;
+ }
+ }
+
+ private:
+ pw_metric_MetricResponse response_;
+ // This RPC stream writer handle must be valid for the metric writer lifetime.
+ rpc::ServerWriter<pw_metric_MetricResponse>& response_writer_;
+};
+
+// Walk a metric tree recursively; passing metrics with their path (names) to a
+// metric writer which can consume them.
+//
+// TODO(keir): Generalize this to support a generic visitor.
+class MetricWalker {
+ public:
+ MetricWalker(MetricWriter& writer) : writer_(writer) {}
+
+ void Walk(const IntrusiveList<Metric>& metrics) {
+ for (const auto& m : metrics) {
+ ScopedName(m.name(), *this);
+ writer_.Write(m, path_);
+ }
+ }
+
+ void Walk(const IntrusiveList<Group>& groups) {
+ for (const auto& g : groups) {
+ Walk(g);
+ }
+ }
+
+ void Walk(const Group& group) {
+ ScopedName(group.name(), *this);
+ Walk(group.children());
+ Walk(group.metrics());
+ }
+
+ private:
+ // Exists to safely push/pop parent groups from the explicit stack.
+ struct ScopedName {
+ ScopedName(Token name, MetricWalker& walker) : walker(walker) {
+ PW_CHECK_INT_LT(walker.path_.size(),
+ walker.path_.capacity(),
+ "Metrics are too deep; bump path_ capacity");
+ walker.path_.push_back(name);
+ }
+ ~ScopedName() { walker.path_.pop_back(); }
+ MetricWalker& walker;
+ };
+
+ Vector<Token, 4 /* max depth */> path_;
+ MetricWriter& writer_;
+};
+
+} // namespace
+
+void MetricService::Get(ServerContext&,
+ const pw_metric_MetricRequest& /* request */,
+ ServerWriter<pw_metric_MetricResponse>& response) {
+ // For now, ignore the request and just stream all the metrics back.
+ MetricWriter writer(response);
+ MetricWalker walker(writer);
+
+ // This will stream all the metrics in the span of this Get() method call.
+ // This will have the effect of blocking the RPC thread until all the metrics
+ // are sent. That is likely to cause problems if there are many metrics, or
+ // if other RPCs are higher priority and should complete first.
+ //
+ // In the future, this should be replaced with an optional async solution
+ // that puts the application in control of when the response batches are sent.
+ walker.Walk(metrics_);
+ walker.Walk(groups_);
+ writer.Flush();
+}
+
+} // namespace pw::metric
diff --git a/pw_metric/metric_service_nanopb_test.cc b/pw_metric/metric_service_nanopb_test.cc
new file mode 100644
index 0000000..b8b15b0
--- /dev/null
+++ b/pw_metric/metric_service_nanopb_test.cc
@@ -0,0 +1,135 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "pw_metric/metric_service_nanopb.h"
+
+#include "gtest/gtest.h"
+#include "pw_log/log.h"
+#include "pw_rpc/test_method_context.h"
+
+namespace pw::metric {
+namespace {
+
+typedef pw::rpc::
+ TestMethodContext<&MetricService::Get, 4, sizeof(pw_metric_MetricResponse)>
+ MetricMethodContext;
+
+TEST(MetricService, EmptyGroupAndNoMetrics) {
+ // Empty root group.
+ PW_METRIC_GROUP(root, "/");
+
+ // Run the RPC and ensure it completes.
+ MetricMethodContext context(root.metrics(), root.children());
+ context.call({});
+ EXPECT_TRUE(context.done());
+ EXPECT_EQ(Status::OK, context.status());
+
+ // No metrics should be in the response.
+ EXPECT_EQ(0u, context.responses().size());
+}
+
+TEST(MetricService, FlatMetricsNoGroupsOneResponseOnly) {
+ // Set up a one-group suite of metrics.
+ PW_METRIC_GROUP(root, "/");
+ PW_METRIC(root, a, "a", 1.0);
+ PW_METRIC(root, b, "b", 1.0);
+ PW_METRIC(root, c, "c", 1.0);
+ PW_METRIC(root, d, "d", 1.0);
+ PW_METRIC(root, e, "e", 1.0);
+
+ // Run the RPC and ensure it completes.
+ MetricMethodContext context(root.metrics(), root.children());
+ context.call({});
+ EXPECT_TRUE(context.done());
+ EXPECT_EQ(Status::OK, context.status());
+
+ // All of the responses should have fit in one proto.
+ EXPECT_EQ(1u, context.responses().size());
+ EXPECT_EQ(5, context.responses()[0].metrics_count);
+}
+
+TEST(MetricService, NestedGroupsButOnlyOneBatch) {
+ // Set up a nested group of metrics that will fit in the default batch (10).
+ PW_METRIC_GROUP(root, "/");
+ PW_METRIC(root, a, "a", 1.0);
+ PW_METRIC(root, b, "b", 1.0);
+ PW_METRIC(root, c, "c", 1.0);
+
+ PW_METRIC_GROUP(inner, "inner");
+ PW_METRIC(inner, x, "x", 1.0);
+ PW_METRIC(inner, y, "y", 1.0);
+ PW_METRIC(inner, z, "z", 1.0);
+
+ root.Add(inner);
+
+ // Run the RPC and ensure it completes.
+ MetricMethodContext context(root.metrics(), root.children());
+ context.call({});
+ EXPECT_TRUE(context.done());
+ EXPECT_EQ(Status::OK, context.status());
+
+ // All of the responses should fit in one proto.
+ EXPECT_EQ(1u, context.responses().size());
+ EXPECT_EQ(6, context.responses()[0].metrics_count);
+}
+
+TEST(MetricService, NestedGroupsWithBatches) {
+ // Set up a nested group of metrics that will not fit in a single batch.
+ PW_METRIC_GROUP(root, "/");
+ PW_METRIC(root, a, "a", 1u);
+ PW_METRIC(root, d, "d", 2u);
+ PW_METRIC(root, f, "f", 3u);
+
+ PW_METRIC_GROUP(inner_1, "inner1");
+ PW_METRIC(inner_1, x, "x", 4u);
+ PW_METRIC(inner_1, y, "y", 5u);
+ PW_METRIC(inner_1, z, "z", 6u);
+
+ PW_METRIC_GROUP(inner_2, "inner2");
+ PW_METRIC(inner_2, p, "p", 7u);
+ PW_METRIC(inner_2, q, "q", 8u);
+ PW_METRIC(inner_2, r, "r", 9u);
+ PW_METRIC(inner_2, s, "s", 10u); // Note: Max # per response is 10.
+ PW_METRIC(inner_2, t, "s", 11u);
+ PW_METRIC(inner_2, u, "s", 12u);
+
+ root.Add(inner_1);
+ root.Add(inner_2);
+
+ // Run the RPC and ensure it completes.
+ MetricMethodContext context(root.metrics(), root.children());
+ context.call({});
+ EXPECT_TRUE(context.done());
+ EXPECT_EQ(Status::OK, context.status());
+
+ // The response had to be split into two parts; check that they have the
+ // appropriate sizes.
+ EXPECT_EQ(2u, context.responses().size());
+ EXPECT_EQ(10, context.responses()[0].metrics_count);
+ EXPECT_EQ(2, context.responses()[1].metrics_count);
+
+ // The metrics are the numbers 1..12; sum them and compare.
+ uint32_t metric_sum = 0;
+ for (const auto& response : context.responses()) {
+ for (unsigned i = 0; i < response.metrics_count; ++i) {
+ metric_sum += response.metrics[i].value.as_int;
+ }
+ }
+ EXPECT_EQ(78u, metric_sum);
+
+ // TODO(keir): Properly check all the fields.
+}
+
+} // namespace
+} // namespace pw::metric
diff --git a/pw_metric/public/pw_metric/metric_service_nanopb.h b/pw_metric/public/pw_metric/metric_service_nanopb.h
new file mode 100644
index 0000000..15fd6ee
--- /dev/null
+++ b/pw_metric/public/pw_metric/metric_service_nanopb.h
@@ -0,0 +1,49 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+#pragma once
+
+#include <cstring>
+#include <span>
+
+#include "pw_log/log.h"
+#include "pw_metric/metric.h"
+#include "pw_metric_proto/metric_service.rpc.pb.h"
+
+namespace pw::metric {
+
+// The MetricService will send metrics when requested by Get(). For now, each
+// Get() request results in a stream of responses, containing the metrics from
+// the supplied list of groups and metrics. This includes recursive traversal
+// of subgroups. In the future, filtering will be supported.
+//
+// An important limitation of the current implementation is that the Get()
+// method is blocking, and sends all metrics at once (though batched). In the
+// future, we may switch to offering an async version where the Get() method
+// returns immediately, and someone else is responsible for pumping the queue.
+class MetricService final : public generated::MetricService<MetricService> {
+ public:
+ MetricService(const IntrusiveList<Metric>& metrics,
+ const IntrusiveList<Group>& groups)
+ : metrics_(metrics), groups_(groups) {}
+
+ void Get(ServerContext&,
+ const pw_metric_MetricRequest& request,
+ ServerWriter<pw_metric_MetricResponse>& response);
+
+ private:
+ const IntrusiveList<Metric>& metrics_;
+ const IntrusiveList<Group>& groups_;
+};
+
+} // namespace pw::metric
diff --git a/pw_metric/pw_metric_proto/metric_service.options b/pw_metric/pw_metric_proto/metric_service.options
new file mode 100644
index 0000000..65eb3b3
--- /dev/null
+++ b/pw_metric/pw_metric_proto/metric_service.options
@@ -0,0 +1,18 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+// TODO(keir): Figure out appropriate options.
+pw.metric.Metric.token_path max_count:4
+pw.metric.MetricResponse.metrics max_count:10
+
diff --git a/pw_metric/pw_metric_proto/metric_service.proto b/pw_metric/pw_metric_proto/metric_service.proto
new file mode 100644
index 0000000..27cb371
--- /dev/null
+++ b/pw_metric/pw_metric_proto/metric_service.proto
@@ -0,0 +1,70 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+syntax = "proto3";
+
+package pw.metric;
+
+// A metric, described by the name (path + name), and the value.
+//
+// This flattened representation, while more complicated than the obvious tree
+// structure alternative, enables streaming metrics from the device in low
+// memory or low compute sitations.
+message Metric {
+ // The token path from the root. The last token is the metric name, and
+ // previous tokens are the parent group names. This could be converted from
+ // the tokens into a string; for example the token path {0xfaff, 0xabcd}:
+ //
+ // - The group is 0xfaff (root, parent)
+ // - The metric is 0xabcd
+ //
+ // Given the token database, this might be converted into:
+ //
+ // /i2c_bus_1/failed_transactions
+ //
+ // Note: This uses a repeated fixed32 instead of a "Oneof" with the string
+ // path to reduce the encoded size. Using a repeated Oneof name { str,
+ // fixed32 } would cost approximately 6N bytes for N path elements, vs 2 + 4N
+ // bytes in the packed case.
+ repeated fixed32 token_path = 1;
+
+ // The string path from the root. Similar to token path, but with strings.
+ // Note: This is currently unsupported.
+ repeated string string_path = 2;
+
+ // The metric value. This field should be omitted when used as a query.
+ oneof value {
+ float as_float = 3;
+ uint32 as_int = 4;
+ };
+}
+
+message MetricRequest {
+ // Metrics or the groups matched to the given paths are returned. The intent
+ // is to support matching semantics, with at least subsetting to e.g. collect
+ // all the metrics in a group and its children. We may also implement
+ // wildcard matchers.
+ //
+ // Value fields in the metrics will be ignored, since this is a query.
+ //
+ // Note: This is currently unsupported.
+ repeated Metric metrics = 1;
+}
+
+message MetricResponse {
+ repeated Metric metrics = 1;
+}
+
+service MetricService {
+ rpc Get(MetricRequest) returns (stream MetricResponse) {}
+}