No public description PiperOrigin-RevId: 668035717
diff --git a/centipede/BUILD b/centipede/BUILD index 8ef1ee0..37d94a0 100644 --- a/centipede/BUILD +++ b/centipede/BUILD
@@ -58,6 +58,7 @@ srcs = ["seed_corpus_maker.cc"], deps = [ ":config_init", + ":seed_corpus_config_proto_lib", ":seed_corpus_maker_flags", ":seed_corpus_maker_lib", ":util", @@ -65,7 +66,9 @@ "@com_google_absl//absl/flags:flag", "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", + "@com_google_absl//absl/status", "@com_google_fuzztest//common:remote_file", + "@com_google_fuzztest//common:status_macros", ], ) @@ -799,7 +802,6 @@ ":pc_info", ":periodic_action", ":runner_result", - ":seed_corpus_config_cc_proto", ":seed_corpus_maker_lib", ":stats", ":thread_pool", @@ -1091,7 +1093,6 @@ ":corpus_io", ":feature", ":rusage_profiler", - ":seed_corpus_config_cc_proto", ":thread_pool", ":util", ":workdir", @@ -1108,6 +1109,25 @@ "@com_google_fuzztest//common:logging", "@com_google_fuzztest//common:remote_file", "@com_google_fuzztest//common:status_macros", + ], +) + +# Utilities for seed corpus config proto. +cc_library( + name = "seed_corpus_config_proto_lib", + srcs = ["seed_corpus_config_proto_lib.cc"], + hdrs = ["seed_corpus_config_proto_lib.h"], + deps = [ + ":seed_corpus_config_cc_proto", + ":seed_corpus_maker_lib", + ":workdir", + "@com_google_absl//absl/log", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@com_google_absl//absl/strings", + "@com_google_fuzztest//common:logging", + "@com_google_fuzztest//common:remote_file", + "@com_google_fuzztest//common:status_macros", "@com_google_protobuf//:protobuf", ], ) @@ -1640,6 +1660,7 @@ deps = [ ":feature", ":seed_corpus_config_cc_proto", + ":seed_corpus_config_proto_lib", ":seed_corpus_maker_lib", ":workdir", "@com_google_absl//absl/log:check", @@ -1653,6 +1674,24 @@ ) cc_test( + name = "seed_corpus_config_proto_lib_test", + srcs = ["seed_corpus_config_proto_lib_test.cc"], + deps = [ + ":seed_corpus_config_cc_proto", + ":seed_corpus_config_proto_lib", + ":workdir", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/strings", + "@com_google_fuzztest//common:logging", + "@com_google_fuzztest//common:status_macros", + "@com_google_fuzztest//common:test_util", + "@com_google_fuzztest//fuzztest", + "@com_google_fuzztest//fuzztest:fuzztest_gtest_main", + "@com_google_protobuf//:protobuf", + ], +) + +cc_test( name = "coverage_test", srcs = ["coverage_test.cc"], data = [
diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc index 88abafc..67435e3 100644 --- a/centipede/centipede_interface.cc +++ b/centipede/centipede_interface.cc
@@ -55,7 +55,6 @@ #include "./centipede/pc_info.h" #include "./centipede/periodic_action.h" #include "./centipede/runner_result.h" -#include "./centipede/seed_corpus_config.pb.h" #include "./centipede/seed_corpus_maker_lib.h" #include "./centipede/stats.h" #include "./centipede/thread_pool.h" @@ -348,21 +347,21 @@ std::string_view src_dir) { const WorkDir workdir{env}; SeedCorpusConfig seed_corpus_config; - SeedCorpusSource &src = *seed_corpus_config.mutable_sources()->Add(); - src.set_dir_glob(src_dir); - src.set_num_recent_dirs(1); + SeedCorpusSource &src = seed_corpus_config.sources.emplace_back(); + src.dir_glob = src_dir; + src.num_recent_dirs = 1; // We're using the previously distilled corpus files as seeds. - src.set_shard_rel_glob( + src.shard_rel_glob = std::filesystem::path{workdir.DistilledCorpusFiles().AllShardsGlob()} - .filename()); - src.set_sampled_fraction(1.0); - SeedCorpusDestination &dst = *seed_corpus_config.mutable_destination(); - dst.set_dir_path(env.workdir); + .filename(); + src.sampled_fraction_or_count = 1.0f; + SeedCorpusDestination &dst = seed_corpus_config.destination; + dst.dir_path = env.workdir; // We're seeding the current corpus files. - dst.set_shard_rel_glob( - std::filesystem::path{workdir.CorpusFiles().AllShardsGlob()}.filename()); - dst.set_shard_index_digits(WorkDir::kDigitsInShardIndex); - dst.set_num_shards(env.num_threads); + dst.shard_rel_glob = + std::filesystem::path{workdir.CorpusFiles().AllShardsGlob()}.filename(); + dst.shard_index_digits = WorkDir::kDigitsInShardIndex; + dst.num_shards = env.num_threads; return seed_corpus_config; }
diff --git a/centipede/seed_corpus_config.proto b/centipede/seed_corpus_config.proto index 8abc201..44e1ae7 100644 --- a/centipede/seed_corpus_config.proto +++ b/centipede/seed_corpus_config.proto
@@ -18,7 +18,7 @@ syntax = "proto3"; -package centipede; +package centipede.proto; // Describes a seed corpus source as a set of directories matching a glob in // combination with a relative shard file glob searched under each of those
diff --git a/centipede/seed_corpus_config_proto_lib.cc b/centipede/seed_corpus_config_proto_lib.cc new file mode 100644 index 0000000..3a7a00c --- /dev/null +++ b/centipede/seed_corpus_config_proto_lib.cc
@@ -0,0 +1,135 @@ +// Copyright 2024 The Centipede Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "./centipede/seed_corpus_config_proto_lib.h" + +#include <filesystem> // NOLINT +#include <string> +#include <string_view> +#include <utility> +#include <variant> + +#include "absl/log/log.h" +#include "absl/status/status.h" +#include "absl/status/statusor.h" +#include "absl/strings/str_cat.h" +#include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/seed_corpus_maker_lib.h" +#include "./centipede/workdir.h" +#include "./common/logging.h" +#include "./common/remote_file.h" +#include "./common/status_macros.h" +#include "google/protobuf/text_format.h" + +namespace centipede { + +namespace fs = std::filesystem; + +absl::StatusOr<proto::SeedCorpusConfig> ResolveSeedCorpusConfigProto( // + std::string_view config_spec, // + std::string_view override_out_dir) { + std::string config_str; + std::string base_dir; + + if (config_spec.empty()) { + return absl::InvalidArgumentError( + "Unable to ResolveSeedCorpusConfig() with empty config_spec"); + } + + if (RemotePathExists(config_spec)) { + LOG(INFO) << "Config spec points at an existing file; trying to parse " + "textproto config from it: " + << VV(config_spec); + RETURN_IF_NOT_OK(RemoteFileGetContents(config_spec, config_str)); + LOG(INFO) << "Raw config read from file:\n" << config_str; + base_dir = std::filesystem::path{config_spec}.parent_path(); + } else { + LOG(INFO) << "Config spec is not a file, or file doesn't exist; trying to " + "parse textproto config verbatim: " + << VV(config_spec); + config_str = config_spec; + base_dir = fs::current_path(); + } + + proto::SeedCorpusConfig config; + if (!google::protobuf::TextFormat::ParseFromString(config_str, &config)) { + return absl::InvalidArgumentError( + absl::StrCat("Unable to parse config_str: ", config_str)); + } + if (config.sources_size() > 0 != config.has_destination()) { + return absl::InvalidArgumentError( + absl::StrCat("Non-empty config must have both source(s) and " + "destination, config_spec: ", + config_spec, ", config: ", config)); + } + LOG(INFO) << "Parsed config:\n" << config; + + // Resolve relative `source.dir_glob`s in the config to absolute ones. + for (auto& src : *config.mutable_sources()) { + auto* dir = src.mutable_dir_glob(); + if (dir->empty() || !fs::path{*dir}.is_absolute()) { + *dir = fs::path{base_dir} / *dir; + } + } + + // Set `destination.dir_path` to `override_out_dir`, if the latter is + // non-empty, or resolve a relative `destination.dir_path` to an absolute one. + if (config.has_destination()) { + auto* dir = config.mutable_destination()->mutable_dir_path(); + if (!override_out_dir.empty()) { + *dir = override_out_dir; + } else if (dir->empty() || !fs::path{*dir}.is_absolute()) { + *dir = fs::path{base_dir} / *dir; + } + } + + if (config.destination().shard_index_digits() == 0) { + config.mutable_destination()->set_shard_index_digits( + WorkDir::kDigitsInShardIndex); + } + + LOG(INFO) << "Resolved config:\n" << config; + + return config; +} + +SeedCorpusConfig CreateSeedCorpusConfigFromProto( + const proto::SeedCorpusConfig& config_proto) { + SeedCorpusConfig config; + for (const auto& source_proto : config_proto.sources()) { + SeedCorpusSource source; + source.dir_glob = source_proto.dir_glob(); + source.num_recent_dirs = source_proto.num_recent_dirs(); + source.shard_rel_glob = source_proto.shard_rel_glob(); + switch (source_proto.sample_size_case()) { + case proto::SeedCorpusSource::kSampledFraction: + source.sampled_fraction_or_count = source_proto.sampled_fraction(); + break; + case proto::SeedCorpusSource::kSampledCount: + source.sampled_fraction_or_count = source_proto.sampled_count(); + break; + case proto::SeedCorpusSource::SAMPLE_SIZE_NOT_SET: + break; + } + config.sources.push_back(std::move(source)); + } + config.destination.dir_path = config_proto.destination().dir_path(); + config.destination.shard_rel_glob = + config_proto.destination().shard_rel_glob(); + config.destination.shard_index_digits = + config_proto.destination().shard_index_digits(); + config.destination.num_shards = config_proto.destination().num_shards(); + return config; +} + +} // namespace centipede
diff --git a/centipede/seed_corpus_config_proto_lib.h b/centipede/seed_corpus_config_proto_lib.h new file mode 100644 index 0000000..3a494c7 --- /dev/null +++ b/centipede/seed_corpus_config_proto_lib.h
@@ -0,0 +1,44 @@ +// Copyright 2024 The Centipede Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef THIRD_PARTY_CENTIPEDE_SEED_CORPUS_CONFIG_PROTO_LIB_H_ +#define THIRD_PARTY_CENTIPEDE_SEED_CORPUS_CONFIG_PROTO_LIB_H_ + +#include <string_view> + +#include "absl/status/status.h" +#include "absl/status/statusor.h" +#include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/seed_corpus_maker_lib.h" + +namespace centipede { + +// If a file with `config_spec` path exists, tries to parse it as a +// `SeedCorpusConfig` textproto. Otherwise, tries to parse `config_spec` as a +// verbatim `SeedCorpusConfig` textproto. Resolves any relative paths and globs +// in the config fields to absolute ones, using as the base dir either the +// file's parent dir (if `config_spec` is a file) or the current dir otherwise. +// If `override_out_dir` is non-empty, it overrides `destination.dir_path` in +// the resolved config. +absl::StatusOr<proto::SeedCorpusConfig> ResolveSeedCorpusConfigProto( // + std::string_view config_spec, // + std::string_view override_out_dir = ""); + +// Creates the native `SeedCorpusConfig` from `config_proto`; +SeedCorpusConfig CreateSeedCorpusConfigFromProto( + const proto::SeedCorpusConfig& config_proto); + +} // namespace centipede + +#endif // THIRD_PARTY_CENTIPEDE_SEED_CORPUS_CONFIG_PROTO_LIB_H_
diff --git a/centipede/seed_corpus_config_proto_lib_test.cc b/centipede/seed_corpus_config_proto_lib_test.cc new file mode 100644 index 0000000..a6aa722 --- /dev/null +++ b/centipede/seed_corpus_config_proto_lib_test.cc
@@ -0,0 +1,132 @@ +// Copyright 2024 The Centipede Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "./centipede/seed_corpus_config_proto_lib.h" + +#include <cstddef> +#include <sstream> + +#include "gtest/gtest.h" +#include "./fuzztest/fuzztest.h" +#include "absl/log/check.h" +#include "absl/strings/substitute.h" +#include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/workdir.h" +#include "./common/logging.h" // IWYU pragma: keep +#include "./common/status_macros.h" +#include "./common/test_util.h" +#include "google/protobuf/text_format.h" +#include "google/protobuf/util/message_differencer.h" + +namespace centipede { +namespace { + +using ::google::protobuf::TextFormat; +using ::google::protobuf::util::DefaultFieldComparator; +using ::google::protobuf::util::MessageDifferencer; + +inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; + +proto::SeedCorpusConfig ParseSeedCorpusConfigProto( + std::string_view config_str) { + proto::SeedCorpusConfig config_proto; + CHECK(TextFormat::ParseFromString(config_str, &config_proto)); + return config_proto; +} + +std::string PrintSeedCorpusConfigProtoToString( + const proto::SeedCorpusConfig& config_proto) { + std::string config_str; + TextFormat::PrintToString(config_proto, &config_str); + return config_str; +} + +TEST(SeedCorpusMakerLibTest, ResolveConfig) { + const std::string test_dir = GetTestTempDir(test_info_->name()); + + // `ResolveSeedCorpusConfig()` should use the CWD to resolve relative paths. + chdir(test_dir.c_str()); + + constexpr size_t kNumShards = 3; + constexpr std::string_view kSrcSubDir = "src/dir"; + constexpr std::string_view kDstSubDir = "dest/dir"; + const std::string_view kConfigStr = R"pb( + sources { + dir_glob: "./$0" + shard_rel_glob: "corpus.*" + num_recent_dirs: 1 + sampled_fraction: 0.5 + } + destination { + # + dir_path: "./$1" + shard_rel_glob: "corpus.*" + num_shards: $2 + } + )pb"; + const std::string_view kExpectedConfigStr = R"pb( + sources { + dir_glob: "$0/./$1" + shard_rel_glob: "corpus.*" + num_recent_dirs: 1 + sampled_fraction: 0.5 + } + destination { + dir_path: "$0/./$2" + shard_rel_glob: "corpus.*" + num_shards: $3 + shard_index_digits: $4 + } + )pb"; + + const proto::SeedCorpusConfig resolved_config_proto = + ValueOrDie(ResolveSeedCorpusConfigProto( // + absl::Substitute(kConfigStr, kSrcSubDir, kDstSubDir, kNumShards))); + + const proto::SeedCorpusConfig expected_config_proto = + ParseSeedCorpusConfigProto( // + absl::Substitute(kExpectedConfigStr, test_dir, kSrcSubDir, kDstSubDir, + kNumShards, kIdxDigits)); + + ASSERT_EQ(PrintSeedCorpusConfigProtoToString(resolved_config_proto), + PrintSeedCorpusConfigProtoToString(expected_config_proto)); +} + +void SeedCorpusConfigProtoConversionRoundTrip( + const proto::SeedCorpusConfig& config_proto) { + std::ostringstream os; + os << CreateSeedCorpusConfigFromProto(config_proto); + const std::string stringified_config = os.str(); + const proto::SeedCorpusConfig parsed_config_proto = + ParseSeedCorpusConfigProto(stringified_config); + MessageDifferencer diff; + diff.set_message_field_comparison(MessageDifferencer::EQUIVALENT); + DefaultFieldComparator comparator; + comparator.set_treat_nan_as_equal(true); + comparator.set_float_comparison( + DefaultFieldComparator::FloatComparison::APPROXIMATE); + comparator.SetDefaultFractionAndMargin(0.0001, 0.0001); + diff.set_field_comparator(&comparator); + std::string diff_out; + diff.ReportDifferencesToString(&diff_out); + const bool is_equal = diff.Compare(config_proto, parsed_config_proto); + ASSERT_TRUE(is_equal) << config_proto << " is different than " + << parsed_config_proto << ": " << diff_out; +} + +FUZZ_TEST(SeedCorpusConfigProtoLibFuzzTest, + SeedCorpusConfigProtoConversionRoundTrip); + +} // namespace +} // namespace centipede
diff --git a/centipede/seed_corpus_maker.cc b/centipede/seed_corpus_maker.cc index 4cc6a45..fab1352 100644 --- a/centipede/seed_corpus_maker.cc +++ b/centipede/seed_corpus_maker.cc
@@ -15,16 +15,45 @@ #include <cstdlib> #include <filesystem> // NOLINT #include <string> +#include <string_view> #include "absl/base/nullability.h" #include "absl/flags/flag.h" #include "absl/log/check.h" #include "absl/log/log.h" +#include "absl/status/status.h" #include "./centipede/config_init.h" +#include "./centipede/seed_corpus_config_proto_lib.h" #include "./centipede/seed_corpus_maker_flags.h" #include "./centipede/seed_corpus_maker_lib.h" #include "./centipede/util.h" #include "./common/remote_file.h" +#include "./common/status_macros.h" + +namespace centipede { +namespace { + +absl::Status GenerateSeedCorpusFromConfigProto( // + std::string_view config_spec, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // + std::string_view override_out_dir) { + // Resolve the config. + ASSIGN_OR_RETURN_IF_NOT_OK( + const proto::SeedCorpusConfig config_proto, + ResolveSeedCorpusConfigProto(config_spec, override_out_dir)); + if (config_proto.sources_size() == 0 || !config_proto.has_destination()) { + LOG(WARNING) << "Config is empty: skipping seed corpus generation"; + return absl::OkStatus(); + } + RETURN_IF_NOT_OK(GenerateSeedCorpusFromConfig( // + CreateSeedCorpusConfigFromProto(config_proto), coverage_binary_name, + coverage_binary_hash)); + return absl::OkStatus(); +} + +} // namespace +} // namespace centipede int main(int argc, absl::Nonnull<char**> argv) { (void)centipede::config::InitRuntime(argc, argv); @@ -46,7 +75,7 @@ << " from actual file at --coverage_binary_path=" << binary_path; } - QCHECK_OK(centipede::GenerateSeedCorpusFromConfig( // + QCHECK_OK(centipede::GenerateSeedCorpusFromConfigProto( // config, binary_name, binary_hash, override_out_dir)); return EXIT_SUCCESS;
diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index 4ebd677..ce73a4b 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc
@@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// The Centipede seed corpus maker. Following the input text proto config in the -// ./seed_corpus_config.proto format, selects a sample of fuzzing inputs from N +// The Centipede seed corpus maker. It selects a sample of fuzzing inputs from N // Centipede workdirs and writes them out to a new set of Centipede corpus file // shards. @@ -23,16 +22,20 @@ #include <atomic> #include <cmath> #include <cstddef> +#include <cstdint> #include <cstdio> #include <cstdlib> #include <filesystem> // NOLINT #include <functional> +#include <iostream> #include <iterator> #include <memory> #include <numeric> +#include <sstream> #include <string> #include <string_view> #include <utility> +#include <variant> #include <vector> #include "absl/log/check.h" @@ -40,6 +43,7 @@ #include "absl/random/random.h" #include "absl/status/status.h" #include "absl/status/statusor.h" +#include "absl/strings/escaping.h" #include "absl/strings/match.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" @@ -48,7 +52,6 @@ #include "./centipede/corpus_io.h" #include "./centipede/feature.h" #include "./centipede/rusage_profiler.h" -#include "./centipede/seed_corpus_config.pb.h" #include "./centipede/thread_pool.h" #include "./centipede/util.h" #include "./centipede/workdir.h" @@ -57,7 +60,6 @@ #include "./common/logging.h" #include "./common/remote_file.h" #include "./common/status_macros.h" -#include "google/protobuf/text_format.h" // TODO(ussuri): Implement a smarter on-the-fly sampling to avoid having to // load all of a source's elements into RAM only to pick some of them. That @@ -81,72 +83,36 @@ } // namespace -absl::StatusOr<SeedCorpusConfig> ResolveSeedCorpusConfig( // - std::string_view config_spec, // - std::string_view override_out_dir) { - std::string config_str; - std::string base_dir; - - if (config_spec.empty()) { - return absl::InvalidArgumentError( - "Unable to ResolveSeedCorpusConfig() with empty config_spec"); +std::ostream& operator<<(std::ostream& os, const SeedCorpusSource& source) { + os << "dir_glob: \"" << absl::CEscape(source.dir_glob) + << "\" num_recent_dirs: " << source.num_recent_dirs + << " shard_rel_glob: \"" << absl::CEscape(source.shard_rel_glob) << "\""; + if (std::holds_alternative<float>(source.sampled_fraction_or_count)) { + os << " sampled_fraction: " + << std::get<float>(source.sampled_fraction_or_count); + } else if (std::holds_alternative<uint32_t>( + source.sampled_fraction_or_count)) { + os << " sampled_count: " + << std::get<uint32_t>(source.sampled_fraction_or_count); } + return os; +} - if (RemotePathExists(config_spec)) { - LOG(INFO) << "Config spec points at an existing file; trying to parse " - "textproto config from it: " - << VV(config_spec); - RETURN_IF_NOT_OK(RemoteFileGetContents(config_spec, config_str)); - LOG(INFO) << "Raw config read from file:\n" << config_str; - base_dir = std::filesystem::path{config_spec}.parent_path(); - } else { - LOG(INFO) << "Config spec is not a file, or file doesn't exist; trying to " - "parse textproto config verbatim: " - << VV(config_spec); - config_str = config_spec; - base_dir = fs::current_path(); +std::ostream& operator<<(std::ostream& os, + const SeedCorpusDestination& destination) { + os << "dir_path: \"" << absl::CEscape(destination.dir_path) + << "\" shard_rel_glob: \"" << absl::CEscape(destination.shard_rel_glob) + << "\" shard_index_digits: " << destination.shard_index_digits + << " num_shards: " << destination.num_shards; + return os; +} + +std::ostream& operator<<(std::ostream& os, const SeedCorpusConfig& config) { + for (const auto& source : config.sources) { + os << "sources { " << source << " }"; } - - SeedCorpusConfig config; - if (!google::protobuf::TextFormat::ParseFromString(config_str, &config)) { - return absl::InvalidArgumentError( - absl::StrCat("Unable to parse config_str: ", config_str)); - } - if (config.sources_size() > 0 != config.has_destination()) { - return absl::InvalidArgumentError( - absl::StrCat("Non-empty config must have both source(s) and " - "destination, config_spec: ", - config_spec, ", config: ", config)); - } - LOG(INFO) << "Parsed config:\n" << config; - - // Resolve relative `source.dir_glob`s in the config to absolute ones. - for (auto& src : *config.mutable_sources()) { - auto* dir = src.mutable_dir_glob(); - if (dir->empty() || !fs::path{*dir}.is_absolute()) { - *dir = fs::path{base_dir} / *dir; - } - } - - // Set `destination.dir_path` to `override_out_dir`, if the latter is - // non-empty, or resolve a relative `destination.dir_path` to an absolute one. - if (config.has_destination()) { - auto* dir = config.mutable_destination()->mutable_dir_path(); - if (!override_out_dir.empty()) { - *dir = override_out_dir; - } else if (dir->empty() || !fs::path{*dir}.is_absolute()) { - *dir = fs::path{base_dir} / *dir; - } - } - - if (config.destination().shard_index_digits() == 0) { - config.mutable_destination()->set_shard_index_digits( - WorkDir::kDigitsInShardIndex); - } - - LOG(INFO) << "Resolved config:\n" << config; - - return config; + os << "destination { " << config.destination << " }"; + return os; } // TODO(ussuri): Refactor into smaller functions. @@ -173,14 +139,14 @@ // `source.num_recent_dirs()` most recent ones. std::vector<std::string> src_dirs; - RETURN_IF_NOT_OK(RemoteGlobMatch(source.dir_glob(), src_dirs)); + RETURN_IF_NOT_OK(RemoteGlobMatch(source.dir_glob, src_dirs)); LOG(INFO) << "Found " << src_dirs.size() << " corpus dir(s) matching " - << source.dir_glob(); + << source.dir_glob; // Sort in the ascending lexicographical order. We expect that dir names // contain timestamps and therefore will be sorted from oldest to newest. std::sort(src_dirs.begin(), src_dirs.end(), std::less<std::string>()); - if (source.num_recent_dirs() < src_dirs.size()) { - src_dirs.erase(src_dirs.begin(), src_dirs.end() - source.num_recent_dirs()); + if (source.num_recent_dirs < src_dirs.size()) { + src_dirs.erase(src_dirs.begin(), src_dirs.end() - source.num_recent_dirs); LOG(INFO) << "Selected " << src_dirs.size() << " corpus dir(s)"; } @@ -188,7 +154,7 @@ std::vector<std::string> corpus_shard_fnames; for (const auto& dir : src_dirs) { - const std::string shards_glob = fs::path{dir} / source.shard_rel_glob(); + const std::string shards_glob = fs::path{dir} / source.shard_rel_glob; // NOTE: `RemoteGlobMatch` appends to the output list. const auto prev_num_shards = corpus_shard_fnames.size(); RETURN_IF_NOT_OK(RemoteGlobMatch(shards_glob, corpus_shard_fnames)); @@ -196,10 +162,10 @@ << " shard(s) matching " << shards_glob; } LOG(INFO) << "Found " << corpus_shard_fnames.size() - << " shard(s) total in source " << source.dir_glob(); + << " shard(s) total in source " << source.dir_glob; if (corpus_shard_fnames.empty()) { - LOG(WARNING) << "Skipping empty source " << source.dir_glob(); + LOG(WARNING) << "Skipping empty source " << source.dir_glob; return absl::OkStatus(); } @@ -288,27 +254,25 @@ LOG(INFO) << "Read total of " << src_elts.size() << " elements (" << src_num_features << " with features) from source " - << source.dir_glob(); + << source.dir_glob; // Extract a sample of the elements of the size specified in - // `source.sample_size()`. + // `source.sample_size`. size_t sample_size = 0; - switch (source.sample_size_case()) { - case SeedCorpusSource::kSampledFraction: - if (source.sampled_fraction() <= 0.0 || source.sampled_fraction() > 1) { - return absl::InvalidArgumentError( - absl::StrCat("sampled_fraction must be in (0, 1], got ", - source.sampled_fraction())); - } - sample_size = std::llrint(src_elts.size() * source.sampled_fraction()); - break; - case SeedCorpusSource::kSampledCount: - sample_size = std::min<size_t>(src_elts.size(), source.sampled_count()); - break; - case SeedCorpusSource::SAMPLE_SIZE_NOT_SET: - sample_size = src_elts.size(); - break; + if (std::holds_alternative<float>(source.sampled_fraction_or_count)) { + const auto& fraction = std::get<float>(source.sampled_fraction_or_count); + if (fraction <= 0.0 || fraction > 1) { + return absl::InvalidArgumentError( + absl::StrCat("sampled_fraction must be in (0, 1], got ", fraction)); + } + sample_size = std::llrint(src_elts.size() * fraction); + } else if (std::holds_alternative<uint32_t>( + source.sampled_fraction_or_count)) { + const auto count = std::get<uint32_t>(source.sampled_fraction_or_count); + sample_size = std::min<size_t>(src_elts.size(), count); + } else { + sample_size = src_elts.size(); } if (sample_size < src_elts.size()) { @@ -357,7 +321,7 @@ "Collected seed corpus turned out to be empty: verify config / " "sources"); } - if (destination.dir_path().empty()) { + if (destination.dir_path.empty()) { return absl::InvalidArgumentError( "Unable to write seed corpus to empty destination path"); } @@ -371,21 +335,21 @@ << " seed corpus elements to destination:\n" << destination; - if (destination.num_shards() <= 0) { + if (destination.num_shards <= 0) { return absl::InvalidArgumentError( "Requested number of destination shards must be > 0"); } - if (!absl::StrContains(destination.shard_rel_glob(), "*")) { + if (!absl::StrContains(destination.shard_rel_glob, "*")) { return absl::InvalidArgumentError( absl::StrCat("Destination shard pattern must contain '*', got ", - destination.shard_rel_glob())); + destination.shard_rel_glob)); } // Compute shard sizes. If the elements can't be evenly divided between the // requested number of shards, distribute the N excess elements between the // first N shards. const size_t num_shards = - std::min<size_t>(destination.num_shards(), elements.size()); + std::min<size_t>(destination.num_shards, elements.size()); CHECK_GT(num_shards, 0); const size_t shard_size = elements.size() / num_shards; std::vector<size_t> shard_sizes(num_shards, shard_size); @@ -419,11 +383,11 @@ // them, and possibly retire // `SeedCorpusDestination::shard_index_digits`). const std::string shard_idx = - absl::StrFormat("%0*d", destination.shard_index_digits(), shard); - const std::string corpus_rel_fname = absl::StrReplaceAll( - destination.shard_rel_glob(), {{"*", shard_idx}}); + absl::StrFormat("%0*d", destination.shard_index_digits, shard); + const std::string corpus_rel_fname = + absl::StrReplaceAll(destination.shard_rel_glob, {{"*", shard_idx}}); const std::string corpus_fname = - fs::path{destination.dir_path()} / corpus_rel_fname; + fs::path{destination.dir_path} / corpus_rel_fname; const auto work_dir = WorkDir::FromCorpusShardPath( // corpus_fname, coverage_binary_name, coverage_binary_hash); @@ -450,9 +414,9 @@ << ShardPathsForLogging(corpus_fname, features_fname); // Features files are always saved in a subdir of the workdir - // (== `destination.dir_path()` here), which might not exist yet, so we + // (== `destination.dir_path` here), which might not exist yet, so we // create it. Corpus files are saved in the workdir directly, but we - // also create it in case `destination.shard_rel_glob()` contains some + // also create it in case `destination.shard_rel_glob` contains some // dirs (not really intended for that, but the end-user may do that). for (const auto& fname : {corpus_fname, features_fname}) { if (!fname.empty()) { @@ -516,59 +480,42 @@ LOG(INFO) << "Wrote total of " << elements.size() << " elements (" << dst_elts_with_features << " with precomputed features) to destination " - << destination.dir_path(); - return absl::OkStatus(); -} - -absl::Status GenerateSeedCorpusFromConfig( // - std::string_view config_spec, // - std::string_view coverage_binary_name, // - std::string_view coverage_binary_hash, // - std::string_view override_out_dir) { - // Resolve the config. - ASSIGN_OR_RETURN_IF_NOT_OK( - const SeedCorpusConfig config, - ResolveSeedCorpusConfig(config_spec, override_out_dir)); - if (config.sources_size() == 0 || !config.has_destination()) { - LOG(WARNING) << "Config is empty: skipping seed corpus generation"; - return absl::OkStatus(); - } - RETURN_IF_NOT_OK(GenerateSeedCorpusFromConfig( // - config, coverage_binary_name, coverage_binary_hash, override_out_dir)); + << destination.dir_path; return absl::OkStatus(); } absl::Status GenerateSeedCorpusFromConfig( // const SeedCorpusConfig& config, // std::string_view coverage_binary_name, // - std::string_view coverage_binary_hash, // - std::string_view override_out_dir) { + std::string_view coverage_binary_hash) { // Pre-create the destination dir early to catch possible misspellings etc. - if (!RemotePathExists(config.destination().dir_path())) { - RETURN_IF_NOT_OK(RemoteMkdir(config.destination().dir_path())); + if (!RemotePathExists(config.destination.dir_path)) { + RETURN_IF_NOT_OK(RemoteMkdir(config.destination.dir_path)); } // Dump the config to the debug info dir in the destination. const WorkDir workdir{ - config.destination().dir_path(), + config.destination.dir_path, coverage_binary_name, coverage_binary_hash, /*my_shard_index=*/0, }; const std::filesystem::path debug_info_dir = workdir.DebugInfoDirPath(); RETURN_IF_NOT_OK(RemoteMkdir(debug_info_dir.c_str())); + std::ostringstream os; + os << config; RETURN_IF_NOT_OK(RemoteFileSetContents( - (debug_info_dir / "seeding.cfg").c_str(), absl::StrCat(config))); + (debug_info_dir / "seeding.cfg").c_str(), os.str())); InputAndFeaturesVec elements; // Read and sample elements from the sources. - for (const auto& source : config.sources()) { + for (const auto& source : config.sources) { RETURN_IF_NOT_OK(SampleSeedCorpusElementsFromSource( // source, coverage_binary_name, coverage_binary_hash, elements)); } LOG(INFO) << "Sampled " << elements.size() << " elements from " - << config.sources_size() << " seed corpus source(s)"; + << config.sources.size() << " seed corpus source(s)"; // Write the sampled elements to the destination. if (elements.empty()) { @@ -577,7 +524,7 @@ } else { RETURN_IF_NOT_OK(WriteSeedCorpusElementsToDestination( // elements, coverage_binary_name, coverage_binary_hash, - config.destination())); + config.destination)); LOG(INFO) << "Wrote " << elements.size() << " elements to seed corpus destination"; }
diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index fe1736d..8299818 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h
@@ -15,32 +15,62 @@ #ifndef THIRD_PARTY_CENTIPEDE_SEED_CORPUS_MAKER_LIB_H_ #define THIRD_PARTY_CENTIPEDE_SEED_CORPUS_MAKER_LIB_H_ +#include <iostream> #include <string_view> #include <utility> +#include <variant> #include <vector> #include "absl/status/status.h" #include "absl/status/statusor.h" #include "./centipede/feature.h" -#include "./centipede/seed_corpus_config.pb.h" #include "./common/defs.h" namespace centipede { +// Native struct used by the seed corpus library for seed corpus source. +// +// Currently this is mirroring the `proto::SeedCorpusSource` proto. But in the +// future it may change with the core seeding API. +struct SeedCorpusSource { + std::string dir_glob; + uint32_t num_recent_dirs; + std::string shard_rel_glob; + std::variant<float, uint32_t> sampled_fraction_or_count; + + friend std::ostream& operator<<(std::ostream& os, + const SeedCorpusSource& source); +}; + +// Native struct used by the seed corpus library for seed corpus destination. +// +// Currently this is mirroring the `proto::SeedCorpusDestination` proto. But in +// the future it may change with the core seeding API. +struct SeedCorpusDestination { + std::string dir_path; + std::string shard_rel_glob; + uint32_t shard_index_digits; + uint32_t num_shards; + + friend std::ostream& operator<<(std::ostream& os, + const SeedCorpusDestination& destination); +}; + +// Native struct used by the seed corpus library for seed corpus configuration. +// +// Currently this is mirroring the `proto::SeedCorpusConfig` proto. But in the +// future it may change with the core seeding API. +struct SeedCorpusConfig { + std::vector<SeedCorpusSource> sources; + SeedCorpusDestination destination; + + friend std::ostream& operator<<(std::ostream& os, + const SeedCorpusConfig& config); +}; + using InputAndFeatures = std::pair<ByteArray, FeatureVec>; using InputAndFeaturesVec = std::vector<InputAndFeatures>; -// If a file with `config_spec` path exists, tries to parse it as a -// `SeedCorpusConfig` textproto. Otherwise, tries to parse `config_spec` as a -// verbatim `SeedCorpusConfig` textproto. Resolves any relative paths and globs -// in the config fields to absolute ones, using as the base dir either the -// file's parent dir (if `config_spec` is a file) or the current dir otherwise. -// If `override_out_dir` is non-empty, it overrides `destination.dir_path` in -// the resolved config. -absl::StatusOr<SeedCorpusConfig> ResolveSeedCorpusConfig( // - std::string_view config_spec, // - std::string_view override_out_dir = ""); - // Extracts a sample of corpus elements from `source` and appends the results to // `elements`. `source` defines the locations of the corpus shards and the size // of the sample. @@ -74,14 +104,9 @@ const SeedCorpusDestination& destination); // Reads and samples seed corpus elements from all the sources and writes the -// results to the destination, as defined in `config_spec`. `config_spec` can be -// either a `silifuzz.ccmp.SeedCorpusConfig` textproto file (local or remote) or -// a verbatim `silifuzz.ccmp.SeedCorpusConfig` string. The paths and globs in -// the proto can be relative paths: in that case, they are resolved to absolute -// using either the file's parent dir (if `config_spec` is a file) or the -// current dir (if `config_spec` is a verbatim string) as the base dir. If -// `override_out_dir` is non-empty, it overrides `destination.dir_path` -// specified in `config_spec`. +// results to the destination, as defined in `config`. The paths and globs in +// `config` can be relative paths: in that case, they are resolved to absolute +// using as the base dir. // // `coverage_binary_name` should be the basename of the coverage binary for // which the seed corpus is to be created, and the `coverage_binary_hash` should @@ -90,17 +115,9 @@ // <coverage_binary_name>-<coverage_binary_hash> subdir of the source to the // same subdir of the destination. absl::Status GenerateSeedCorpusFromConfig( // - std::string_view config_spec, // - std::string_view coverage_binary_name, // - std::string_view coverage_binary_hash, // - std::string_view override_out_dir = ""); - -// Same as above but accepts a `SeedCorpusConfig` directly. -absl::Status GenerateSeedCorpusFromConfig( // const SeedCorpusConfig& config, // std::string_view coverage_binary_name, // - std::string_view coverage_binary_hash, // - std::string_view override_out_dir = ""); + std::string_view coverage_binary_hash); } // namespace centipede
diff --git a/centipede/seed_corpus_maker_lib_test.cc b/centipede/seed_corpus_maker_lib_test.cc index 22a7aa2..8fa3660 100644 --- a/centipede/seed_corpus_maker_lib_test.cc +++ b/centipede/seed_corpus_maker_lib_test.cc
@@ -28,6 +28,7 @@ #include "absl/strings/substitute.h" #include "./centipede/feature.h" #include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/seed_corpus_config_proto_lib.h" #include "./centipede/workdir.h" #include "./common/logging.h" // IWYU pragma: keep #include "./common/status_macros.h" @@ -38,23 +39,18 @@ namespace { namespace fs = std::filesystem; -using google::protobuf::TextFormat; -using testing::IsSubsetOf; + +using ::google::protobuf::TextFormat; +using ::testing::IsSubsetOf; inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; enum ShardType { kNormal, kDistilled }; -SeedCorpusConfig ParseSeedCorpusConfig(std::string_view config_str) { - SeedCorpusConfig config; - CHECK(TextFormat::ParseFromString(config_str, &config)); - return config; -} - -std::string PrintSeedCorpusConfigToString(const SeedCorpusConfig& config) { - std::string config_str; - TextFormat::PrintToString(config, &config_str); - return config_str; +SeedCorpusConfig CreateTestSeedCorpusConfig(std::string_view config_str) { + proto::SeedCorpusConfig config_proto; + CHECK(TextFormat::ParseFromString(config_str, &config_proto)); + return CreateSeedCorpusConfigFromProto(config_proto); } void VerifyShardsExist( // @@ -103,59 +99,8 @@ << VV(workdir); } -TEST(SeedCorpusMakerLibTest, ResolveConfig) { - const std::string test_dir = GetTestTempDir(test_info_->name()); - - // `ResolveSeedCorpusConfig()` should use the CWD to resolve relative paths. - chdir(test_dir.c_str()); - - constexpr size_t kNumShards = 3; - constexpr std::string_view kSrcSubDir = "src/dir"; - constexpr std::string_view kDstSubDir = "dest/dir"; - const std::string_view kConfigStr = R"pb( - sources { - dir_glob: "./$0" - shard_rel_glob: "corpus.*" - num_recent_dirs: 1 - sampled_fraction: 0.5 - } - destination { - # - dir_path: "./$1" - shard_rel_glob: "corpus.*" - num_shards: $2 - } - )pb"; - const std::string_view kExpectedConfigStr = R"pb( - sources { - dir_glob: "$0/./$1" - shard_rel_glob: "corpus.*" - num_recent_dirs: 1 - sampled_fraction: 0.5 - } - destination { - dir_path: "$0/./$2" - shard_rel_glob: "corpus.*" - num_shards: $3 - shard_index_digits: $4 - } - )pb"; - - const SeedCorpusConfig resolved_config = - ValueOrDie(ResolveSeedCorpusConfig( // - absl::Substitute(kConfigStr, kSrcSubDir, kDstSubDir, kNumShards))); - - const SeedCorpusConfig expected_config = ParseSeedCorpusConfig( // - absl::Substitute(kExpectedConfigStr, test_dir, kSrcSubDir, kDstSubDir, - kNumShards, kIdxDigits)); - - ASSERT_EQ(PrintSeedCorpusConfigToString(resolved_config), - PrintSeedCorpusConfigToString(expected_config)); -} - TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) { const fs::path test_dir = GetTestTempDir(test_info_->name()); - // `ResolveSeedCorpusConfig()` should use the CWD to resolve relative paths. chdir(test_dir.c_str()); const InputAndFeaturesVec kElements = { @@ -171,7 +116,6 @@ constexpr std::string_view kCovHash = "hash"; constexpr std::string_view kRelDir1 = "dir/foo"; constexpr std::string_view kRelDir2 = "dir/bar"; - constexpr std::string_view kRelDir3 = "dir/kuq"; // Test `WriteSeedCorpusElementsToDestination()`. This also creates a seed // source for the subsequent tests. @@ -185,10 +129,10 @@ shard_index_digits: $3 } )pb"; - const SeedCorpusConfig config = ParseSeedCorpusConfig(absl::Substitute( + const SeedCorpusConfig config = CreateTestSeedCorpusConfig(absl::Substitute( kConfigStr, kRelDir1, kCovBin, kNumShards, kIdxDigits)); ASSERT_OK(WriteSeedCorpusElementsToDestination( // - kElements, kCovBin, kCovHash, config.destination())); + kElements, kCovBin, kCovHash, config.destination)); const std::string workdir = (test_dir / kRelDir1).c_str(); ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled)); @@ -207,11 +151,11 @@ )pb"; for (const double fraction : {1.0, 0.5, 0.2}) { - const SeedCorpusConfig config = ParseSeedCorpusConfig( + const SeedCorpusConfig config = CreateTestSeedCorpusConfig( absl::Substitute(kConfigStr, kRelDir1, kCovBin, fraction)); InputAndFeaturesVec elements; ASSERT_OK(SampleSeedCorpusElementsFromSource( // - config.sources(0), kCovBin, kCovHash, elements)); + config.sources[0], kCovBin, kCovHash, elements)); // NOTE: 1.0 has a precise double representation, so `==` is fine. ASSERT_EQ(elements.size(), std::llrint(kElements.size() * fraction)) << VV(fraction); @@ -238,26 +182,18 @@ } )pb"; - const std::string config_str = absl::Substitute( // - kConfigStr, kRelDir1, kCovBin, kRelDir2, kNumShards, kIdxDigits); + const SeedCorpusConfig config = + CreateTestSeedCorpusConfig(absl::Substitute( // + kConfigStr, kRelDir1, kCovBin, kRelDir2, kNumShards, kIdxDigits)); { ASSERT_OK(GenerateSeedCorpusFromConfig( // - config_str, kCovBin, kCovHash, "")); + config, kCovBin, kCovHash)); const std::string workdir = (test_dir / kRelDir2).c_str(); ASSERT_NO_FATAL_FAILURE(VerifyDumpedConfig(workdir, kCovBin, kCovHash)); ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // workdir, kCovBin, kCovHash, kNumShards, ShardType::kNormal)); } - - { - ASSERT_OK(GenerateSeedCorpusFromConfig( // - config_str, kCovBin, kCovHash, kRelDir3)); - const std::string workdir = (test_dir / kRelDir3).c_str(); - ASSERT_NO_FATAL_FAILURE(VerifyDumpedConfig(workdir, kCovBin, kCovHash)); - ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // - workdir, kCovBin, kCovHash, kNumShards, ShardType::kNormal)); - } } }
diff --git a/codelab/escaping_test.cc b/codelab/escaping_test.cc index 5e5310e..65caf4d 100644 --- a/codelab/escaping_test.cc +++ b/codelab/escaping_test.cc
@@ -15,7 +15,7 @@ #include "./escaping.h" #include "gtest/gtest.h" -#include "fuzztest/fuzztest.h" +#include "./fuzztest/fuzztest.h" namespace codelab { namespace {