#Centipede Add unit tests for seed corpus maker (redux) PiperOrigin-RevId: 582764763
diff --git a/centipede/BUILD b/centipede/BUILD index 57866ba..f24d8bb 100644 --- a/centipede/BUILD +++ b/centipede/BUILD
@@ -1533,6 +1533,23 @@ ], ) +cc_test( + name = "seed_corpus_maker_lib_test", + srcs = ["seed_corpus_maker_lib_test.cc"], + deps = [ + ":feature", + ":logging", + ":seed_corpus_config_cc_proto", + ":seed_corpus_maker_lib", + ":test_util", + ":workdir", + "@com_google_absl//absl/log:check", + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest_main", + "@com_google_protobuf//:protobuf", + ], +) + ################################################################################ # Other tests ################################################################################
diff --git a/centipede/seed_corpus_maker_lib.cc b/centipede/seed_corpus_maker_lib.cc index 5b46ccc..455518c 100644 --- a/centipede/seed_corpus_maker_lib.cc +++ b/centipede/seed_corpus_maker_lib.cc
@@ -17,6 +17,8 @@ // Centipede workdirs and writes them out to a new set of Centipede corpus file // shards. +#include "./centipede/seed_corpus_maker_lib.h" + #include <algorithm> #include <cmath> #include <cstddef> @@ -51,7 +53,6 @@ #include "./centipede/workdir.h" #include "google/protobuf/text_format.h" -// TODO(ussuri): Add unit tests. // TODO(ussuri): Implement a smarter on-the-fly sampling to avoid having to // load all of a source's elements into RAM only to pick some of them. That // would be trivial if the number of elements in a corpus file could be @@ -63,9 +64,6 @@ namespace fs = std::filesystem; -using InputAndFeatures = std::pair<ByteArray, FeatureVec>; -using InputAndFeaturesVec = std::vector<InputAndFeatures>; - SeedCorpusConfig ResolveSeedCorpusConfig( // std::string_view config_spec, // std::string_view override_out_dir) {
diff --git a/centipede/seed_corpus_maker_lib.h b/centipede/seed_corpus_maker_lib.h index 0d34880..3573116 100644 --- a/centipede/seed_corpus_maker_lib.h +++ b/centipede/seed_corpus_maker_lib.h
@@ -25,6 +25,9 @@ namespace centipede { +using InputAndFeatures = std::pair<ByteArray, FeatureVec>; +using InputAndFeaturesVec = std::vector<InputAndFeatures>; + // If a file with `config_spec` path exists, tries to parse it as a // `SeedCorpusConfig` textproto. Otherwise, tries to parse `config_spec` as a // verbatim `SeedCorpusConfig` textproto. Resolves any relative paths and globs @@ -51,7 +54,7 @@ const SeedCorpusSource& source, // std::string_view coverage_binary_name, // std::string_view coverage_binary_hash, // - std::vector<std::pair<ByteArray, FeatureVec>>& elements); + InputAndFeaturesVec& elements); // Writes seed corpus `elements` to `destination`. Any previously existing // corpus shard files matching `destination.shard_glob()` will be deleted @@ -62,10 +65,10 @@ // be the hash of that binary. The features in each `FeatureVec` of the // `elements` will be saved to a features shard file under // <coverage_binary_name>-<coverage_binary_hash> subdir of the destination. -void WriteSeedCorpusElementsToDestination( // - const std::vector<std::pair<ByteArray, FeatureVec>>& elements, // - std::string_view coverage_binary_name, // - std::string_view coverage_binary_hash, // +void WriteSeedCorpusElementsToDestination( // + const InputAndFeaturesVec& elements, // + std::string_view coverage_binary_name, // + std::string_view coverage_binary_hash, // const SeedCorpusDestination& destination); // Reads and samples seed corpus elements from all the sources and writes the
diff --git a/centipede/seed_corpus_maker_lib_test.cc b/centipede/seed_corpus_maker_lib_test.cc new file mode 100644 index 0000000..4b52676 --- /dev/null +++ b/centipede/seed_corpus_maker_lib_test.cc
@@ -0,0 +1,249 @@ +// Copyright 2023 The Centipede Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "./centipede/seed_corpus_maker_lib.h" + +#include <unistd.h> + +#include <cstddef> +#include <filesystem> // NOLINT +#include <string> +#include <string_view> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/log/check.h" +#include "absl/strings/substitute.h" +#include "./centipede/feature.h" +#include "./centipede/logging.h" // IWYU pragma: keep +#include "./centipede/seed_corpus_config.pb.h" +#include "./centipede/test_util.h" +#include "./centipede/workdir.h" +#include "google/protobuf/text_format.h" + +namespace centipede { +namespace { + +namespace fs = std::filesystem; +using google::protobuf::TextFormat; +using testing::IsSubsetOf; +using testing::UnorderedElementsAreArray; + +inline constexpr auto kIdxDigits = WorkDir::kDigitsInShardIndex; + +enum ShardType { kNormal, kDistilled }; + +SeedCorpusConfig ParseSeedCorpusConfig(std::string_view config_str) { + SeedCorpusConfig config; + CHECK(TextFormat::ParseFromString(config_str, &config)); + return config; +} + +std::string PrintSeedCorpusConfigToString(const SeedCorpusConfig& config) { + std::string config_str; + TextFormat::PrintToString(config, &config_str); + return config_str; +} + +void VerifyShardsExist( // + std::string_view workdir, // + std::string_view binary_name, // + std::string_view binary_hash, // + size_t num_shards, // + ShardType shard_type) { + const WorkDir wd{ + std::string{workdir}, + std::string{binary_name}, + std::string{binary_hash}, + /*my_shard_index=*/0, + }; + const WorkDir::ShardedFileInfo corpus_files = + shard_type == kNormal ? wd.CorpusFiles() : wd.DistilledCorpusFiles(); + const WorkDir::ShardedFileInfo features_files = + shard_type == kNormal ? wd.FeaturesFiles() : wd.DistilledFeaturesFiles(); + for (int shard = 0; shard < num_shards + 2; ++shard) { + if (shard < num_shards) { + ASSERT_TRUE(fs::exists(corpus_files.ShardPath(shard))) + << VV(shard) << VV(corpus_files.ShardPath(shard)); + ASSERT_TRUE(fs::exists(features_files.ShardPath(shard))) + << VV(shard) << VV(features_files.ShardPath(shard)); + } else { + ASSERT_FALSE(fs::exists(corpus_files.ShardPath(shard))) + << VV(shard) << VV(corpus_files.ShardPath(shard)); + ASSERT_FALSE(fs::exists(features_files.ShardPath(shard))) + << VV(shard) << VV(features_files.ShardPath(shard)); + } + } +} + +TEST(SeedCorpusMakerLibTest, ResolveConfig) { + const std::string test_dir = fs::canonical(GetTestTempDir()); + + // `ResolveSeedCorpusConfig()` should use the CWD to resolve relative paths. + chdir(test_dir.c_str()); + + constexpr size_t kNumShards = 3; + constexpr std::string_view kSrcSubDir = "src/dir"; + constexpr std::string_view kDstSubDir = "dest/dir"; + const std::string_view kConfigStr = R"pb( + sources { + dir_glob: "./$0" + shard_rel_glob: "corpus.*" + num_recent_dirs: 1 + sampled_fraction: 0.5 + } + destination { + # + dir_path: "./$1" + shard_rel_glob: "corpus.*" + num_shards: $2 + } + )pb"; + const std::string_view kExpectedConfigStr = R"pb( + sources { + dir_glob: "$0/./$1" + shard_rel_glob: "corpus.*" + num_recent_dirs: 1 + sampled_fraction: 0.5 + } + destination { + dir_path: "$0/./$2" + shard_rel_glob: "corpus.*" + num_shards: $3 + shard_index_digits: $4 + } + )pb"; + + const SeedCorpusConfig resolved_config = ResolveSeedCorpusConfig( // + absl::Substitute(kConfigStr, kSrcSubDir, kDstSubDir, kNumShards)); + + const SeedCorpusConfig expected_config = ParseSeedCorpusConfig( // + absl::Substitute(kExpectedConfigStr, test_dir, kSrcSubDir, kDstSubDir, + kNumShards, kIdxDigits)); + + ASSERT_EQ(PrintSeedCorpusConfigToString(resolved_config), + PrintSeedCorpusConfigToString(expected_config)); +} + +TEST(SeedCorpusMakerLibTest, RoundTripWriteReadWrite) { + const fs::path test_dir = fs::canonical(GetTestTempDir()); + // `ResolveSeedCorpusConfig()` should use the CWD to resolve relative paths. + chdir(test_dir.c_str()); + + const InputAndFeaturesVec kElements = { + {{0}, {}}, + {{1}, {feature_domains::kNoFeature}}, + {{0, 1}, {0x11, 0x23}}, + {{1, 2, 3}, {0x11, 0x23, 0xfe}}, + {{3, 4, 5, 6}, {0x111, 0x234, 0x345, 0x56}}, + {{5, 6, 7, 9}, {0x1111, 0x2345, 0x3456, 0x5678}}, + {{7, 8, 9, 10, 111}, {0x11111, 0x23456, 0x34567, 0x56789, 0xffaf}}, + }; + constexpr std::string_view kCovBin = "bin"; + constexpr std::string_view kCovHash = "hash"; + constexpr std::string_view kRelDir1 = "dir/foo"; + constexpr std::string_view kRelDir2 = "dir/bar"; + constexpr std::string_view kRelDir3 = "dir/kuq"; + + // Test `WriteSeedCorpusElementsToDestination()`. This also creates a seed + // source for the subsequent tests. + { + constexpr size_t kNumShards = 2; + constexpr std::string_view kConfigStr = R"pb( + destination { + dir_path: "./$0" + shard_rel_glob: "distilled-$1.*" + num_shards: $2 + shard_index_digits: $3 + } + )pb"; + const SeedCorpusConfig config = ParseSeedCorpusConfig(absl::Substitute( + kConfigStr, kRelDir1, kCovBin, kNumShards, kIdxDigits)); + WriteSeedCorpusElementsToDestination( // + kElements, kCovBin, kCovHash, config.destination()); + const std::string workdir = (test_dir / kRelDir1).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kDistilled)); + } + + // Test that `SampleSeedCorpusElementsFromSource()` correctly reads a + // subsample of elements from the seed source created by the previous step. + { + constexpr std::string_view kConfigStr = R"pb( + sources { + dir_glob: "./$0" + shard_rel_glob: "distilled-$1.*" + num_recent_dirs: 2 # Intentionally specify more than we actually have + sampled_fraction: $2 + } + )pb"; + + for (const double fraction : {1.0, 0.5}) { + const SeedCorpusConfig config = ParseSeedCorpusConfig( + absl::Substitute(kConfigStr, kRelDir1, kCovBin, fraction)); + InputAndFeaturesVec elements; + SampleSeedCorpusElementsFromSource( // + config.sources(0), kCovBin, kCovHash, elements); + // NOTE: 1.0 has a precise double representation, so `==` is fine. + if (fraction == 1.0) { + ASSERT_THAT(elements, UnorderedElementsAreArray(kElements)) + << VV(fraction); + } else { + ASSERT_THAT(elements, IsSubsetOf(kElements)) << VV(fraction); + } + } + } + + // Test that `GenerateSeedCorpusFromConfig()` correctly samples seed elements + // from the source and writes expected shards to the destination. + { + constexpr size_t kNumShards = 3; + constexpr std::string_view kConfigStr = R"pb( + sources { + dir_glob: "./$0" + shard_rel_glob: "distilled-$1.*" + num_recent_dirs: 1 + sampled_fraction: 1.0 + } + destination { + dir_path: "./$2" + shard_rel_glob: "corpus.*" + num_shards: $3 + shard_index_digits: $4 + } + )pb"; + + const std::string config_str = absl::Substitute( // + kConfigStr, kRelDir1, kCovBin, kRelDir2, kNumShards, kIdxDigits); + + { + GenerateSeedCorpusFromConfig( // + config_str, kCovBin, kCovHash, ""); + const std::string workdir = (test_dir / kRelDir2).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kNormal)); + } + + { + GenerateSeedCorpusFromConfig( // + config_str, kCovBin, kCovHash, kRelDir3); + const std::string workdir = (test_dir / kRelDir3).c_str(); + ASSERT_NO_FATAL_FAILURE(VerifyShardsExist( // + workdir, kCovBin, kCovHash, kNumShards, ShardType::kNormal)); + } + } +} + +} // namespace +} // namespace centipede