blob: cd3e987bcad5ccc38b673ee0061fcba238d15219 [file] [log] [blame]
// Copyright 2022 The Centipede Authors.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <filesystem>
#include <set>
#include <string>
#include <string_view>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/container/flat_hash_set.h"
#include "./centipede/blob_file.h"
#include "./centipede/centipede_callbacks.h"
#include "./centipede/centipede_interface.h"
#include "./centipede/corpus.h"
#include "./centipede/defs.h"
#include "./centipede/environment.h"
#include "./centipede/execution_result.h"
#include "./centipede/feature.h"
#include "./centipede/logging.h"
#include "./centipede/shard_reader.h"
#include "./centipede/test_util.h"
#include "./centipede/util.h"
namespace centipede {
namespace {
// A mock for CentipedeCallbacks.
class CentipedeMock : public CentipedeCallbacks {
CentipedeMock(const Environment &env) : CentipedeCallbacks(env) {}
// Doesn't execute anything
// Sets `batch_result.results()` based on the values of `inputs`:
// Collects various stats about the inputs, to be checked in tests.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
// For every input, we create a 256-element array `counters`, where
// i-th element is the number of bytes with the value 'i' in the input.
// `counters` is converted to FeatureVec and added to
// `batch_result.results()`.
for (auto &input : inputs) {
ByteArray counters(256);
for (uint8_t byte : input) {
FeatureVec features;
for (size_t i = 0; i < counters.size(); ++i) {
if (counters[i] == 0) continue;
Convert8bitCounterToNumber(i, counters[i])));
if (input.size() == 1) {
} else {
EXPECT_EQ(input.size(), 2);
uint16_t input2bytes = (input[0] << 8) | input[1];
max_batch_size_ = std::max(max_batch_size_, inputs.size());
min_batch_size_ = std::min(min_batch_size_, inputs.size());
return true;
// Makes predictable mutants:
// first 255 mutations are 1-byte sequences {1} ... {255}.
// (the value {0} is produced by DummyValidInput()).
// Next 65536 mutations are 2-byte sequences {0,0} ... {255, 255}.
// Then repeat 2-byte sequences.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
for (auto &mutant : mutants) {
if (num_mutations_ < 256) {
mutant = {static_cast<uint8_t>(num_mutations_)};
uint8_t byte0 = (num_mutations_ - 256) / 256;
uint8_t byte1 = (num_mutations_ - 256) % 256;
mutant = {byte0, byte1};
absl::flat_hash_set<uint8_t> observed_1byte_inputs_;
absl::flat_hash_set<uint16_t> observed_2byte_inputs_;
size_t num_executions_ = 0;
size_t num_inputs_ = 0;
size_t num_mutations_ = 0;
size_t max_batch_size_ = 0;
size_t min_batch_size_ = -1;
// Returns the same CentipedeCallbacks object every time, never destroys it.
class MockFactory : public CentipedeCallbacksFactory {
explicit MockFactory(CentipedeCallbacks &cb) : cb_(cb) {}
CentipedeCallbacks *create(const Environment &env) override { return &cb_; }
void destroy(CentipedeCallbacks *cb) override { EXPECT_EQ(cb, &cb_); }
CentipedeCallbacks &cb_;
} // namespace
TEST(Centipede, MockTest) {
TempCorpusDir tmp_dir{test_info_->name()};
Environment env; // Reads the flags. We override some members below.
env.log_level = 0; // Disable most of the logging in the test.
env.workdir = tmp_dir.path();
env.num_runs = 100000; // Enough to run through all 1- and 2-byte inputs.
env.batch_size = 7; // Just some small number.
env.require_pc_table = false; // No PC table here.
CentipedeMock mock(env);
MockFactory factory(mock);
CentipedeMain(env, factory); // Run fuzzing with num_runs inputs.
EXPECT_EQ(mock.num_inputs_, env.num_runs + 1); // num_runs and one dummy.
EXPECT_EQ(mock.num_mutations_, env.num_runs);
EXPECT_EQ(mock.max_batch_size_, env.batch_size);
EXPECT_EQ(mock.min_batch_size_, 1); // 1 for dummy.
EXPECT_EQ(tmp_dir.CountElementsInCorpusFile(0), 512);
EXPECT_EQ(mock.observed_1byte_inputs_.size(), 256); // all 1-byte seqs.
EXPECT_EQ(mock.observed_2byte_inputs_.size(), 65536); // all 2-byte seqs.
static size_t CountFilesInDir(std::string_view dir_path) {
const std::filesystem::directory_iterator dir_iter{dir_path};
return std::distance(std::filesystem::begin(dir_iter),
// Tests fuzzing and distilling in multiple shards.
TEST(Centipede, ShardsAndDistillTest) {
TempCorpusDir tmp_dir{test_info_->name()};
Environment env; // Reads the flags. We override some members below.
env.workdir = tmp_dir.path();
env.log_level = 0; // Disable most of the logging in the test.
size_t combined_num_runs = 100000; // Enough to run through all inputs.
env.total_shards = 20;
env.num_runs = combined_num_runs / env.total_shards;
env.require_pc_table = false; // No PC table here.
// Create two empty dirs and add them to corpus_dir.
CentipedeMock mock(env);
// First round of runs: do the actual fuzzing, compute the features.
size_t max_shard_size = 0;
for (size_t shard_index = 0; shard_index < env.total_shards; shard_index++) {
env.my_shard_index = shard_index;
MockFactory factory(mock);
CentipedeMain(env, factory); // Run fuzzing in shard `shard_index`.
auto corpus_size = tmp_dir.CountElementsInCorpusFile(shard_index);
// Every byte should be present at least once.
// With 2-byte inputs, we get at least 128 inputs covering 256 features.
EXPECT_GT(corpus_size, 128);
max_shard_size = std::max(max_shard_size, corpus_size);
EXPECT_EQ(mock.observed_1byte_inputs_.size(), 256); // all 1-byte seqs.
EXPECT_EQ(mock.observed_2byte_inputs_.size(), 65536); // all 2-byte seqs.
EXPECT_GT(CountFilesInDir(env.corpus_dir[0]), 128);
EXPECT_EQ(CountFilesInDir(env.corpus_dir[1]), 0);
// Second round of runs. Don't fuzz, only distill.
// Don't distill in the last one to test the flag behaviour.
env.distill_shards = env.total_shards - 1;
env.num_runs = 0; // No fuzzing.
for (size_t shard_index = 0; shard_index < env.total_shards; shard_index++) {
env.my_shard_index = shard_index;
// Empty the corpus_dir[0]
MockFactory factory(mock);
CentipedeMain(env, factory); // Run distilling in shard `shard_index`.
auto distilled_size =
tmp_dir.CountElementsInCorpusFile(shard_index, "distilled-.");
if (shard_index == env.total_shards - 1) {
EXPECT_EQ(distilled_size, 0); // Didn't distill in the last shard.
EXPECT_EQ(CountFilesInDir(env.corpus_dir[0]), 0);
} else {
// Distillation is expected to find more inputs than any individual shard.
EXPECT_GT(distilled_size, max_shard_size);
// And since we are expecting 512 features, with 2-byte inputs,
// we get at least 512/2 corpus elements after distillation.
EXPECT_GT(distilled_size, 256);
EXPECT_GT(CountFilesInDir(env.corpus_dir[0]), 256);
// Tests --input_filter. test_input_filter filters out inputs with 'b' in them.
TEST(Centipede, InputFilter) {
TempCorpusDir tmp_dir{test_info_->name()};
Environment env; // Reads the flags. We override some members below.
env.workdir = tmp_dir.path();
env.num_runs = 256; // Enough to run through all 1- byte inputs.
env.log_level = 0; // Disable most of the logging in the test.
env.require_pc_table = false; // No PC table here.
// Add %f so that test_input_filter doesn't need to be linked with forkserver.
env.input_filter = "%f" + std::string{GetDataDependencyFilepath(
CentipedeMock mock(env);
MockFactory factory(mock);
CentipedeMain(env, factory); // Run fuzzing.
auto corpus = tmp_dir.GetCorpus(0);
std::set<ByteArray> corpus_set(corpus.begin(), corpus.end());
// Callbacks for MutateViaExternalBinary test.
class MutateCallbacks : public CentipedeCallbacks {
explicit MutateCallbacks(const Environment &env) : CentipedeCallbacks(env) {}
// Will not be called.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
return false;
// Will not be called.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
// Redeclare a protected member function as public so the tests can call it.
using CentipedeCallbacks::MutateViaExternalBinary;
TEST(Centipede, MutateViaExternalBinary) {
// This binary contains a test-friendly custom mutator.
const std::string binary_with_custom_mutator =
// This binary does not contain a custom mutator.
const std::string binary_without_custom_mutator =
// Mutate a couple of different inputs.
std::vector<ByteArray> inputs = {{0, 1, 2}, {3, 4}};
// The custom mutator in the test binary will revert the order of bytes
// and sometimes add a number in [100-107) at the end.
// Periodically, the custom mutator will fall back to LLVMFuzzerMutate,
// which in turn will sometimes shrink the inputs.
std::vector<ByteArray> some_of_expected_mutants = {
// Reverted inputs, sometimes with an extra byte at the end.
{2, 1, 0},
{2, 1, 0, 100},
{2, 1, 0, 101},
{2, 1, 0, 102},
{4, 3},
{4, 3, 103},
{4, 3, 104},
{4, 3, 105},
// Shrunk inputs.
{0, 1},
std::vector<ByteArray> expected_crossover_mutants = {
// Crossed-over mutants.
{0, 1, 2, 42, 3, 4},
{3, 4, 42, 0, 1, 2},
auto all_expected_mutants = some_of_expected_mutants;
std::vector<ByteArray> mutants;
// Test with crossover enabled (default).
Environment env;
MutateCallbacks callbacks(env);
// Expect to fail on the binary w/o a custom mutator.
GetMutationInputRefsFromDataInputs(inputs), mutants));
// Expect to succeed on the binary with a custom mutator.
binary_with_custom_mutator, GetMutationInputRefsFromDataInputs(inputs),
// Check that we see all expected mutants, and that they are non-empty.
for (auto &mutant : mutants) {
EXPECT_THAT(mutants, testing::IsSupersetOf(all_expected_mutants));
// Test with crossover disabled.
Environment env_no_crossover;
env_no_crossover.crossover_level = 0;
MutateCallbacks callbacks_no_crossover(env_no_crossover);
binary_with_custom_mutator, GetMutationInputRefsFromDataInputs(inputs),
// Must contain normal mutants, but not the ones from crossover.
EXPECT_THAT(mutants, testing::IsSupersetOf(some_of_expected_mutants));
for (const auto &crossover_mutant : expected_crossover_mutants) {
EXPECT_THAT(mutants, testing::Contains(crossover_mutant).Times(0));
// A mock for MergeFromOtherCorpus test.
class MergeMock : public CentipedeCallbacks {
explicit MergeMock(const Environment &env) : CentipedeCallbacks(env) {}
// Doesn't execute anything.
// All inputs are 1-byte long.
// For an input {X}, the feature output is {X}.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
for (size_t i = 0, n = inputs.size(); i < n; ++i) {
CHECK_EQ(inputs[i].size(), 1);
batch_result.results()[i].mutable_features() = {inputs[i][0]};
return true;
// Every consecutive mutation is {number_of_mutations_}.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
for (auto &mutant : mutants) {
mutant[0] = number_of_mutations_++; // first mutation is {0}.
void Reset() { number_of_mutations_ = 0; }
size_t number_of_mutations_ = 0;
TEST(Centipede, MergeFromOtherCorpus) {
using Corpus = std::vector<ByteArray>;
// Set up the workdir, create a 2-shard corpus with 3 inputs each.
TempCorpusDir work_tmp_dir{test_info_->name(), "workdir"};
Environment env;
env.workdir = work_tmp_dir.path();
env.num_runs = 3; // Just a few runs.
env.require_pc_table = false; // No PC table here.
MergeMock mock(env);
MockFactory factory(mock);
for (env.my_shard_index = 0; env.my_shard_index < 2; ++env.my_shard_index) {
CentipedeMain(env, factory);
CentipedeMain(env, factory);
EXPECT_EQ(work_tmp_dir.GetCorpus(0), Corpus({{0}, {1}, {2}}));
EXPECT_EQ(work_tmp_dir.GetCorpus(1), Corpus({{3}, {4}, {5}}));
// Set up another workdir, create a 2-shard corpus there, with 4 inputs each.
TempCorpusDir merge_tmp_dir(test_info_->name(), "merge_from");
Environment merge_env;
merge_env.workdir = merge_tmp_dir.path();
merge_env.num_runs = 4;
merge_env.require_pc_table = false; // No PC table here.
for (merge_env.my_shard_index = 0; merge_env.my_shard_index < 2;
++merge_env.my_shard_index) {
CentipedeMain(merge_env, factory);
EXPECT_EQ(merge_tmp_dir.GetCorpus(0), Corpus({{0}, {1}, {2}, {3}}));
EXPECT_EQ(merge_tmp_dir.GetCorpus(1), Corpus({{4}, {5}, {6}, {7}}));
// Merge shards of `merge_env` into shards of `env`.
// Shard 0 will receive one extra input: {3}
// Shard 1 will receive two extra inputs: {6}, {7}
env.merge_from = merge_tmp_dir.path();
env.num_runs = 0;
for (env.my_shard_index = 0; env.my_shard_index < 2; ++env.my_shard_index) {
CentipedeMain(env, factory);
EXPECT_EQ(work_tmp_dir.GetCorpus(0), Corpus({{0}, {1}, {2}, {3}}));
EXPECT_EQ(work_tmp_dir.GetCorpus(1), Corpus({{3}, {4}, {5}, {6}, {7}}));
// A mock for FunctionFilter test.
class FunctionFilterMock : public CentipedeCallbacks {
explicit FunctionFilterMock(const Environment &env)
: CentipedeCallbacks(env) {}
// Executes the target in the normal way.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
return ExecuteCentipedeSancovBinaryWithShmem(env_.binary, inputs,
batch_result) == EXIT_SUCCESS;
// Sets the inputs to one of 3 pre-defined values.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
for (auto &input : inputs) {
if ( != DummyValidInput()) {
for (auto &mutant : mutants) {
mutant = GetMutant(++number_of_mutations_);
// Returns one of 3 pre-defined values, that trigger different code paths in
// the test target.
static ByteArray GetMutant(size_t idx) {
const char *mutants[3] = {"func1", "func2-A", "foo"};
const char *mutant = mutants[idx % 3];
return {mutant, mutant + strlen(mutant)};
// Set of inputs observed by Mutate(), except for DummyValidInput().
absl::flat_hash_set<ByteArray> observed_inputs_;
size_t number_of_mutations_ = 0;
// Runs a short fuzzing session with the provided `function_filter`.
// Returns a sorted array of observed inputs.
static std::vector<ByteArray> RunWithFunctionFilter(
std::string_view function_filter, const TempDir &tmp_dir) {
Environment env;
env.workdir = tmp_dir.path();
env.seed = 1; // make the runs predictable.
env.num_runs = 100;
env.batch_size = 10;
env.binary = GetDataDependencyFilepath("centipede/testing/test_fuzz_target");
env.coverage_binary = env.binary;
// Must symbolize in order for the filter to work.
CHECK_EQ(system("which llvm-symbolizer"), EXIT_SUCCESS)
<< "llvm-symbolizer should be installed and findable via PATH";
CHECK_EQ(system("which objdump"), EXIT_SUCCESS)
<< "odjdump should be installed and findable via PATH";
env.objdump_path = "objdump";
env.log_level = 0;
env.function_filter = function_filter;
FunctionFilterMock mock(env);
MockFactory factory(mock);
CentipedeMain(env, factory);
LOG(INFO) << mock.observed_inputs_.size();
std::vector<ByteArray> res(mock.observed_inputs_.begin(),
std::sort(res.begin(), res.end());
return res;
// Tests --function_filter.
TEST(Centipede, FunctionFilter) {
// Run with empty function filter.
TempDir tmp_dir{test_info_->name(), "none"};
auto observed_empty = RunWithFunctionFilter("", tmp_dir);
ASSERT_EQ(observed_empty.size(), 3);
// Run with a one-function filter
TempDir tmp_dir{test_info_->name(), "single"};
auto observed_single = RunWithFunctionFilter("SingleEdgeFunc", tmp_dir);
ASSERT_EQ(observed_single.size(), 1);
EXPECT_EQ(observed_single[0], FunctionFilterMock::GetMutant(0));
// Run with a two-function filter.
TempDir tmp_dir{test_info_->name(), "single_multi"};
auto observed_both =
RunWithFunctionFilter("SingleEdgeFunc,MultiEdgeFunc", tmp_dir);
ASSERT_EQ(observed_both.size(), 2);
EXPECT_EQ(observed_both[0], FunctionFilterMock::GetMutant(0));
EXPECT_EQ(observed_both[1], FunctionFilterMock::GetMutant(1));
namespace {
// A mock for ExtraBinaries test.
class ExtraBinariesMock : public CentipedeCallbacks {
explicit ExtraBinariesMock(const Environment &env)
: CentipedeCallbacks(env) {}
// Doesn't execute anything.
// On certain combinations of {binary,input} returns false.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
bool res = true;
for (const auto &input : inputs) {
if (input.size() != 1) continue;
if (binary == "b1" && input[0] == 10) res = false;
if (binary == "b2" && input[0] == 30) res = false;
if (binary == "b3" && input[0] == 50) res = false;
return res;
// Sets the mutants to different 1-byte values.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
for (auto &mutant : mutants) {
mutant[0] = ++number_of_mutations_;
size_t number_of_mutations_ = 0;
} // namespace
// Tests --extra_binaries.
// Executes one main binary (--binary) and 3 extra ones (--extra_binaries).
// Expects the main binary and two extra ones to generate one crash each.
TEST(Centipede, ExtraBinaries) {
TempDir tmp_dir{test_info_->name()};
Environment env;
env.workdir = tmp_dir.path();
env.num_runs = 100;
env.batch_size = 10;
env.log_level = 1;
env.binary = "b1";
env.extra_binaries = {"b2", "b3", "b4"};
env.require_pc_table = false; // No PC table here.
ExtraBinariesMock mock(env);
MockFactory factory(mock);
CentipedeMain(env, factory);
// Verify that we see the expected crashes.
// The "crashes" dir must contain 3 crashy inputs, one for each binary.
// We simply match their file names, because they are hashes of the contents.
std::vector<std::string> found_crash_file_names;
auto crashes_dir_path = env.MakeCrashReproducerDirPath();
<< VV(crashes_dir_path);
for (const auto &dir_ent :
std::filesystem::directory_iterator(crashes_dir_path)) {
EXPECT_THAT(found_crash_file_names, testing::UnorderedElementsAre(
Hash({10}), Hash({30}), Hash({50})));
namespace {
// A mock for UndetectedCrashingInput test.
class UndetectedCrashingInputMock : public CentipedeCallbacks {
explicit UndetectedCrashingInputMock(const Environment &env,
size_t crashing_input_idx)
: CentipedeCallbacks{env}, crashing_input_idx_{crashing_input_idx} {
CHECK_LE(crashing_input_idx_, std::numeric_limits<uint8_t>::max());
// Doesn't execute anything.
// Crash when 0th char of input to binary b1 equals 10, but only on 1st exec.
bool Execute(std::string_view binary, const std::vector<ByteArray> &inputs,
BatchResult &batch_result) override {
bool res = true;
for (const auto &input : inputs) {
CHECK_EQ(input.size(), 1); // By construction in `Mutate()`.
// The contents of each mutant is its sequential number.
if (input[0] == crashing_input_idx_) {
if (first_pass_) {
first_pass_ = false;
crashing_input_ = input;
// TODO(b/274705740): `num_outputs_read()` is the number of outputs
// that Centipede engine *expects* to have been read from *the
// current BatchResult* by the *particular* implementation of
// `CentipedeCallbacks` (and `DefaultCentipedeCallbacks` fits the
// bill). `Centipede::ReportCrash()` then uses this value as a hint
// for the crashing input's index, and in our case saves the batch's
// inputs from 0 up to and including the crasher to a subdir. See the
// bug for details. All of this is horribly convoluted and misplaced
// here. Implement a cleaner solution.
batch_result.num_outputs_read() =
crashing_input_idx_ % env_.batch_size;
res = false;
return res;
// Sets the mutants to different 1-byte values.
void Mutate(const std::vector<MutationInputRef> &inputs, size_t num_mutants,
std::vector<ByteArray> &mutants) override {
for (auto &mutant : mutants) {
// The contents of each mutant is simply its sequential number.
mutant = {static_cast<uint8_t>(curr_input_idx_++)};
// Gets the input that triggered the crash.
ByteArray crashing_input() const { return crashing_input_; }
const size_t crashing_input_idx_;
size_t curr_input_idx_ = 0;
ByteArray crashing_input_ = {};
bool first_pass_ = true;
} // namespace
// Test for preserving a crashing batch when 1-by-1 exec fails to reproduce.
// Executes one main binary (--binary).
// Expects the binary to crash once and 1-by-1 reproduction to fail.
TEST(Centipede, UndetectedCrashingInput) {
constexpr size_t kNumBatches = 7;
constexpr size_t kBatchSize = 11;
constexpr size_t kCrashingInputIdxInBatch = kBatchSize / 2;
constexpr size_t kCrashingInputIdx =
(kNumBatches / 2) * kBatchSize + kCrashingInputIdxInBatch;
LOG(INFO) << VV(kNumBatches) << VV(kBatchSize)
<< VV(kCrashingInputIdxInBatch) VV(kCrashingInputIdx);
TempDir temp_dir{test_info_->name()};
Environment env;
env.workdir = temp_dir.path();
env.num_runs = kBatchSize * kNumBatches;
env.batch_size = kBatchSize;
// No real binary: prevent attempts by Centipede to read a PCtable from it.
env.require_pc_table = false;
UndetectedCrashingInputMock mock(env, kCrashingInputIdx);
MockFactory factory(mock);
CentipedeMain(env, factory);
// Verify that we see the expected inputs from the batch.
// The "crashes/unreliable_batch-<HASH>" dir must contain all inputs from the
// batch that were executing during the session.
// We simply verify the number of saved inputs matches the number of executed
// inputs.
const auto crashing_input_hash = Hash(mock.crashing_input());
const auto crashes_dir_path = std::filesystem::path(temp_dir.path())
ASSERT_TRUE(std::filesystem::exists(crashes_dir_path)) << crashes_dir_path;
std::vector<std::string> found_crash_file_names;
for (auto const &dir_ent :
std::filesystem::directory_iterator(crashes_dir_path)) {
// TODO(ussuri): Verify exact names/contents of the files, not just count.
ASSERT_EQ(found_crash_file_names.size(), kCrashingInputIdxInBatch + 1);
static void WriteBlobsToFile(const std::vector<ByteArray> &blobs,
const std::string_view path) {
auto appender = DefaultBlobFileWriterFactory();
CHECK_OK(appender->Open(path, "a"));
for (const auto &blob : blobs) {
TEST(Centipede, ShardReader) {
ByteArray data1 = {1, 2, 3};
ByteArray data2 = {3, 4, 5, 6};
ByteArray data3 = {7, 8, 9, 10, 11};
ByteArray data4 = {12, 13, 14};
ByteArray data5 = {15, 16};
FeatureVec fv1 = {100, 200, 300};
FeatureVec fv2 = {300, 400, 500, 600};
FeatureVec fv3 = {700, 800, 900, 1000, 1100};
FeatureVec fv4 = {}; // empty.
std::vector<ByteArray> corpus_blobs;
std::vector<ByteArray> features_blobs;
features_blobs.push_back(PackFeaturesAndHash(data1, fv1));
features_blobs.push_back(PackFeaturesAndHash(data2, fv2));
features_blobs.push_back(PackFeaturesAndHash(data3, fv3));
features_blobs.push_back(PackFeaturesAndHash(data4, fv4));
TempDir tmp_dir{test_info_->name()};
std::string corpus_path = tmp_dir.GetFilePath("corpus");
std::string features_path = tmp_dir.GetFilePath("features");
WriteBlobsToFile(corpus_blobs, corpus_path);
WriteBlobsToFile(features_blobs, features_path);
std::vector<CorpusRecord> res;
ReadShard(corpus_path, features_path,
[&res](const ByteArray &input, const FeatureVec &features) {
res.push_back(CorpusRecord{input, features});
EXPECT_EQ(res.size(), 5UL);
EXPECT_EQ(res[0].data, data1);
EXPECT_EQ(res[1].data, data2);
EXPECT_EQ(res[2].data, data3);
EXPECT_EQ(res[3].data, data4);
EXPECT_EQ(res[4].data, data5);
EXPECT_EQ(res[0].features, fv1);
EXPECT_EQ(res[1].features, fv2);
EXPECT_EQ(res[2].features, fv3);
EXPECT_EQ(res[3].features, FeatureVec{feature_domains::kNoFeature});
EXPECT_EQ(res[4].features, FeatureVec());
} // namespace centipede