pw_kvs: Reinit KVS metadata as part of the repair process

- Split out the KVS metadata initialization out of Init to
  a new method InitializeMetadata.
- Split out the fixing errors out of Repair to a new method FixErrors.
- Use InitializeMetadata followed up by FixErrors as the process to
  recover from corruption. This does the most robust integrity/error
  checking, and is the simplest path to an intact and coherent KVS.
- Add KVS error test for recovering after losing all copies of an entry.

Change-Id: Ifca540886738d378428162074ac186e2dc0d3d7c
diff --git a/pw_kvs/in_memory_fake_flash.cc b/pw_kvs/in_memory_fake_flash.cc
index 245e329..897e5db 100644
--- a/pw_kvs/in_memory_fake_flash.cc
+++ b/pw_kvs/in_memory_fake_flash.cc
@@ -33,7 +33,7 @@
 Status FlashError::Check(FlashMemory::Address start_address, size_t size) {
   // Check if the event overlaps with this address range.
   if (begin_ != kAnyAddress &&
-      (start_address >= end_ || (start_address + size) < begin_)) {
+      (start_address >= end_ || (start_address + size) <= begin_)) {
     return Status::OK;
   }
 
diff --git a/pw_kvs/key_value_store.cc b/pw_kvs/key_value_store.cc
index 2bc6ddf..55d030d 100644
--- a/pw_kvs/key_value_store.cc
+++ b/pw_kvs/key_value_store.cc
@@ -57,8 +57,6 @@
   initialized_ = InitializationState::kNotInitialized;
   error_detected_ = false;
   last_transaction_id_ = 0;
-  sectors_.Reset();
-  entry_cache_.Reset();
 
   INF("Initializing key value store");
   if (partition_.sector_count() > sectors_.max_size()) {
@@ -80,6 +78,53 @@
     return Status::FAILED_PRECONDITION;
   }
 
+  InitializeMetadata();
+
+  if (!error_detected_) {
+    initialized_ = InitializationState::kReady;
+  } else {
+    if (options_.recovery != ErrorRecovery::kManual) {
+      Status recovery_status = FixErrors();
+
+      if (recovery_status.ok()) {
+        WRN("KVS init: Corruption detected and fully repaired");
+        initialized_ = InitializationState::kReady;
+      } else if (recovery_status == Status::RESOURCE_EXHAUSTED) {
+        WRN("KVS init: Unable to maintain required free sector");
+        initialized_ = InitializationState::kNeedsMaintenance;
+      } else {
+        WRN("KVS init: Corruption detected and unable repair");
+        initialized_ = InitializationState::kNeedsMaintenance;
+      }
+    } else {
+      WRN("KVS init: Corruption detected, no repair attempted due to options");
+      initialized_ = InitializationState::kNeedsMaintenance;
+    }
+  }
+
+  INF("KeyValueStore init complete: active keys %zu, deleted keys %zu, sectors "
+      "%zu, logical sector size %zu bytes",
+      size(),
+      (entry_cache_.total_entries() - size()),
+      sectors_.size(),
+      partition_.sector_size_bytes());
+
+  // Report any corruption was not repaired.
+  if (error_detected_) {
+    WRN("KVS init: Corruption found but not repaired, KVS unavailable until "
+        "successful maintenance.");
+    return Status::DATA_LOSS;
+  }
+
+  return Status::OK;
+}
+
+void KeyValueStore::InitializeMetadata() {
+  const size_t sector_size_bytes = partition_.sector_size_bytes();
+
+  sectors_.Reset();
+  entry_cache_.Reset();
+
   DBG("First pass: Read all entries from all sectors");
   Address sector_address = 0;
 
@@ -93,10 +138,10 @@
     size_t sector_corrupt_bytes = 0;
 
     for (int num_entries_in_sector = 0; true; num_entries_in_sector++) {
-      DBG("Load entry: sector=%" PRIx32 ", entry#=%d, address=%" PRIx32,
-          sector_address,
+      DBG("Load entry: sector=%u, entry#=%d, address=%u",
+          unsigned(sector_address),
           num_entries_in_sector,
-          entry_address);
+          unsigned(entry_address));
 
       if (!sectors_.AddressInSector(sector, entry_address)) {
         DBG("Fell off end of sector; moving to the next sector");
@@ -108,13 +153,10 @@
       if (status == Status::NOT_FOUND) {
         DBG("Hit un-written data in sector; moving to the next sector");
         break;
-      }
-      if (status == Status::DATA_LOSS) {
-        // The entry could not be read, indicating data corruption within the
-        // sector. Try to scan the remainder of the sector for other entries.
-        WRN("KVS init: data loss detected in sector %u at address %zu",
-            sectors_.Index(sector),
-            size_t(entry_address));
+      } else if (!status.ok()) {
+        // The entry could not be read, indicating likely data corruption within
+        // the sector. Try to scan the remainder of the sector for other
+        // entries.
 
         error_detected_ = true;
         corrupt_entries++;
@@ -122,7 +164,7 @@
         status = ScanForEntry(sector,
                               entry_address + Entry::kMinAlignmentBytes,
                               &next_entry_address);
-        if (status == Status::NOT_FOUND) {
+        if (!status.ok()) {
           // No further entries in this sector. Mark the remaining bytes in the
           // sector as corrupt (since we can't reliably know the size of the
           // corrupt entry).
@@ -131,15 +173,7 @@
           break;
         }
 
-        if (!status.ok()) {
-          ERR("Unexpected error in KVS initialization: %s", status.str());
-          return Status::UNKNOWN;
-        }
-
         sector_corrupt_bytes += next_entry_address - entry_address;
-      } else if (!status.ok()) {
-        ERR("Unexpected error in KVS initialization: %s", status.str());
-        return Status::UNKNOWN;
       }
 
       // Entry loaded successfully; so get ready to load the next one.
@@ -218,47 +252,11 @@
     error_detected_ = true;
   }
 
-  if (!error_detected_) {
-    initialized_ = InitializationState::kReady;
-  } else {
-    if (options_.recovery != ErrorRecovery::kManual) {
-      WRN("KVS init: Corruption detected, beginning repair. Found %zu corrupt "
-          "bytes and %d corrupt entries.",
-          total_corrupt_bytes,
-          corrupt_entries);
-      Status recovery_status = Repair();
-
-      if (recovery_status.ok()) {
-        WRN("KVS init: Corruption detected and fully repaired");
-        initialized_ = InitializationState::kReady;
-      } else if (recovery_status == Status::RESOURCE_EXHAUSTED) {
-        WRN("KVS init: Unable to maintain required free sector");
-        initialized_ = InitializationState::kNeedsMaintenance;
-      } else {
-        WRN("KVS init: Corruption detected and unable repair");
-        initialized_ = InitializationState::kNeedsMaintenance;
-      }
-    } else {
-      WRN("KVS init: Corruption detected, no repair attempted due to options");
-      initialized_ = InitializationState::kNeedsMaintenance;
-    }
+  if (total_corrupt_bytes != 0 || corrupt_entries != 0) {
+    WRN("Corruption detected. Found %zu corrupt bytes and %d corrupt entries.",
+        total_corrupt_bytes,
+        corrupt_entries);
   }
-
-  INF("KeyValueStore init complete: active keys %zu, deleted keys %zu, sectors "
-      "%zu, logical sector size %zu bytes",
-      size(),
-      (entry_cache_.total_entries() - size()),
-      sectors_.size(),
-      partition_.sector_size_bytes());
-
-  // Report any corruption was not repaired.
-  if (error_detected_) {
-    WRN("KVS init: Corruption found but not repaired, KVS unavailable until "
-        "successful maintenance.");
-    return Status::DATA_LOSS;
-  }
-
-  return Status::OK;
 }
 
 KeyValueStore::StorageStats KeyValueStore::GetStorageStats() const {
@@ -336,9 +334,9 @@
 Status KeyValueStore::ScanForEntry(const SectorDescriptor& sector,
                                    Address start_address,
                                    Address* next_entry_address) {
-  DBG("Scanning sector %u for entries starting from address %zx",
+  DBG("Scanning sector %u for entries starting from address %u",
       sectors_.Index(sector),
-      size_t(start_address));
+      unsigned(start_address));
 
   // Entries must start at addresses which are aligned on a multiple of
   // Entry::kMinAlignmentBytes. However, that multiple can vary between entries.
@@ -348,9 +346,13 @@
        sectors_.AddressInSector(sector, address);
        address += Entry::kMinAlignmentBytes) {
     uint32_t magic;
-    TRY(partition_.Read(address, as_writable_bytes(span(&magic, 1))));
+    StatusWithSize read_result =
+        partition_.Read(address, as_writable_bytes(span(&magic, 1)));
+    if (!read_result.ok()) {
+      continue;
+    }
     if (formats_.KnownMagic(magic)) {
-      DBG("Found entry magic at address %zx", size_t(address));
+      DBG("Found entry magic at address %u", unsigned(address));
       *next_entry_address = address;
       return Status::OK;
     }
@@ -823,16 +825,16 @@
     return Status::FAILED_PRECONDITION;
   }
 
-  // Do automatic repair, if KVS options allow for it.
-  if (error_detected_ && options_.recovery != ErrorRecovery::kManual) {
-    TRY(Repair());
-  }
-
   DBG("Garbage Collect a single sector");
   for (Address address : reserved_addresses) {
     DBG("   Avoid address %u", unsigned(address));
   }
 
+  // Do automatic repair, if KVS options allow for it.
+  if (error_detected_ && options_.recovery != ErrorRecovery::kManual) {
+    TRY(Repair());
+  }
+
   // Step 1: Find the sector to garbage collect
   SectorDescriptor* sector_to_gc =
       sectors_.FindSectorToGarbageCollect(reserved_addresses);
@@ -848,7 +850,7 @@
 
 Status KeyValueStore::RelocateKeyAddressesInSector(
     SectorDescriptor& sector_to_gc,
-    EntryMetadata& metadata,
+    const EntryMetadata& metadata,
     span<const Address> reserved_addresses) {
   for (FlashPartition::Address& address : metadata.addresses()) {
     if (sectors_.AddressInSector(sector_to_gc, address)) {
@@ -1004,23 +1006,17 @@
   return repair_status;
 }
 
-Status KeyValueStore::Repair() {
-  // Collect and return the first error encountered.
-  Status overall_status = Status::OK;
-
-  DBG("KVS repair");
+Status KeyValueStore::FixErrors() {
+  DBG("Fixing KVS errors");
 
   // Step 1: Garbage collect any sectors marked as corrupt.
-  Status repair_status = RepairCorruptSectors();
-  if (overall_status.ok()) {
-    overall_status = repair_status;
-  }
+  Status overall_status = RepairCorruptSectors();
 
   // Step 2: Make sure there is at least 1 empty sector. This needs to be a
   // seperate check of sectors from step 1, because a found empty sector might
   // get written to by a later GC that fails and does not result in a free
   // sector.
-  repair_status = EnsureFreeSectorExists();
+  Status repair_status = EnsureFreeSectorExists();
   if (overall_status.ok()) {
     overall_status = repair_status;
   }
@@ -1039,6 +1035,17 @@
   return overall_status;
 }
 
+Status KeyValueStore::Repair() {
+  // If errors have been detected, just reinit the KVS metadata. This does a
+  // full deep error check and any needed repairs. Then repair any errors.
+  INF("Starting KVS repair");
+
+  DBG("Reinitialize KVS metadata");
+  InitializeMetadata();
+
+  return FixErrors();
+}
+
 KeyValueStore::Entry KeyValueStore::CreateEntry(Address address,
                                                 string_view key,
                                                 span<const byte> value,
diff --git a/pw_kvs/key_value_store_binary_format_test.cc b/pw_kvs/key_value_store_binary_format_test.cc
index 3119edc..ff826d8 100644
--- a/pw_kvs/key_value_store_binary_format_test.cc
+++ b/pw_kvs/key_value_store_binary_format_test.cc
@@ -114,6 +114,13 @@
     .verify_on_write = true,
 };
 
+constexpr Options kRecoveryLazyGcOptions{
+    .gc_on_write = GargbageCollectOnWrite::kOneSector,
+    .recovery = ErrorRecovery::kLazy,
+    .verify_on_read = true,
+    .verify_on_write = true,
+};
+
 constexpr auto kEntry1 = MakeValidEntry(kMagic, 1, "key1", ByteStr("value1"));
 constexpr auto kEntry2 = MakeValidEntry(kMagic, 3, "k2", ByteStr("value2"));
 constexpr auto kEntry3 = MakeValidEntry(kMagic, 4, "k3y", ByteStr("value3"));
@@ -198,13 +205,13 @@
   ASSERT_EQ(1024u, stats.writable_bytes);
 }
 
-TEST_F(KvsErrorHandling, Init_ReadError_IsNotInitialized) {
+TEST_F(KvsErrorHandling, Init_ReadError_InitializedWithSingleEntryError) {
   InitFlashTo(AsBytes(kEntry1, kEntry2));
 
   flash_.InjectReadError(
       FlashError::InRange(Status::UNAUTHENTICATED, kEntry1.size()));
 
-  EXPECT_EQ(Status::UNKNOWN, kvs_.Init());
+  EXPECT_EQ(Status::DATA_LOSS, kvs_.Init());
   EXPECT_FALSE(kvs_.initialized());
 }
 
@@ -390,14 +397,20 @@
   ASSERT_EQ(1u, stats.corrupt_sectors_recovered);
 }
 
-TEST_F(KvsErrorRecovery, Init_ReadError_IsNotInitialized) {
+TEST_F(KvsErrorRecovery, Init_ReadError_InitializedWithSingleEntryError) {
   InitFlashTo(AsBytes(kEntry1, kEntry2));
 
   flash_.InjectReadError(
       FlashError::InRange(Status::UNAUTHENTICATED, kEntry1.size()));
 
-  EXPECT_EQ(Status::UNKNOWN, kvs_.Init());
-  EXPECT_FALSE(kvs_.initialized());
+  EXPECT_EQ(Status::OK, kvs_.Init());
+  EXPECT_TRUE(kvs_.initialized());
+  auto stats = kvs_.GetStorageStats();
+  ASSERT_EQ(32u, stats.in_use_bytes);
+  ASSERT_EQ(0u, stats.reclaimable_bytes);
+  ASSERT_EQ(3 * 512u - 32u, stats.writable_bytes);
+  ASSERT_EQ(1u, stats.corrupt_sectors_recovered);
+  ASSERT_EQ(0u, stats.missing_redundant_entries_recovered);
 }
 
 TEST_F(KvsErrorRecovery, Init_CorruptSectors_ShouldBeUnwritable) {
@@ -423,7 +436,7 @@
   EXPECT_EQ(96u, stats.in_use_bytes);
   EXPECT_EQ(0u, stats.reclaimable_bytes);
   EXPECT_EQ(1440u, stats.writable_bytes);
-  ASSERT_EQ(3u, stats.corrupt_sectors_recovered);
+  EXPECT_EQ(3u, stats.corrupt_sectors_recovered);
 }
 
 TEST_F(KvsErrorRecovery, Init_CorruptSectors_ShouldRecoverOne) {
@@ -443,6 +456,7 @@
   EXPECT_EQ(64u, stats.in_use_bytes);
   EXPECT_EQ(0u, stats.reclaimable_bytes);
   EXPECT_EQ(3 * 512u - 64u, stats.writable_bytes);
+  EXPECT_EQ(4u, stats.corrupt_sectors_recovered);
 }
 
 TEST_F(KvsErrorRecovery, Init_CorruptKey_RevertsToPreviousVersion) {
@@ -597,8 +611,8 @@
   EXPECT_EQ(stats.in_use_bytes, (160u * kvs_.redundancy()));
   EXPECT_EQ(stats.reclaimable_bytes, 0u);
   EXPECT_EQ(stats.writable_bytes, 512u * 3 - (160 * kvs_.redundancy()));
-  EXPECT_EQ(stats.corrupt_sectors_recovered, 1u);
-  EXPECT_EQ(stats.missing_redundant_entries_recovered, 5u);
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 0u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 10u);
 }
 
 TEST_F(InitializedMultiMagicKvs, RecoversLossOfSecondSector) {
@@ -674,8 +688,8 @@
   EXPECT_EQ(stats.in_use_bytes, (192u * kvs_.redundancy()));
   EXPECT_EQ(stats.reclaimable_bytes, 0u);
   EXPECT_EQ(stats.writable_bytes, 512u * 3 - (192 * kvs_.redundancy()));
-  EXPECT_EQ(stats.corrupt_sectors_recovered, 1u);
-  EXPECT_EQ(stats.missing_redundant_entries_recovered, 6u);
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 0u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 5u);
 
   EXPECT_EQ(Status::OK,
             kvs_.Get("new key", as_writable_bytes(span(val))).status());
@@ -706,6 +720,99 @@
   EXPECT_EQ(stats.missing_redundant_entries_recovered, 5u);
 }
 
+class RedundantKvsInitializedSingleCopyData : public ::testing::Test {
+ protected:
+  static constexpr auto kInitialContents =
+      AsBytes(kEntry1, kEntry2, kEntry3, kEntry4);
+
+  RedundantKvsInitializedSingleCopyData()
+      : flash_(internal::Entry::kMinAlignmentBytes),
+        partition_(&flash_),
+        kvs_(&partition_,
+             {.magic = kMagic, .checksum = &checksum},
+             kRecoveryLazyGcOptions) {
+    partition_.Erase();
+    std::memcpy(flash_.buffer().data(),
+                kInitialContents.data(),
+                kInitialContents.size());
+
+    EXPECT_EQ(Status::OK, kvs_.Init());
+  }
+
+  FakeFlashBuffer<512, 4, 3> flash_;
+  FlashPartition partition_;
+  KeyValueStoreBuffer<kMaxEntries, kMaxUsableSectors, 2> kvs_;
+};
+
+TEST_F(RedundantKvsInitializedSingleCopyData, WriteAfterDataLoss) {
+  EXPECT_EQ(Status::OK, partition_.Erase(0, 4));
+
+  char val[20] = {};
+  EXPECT_EQ(Status::DATA_LOSS,
+            kvs_.Get("key1", as_writable_bytes(span(val))).status());
+  EXPECT_EQ(Status::DATA_LOSS,
+            kvs_.Get("k2", as_writable_bytes(span(val))).status());
+  EXPECT_EQ(Status::DATA_LOSS,
+            kvs_.Get("k3y", as_writable_bytes(span(val))).status());
+  EXPECT_EQ(Status::DATA_LOSS,
+            kvs_.Get("4k", as_writable_bytes(span(val))).status());
+
+  EXPECT_EQ(true, kvs_.error_detected());
+
+  auto stats = kvs_.GetStorageStats();
+  EXPECT_EQ(stats.in_use_bytes, (128u * kvs_.redundancy()));
+  EXPECT_EQ(stats.reclaimable_bytes, 2 * 384u);
+  EXPECT_EQ(stats.writable_bytes, 512u);
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 0u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 4u);
+
+  ASSERT_EQ(Status::DATA_LOSS, kvs_.Put("key1", 1000));
+
+  EXPECT_EQ(Status::OK, kvs_.FullMaintenance());
+  stats = kvs_.GetStorageStats();
+  EXPECT_EQ(stats.in_use_bytes, 0u);
+  EXPECT_EQ(stats.reclaimable_bytes, 0u);
+  EXPECT_EQ(stats.writable_bytes, 3 * 512u);
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 0u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 4u);
+}
+
+TEST_F(RedundantKvsInitializedSingleCopyData,
+       TwoSectorsCorruptWithGoodEntries) {
+  ASSERT_CONTAINS_ENTRY("key1", "value1");
+  ASSERT_CONTAINS_ENTRY("k2", "value2");
+  ASSERT_CONTAINS_ENTRY("k3y", "value3");
+  ASSERT_CONTAINS_ENTRY("4k", "value4");
+
+  EXPECT_EQ(false, kvs_.error_detected());
+
+  auto stats = kvs_.GetStorageStats();
+  EXPECT_EQ(stats.in_use_bytes, (128u * kvs_.redundancy()));
+  EXPECT_EQ(stats.reclaimable_bytes, 0u);
+  EXPECT_EQ(stats.writable_bytes, 3 * 512u - (128u * kvs_.redundancy()));
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 0u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 4u);
+
+  // Corrupt all the keys, alternating which copy gets corrupted.
+  flash_.buffer()[0x10] = byte(0xef);
+  flash_.buffer()[0x230] = byte(0xef);
+  flash_.buffer()[0x50] = byte(0xef);
+  flash_.buffer()[0x270] = byte(0xef);
+
+  ASSERT_CONTAINS_ENTRY("key1", "value1");
+  ASSERT_CONTAINS_ENTRY("k2", "value2");
+  ASSERT_CONTAINS_ENTRY("k3y", "value3");
+  ASSERT_CONTAINS_ENTRY("4k", "value4");
+
+  EXPECT_EQ(Status::OK, kvs_.FullMaintenance());
+  stats = kvs_.GetStorageStats();
+  EXPECT_EQ(stats.in_use_bytes, (128u * kvs_.redundancy()));
+  EXPECT_EQ(stats.reclaimable_bytes, 0u);
+  EXPECT_EQ(stats.writable_bytes, 3 * 512u - (128u * kvs_.redundancy()));
+  EXPECT_EQ(stats.corrupt_sectors_recovered, 2u);
+  EXPECT_EQ(stats.missing_redundant_entries_recovered, 8u);
+}
+
 TEST_F(InitializedMultiMagicKvs, PutNewEntry_UsesFirstFormat) {
   EXPECT_EQ(Status::OK, kvs_.Put("new key", ByteStr("abcd?")));
 
diff --git a/pw_kvs/public/pw_kvs/key_value_store.h b/pw_kvs/public/pw_kvs/key_value_store.h
index 1fa5297..f5d7d69 100644
--- a/pw_kvs/public/pw_kvs/key_value_store.h
+++ b/pw_kvs/public/pw_kvs/key_value_store.h
@@ -339,6 +339,7 @@
         "as_bytes(span(&value, 1)) or as_writable_bytes(span(&value, 1)).");
   }
 
+  void InitializeMetadata();
   Status LoadEntry(Address entry_address, Address* next_entry_address);
   Status ScanForEntry(const SectorDescriptor& sector,
                       Address start_address,
@@ -430,7 +431,7 @@
   Status GarbageCollect(span<const Address> addresses_to_skip);
 
   Status RelocateKeyAddressesInSector(SectorDescriptor& sector_to_gc,
-                                      EntryMetadata& descriptor,
+                                      const EntryMetadata& descriptor,
                                       span<const Address> addresses_to_skip);
 
   Status GarbageCollectSector(SectorDescriptor& sector_to_gc,
@@ -444,6 +445,8 @@
 
   Status EnsureEntryRedundancy();
 
+  Status FixErrors();
+
   Status Repair();
 
   internal::Entry CreateEntry(Address address,