pw_kvs: Remove deleted keys as part of heavy maintenance

Remove/garbage collect all the deleted key entries as part of
doing heavy maintenance.

Change-Id: Ie55ea51211eef1cb81e6959a819e3756091eeb5d
Co-authored-by: David Rempel <drempel@google.com>
Co-authored-by: David Rogers <davidrogers@google.com>
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/95021
Reviewed-by: Keir Mierle <keir@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com>
diff --git a/pw_kvs/docs.rst b/pw_kvs/docs.rst
index fd21996..fdf443a 100644
--- a/pw_kvs/docs.rst
+++ b/pw_kvs/docs.rst
@@ -157,9 +157,10 @@
 Full Maintenance does garbage collection of all sectors except those that have
 current valid KV entries.
 
-Heavy Maintenance does garbage collection of all sectors. Use strong caution
-when doing Heavy Maintenance as it can, compared to Full Maintenance, result
-in a significant amount of moving valid entries,
+Heavy Maintenance does garbage collection of all sectors, including removing
+entries for deleted keys. Use strong caution when doing Heavy Maintenance as it
+can, compared to Full Maintenance, result in a significant amount of moving
+valid entries,
 
 Garbage collection can be performed by request of higher level software or
 automatically as needed to make space available to write new entries.
diff --git a/pw_kvs/entry_cache.cc b/pw_kvs/entry_cache.cc
index ece008b..bac1607 100644
--- a/pw_kvs/entry_cache.cc
+++ b/pw_kvs/entry_cache.cc
@@ -19,6 +19,7 @@
 
 #include <cinttypes>
 
+#include "pw_assert/check.h"
 #include "pw_kvs/flash_memory.h"
 #include "pw_kvs/internal/entry.h"
 #include "pw_kvs/internal/hash.h"
@@ -125,6 +126,32 @@
   return EntryMetadata(descriptors_.back(), std::span(first_address, 1));
 }
 
+// Removes an existing entry from the cache
+EntryCache::iterator EntryCache::RemoveEntry(iterator& entry_it) {
+  PW_DCHECK_PTR_EQ(entry_it.entry_cache_, this);
+
+  const unsigned int index_to_remove =
+      entry_it.metadata_.descriptor_ - &descriptors_.front();
+  const KeyDescriptor last_desc = descriptors_[descriptors_.size() - 1];
+
+  // Since order is not important, this copies the last descriptor into the
+  // deleted descriptor's space and then pops the last entry.
+  Address* addresses_at_end = first_address(descriptors_.size() - 1);
+
+  if (index_to_remove < descriptors_.size() - 1) {
+    Address* addresses_to_remove = first_address(index_to_remove);
+    for (unsigned int i = 0; i < redundancy_; i++) {
+      addresses_to_remove[i] = addresses_at_end[i];
+    }
+    descriptors_[index_to_remove] = last_desc;
+  }
+
+  // Erase the last entry since it was copied over the entry being deleted.
+  descriptors_.pop_back();
+
+  return {this, descriptors_.data() + index_to_remove};
+}
+
 // TODO: This method is the trigger of the O(valid_entries * all_entries) time
 // complexity for reading. At some cost to memory, this could be optimized by
 // using a hash table instead of scanning, but in practice this should be fine
diff --git a/pw_kvs/key_value_store.cc b/pw_kvs/key_value_store.cc
index 2b0bbe8..a3e2262 100644
--- a/pw_kvs/key_value_store.cc
+++ b/pw_kvs/key_value_store.cc
@@ -400,6 +400,36 @@
   return Status::NotFound();
 }
 
+Status KeyValueStore::RemoveDeletedKeyEntries() {
+  for (internal::EntryCache::iterator it = entry_cache_.begin();
+       it != entry_cache_.end();
+       ++it) {
+    EntryMetadata& entry_metadata = *it;
+
+    // The iterator we are given back from RemoveEntry could also be deleted,
+    // so loop until we find one that isn't deleted.
+    while (entry_metadata.state() == EntryState::kDeleted) {
+      // Read the original entry to get the size for sector accounting purposes.
+      Entry entry;
+      PW_TRY(ReadEntry(entry_metadata, entry));
+
+      for (Address address : entry_metadata.addresses()) {
+        sectors_.FromAddress(address).RemoveValidBytes(entry.size());
+      }
+
+      it = entry_cache_.RemoveEntry(it);
+
+      if (it == entry_cache_.end()) {
+        return OkStatus();  // new iterator is the end, bail
+      }
+
+      entry_metadata = *it;  // set entry_metadata to check for deletion again
+    }
+  }
+
+  return OkStatus();
+}
+
 StatusWithSize KeyValueStore::Get(Key key,
                                   std::span<byte> value_buffer,
                                   size_t offset_bytes) const {
@@ -634,6 +664,17 @@
 
 Status KeyValueStore::WriteEntryForNewKey(Key key,
                                           std::span<const byte> value) {
+  // If we are trying to ensure that all possible writes are successful, and the
+  // cache is full, attempt heavy maintenance now.
+  if (options_.gc_on_write == GargbageCollectOnWrite::kAsManySectorsNeeded &&
+      entry_cache_.full()) {
+    Status maintenance_status = HeavyMaintenance();
+    if (!maintenance_status.ok()) {
+      WRN("KVS Maintenance failed for write: %s", maintenance_status.str());
+      return maintenance_status;
+    }
+  }
+
   if (entry_cache_.full()) {
     WRN("KVS full: trying to store a new entry, but can't. Have %u entries",
         unsigned(entry_cache_.total_entries()));
@@ -878,13 +919,15 @@
   INF("Beginning full maintenance");
   CheckForErrors();
 
+  // Step 1: Repair errors
   if (error_detected_) {
     PW_TRY(Repair());
   }
+
+  // Step 2: Make sure all the entries are on the primary format.
   StatusWithSize update_status = UpdateEntriesToPrimaryFormat();
   Status overall_status = update_status.status();
 
-  // Make sure all the entries are on the primary format.
   if (!overall_status.ok()) {
     ERR("Failed to update all entries to the primary format");
   }
@@ -901,26 +944,50 @@
   bool heavy = (maintenance_type == MaintenanceType::kHeavy);
   bool force_gc = heavy || over_usage_threshold || (update_status.size() > 0);
 
-  // TODO: look in to making an iterator method for cycling through sectors
-  // starting from last_new_sector_.
-  Status gc_status;
-  for (size_t j = 0; j < sectors_.size(); j++) {
-    sector += 1;
-    if (sector == sectors_.end()) {
-      sector = sectors_.begin();
-    }
+  auto do_garbage_collect_pass = [&]() {
+    // TODO: look in to making an iterator method for cycling through
+    // sectors starting from last_new_sector_.
+    Status gc_status;
+    for (size_t j = 0; j < sectors_.size(); j++) {
+      sector += 1;
+      if (sector == sectors_.end()) {
+        sector = sectors_.begin();
+      }
 
-    if (sector->RecoverableBytes(partition_.sector_size_bytes()) > 0 &&
-        (force_gc || sector->valid_bytes() == 0)) {
-      gc_status = GarbageCollectSector(*sector, {});
-      if (!gc_status.ok()) {
-        ERR("Failed to garbage collect all sectors");
-        break;
+      if (sector->RecoverableBytes(partition_.sector_size_bytes()) > 0 &&
+          (force_gc || sector->valid_bytes() == 0)) {
+        gc_status = GarbageCollectSector(*sector, {});
+        if (!gc_status.ok()) {
+          ERR("Failed to garbage collect all sectors");
+          break;
+        }
       }
     }
-  }
-  if (overall_status.ok()) {
-    overall_status = gc_status;
+    if (overall_status.ok()) {
+      overall_status = gc_status;
+    }
+  };
+
+  // Step 3: Do full garbage collect pass for all sectors. This will erase all
+  // old/state entries from flash and leave only current/valid entries.
+  do_garbage_collect_pass();
+
+  // Step 4: (if heavy maintenance) garbage collect all the deleted keys.
+  if (heavy) {
+    // Remove deleted keys from the entry cache, including freeing sector bytes
+    // used by those keys. This must only be done directly after a full garbage
+    // collection, otherwise the current deleted entry could be garbage
+    // collected before the older stale entry producing a window for an
+    // invalid/corrupted KVS state if there was a power-fault, crash or other
+    // interruption.
+    Status rdk_status = RemoveDeletedKeyEntries();
+    overall_status.Update(rdk_status);
+
+    // Do another garbage collect pass that will fully remove the deleted keys
+    // from flash. Garbage collect will only touch sectors that have something
+    // to garbage collect, which in this case is only sectors containing deleted
+    // keys.
+    do_garbage_collect_pass();
   }
 
   if (overall_status.ok()) {
diff --git a/pw_kvs/key_value_store_fuzz_test.cc b/pw_kvs/key_value_store_fuzz_test.cc
index d35a128..4b14ed1 100644
--- a/pw_kvs/key_value_store_fuzz_test.cc
+++ b/pw_kvs/key_value_store_fuzz_test.cc
@@ -80,7 +80,8 @@
       // Rewrite a single key many times, can fill up a sector
       ASSERT_EQ(OkStatus(), kvs_.Put("some_data", j));
     }
-    // Delete and re-add everything
+
+    // Delete and re-add everything except "some_data"
     ASSERT_EQ(OkStatus(), kvs_.Delete(key1));
     ASSERT_EQ(OkStatus(), kvs_.Put(key1, std::span(buf1, size1)));
     ASSERT_EQ(OkStatus(), kvs_.Delete(key2));
@@ -105,4 +106,98 @@
   }
 }
 
+TEST(KvsFuzz, FuzzTestWithGC) {
+  FlashPartition& test_partition = FlashTestPartition();
+  ASSERT_EQ(OkStatus(), test_partition.Erase());
+
+  KeyValueStoreBuffer<kMaxEntries, kMaxUsableSectors> kvs_(&test_partition,
+                                                           default_format);
+
+  ASSERT_EQ(OkStatus(), kvs_.Init());
+
+  if (test_partition.sector_size_bytes() < 4 * 1024 ||
+      test_partition.sector_count() < 4) {
+    PW_LOG_INFO("Sectors too small, skipping test.");
+    return;  // TODO: Test could be generalized
+  }
+  const char* key1 = "Buf1";
+  const char* key2 = "Buf2";
+  const size_t kLargestBufSize = 3 * 1024;
+  static byte buf1[kLargestBufSize];
+  static byte buf2[kLargestBufSize];
+  std::memset(buf1, 1, sizeof(buf1));
+  std::memset(buf2, 2, sizeof(buf2));
+
+  // Start with things in KVS
+  ASSERT_EQ(OkStatus(), kvs_.Put(key1, buf1));
+  ASSERT_EQ(OkStatus(), kvs_.Put(key2, buf2));
+  for (size_t j = 0; j < keys.size(); j++) {
+    ASSERT_EQ(OkStatus(), kvs_.Put(keys[j], j));
+  }
+
+  for (size_t i = 0; i < 100; i++) {
+    // Vary two sizes
+    size_t size1 = (kLargestBufSize) / (i + 1);
+    size_t size2 = (kLargestBufSize) / (100 - i);
+    for (size_t j = 0; j < 50; j++) {
+      // Rewrite a single key many times, can fill up a sector
+      ASSERT_EQ(OkStatus(), kvs_.Put("some_data", j));
+    }
+
+    // Delete and re-add everything except "some_data".
+    ASSERT_EQ(OkStatus(), kvs_.Delete(key1));
+    ASSERT_EQ(OkStatus(), kvs_.Put(key1, std::span(buf1, size1)));
+    ASSERT_EQ(OkStatus(), kvs_.Delete(key2));
+
+    // Throw some heavy maintenance in the middle to trigger some GC before
+    // moving forward.
+    EXPECT_EQ(OkStatus(), kvs_.HeavyMaintenance());
+
+    // check for expected stats
+    KeyValueStore::StorageStats stats = kvs_.GetStorageStats();
+    EXPECT_GT(stats.sector_erase_count, 1u);
+    EXPECT_EQ(stats.reclaimable_bytes, 0u);
+
+    // Write out rotating keyvalue, read it, and delete kMaxEntries * 4.
+    // This tests whether garbage collection is working on write.
+    for (size_t j = 0; j < kMaxEntries * 4; j++) {
+      size_t readj;
+      StringBuffer<6> keyVal;
+      keyVal << j;
+      ASSERT_EQ(OkStatus(), kvs_.Put(keyVal.c_str(), j));
+      ASSERT_EQ(OkStatus(), kvs_.Get(keyVal.c_str(), &readj));
+      ASSERT_EQ(j, readj);
+      ASSERT_EQ(OkStatus(), kvs_.Delete(keyVal.c_str()));
+      ASSERT_EQ(Status::NotFound(), kvs_.Get(keyVal.c_str(), &readj));
+    }
+
+    // The KVS should contain key1, "some_data", and all of keys[].
+    ASSERT_EQ(kvs_.size(), 2u + keys.size());
+
+    ASSERT_EQ(OkStatus(), kvs_.Put(key2, std::span(buf2, size2)));
+    for (size_t j = 0; j < keys.size(); j++) {
+      ASSERT_EQ(OkStatus(), kvs_.Delete(keys[j]));
+      ASSERT_EQ(OkStatus(), kvs_.Put(keys[j], j));
+    }
+
+    // Do some more heavy maintenance, ensure we have the right number
+    // of keys.
+    EXPECT_EQ(OkStatus(), kvs_.HeavyMaintenance());
+    // The KVS should contain key1, key2, "some_data", and all of keys[].
+    ASSERT_EQ(kvs_.size(), 3u + keys.size());
+
+    // Re-enable and verify (final check on store).
+    ASSERT_EQ(OkStatus(), kvs_.Init());
+    static byte buf[4 * 1024];
+    ASSERT_EQ(OkStatus(), kvs_.Get(key1, std::span(buf, size1)).status());
+    ASSERT_EQ(std::memcmp(buf, buf1, size1), 0);
+    ASSERT_EQ(OkStatus(), kvs_.Get(key2, std::span(buf, size2)).status());
+    ASSERT_EQ(std::memcmp(buf2, buf2, size2), 0);
+    for (size_t j = 0; j < keys.size(); j++) {
+      size_t ret = 1000;
+      ASSERT_EQ(OkStatus(), kvs_.Get(keys[j], &ret));
+      ASSERT_EQ(ret, j);
+    }
+  }
+}
 }  // namespace pw::kvs
diff --git a/pw_kvs/key_value_store_test.cc b/pw_kvs/key_value_store_test.cc
index 004684e..0aaea1f 100644
--- a/pw_kvs/key_value_store_test.cc
+++ b/pw_kvs/key_value_store_test.cc
@@ -413,6 +413,57 @@
   EXPECT_EQ(stats.reclaimable_bytes, 0u);
 }
 
+TEST_F(LargeEmptyInitializedKvs, KeyDeletionMaintenance) {
+  const uint8_t kValue1 = 0xDA;
+  const uint8_t kValue2 = 0x12;
+  uint8_t val = 0;
+
+  // Write and delete a key. The key should be gone, but the size should be 1.
+  ASSERT_EQ(OkStatus(), kvs_.Put(keys[0], kValue1));
+  ASSERT_EQ(kvs_.size(), 1u);
+  ASSERT_EQ(OkStatus(), kvs_.Delete(keys[0]));
+
+  // Ensure the key is indeed gone and the size is 1 before continuing.
+  ASSERT_EQ(kvs_.Get(keys[0], &val), Status::NotFound());
+  ASSERT_EQ(kvs_.size(), 0u);
+  ASSERT_EQ(kvs_.total_entries_with_deleted(), 1u);
+
+  KeyValueStore::StorageStats stats = kvs_.GetStorageStats();
+  EXPECT_EQ(stats.sector_erase_count, 0u);
+  EXPECT_GT(stats.reclaimable_bytes, 0u);
+
+  // Do aggressive FullMaintenance, which should GC the sector with valid data,
+  // resulting in no reclaimable bytes and an erased sector.
+  EXPECT_EQ(OkStatus(), kvs_.HeavyMaintenance());
+  stats = kvs_.GetStorageStats();
+  EXPECT_GT(stats.sector_erase_count, 1u);
+  EXPECT_EQ(stats.reclaimable_bytes, 0u);
+  ASSERT_EQ(kvs_.size(), 0u);
+  ASSERT_EQ(kvs_.total_entries_with_deleted(), 0u);
+
+  // Do it again but with 2 keys and keep one.
+  ASSERT_EQ(OkStatus(), kvs_.Put(keys[0], kValue1));
+  ASSERT_EQ(OkStatus(), kvs_.Put(keys[1], kValue2));
+  ASSERT_EQ(kvs_.size(), 2u);
+  ASSERT_EQ(OkStatus(), kvs_.Delete(keys[0]));
+
+  // Ensure the key is indeed gone and the size is 1 before continuing.
+  ASSERT_EQ(kvs_.Get(keys[0], &val), Status::NotFound());
+  ASSERT_EQ(kvs_.size(), 1u);
+  ASSERT_EQ(kvs_.total_entries_with_deleted(), 2u);
+
+  // Do aggressive FullMaintenance, which should GC the sector with valid data,
+  // resulting in no reclaimable bytes and an erased sector.
+  EXPECT_EQ(OkStatus(), kvs_.HeavyMaintenance());
+  stats = kvs_.GetStorageStats();
+  ASSERT_EQ(kvs_.size(), 1u);
+  ASSERT_EQ(kvs_.total_entries_with_deleted(), 1u);
+
+  // Read back the second key to make sure it is still valid.
+  ASSERT_EQ(kvs_.Get(keys[1], &val), OkStatus());
+  ASSERT_EQ(val, kValue2);
+}
+
 TEST(InMemoryKvs, Put_MaxValueSize) {
   // Create and erase the fake flash.
   Flash flash;
diff --git a/pw_kvs/public/pw_kvs/internal/entry_cache.h b/pw_kvs/public/pw_kvs/internal/entry_cache.h
index ae2acb7..a4a5414 100644
--- a/pw_kvs/public/pw_kvs/internal/entry_cache.h
+++ b/pw_kvs/public/pw_kvs/internal/entry_cache.h
@@ -188,6 +188,10 @@
                                 Address address,
                                 size_t sector_size_bytes) const;
 
+  // Removes an existing entry from the cache. Returns an iterator to the
+  // next entry so that iteration can continue.
+  iterator RemoveEntry(iterator& entry_it);
+
   // Returns a pointer to an array of redundancy() addresses for temporary use.
   // This is used by the KeyValueStore to track reserved addresses when finding
   // space for a new entry.
diff --git a/pw_kvs/public/pw_kvs/key_value_store.h b/pw_kvs/public/pw_kvs/key_value_store.h
index 086dcb8..d59aae4 100644
--- a/pw_kvs/public/pw_kvs/key_value_store.h
+++ b/pw_kvs/public/pw_kvs/key_value_store.h
@@ -305,6 +305,11 @@
   // Returns the number of valid entries in the KeyValueStore.
   size_t size() const { return entry_cache_.present_entries(); }
 
+  // Returns the number of valid entries and deleted entries yet to be collected
+  size_t total_entries_with_deleted() const {
+    return entry_cache_.total_entries();
+  }
+
   size_t max_size() const { return entry_cache_.max_entries(); }
 
   size_t empty() const { return size() == 0u; }
@@ -383,6 +388,14 @@
                       Address start_address,
                       Address* next_entry_address);
 
+  // Remove deleted keys from the entry cache, including freeing sector bytes
+  // used by those keys. This must only be done directly after a full garbage
+  // collection, otherwise the current deleted entry could be garbage
+  // collected before the older stale entry producing a window for an
+  // invalid/corrupted KVS state if there was a power-fault, crash or other
+  // interruption.
+  Status RemoveDeletedKeyEntries();
+
   Status PutBytes(Key key, std::span<const std::byte> value);
 
   StatusWithSize ValueSize(const EntryMetadata& metadata) const;