upb: replumb upb_Arena to be substantially more opaque

PiperOrigin-RevId: 592345066
diff --git a/upb/BUILD b/upb/BUILD
index cc55caa..180cc77 100644
--- a/upb/BUILD
+++ b/upb/BUILD
@@ -202,12 +202,6 @@
 )
 
 alias(
-    name = "mem_internal",
-    actual = "//upb/mem:internal",
-    visibility = ["//visibility:public"],
-)
-
-alias(
     name = "message",
     actual = "//upb/message",
     visibility = ["//visibility:public"],
@@ -403,7 +397,6 @@
         ":hash",
         ":lex",
         ":mem",
-        ":mem_internal",
         ":message",
         ":message_accessors",
         ":message_copy",
@@ -454,7 +447,6 @@
         ":json",
         ":lex",
         ":mem",
-        ":mem_internal",
         ":message",
         ":message_accessors",
         ":message_copy",
@@ -505,7 +497,6 @@
         ":json",
         ":lex",
         ":mem",
-        ":mem_internal",
         ":message",
         ":message_accessors",
         ":message_copy",
diff --git a/upb/mem/BUILD b/upb/mem/BUILD
index f6bc19e..f4b3226 100644
--- a/upb/mem/BUILD
+++ b/upb/mem/BUILD
@@ -9,32 +9,18 @@
 
 cc_library(
     name = "mem",
+    srcs = [
+        "alloc.c",
+        "arena.c",
+    ],
     hdrs = [
         "alloc.h",
         "arena.h",
         "arena.hpp",
-    ],
-    copts = UPB_DEFAULT_COPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":internal",
-        "//upb:port",
-    ],
-)
-
-cc_library(
-    name = "internal",
-    srcs = [
-        "alloc.c",
-        "alloc.h",
-        "arena.c",
-        "arena.h",
-    ],
-    hdrs = [
         "internal/arena.h",
     ],
     copts = UPB_DEFAULT_COPTS,
-    visibility = ["//upb:__pkg__"],
+    visibility = ["//visibility:public"],
     deps = [
         "//upb:port",
     ],
@@ -44,9 +30,8 @@
     name = "arena_test",
     srcs = ["arena_test.cc"],
     deps = [
+        ":mem",
         "@com_google_googletest//:gtest_main",
-        "//upb:mem",
-        "//upb:mem_internal",
         "//upb:port",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/random:distributions",
diff --git a/upb/mem/arena.c b/upb/mem/arena.c
index 1bb1b01..a01421e 100644
--- a/upb/mem/arena.c
+++ b/upb/mem/arena.c
@@ -17,21 +17,61 @@
 // Must be last.
 #include "upb/port/def.inc"
 
-struct _upb_MemBlock {
+typedef struct upb_MemBlock {
   // Atomic only for the benefit of SpaceAllocated().
-  UPB_ATOMIC(_upb_MemBlock*) next;
+  UPB_ATOMIC(struct upb_MemBlock*) next;
   uint32_t size;
   // Data follows.
-};
+} upb_MemBlock;
 
-static const size_t kUpb_MemblockReserve =
-    UPB_ALIGN_UP(sizeof(_upb_MemBlock), UPB_MALLOC_ALIGN);
+typedef struct upb_ArenaInternal {
+  // upb_alloc* together with a low bit which signals if there is an initial
+  // block.
+  uintptr_t block_alloc;
+
+  // When multiple arenas are fused together, each arena points to a parent
+  // arena (root points to itself). The root tracks how many live arenas
+  // reference it.
+
+  // The low bit is tagged:
+  //   0: pointer to parent
+  //   1: count, left shifted by one
+  UPB_ATOMIC(uintptr_t) parent_or_count;
+
+  // All nodes that are fused together are in a singly-linked list.
+  // == NULL at end of list.
+  UPB_ATOMIC(struct upb_ArenaInternal*) next;
+
+  // The last element of the linked list. This is present only as an
+  // optimization, so that we do not have to iterate over all members for every
+  // fuse.  Only significant for an arena root. In other cases it is ignored.
+  // == self when no other list members.
+  UPB_ATOMIC(struct upb_ArenaInternal*) tail;
+
+  // Linked list of blocks to free/cleanup. Atomic only for the benefit of
+  // upb_Arena_SpaceAllocated().
+  UPB_ATOMIC(upb_MemBlock*) blocks;
+} upb_ArenaInternal;
+
+// All public + private state for an arena.
+typedef struct {
+  upb_Arena head;
+  upb_ArenaInternal body;
+} upb_ArenaState;
 
 typedef struct {
-  upb_Arena* root;
+  upb_ArenaInternal* root;
   uintptr_t tagged_count;
 } upb_ArenaRoot;
 
+static const size_t kUpb_MemblockReserve =
+    UPB_ALIGN_UP(sizeof(upb_MemBlock), UPB_MALLOC_ALIGN);
+
+// Extracts the (upb_ArenaInternal*) from a (upb_Arena*)
+static upb_ArenaInternal* upb_Arena_Internal(const upb_Arena* a) {
+  return &((upb_ArenaState*)a)->body;
+}
+
 static bool _upb_Arena_IsTaggedRefcount(uintptr_t parent_or_count) {
   return (parent_or_count & 1) == 1;
 }
@@ -51,19 +91,20 @@
   return parent_or_count;
 }
 
-static upb_Arena* _upb_Arena_PointerFromTagged(uintptr_t parent_or_count) {
+static upb_ArenaInternal* _upb_Arena_PointerFromTagged(
+    uintptr_t parent_or_count) {
   UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
-  return (upb_Arena*)parent_or_count;
+  return (upb_ArenaInternal*)parent_or_count;
 }
 
-static uintptr_t _upb_Arena_TaggedFromPointer(upb_Arena* a) {
-  uintptr_t parent_or_count = (uintptr_t)a;
+static uintptr_t _upb_Arena_TaggedFromPointer(upb_ArenaInternal* ai) {
+  uintptr_t parent_or_count = (uintptr_t)ai;
   UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
   return parent_or_count;
 }
 
-static upb_alloc* _upb_Arena_BlockAlloc(upb_Arena* arena) {
-  return (upb_alloc*)(arena->block_alloc & ~0x1);
+static upb_alloc* _upb_ArenaInternal_BlockAlloc(upb_ArenaInternal* ai) {
+  return (upb_alloc*)(ai->block_alloc & ~0x1);
 }
 
 static uintptr_t _upb_Arena_MakeBlockAlloc(upb_alloc* alloc, bool has_initial) {
@@ -72,15 +113,16 @@
   return alloc_uint | (has_initial ? 1 : 0);
 }
 
-static bool _upb_Arena_HasInitialBlock(upb_Arena* arena) {
-  return arena->block_alloc & 0x1;
+static bool _upb_ArenaInternal_HasInitialBlock(upb_ArenaInternal* ai) {
+  return ai->block_alloc & 0x1;
 }
 
 static upb_ArenaRoot _upb_Arena_FindRoot(upb_Arena* a) {
-  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
+  uintptr_t poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
   while (_upb_Arena_IsTaggedPointer(poc)) {
-    upb_Arena* next = _upb_Arena_PointerFromTagged(poc);
-    UPB_ASSERT(a != next);
+    upb_ArenaInternal* next = _upb_Arena_PointerFromTagged(poc);
+    UPB_ASSERT(ai != next);
     uintptr_t next_poc =
         upb_Atomic_Load(&next->parent_or_count, memory_order_acquire);
 
@@ -104,64 +146,67 @@
       // further away over time, but the path towards that root will continue to
       // be valid and the creation of the path carries all the memory orderings
       // required.
-      UPB_ASSERT(a != _upb_Arena_PointerFromTagged(next_poc));
-      upb_Atomic_Store(&a->parent_or_count, next_poc, memory_order_relaxed);
+      UPB_ASSERT(ai != _upb_Arena_PointerFromTagged(next_poc));
+      upb_Atomic_Store(&ai->parent_or_count, next_poc, memory_order_relaxed);
     }
-    a = next;
+    ai = next;
     poc = next_poc;
   }
-  return (upb_ArenaRoot){.root = a, .tagged_count = poc};
+  return (upb_ArenaRoot){.root = ai, .tagged_count = poc};
 }
 
 size_t upb_Arena_SpaceAllocated(upb_Arena* arena) {
-  arena = _upb_Arena_FindRoot(arena).root;
+  upb_ArenaInternal* ai = _upb_Arena_FindRoot(arena).root;
   size_t memsize = 0;
 
-  while (arena != NULL) {
-    _upb_MemBlock* block =
-        upb_Atomic_Load(&arena->blocks, memory_order_relaxed);
+  while (ai != NULL) {
+    upb_MemBlock* block = upb_Atomic_Load(&ai->blocks, memory_order_relaxed);
     while (block != NULL) {
-      memsize += sizeof(_upb_MemBlock) + block->size;
+      memsize += sizeof(upb_MemBlock) + block->size;
       block = upb_Atomic_Load(&block->next, memory_order_relaxed);
     }
-    arena = upb_Atomic_Load(&arena->next, memory_order_relaxed);
+    ai = upb_Atomic_Load(&ai->next, memory_order_relaxed);
   }
 
   return memsize;
 }
 
 uint32_t upb_Arena_DebugRefCount(upb_Arena* a) {
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
   // These loads could probably be relaxed, but given that this is debug-only,
   // it's not worth introducing a new variant for it.
-  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
+  uintptr_t poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
   while (_upb_Arena_IsTaggedPointer(poc)) {
-    a = _upb_Arena_PointerFromTagged(poc);
-    poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
+    ai = _upb_Arena_PointerFromTagged(poc);
+    poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
   }
   return _upb_Arena_RefCountFromTagged(poc);
 }
 
 static void _upb_Arena_AddBlock(upb_Arena* a, void* ptr, size_t size) {
-  _upb_MemBlock* block = ptr;
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
+  upb_MemBlock* block = ptr;
 
   // Insert into linked list.
   block->size = (uint32_t)size;
-  upb_Atomic_Init(&block->next, a->blocks);
-  upb_Atomic_Store(&a->blocks, block, memory_order_release);
+  upb_Atomic_Init(&block->next, ai->blocks);
+  upb_Atomic_Store(&ai->blocks, block, memory_order_release);
 
-  a->head.UPB_PRIVATE(ptr) = UPB_PTR_AT(block, kUpb_MemblockReserve, char);
-  a->head.UPB_PRIVATE(end) = UPB_PTR_AT(block, size, char);
+  a->UPB_PRIVATE(ptr) = UPB_PTR_AT(block, kUpb_MemblockReserve, char);
+  a->UPB_PRIVATE(end) = UPB_PTR_AT(block, size, char);
 
-  UPB_POISON_MEMORY_REGION(a->head.UPB_PRIVATE(ptr),
-                           a->head.UPB_PRIVATE(end) - a->head.UPB_PRIVATE(ptr));
+  UPB_POISON_MEMORY_REGION(a->UPB_PRIVATE(ptr),
+                           a->UPB_PRIVATE(end) - a->UPB_PRIVATE(ptr));
 }
 
 static bool _upb_Arena_AllocBlock(upb_Arena* a, size_t size) {
-  if (!a->block_alloc) return false;
-  _upb_MemBlock* last_block = upb_Atomic_Load(&a->blocks, memory_order_acquire);
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
+  if (!ai->block_alloc) return false;
+  upb_MemBlock* last_block = upb_Atomic_Load(&ai->blocks, memory_order_acquire);
   size_t last_size = last_block != NULL ? last_block->size : 128;
   size_t block_size = UPB_MAX(size, last_size * 2) + kUpb_MemblockReserve;
-  _upb_MemBlock* block = upb_malloc(_upb_Arena_BlockAlloc(a), block_size);
+  upb_MemBlock* block =
+      upb_malloc(_upb_ArenaInternal_BlockAlloc(ai), block_size);
 
   if (!block) return false;
   _upb_Arena_AddBlock(a, block, block_size);
@@ -175,8 +220,9 @@
 }
 
 static upb_Arena* _upb_Arena_InitSlow(upb_alloc* alloc) {
-  const size_t first_block_overhead = sizeof(upb_Arena) + kUpb_MemblockReserve;
-  upb_Arena* a;
+  const size_t first_block_overhead =
+      sizeof(upb_ArenaState) + kUpb_MemblockReserve;
+  upb_ArenaState* a;
 
   // We need to malloc the initial block.
   char* mem;
@@ -185,22 +231,23 @@
     return NULL;
   }
 
-  a = UPB_PTR_AT(mem, n - sizeof(*a), upb_Arena);
-  n -= sizeof(*a);
+  a = UPB_PTR_AT(mem, n - sizeof(upb_ArenaState), upb_ArenaState);
+  n -= sizeof(upb_ArenaState);
 
-  a->block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 0);
-  upb_Atomic_Init(&a->parent_or_count, _upb_Arena_TaggedFromRefcount(1));
-  upb_Atomic_Init(&a->next, NULL);
-  upb_Atomic_Init(&a->tail, a);
-  upb_Atomic_Init(&a->blocks, NULL);
+  a->body.block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 0);
+  upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1));
+  upb_Atomic_Init(&a->body.next, NULL);
+  upb_Atomic_Init(&a->body.tail, &a->body);
+  upb_Atomic_Init(&a->body.blocks, NULL);
 
-  _upb_Arena_AddBlock(a, mem, n);
+  _upb_Arena_AddBlock(&a->head, mem, n);
 
-  return a;
+  return &a->head;
 }
 
 upb_Arena* upb_Arena_Init(void* mem, size_t n, upb_alloc* alloc) {
-  upb_Arena* a;
+  UPB_ASSERT(sizeof(void*) * UPB_ARENA_SIZE_HACK >= sizeof(upb_ArenaState));
+  upb_ArenaState* a;
 
   if (n) {
     /* Align initial pointer up so that we return properly-aligned pointers. */
@@ -212,63 +259,65 @@
 
   /* Round block size down to alignof(*a) since we will allocate the arena
    * itself at the end. */
-  n = UPB_ALIGN_DOWN(n, UPB_ALIGN_OF(upb_Arena));
+  n = UPB_ALIGN_DOWN(n, UPB_ALIGN_OF(upb_ArenaState));
 
-  if (UPB_UNLIKELY(n < sizeof(upb_Arena))) {
+  if (UPB_UNLIKELY(n < sizeof(upb_ArenaState))) {
     return _upb_Arena_InitSlow(alloc);
   }
 
-  a = UPB_PTR_AT(mem, n - sizeof(*a), upb_Arena);
+  a = UPB_PTR_AT(mem, n - sizeof(upb_ArenaState), upb_ArenaState);
 
-  upb_Atomic_Init(&a->parent_or_count, _upb_Arena_TaggedFromRefcount(1));
-  upb_Atomic_Init(&a->next, NULL);
-  upb_Atomic_Init(&a->tail, a);
-  upb_Atomic_Init(&a->blocks, NULL);
-  a->block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 1);
+  upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1));
+  upb_Atomic_Init(&a->body.next, NULL);
+  upb_Atomic_Init(&a->body.tail, &a->body);
+  upb_Atomic_Init(&a->body.blocks, NULL);
+
+  a->body.block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 1);
   a->head.UPB_PRIVATE(ptr) = mem;
-  a->head.UPB_PRIVATE(end) = UPB_PTR_AT(mem, n - sizeof(*a), char);
+  a->head.UPB_PRIVATE(end) = UPB_PTR_AT(mem, n - sizeof(upb_ArenaState), char);
 
-  return a;
+  return &a->head;
 }
 
-static void _upb_Arena_DoFree(upb_Arena* a) {
-  UPB_ASSERT(_upb_Arena_RefCountFromTagged(a->parent_or_count) == 1);
+static void _upb_Arena_DoFree(upb_ArenaInternal* ai) {
+  UPB_ASSERT(_upb_Arena_RefCountFromTagged(ai->parent_or_count) == 1);
 
-  while (a != NULL) {
+  while (ai != NULL) {
     // Load first since arena itself is likely from one of its blocks.
-    upb_Arena* next_arena =
-        (upb_Arena*)upb_Atomic_Load(&a->next, memory_order_acquire);
-    upb_alloc* block_alloc = _upb_Arena_BlockAlloc(a);
-    _upb_MemBlock* block = upb_Atomic_Load(&a->blocks, memory_order_acquire);
+    upb_ArenaInternal* next_arena =
+        (upb_ArenaInternal*)upb_Atomic_Load(&ai->next, memory_order_acquire);
+    upb_alloc* block_alloc = _upb_ArenaInternal_BlockAlloc(ai);
+    upb_MemBlock* block = upb_Atomic_Load(&ai->blocks, memory_order_acquire);
     while (block != NULL) {
       // Load first since we are deleting block.
-      _upb_MemBlock* next_block =
+      upb_MemBlock* next_block =
           upb_Atomic_Load(&block->next, memory_order_acquire);
       upb_free(block_alloc, block);
       block = next_block;
     }
-    a = next_arena;
+    ai = next_arena;
   }
 }
 
 void upb_Arena_Free(upb_Arena* a) {
-  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
+  uintptr_t poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
 retry:
   while (_upb_Arena_IsTaggedPointer(poc)) {
-    a = _upb_Arena_PointerFromTagged(poc);
-    poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
+    ai = _upb_Arena_PointerFromTagged(poc);
+    poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
   }
 
   // compare_exchange or fetch_sub are RMW operations, which are more
   // expensive then direct loads.  As an optimization, we only do RMW ops
   // when we need to update things for other threads to see.
   if (poc == _upb_Arena_TaggedFromRefcount(1)) {
-    _upb_Arena_DoFree(a);
+    _upb_Arena_DoFree(ai);
     return;
   }
 
   if (upb_Atomic_CompareExchangeWeak(
-          &a->parent_or_count, &poc,
+          &ai->parent_or_count, &poc,
           _upb_Arena_TaggedFromRefcount(_upb_Arena_RefCountFromTagged(poc) - 1),
           memory_order_release, memory_order_acquire)) {
     // We were >1 and we decremented it successfully, so we are done.
@@ -280,12 +329,14 @@
   goto retry;
 }
 
-static void _upb_Arena_DoFuseArenaLists(upb_Arena* const parent,
-                                        upb_Arena* child) {
-  upb_Arena* parent_tail = upb_Atomic_Load(&parent->tail, memory_order_relaxed);
+static void _upb_Arena_DoFuseArenaLists(upb_ArenaInternal* const parent,
+                                        upb_ArenaInternal* child) {
+  upb_ArenaInternal* parent_tail =
+      upb_Atomic_Load(&parent->tail, memory_order_relaxed);
+
   do {
     // Our tail might be stale, but it will always converge to the true tail.
-    upb_Arena* parent_tail_next =
+    upb_ArenaInternal* parent_tail_next =
         upb_Atomic_Load(&parent_tail->next, memory_order_relaxed);
     while (parent_tail_next != NULL) {
       parent_tail = parent_tail_next;
@@ -293,7 +344,7 @@
           upb_Atomic_Load(&parent_tail->next, memory_order_relaxed);
     }
 
-    upb_Arena* displaced =
+    upb_ArenaInternal* displaced =
         upb_Atomic_Exchange(&parent_tail->next, child, memory_order_relaxed);
     parent_tail = upb_Atomic_Load(&child->tail, memory_order_relaxed);
 
@@ -305,8 +356,8 @@
   upb_Atomic_Store(&parent->tail, parent_tail, memory_order_relaxed);
 }
 
-static upb_Arena* _upb_Arena_DoFuse(upb_Arena* a1, upb_Arena* a2,
-                                    uintptr_t* ref_delta) {
+static upb_ArenaInternal* _upb_Arena_DoFuse(upb_Arena* a1, upb_Arena* a2,
+                                            uintptr_t* ref_delta) {
   // `parent_or_count` has two disctint modes
   // -  parent pointer mode
   // -  refcount mode
@@ -364,7 +415,8 @@
   return r1.root;
 }
 
-static bool _upb_Arena_FixupRefs(upb_Arena* new_root, uintptr_t ref_delta) {
+static bool _upb_Arena_FixupRefs(upb_ArenaInternal* new_root,
+                                 uintptr_t ref_delta) {
   if (ref_delta == 0) return true;  // No fixup required.
   uintptr_t poc =
       upb_Atomic_Load(&new_root->parent_or_count, memory_order_relaxed);
@@ -379,28 +431,33 @@
 bool upb_Arena_Fuse(upb_Arena* a1, upb_Arena* a2) {
   if (a1 == a2) return true;  // trivial fuse
 
+  upb_ArenaInternal* ai1 = upb_Arena_Internal(a1);
+  upb_ArenaInternal* ai2 = upb_Arena_Internal(a2);
+
   // Do not fuse initial blocks since we cannot lifetime extend them.
   // Any other fuse scenario is allowed.
-  if (_upb_Arena_HasInitialBlock(a1) || _upb_Arena_HasInitialBlock(a2)) {
+  if (_upb_ArenaInternal_HasInitialBlock(ai1) ||
+      _upb_ArenaInternal_HasInitialBlock(ai2)) {
     return false;
   }
 
   // The number of refs we ultimately need to transfer to the new root.
   uintptr_t ref_delta = 0;
   while (true) {
-    upb_Arena* new_root = _upb_Arena_DoFuse(a1, a2, &ref_delta);
+    upb_ArenaInternal* new_root = _upb_Arena_DoFuse(a1, a2, &ref_delta);
     if (new_root != NULL && _upb_Arena_FixupRefs(new_root, ref_delta)) {
       return true;
     }
   }
 }
 
-bool upb_Arena_IncRefFor(upb_Arena* arena, const void* owner) {
+bool upb_Arena_IncRefFor(upb_Arena* a, const void* owner) {
+  upb_ArenaInternal* ai = upb_Arena_Internal(a);
+  if (_upb_ArenaInternal_HasInitialBlock(ai)) return false;
   upb_ArenaRoot r;
-  if (_upb_Arena_HasInitialBlock(arena)) return false;
 
 retry:
-  r = _upb_Arena_FindRoot(arena);
+  r = _upb_Arena_FindRoot(a);
   if (upb_Atomic_CompareExchangeWeak(
           &r.root->parent_or_count, &r.tagged_count,
           _upb_Arena_TaggedFromRefcount(
@@ -413,6 +470,23 @@
   goto retry;
 }
 
-void upb_Arena_DecRefFor(upb_Arena* arena, const void* owner) {
-  upb_Arena_Free(arena);
+void upb_Arena_DecRefFor(upb_Arena* a, const void* owner) { upb_Arena_Free(a); }
+
+void UPB_PRIVATE(_upb_Arena_SwapIn)(upb_Arena* des, const upb_Arena* src) {
+  upb_ArenaInternal* desi = upb_Arena_Internal(des);
+  upb_ArenaInternal* srci = upb_Arena_Internal(src);
+
+  *des = *src;
+  desi->block_alloc = srci->block_alloc;
+  upb_MemBlock* blocks = upb_Atomic_Load(&srci->blocks, memory_order_relaxed);
+  upb_Atomic_Init(&desi->blocks, blocks);
+}
+
+void UPB_PRIVATE(_upb_Arena_SwapOut)(upb_Arena* des, const upb_Arena* src) {
+  upb_ArenaInternal* desi = upb_Arena_Internal(des);
+  upb_ArenaInternal* srci = upb_Arena_Internal(src);
+
+  *des = *src;
+  upb_MemBlock* blocks = upb_Atomic_Load(&srci->blocks, memory_order_relaxed);
+  upb_Atomic_Store(&desi->blocks, blocks, memory_order_relaxed);
 }
diff --git a/upb/mem/arena.h b/upb/mem/arena.h
index 1e1f87f..385b14e 100644
--- a/upb/mem/arena.h
+++ b/upb/mem/arena.h
@@ -22,22 +22,15 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>
 
 #include "upb/mem/alloc.h"
+#include "upb/mem/internal/arena.h"
 
 // Must be last.
 #include "upb/port/def.inc"
 
 typedef struct upb_Arena upb_Arena;
 
-// LINT.IfChange(struct_definition)
-typedef struct {
-  char* UPB_ONLYBITS(ptr);
-  char* UPB_ONLYBITS(end);
-} _upb_ArenaHead;
-// LINT.ThenChange(//depot/google3/third_party/upb/bits/typescript/arena.ts)
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -53,33 +46,20 @@
 bool upb_Arena_IncRefFor(upb_Arena* a, const void* owner);
 void upb_Arena_DecRefFor(upb_Arena* a, const void* owner);
 
-void* UPB_PRIVATE(_upb_Arena_SlowMalloc)(upb_Arena* a, size_t size);
-
 size_t upb_Arena_SpaceAllocated(upb_Arena* a);
 uint32_t upb_Arena_DebugRefCount(upb_Arena* a);
 
-UPB_INLINE size_t UPB_PRIVATE(_upb_ArenaHas)(upb_Arena* a) {
-  const _upb_ArenaHead* h = (_upb_ArenaHead*)a;
-  return (size_t)(h->UPB_ONLYBITS(end) - h->UPB_ONLYBITS(ptr));
+UPB_API_INLINE upb_Arena* upb_Arena_New(void) {
+  return upb_Arena_Init(NULL, 0, &upb_alloc_global);
 }
 
-UPB_API_INLINE void* upb_Arena_Malloc(upb_Arena* a, size_t size) {
-  size = UPB_ALIGN_MALLOC(size);
-  const size_t span = size + UPB_ASAN_GUARD_SIZE;
-  if (UPB_UNLIKELY(UPB_PRIVATE(_upb_ArenaHas)(a) < span)) {
-    return UPB_PRIVATE(_upb_Arena_SlowMalloc)(a, span);
-  }
+UPB_API_INLINE void* upb_Arena_Malloc(struct upb_Arena* a, size_t size) {
+  return UPB_PRIVATE(_upb_Arena_Malloc)(a, size);
+}
 
-  // We have enough space to do a fast malloc.
-  _upb_ArenaHead* h = (_upb_ArenaHead*)a;
-  void* ret = h->UPB_ONLYBITS(ptr);
-  UPB_ASSERT(UPB_ALIGN_MALLOC((uintptr_t)ret) == (uintptr_t)ret);
-  UPB_ASSERT(UPB_ALIGN_MALLOC(size) == size);
-  UPB_UNPOISON_MEMORY_REGION(ret, size);
-
-  h->UPB_ONLYBITS(ptr) += span;
-
-  return ret;
+UPB_API_INLINE void* upb_Arena_Realloc(upb_Arena* a, void* ptr, size_t oldsize,
+                                       size_t size) {
+  return UPB_PRIVATE(_upb_Arena_Realloc)(a, ptr, oldsize, size);
 }
 
 // Shrinks the last alloc from arena.
@@ -88,45 +68,7 @@
 // this was not the last alloc.
 UPB_API_INLINE void upb_Arena_ShrinkLast(upb_Arena* a, void* ptr,
                                          size_t oldsize, size_t size) {
-  _upb_ArenaHead* h = (_upb_ArenaHead*)a;
-  oldsize = UPB_ALIGN_MALLOC(oldsize);
-  size = UPB_ALIGN_MALLOC(size);
-  // Must be the last alloc.
-  UPB_ASSERT((char*)ptr + oldsize ==
-             h->UPB_ONLYBITS(ptr) - UPB_ASAN_GUARD_SIZE);
-  UPB_ASSERT(size <= oldsize);
-  h->UPB_ONLYBITS(ptr) = (char*)ptr + size;
-}
-
-UPB_API_INLINE void* upb_Arena_Realloc(upb_Arena* a, void* ptr, size_t oldsize,
-                                       size_t size) {
-  _upb_ArenaHead* h = (_upb_ArenaHead*)a;
-  oldsize = UPB_ALIGN_MALLOC(oldsize);
-  size = UPB_ALIGN_MALLOC(size);
-  bool is_most_recent_alloc =
-      (uintptr_t)ptr + oldsize == (uintptr_t)h->UPB_ONLYBITS(ptr);
-
-  if (is_most_recent_alloc) {
-    ptrdiff_t diff = size - oldsize;
-    if ((ptrdiff_t)UPB_PRIVATE(_upb_ArenaHas)(a) >= diff) {
-      h->UPB_ONLYBITS(ptr) += diff;
-      return ptr;
-    }
-  } else if (size <= oldsize) {
-    return ptr;
-  }
-
-  void* ret = upb_Arena_Malloc(a, size);
-
-  if (ret && oldsize > 0) {
-    memcpy(ret, ptr, UPB_MIN(oldsize, size));
-  }
-
-  return ret;
-}
-
-UPB_API_INLINE upb_Arena* upb_Arena_New(void) {
-  return upb_Arena_Init(NULL, 0, &upb_alloc_global);
+  return UPB_PRIVATE(_upb_Arena_ShrinkLast)(a, ptr, oldsize, size);
 }
 
 #ifdef __cplusplus
diff --git a/upb/mem/internal/arena.h b/upb/mem/internal/arena.h
index 1689573..421988e 100644
--- a/upb/mem/internal/arena.h
+++ b/upb/mem/internal/arena.h
@@ -8,43 +8,104 @@
 #ifndef UPB_MEM_INTERNAL_ARENA_H_
 #define UPB_MEM_INTERNAL_ARENA_H_
 
-#include "upb/mem/arena.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
 
 // Must be last.
 #include "upb/port/def.inc"
 
-typedef struct _upb_MemBlock _upb_MemBlock;
+// This is QUITE an ugly hack, which specifies the number of pointers needed
+// to equal (or exceed) the storage required for one upb_Arena.
+//
+// We need this because the decoder inlines a upb_Arena for performance but
+// the full struct is not visible outside of arena.c. Yes, I know, it's awful.
+#define UPB_ARENA_SIZE_HACK 7
 
-// LINT.IfChange(struct_definition)
+// LINT.IfChange(upb_Array)
+
 struct upb_Arena {
-  _upb_ArenaHead head;
-
-  // upb_alloc* together with a low bit which signals if there is an initial
-  // block.
-  uintptr_t block_alloc;
-
-  // When multiple arenas are fused together, each arena points to a parent
-  // arena (root points to itself). The root tracks how many live arenas
-  // reference it.
-
-  // The low bit is tagged:
-  //   0: pointer to parent
-  //   1: count, left shifted by one
-  UPB_ATOMIC(uintptr_t) parent_or_count;
-
-  // All nodes that are fused together are in a singly-linked list.
-  UPB_ATOMIC(upb_Arena*) next;  // NULL at end of list.
-
-  // The last element of the linked list.  This is present only as an
-  // optimization, so that we do not have to iterate over all members for every
-  // fuse.  Only significant for an arena root.  In other cases it is ignored.
-  UPB_ATOMIC(upb_Arena*) tail;  // == self when no other list members.
-
-  // Linked list of blocks to free/cleanup.  Atomic only for the benefit of
-  // upb_Arena_SpaceAllocated().
-  UPB_ATOMIC(_upb_MemBlock*) blocks;
+  char* UPB_ONLYBITS(ptr);
+  char* UPB_ONLYBITS(end);
 };
-// LINT.ThenChange(//depot/google3/third_party/upb/bits/typescript/arena.ts)
+
+// LINT.ThenChange(//depot/google3/third_party/upb/bits/typescript/arena.ts:upb_Array)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void UPB_PRIVATE(_upb_Arena_SwapIn)(struct upb_Arena* des,
+                                    const struct upb_Arena* src);
+void UPB_PRIVATE(_upb_Arena_SwapOut)(struct upb_Arena* des,
+                                     const struct upb_Arena* src);
+
+UPB_INLINE size_t UPB_PRIVATE(_upb_ArenaHas)(const struct upb_Arena* a) {
+  return (size_t)(a->UPB_ONLYBITS(end) - a->UPB_ONLYBITS(ptr));
+}
+
+UPB_INLINE void* UPB_PRIVATE(_upb_Arena_Malloc)(struct upb_Arena* a,
+                                                size_t size) {
+  void* UPB_PRIVATE(_upb_Arena_SlowMalloc)(struct upb_Arena * a, size_t size);
+
+  size = UPB_ALIGN_MALLOC(size);
+  const size_t span = size + UPB_ASAN_GUARD_SIZE;
+  if (UPB_UNLIKELY(UPB_PRIVATE(_upb_ArenaHas)(a) < span)) {
+    return UPB_PRIVATE(_upb_Arena_SlowMalloc)(a, span);
+  }
+
+  // We have enough space to do a fast malloc.
+  void* ret = a->UPB_ONLYBITS(ptr);
+  UPB_ASSERT(UPB_ALIGN_MALLOC((uintptr_t)ret) == (uintptr_t)ret);
+  UPB_ASSERT(UPB_ALIGN_MALLOC(size) == size);
+  UPB_UNPOISON_MEMORY_REGION(ret, size);
+
+  a->UPB_ONLYBITS(ptr) += span;
+
+  return ret;
+}
+
+UPB_INLINE void* UPB_PRIVATE(_upb_Arena_Realloc)(struct upb_Arena* a, void* ptr,
+                                                 size_t oldsize, size_t size) {
+  oldsize = UPB_ALIGN_MALLOC(oldsize);
+  size = UPB_ALIGN_MALLOC(size);
+  bool is_most_recent_alloc =
+      (uintptr_t)ptr + oldsize == (uintptr_t)a->UPB_ONLYBITS(ptr);
+
+  if (is_most_recent_alloc) {
+    ptrdiff_t diff = size - oldsize;
+    if ((ptrdiff_t)UPB_PRIVATE(_upb_ArenaHas)(a) >= diff) {
+      a->UPB_ONLYBITS(ptr) += diff;
+      return ptr;
+    }
+  } else if (size <= oldsize) {
+    return ptr;
+  }
+
+  void* ret = UPB_PRIVATE(_upb_Arena_Malloc)(a, size);
+
+  if (ret && oldsize > 0) {
+    memcpy(ret, ptr, UPB_MIN(oldsize, size));
+  }
+
+  return ret;
+}
+
+UPB_INLINE void UPB_PRIVATE(_upb_Arena_ShrinkLast)(struct upb_Arena* a,
+                                                   void* ptr, size_t oldsize,
+                                                   size_t size) {
+  oldsize = UPB_ALIGN_MALLOC(oldsize);
+  size = UPB_ALIGN_MALLOC(size);
+  // Must be the last alloc.
+  UPB_ASSERT((char*)ptr + oldsize ==
+             a->UPB_ONLYBITS(ptr) - UPB_ASAN_GUARD_SIZE);
+  UPB_ASSERT(size <= oldsize);
+  a->UPB_ONLYBITS(ptr) = (char*)ptr + size;
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
 
 #include "upb/port/undef.inc"
 
diff --git a/upb/wire/BUILD b/upb/wire/BUILD
index 6115b86..fab4e54 100644
--- a/upb/wire/BUILD
+++ b/upb/wire/BUILD
@@ -51,7 +51,6 @@
         "//upb:base",
         "//upb:hash",
         "//upb:mem",
-        "//upb:mem_internal",
         "//upb:message",
         "//upb:message_accessors_internal",
         "//upb:message_internal",
diff --git a/upb/wire/decode.c b/upb/wire/decode.c
index f5ad1cf..7fd5e08 100644
--- a/upb/wire/decode.c
+++ b/upb/wire/decode.c
@@ -17,7 +17,6 @@
 #include "upb/base/string_view.h"
 #include "upb/hash/common.h"
 #include "upb/mem/arena.h"
-#include "upb/mem/internal/arena.h"
 #include "upb/message/array.h"
 #include "upb/message/internal/accessors.h"
 #include "upb/message/internal/array.h"
@@ -1349,10 +1348,8 @@
     UPB_ASSERT(decoder->status != kUpb_DecodeStatus_Ok);
   }
 
-  _upb_MemBlock* blocks =
-      upb_Atomic_Load(&decoder->arena.blocks, memory_order_relaxed);
-  arena->head = decoder->arena.head;
-  upb_Atomic_Store(&arena->blocks, blocks, memory_order_relaxed);
+  UPB_PRIVATE(_upb_Arena_SwapOut)(arena, &decoder->arena);
+
   return decoder->status;
 }
 
@@ -1379,10 +1376,7 @@
   // done.  The temporary arena only needs to be able to handle allocation,
   // not fuse or free, so it does not need many of the members to be initialized
   // (particularly parent_or_count).
-  _upb_MemBlock* blocks = upb_Atomic_Load(&arena->blocks, memory_order_relaxed);
-  decoder.arena.head = arena->head;
-  decoder.arena.block_alloc = arena->block_alloc;
-  upb_Atomic_Init(&decoder.arena.blocks, blocks);
+  UPB_PRIVATE(_upb_Arena_SwapIn)(&decoder.arena, arena);
 
   return upb_Decoder_Decode(&decoder, buf, msg, l, arena);
 }
diff --git a/upb/wire/decode_fast.c b/upb/wire/decode_fast.c
index 889bb64..4062e2b 100644
--- a/upb/wire/decode_fast.c
+++ b/upb/wire/decode_fast.c
@@ -662,7 +662,7 @@
 static void fastdecode_docopy(upb_Decoder* d, const char* ptr, uint32_t size,
                               int copy, char* data, size_t data_offset,
                               upb_StringView* dst) {
-  d->arena.head.UPB_PRIVATE(ptr) += copy;
+  d->arena.UPB_PRIVATE(ptr) += copy;
   dst->data = data + data_offset;
   UPB_UNPOISON_MEMORY_REGION(data, copy);
   memcpy(data, ptr, copy);
@@ -694,7 +694,7 @@
   ptr += tagbytes + 1;                                                         \
   dst->size = size;                                                            \
                                                                                \
-  buf = d->arena.head.UPB_PRIVATE(ptr);                                        \
+  buf = d->arena.UPB_PRIVATE(ptr);                                             \
   arena_has = UPB_PRIVATE(_upb_ArenaHas)(&d->arena);                           \
   common_has = UPB_MIN(arena_has,                                              \
                        upb_EpsCopyInputStream_BytesAvailable(&d->input, ptr)); \
@@ -874,8 +874,8 @@
   if (UPB_LIKELY(msg_ceil_bytes > 0 &&
                  UPB_PRIVATE(_upb_ArenaHas)(&d->arena) >= msg_ceil_bytes)) {
     UPB_ASSERT(size <= (size_t)msg_ceil_bytes);
-    msg_data = d->arena.head.UPB_PRIVATE(ptr);
-    d->arena.head.UPB_PRIVATE(ptr) += size;
+    msg_data = d->arena.UPB_PRIVATE(ptr);
+    d->arena.UPB_PRIVATE(ptr) += size;
     UPB_UNPOISON_MEMORY_REGION(msg_data, msg_ceil_bytes);
     memset(msg_data, 0, msg_ceil_bytes);
     UPB_POISON_MEMORY_REGION(msg_data + size, msg_ceil_bytes - size);
diff --git a/upb/wire/internal/decode.h b/upb/wire/internal/decode.h
index f36b2b7..fcc04cf 100644
--- a/upb/wire/internal/decode.h
+++ b/upb/wire/internal/decode.h
@@ -33,7 +33,10 @@
   uint32_t end_group;  // field number of END_GROUP tag, else DECODE_NOGROUP.
   uint16_t options;
   bool missing_required;
-  upb_Arena arena;
+  union {
+    upb_Arena arena;
+    void* foo[UPB_ARENA_SIZE_HACK];
+  };
   upb_DecodeStatus status;
   jmp_buf err;