Add UPB_FORCEINLINE for varint32 decoding.

This speeds up the decoder by >20% and also reduces code size slightly!

name                       old time/op  new time/op  delta
ArenaOneAlloc              20.4ns ± 0%  20.2ns ± 0%   -1.10%  (p=0.000 n=12+11)
ArenaInitialBlockOneAlloc  5.25ns ± 0%  5.25ns ± 0%     ~     (p=0.786 n=11+12)
ParseDescriptorNoHeap      17.1µs ± 0%  13.1µs ± 0%  -23.29%  (p=0.000 n=11+12)
ParseDescriptor            17.4µs ± 1%  13.5µs ± 1%  -22.51%  (p=0.000 n=12+12)
SerializeDescriptor        10.7µs ± 0%  10.9µs ± 0%   +1.95%  (p=0.000 n=12+12)

    FILE SIZE        VM SIZE
 --------------  --------------
  +2.7%     +16   +2.7%     +16    [LOAD #2 [RX]]
  +0.5%     +16   [ = ]       0    [Unmapped]
  -1.4%     -72   -0.7%     -32    upb/decode.c
      +3.1%     +98   +3.1%     +98    decode_msg
      [DEL]    -170   [DEL]    -130    decode_varint32
  -0.0%     -40   -0.0%     -16    TOTAL
diff --git a/benchmark.py b/benchmark.py
index 4a6e9f8..b75dd4d 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -15,7 +15,7 @@
 @contextlib.contextmanager
 def GitWorktree(commit):
   tmpdir = tempfile.mkdtemp()
-  subprocess.run(['git', 'worktree', 'add', '-q', tmpdir, commit], check=True)
+  subprocess.run(['git', 'worktree', 'add', '-q', '-d', tmpdir, commit], check=True)
   cwd = os.getcwd()
   os.chdir(tmpdir)
   try: