cdc_uart: cater for Windows driver deficiencies

It's possible for the Windows CDC-ACM driver to ignore the IN endpoint
for long periods of time - multiple frames - if the host application
doesn't consume uart RX data. Boost buffer sizes to compensate.

Also prevent usb_thread from potentially being idle for a tick when
there's work to do.
diff --git a/src/cdc_uart.c b/src/cdc_uart.c
index 525b012..b936496 100644
--- a/src/cdc_uart.c
+++ b/src/cdc_uart.c
@@ -34,8 +34,9 @@
 TaskHandle_t uart_taskhandle;
 TickType_t last_wake, interval = 100;
 
-static uint8_t tx_buf[CFG_TUD_CDC_TX_BUFSIZE];
-static uint8_t rx_buf[CFG_TUD_CDC_RX_BUFSIZE];
+/* Max 1 FIFO worth of data */
+static uint8_t tx_buf[32];
+static uint8_t rx_buf[32];
 // Actually s^-1 so 25ms
 #define DEBOUNCE_MS 40
 static uint debounce_ticks = 5;
@@ -59,6 +60,7 @@
 void cdc_task(void)
 {
     static int was_connected = 0;
+    static uint cdc_tx_oe = 0;
     uint rx_len = 0;
 
     // Consume uart fifo regardless even if not connected
@@ -77,6 +79,9 @@
           rx_led_debounce = debounce_ticks;
 #endif
           written = MIN(tud_cdc_write_available(), rx_len);
+          if (rx_len > written)
+              cdc_tx_oe++;
+
           if (written > 0) {
             tud_cdc_write(rx_buf, written);
             tud_cdc_write_flush();
@@ -113,6 +118,7 @@
     } else if (was_connected) {
       tud_cdc_write_clear();
       was_connected = 0;
+      cdc_tx_oe = 0;
     }
 }
 
diff --git a/src/main.c b/src/main.c
index 1b8838b..3616486 100644
--- a/src/main.c
+++ b/src/main.c
@@ -58,6 +58,8 @@
 
 void usb_thread(void *ptr)
 {
+    TickType_t wake;
+    wake = xTaskGetTickCount();
     do {
         tud_task();
 #ifdef PICOPROBE_USB_CONNECTED_LED
@@ -66,8 +68,9 @@
         else
             gpio_put(PICOPROBE_USB_CONNECTED_LED, 0);
 #endif
-        // Trivial delay to save power
-        vTaskDelay(1);
+        // Go to sleep for up to a tick if nothing to do
+        if (!tud_task_event_ready())
+            xTaskDelayUntil(&wake, 1);
     } while (1);
 }
 
diff --git a/src/tusb_config.h b/src/tusb_config.h
index cc9f80c..a1b92b0 100644
--- a/src/tusb_config.h
+++ b/src/tusb_config.h
@@ -68,8 +68,15 @@
 #define CFG_TUD_MIDI            0
 #define CFG_TUD_VENDOR          1
 
-#define CFG_TUD_CDC_RX_BUFSIZE 64
-#define CFG_TUD_CDC_TX_BUFSIZE 64
+/*
+ * TX bufsize (actually UART RX) is oversized because the Windows CDC-ACM
+ * driver submits a grand total of _one_ URB at any one time.
+ * This means the application must consume the data before the next IN token
+ * is issued. At high datarates this leads to huge variation in instantaneous
+ * throughput on USB, so a large runway is needed.
+ */
+#define CFG_TUD_CDC_RX_BUFSIZE 128
+#define CFG_TUD_CDC_TX_BUFSIZE 4096
 
 #define CFG_TUD_VENDOR_RX_BUFSIZE 8192
 #define CFG_TUD_VENDOR_TX_BUFSIZE 8192