kernel: add page frame management
Initialize the page frame ontology at boot and update it
when we do memory mappings.
Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
diff --git a/kernel/mmu.c b/kernel/mmu.c
index b618061..200ae42 100644
--- a/kernel/mmu.c
+++ b/kernel/mmu.c
@@ -9,28 +9,144 @@
#include <stdint.h>
#include <kernel_arch_interface.h>
#include <spinlock.h>
+#include <mmu.h>
+#include <init.h>
+#include <kernel_internal.h>
+#include <linker/linker-defs.h>
#include <logging/log.h>
LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
+/*
+ * General terminology:
+ * - A page frame is a page-sized physical memory region in RAM. It is a
+ * container where a data page may be placed. It is always referred to by
+ * physical address. We have a convention of using uintptr_t for physical
+ * addresses. We instantiate a struct z_page_frame to store metadata for
+ * every page frame.
+ *
+ * - A data page is a page-sized region of data. It may exist in a page frame,
+ * or be paged out to some backing store. Its location can always be looked
+ * up in the CPU's page tables (or equivalent) by virtual address.
+ * The data type will always be void * or in some cases uint8_t * when we
+ * want to do pointer arithmetic.
+ */
+
/* Spinlock to protect any globals in this file and serialize page table
* updates in arch code
*/
-static struct k_spinlock mm_lock;
+struct k_spinlock z_mm_lock;
/*
- * Overall virtual memory map. When the kernel starts, it is expected that all
- * memory regions are mapped into one large virtual region at the beginning of
- * CONFIG_KERNEL_VM_BASE. Unused virtual memory up to the limit noted by
- * CONFIG_KERNEL_VM_SIZE may be used for runtime memory mappings.
+ * General page frame management
+ */
+
+/* Database of all RAM page frames */
+struct z_page_frame z_page_frames[Z_NUM_PAGE_FRAMES];
+
+#if __ASSERT_ON
+/* Indicator that z_page_frames has been initialized, many of these APIs do
+ * not work before POST_KERNEL
+ */
+static bool page_frames_initialized;
+#endif
+
+/* Add colors to page table dumps to indicate mapping type */
+#define COLOR_PAGE_FRAMES 1
+
+#if COLOR_PAGE_FRAMES
+#define ANSI_DEFAULT "\x1B[0m"
+#define ANSI_RED "\x1B[1;31m"
+#define ANSI_GREEN "\x1B[1;32m"
+#define ANSI_YELLOW "\x1B[1;33m"
+#define ANSI_BLUE "\x1B[1;34m"
+#define ANSI_MAGENTA "\x1B[1;35m"
+#define ANSI_CYAN "\x1B[1;36m"
+#define ANSI_GREY "\x1B[1;90m"
+
+#define COLOR(x) printk(_CONCAT(ANSI_, x))
+#else
+#define COLOR(x) do { } while (0)
+#endif
+
+static void page_frame_dump(struct z_page_frame *pf)
+{
+ if (z_page_frame_is_reserved(pf)) {
+ COLOR(CYAN);
+ printk("R");
+ } else if (z_page_frame_is_busy(pf)) {
+ COLOR(MAGENTA);
+ printk("B");
+ } else if (z_page_frame_is_pinned(pf)) {
+ COLOR(YELLOW);
+ printk("P");
+ } else if (z_page_frame_is_available(pf)) {
+ COLOR(GREY);
+ printk(".");
+ } else if (z_page_frame_is_mapped(pf)) {
+ COLOR(DEFAULT);
+ printk("M");
+ } else {
+ COLOR(RED);
+ printk("?");
+ }
+}
+
+void z_page_frames_dump(void)
+{
+ int column = 0;
+
+ __ASSERT(page_frames_initialized, "%s called too early", __func__);
+ printk("Physical memory from 0x%lx to 0x%lx\n",
+ Z_PHYS_RAM_START, Z_PHYS_RAM_END);
+
+ for (int i = 0; i < Z_NUM_PAGE_FRAMES; i++) {
+ struct z_page_frame *pf = &z_page_frames[i];
+
+ page_frame_dump(pf);
+
+ column++;
+ if (column == 64) {
+ column = 0;
+ printk("\n");
+ }
+ }
+
+ COLOR(DEFAULT);
+ if (column != 0) {
+ printk("\n");
+ }
+}
+
+#define VIRT_FOREACH(_base, _size, _pos) \
+ for (_pos = _base; \
+ _pos < ((uint8_t *)_base + _size); _pos += CONFIG_MMU_PAGE_SIZE)
+
+#define PHYS_FOREACH(_base, _size, _pos) \
+ for (_pos = _base; \
+ _pos < ((uintptr_t)_base + _size); _pos += CONFIG_MMU_PAGE_SIZE)
+
+/*
+ * Virtual address space management
*
- * +--------------+ <- CONFIG_KERNEL_VM_BASE
+ * Call all of these functions with z_mm_lock held.
+ *
+ * Overall virtual memory map: When the kernel starts, it resides in
+ * virtual memory in the region Z_BOOT_KERNEL_VIRT_START to
+ * Z_BOOT_KERNEL_VIRT_END. Unused virtual memory past this, up to the limit
+ * noted by CONFIG_KERNEL_VM_SIZE may be used for runtime memory mappings.
+ *
+ * +--------------+ <- Z_VIRT_ADDR_START
+ * | Undefined VM | <- May contain ancillary regions like x86_64's locore
+ * +--------------+ <- Z_BOOT_KERNEL_VIRT_START (often == Z_VIRT_ADDR_START)
* | Mapping for |
- * | all RAM |
+ * | main kernel |
+ * | image |
+ * | |
+ * | |
+ * +--------------+ <- Z_BOOT_KERNEL_VIRT_END
* | |
- * | |
- * +--------------+ <- CONFIG_KERNEL_VM_BASE + CONFIG_KERNEL_RAM_SIZE
- * | Available | also the mapping limit as mappings grown downward
- * | virtual mem |
+ * | Unused, |
+ * | Available VM |
* | |
* |..............| <- mapping_pos (grows downward as more mappings are made)
* | Mapping |
@@ -40,31 +156,183 @@
* | ... |
* +--------------+
* | Mapping |
- * +--------------+ <- CONFIG_KERNEL_VM_BASE + CONFIG_KERNEL_VM_SIZE
+ * +--------------+ <- mappings start here
+ * | Reserved | <- special purpose virtual page(s) of size Z_VM_RESERVED
+ * +--------------+ <- Z_VIRT_RAM_END
*
- * At the moment we just have one area for mappings and they are permanent.
- * This is under heavy development and may change.
+ * At the moment we just have one downward-growing area for mappings.
+ * There is currently no support for un-mapping memory, see #28900.
+ */
+static uint8_t *mapping_pos = Z_VIRT_RAM_END - Z_VM_RESERVED;
+
+/* Get a chunk of virtual memory and mark it as being in-use.
+ *
+ * This may be called from arch early boot code before z_cstart() is invoked.
+ * Data will be copied and BSS zeroed, but this must not rely on any
+ * initialization functions being called prior to work correctly.
+ */
+static void *virt_region_get(size_t size)
+{
+ uint8_t *dest_addr;
+
+ if ((mapping_pos - size) < Z_KERNEL_VIRT_END) {
+ LOG_ERR("insufficient virtual address space (requested %zu)",
+ size);
+ return NULL;
+ }
+
+ mapping_pos -= size;
+ dest_addr = mapping_pos;
+
+ return dest_addr;
+}
+
+/*
+ * Free page frames management
+ *
+ * Call all of these functions with z_mm_lock held.
*/
- /* Current position for memory mappings in kernel memory.
- * At the moment, all kernel memory mappings are permanent.
- * Memory mappings start at the end of the address space, and grow
- * downward.
- *
- * All of this is under heavy development and is subject to change.
- */
-static uint8_t *mapping_pos =
- (uint8_t *)((uintptr_t)CONFIG_KERNEL_VM_BASE +
- (uintptr_t)CONFIG_KERNEL_VM_SIZE);
-
-/* Lower-limit of virtual address mapping. Immediately below this is the
- * permanent identity mapping for all SRAM.
+/* Linked list of unused and available page frames.
+ *
+ * TODO: This is very simple and treats all free page frames as being equal.
+ * However, there are use-cases to consolidate free pages such that entire
+ * SRAM banks can be switched off to save power, and so obtaining free pages
+ * may require a more complex ontology which prefers page frames in RAM banks
+ * which are still active.
+ *
+ * This implies in the future there may be multiple slists managing physical
+ * pages. Each page frame will still just have one snode link.
*/
-static uint8_t *mapping_limit =
- (uint8_t *)((uintptr_t)CONFIG_KERNEL_VM_BASE +
- (size_t)CONFIG_KERNEL_RAM_SIZE);
+static sys_slist_t free_page_frame_list;
-size_t k_mem_region_align(uintptr_t *aligned_addr, size_t *aligned_size,
+/* Number of unused and available free page frames */
+size_t z_free_page_count;
+
+#define PF_ASSERT(pf, expr, fmt, ...) \
+ __ASSERT(expr, "page frame 0x%lx: " fmt, z_page_frame_to_phys(pf), \
+ ##__VA_ARGS__)
+
+/* Get an unused page frame. don't care which one, or NULL if there are none */
+static struct z_page_frame *free_page_frame_list_get(void)
+{
+ sys_snode_t *node;
+ struct z_page_frame *pf = NULL;
+
+ node = sys_slist_get(&free_page_frame_list);
+ if (node != NULL) {
+ z_free_page_count--;
+ pf = CONTAINER_OF(node, struct z_page_frame, node);
+ PF_ASSERT(pf, z_page_frame_is_available(pf),
+ "unavailable but somehow on free list");
+ }
+
+ return pf;
+}
+
+/* Release a page frame back into the list of free pages */
+static void free_page_frame_list_put(struct z_page_frame *pf)
+{
+ PF_ASSERT(pf, z_page_frame_is_available(pf),
+ "unavailable page put on free list");
+ sys_slist_append(&free_page_frame_list, &pf->node);
+ z_free_page_count++;
+}
+
+static void free_page_frame_list_init(void)
+{
+ sys_slist_init(&free_page_frame_list);
+}
+
+/*
+ * Memory Mapping
+ */
+
+/* Called after the frame is mapped in the arch layer, to update our
+ * local ontology (and do some assertions while we're at it)
+ */
+static void frame_mapped_set(struct z_page_frame *pf, void *addr)
+{
+ PF_ASSERT(pf, !z_page_frame_is_reserved(pf),
+ "attempted to map a reserved page frame");
+
+ /* We do allow multiple mappings for pinned page frames
+ * since we will never need to reverse map them.
+ * This is uncommon, use-cases are for things like the
+ * Zephyr equivalent of VSDOs
+ */
+ PF_ASSERT(pf, !z_page_frame_is_mapped(pf) || z_page_frame_is_pinned(pf),
+ "non-pinned and already mapped to %p", pf->addr);
+
+ pf->flags |= Z_PAGE_FRAME_MAPPED;
+ pf->addr = addr;
+ pf->refcount++;
+}
+
+
+/* This may be called from arch early boot code before z_cstart() is invoked.
+ * Data will be copied and BSS zeroed, but this must not rely on any
+ * initialization functions being called prior to work correctly.
+ */
+void z_phys_map(uint8_t **virt_ptr, uintptr_t phys, size_t size, uint32_t flags)
+{
+ uintptr_t aligned_phys, addr_offset;
+ size_t aligned_size;
+ int ret;
+ k_spinlock_key_t key;
+ uint8_t *dest_addr;
+
+ addr_offset = k_mem_region_align(&aligned_phys, &aligned_size,
+ phys, size,
+ CONFIG_MMU_PAGE_SIZE);
+ __ASSERT(aligned_size != 0, "0-length mapping at 0x%lx", aligned_phys);
+ __ASSERT(aligned_phys < (aligned_phys + (aligned_size - 1)),
+ "wraparound for physical address 0x%lx (size %zu)",
+ aligned_phys, aligned_size);
+
+ key = k_spin_lock(&z_mm_lock);
+ /* Obtain an appropriately sized chunk of virtual memory */
+ dest_addr = virt_region_get(aligned_size);
+ if (!dest_addr) {
+ goto fail;
+ }
+
+ /* If this fails there's something amiss with virt_region_get */
+ __ASSERT((uintptr_t)dest_addr <
+ ((uintptr_t)dest_addr + (size - 1)),
+ "wraparound for virtual address %p (size %zu)",
+ dest_addr, size);
+
+ LOG_DBG("arch_mem_map(%p, 0x%lx, %zu, %x) offset %lu", dest_addr,
+ aligned_phys, aligned_size, flags, addr_offset);
+
+ ret = arch_mem_map(dest_addr, aligned_phys, aligned_size, flags);
+ if (ret != 0) {
+ LOG_ERR("arch_mem_map() failed with %d", ret);
+ goto fail;
+ }
+ k_spin_unlock(&z_mm_lock, key);
+
+ *virt_ptr = dest_addr + addr_offset;
+ return;
+fail:
+ /* May re-visit this in the future, but for now running out of
+ * virtual address space or failing the arch_mem_map() call is
+ * an unrecoverable situation.
+ *
+ * Other problems not related to resource exhaustion we leave as
+ * assertions since they are clearly programming mistakes.
+ */
+ LOG_ERR("memory mapping 0x%lx (size %zu, flags 0x%x) failed",
+ phys, size, flags);
+ k_panic();
+}
+
+/*
+ * Miscellaneous
+ */
+
+size_t k_mem_region_align(uintptr_t *aligned_phys, size_t *aligned_size,
uintptr_t phys_addr, size_t size, size_t align)
{
size_t addr_offset;
@@ -72,66 +340,58 @@
/* The actual mapped region must be page-aligned. Round down the
* physical address and pad the region size appropriately
*/
- *aligned_addr = ROUND_DOWN(phys_addr, align);
- addr_offset = phys_addr - *aligned_addr;
+ *aligned_phys = ROUND_DOWN(phys_addr, align);
+ addr_offset = phys_addr - *aligned_phys;
*aligned_size = ROUND_UP(size + addr_offset, align);
return addr_offset;
}
-void z_phys_map(uint8_t **virt_ptr, uintptr_t phys, size_t size, uint32_t flags)
+#define VM_OFFSET ((CONFIG_KERNEL_VM_BASE + CONFIG_KERNEL_VM_OFFSET) - \
+ CONFIG_SRAM_BASE_ADDRESS)
+
+/* Only applies to boot RAM mappings within the Zephyr image that have never
+ * been remapped or paged out. Never use this unless you know exactly what you
+ * are doing.
+ */
+#define BOOT_VIRT_TO_PHYS(virt) ((uintptr_t)(((uint8_t *)virt) + VM_OFFSET))
+
+void z_mem_manage_init(void)
{
- uintptr_t aligned_addr, addr_offset;
- size_t aligned_size;
- int ret;
- k_spinlock_key_t key;
- uint8_t *dest_virt;
+ uintptr_t phys;
+ uint8_t *addr;
+ struct z_page_frame *pf;
+ k_spinlock_key_t key = k_spin_lock(&z_mm_lock);
- addr_offset = k_mem_region_align(&aligned_addr, &aligned_size,
- phys, size,
- CONFIG_MMU_PAGE_SIZE);
+ free_page_frame_list_init();
- key = k_spin_lock(&mm_lock);
-
- /* Carve out some unused virtual memory from the top of the
- * address space
+#ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
+ /* If some page frames are unavailable for use as memory, arch
+ * code will mark Z_PAGE_FRAME_RESERVED in their flags
*/
- if ((mapping_pos - aligned_size) < mapping_limit) {
- LOG_ERR("insufficient kernel virtual address space");
- goto fail;
+ arch_reserved_pages_update();
+#endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
+
+ /* All pages composing the Zephyr image are mapped at boot in a
+ * predictable way. This can change at runtime.
+ */
+ VIRT_FOREACH(Z_KERNEL_VIRT_START, Z_KERNEL_VIRT_SIZE, addr)
+ {
+ frame_mapped_set(z_phys_to_page_frame(BOOT_VIRT_TO_PHYS(addr)),
+ addr);
}
- mapping_pos -= aligned_size;
- dest_virt = mapping_pos;
- LOG_DBG("arch_mem_map(%p, 0x%lx, %zu, %x) offset %lu\n", dest_virt,
- aligned_addr, aligned_size, flags, addr_offset);
- __ASSERT(dest_virt != NULL, "NULL page memory mapping");
- __ASSERT(aligned_size != 0, "0-length mapping at 0x%lx", aligned_addr);
- __ASSERT((uintptr_t)dest_virt <
- ((uintptr_t)dest_virt + (aligned_size - 1)),
- "wraparound for virtual address %p (size %zu)",
- dest_virt, size);
- __ASSERT(aligned_addr < (aligned_addr + (size - 1)),
- "wraparound for physical address 0x%lx (size %zu)",
- aligned_addr, size);
-
- ret = arch_mem_map(dest_virt, aligned_addr, aligned_size, flags);
- k_spin_unlock(&mm_lock, key);
-
- if (ret == 0) {
- *virt_ptr = dest_virt + addr_offset;
- } else {
- /* This happens if there is an insurmountable problem
- * with the selected cache modes or access flags
- * with no safe fallback
- */
-
- LOG_ERR("arch_mem_map() to %p returned %d", dest_virt, ret);
- goto fail;
+ /* Any remaining pages that aren't mapped, reserved, or pinned get
+ * added to the free pages list
+ */
+ Z_PAGE_FRAME_FOREACH(phys, pf) {
+ if (z_page_frame_is_available(pf)) {
+ free_page_frame_list_put(pf);
+ }
}
- return;
-fail:
- LOG_ERR("memory mapping 0x%lx (size %zu, flags 0x%x) failed",
- phys, size, flags);
- k_panic();
+ LOG_DBG("free page frames: %zu", z_free_page_count);
+#if __ASSERT_ON
+ page_frames_initialized = true;
+#endif
+ k_spin_unlock(&z_mm_lock, key);
}