arch/x86/core/ia32/x86_mmu.c - third_party/github/zephyrproject-rtos/zephyr - Git at Google

 /*
  * Copyright (c) 2011-2014 Wind River Systems, Inc.
  * Copyright (c) 2017 Intel Corporation
  *
  * SPDX-License-Identifier: Apache-2.0
  */
 #include <kernel.h>
 #include <ia32/mmustructs.h>
 #include <linker/linker-defs.h>
 #include <kernel_internal.h>
 #include <kernel_structs.h>
 #include <init.h>
 #include <ctype.h>
 #include <string.h>

 /* Despite our use of PAE page tables, we do not (and will never) actually
  * support PAE. Use a 64-bit x86 target if you have that much RAM.
  */
 BUILD_ASSERT(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024ULL) - 1ULL <=
 				 (unsigned long long)UINTPTR_MAX);

 /* Common regions for all x86 processors.
  * Peripheral I/O ranges configured at the SOC level
  */

 /* Mark text and rodata as read-only.
  * Userspace may read all text and rodata.
  */
 MMU_BOOT_REGION((u32_t)&_image_text_start, (u32_t)&_image_text_size,
 		MMU_ENTRY_READ | MMU_ENTRY_USER);

 MMU_BOOT_REGION((u32_t)&_image_rodata_start, (u32_t)&_image_rodata_size,
 		MMU_ENTRY_READ | MMU_ENTRY_USER | MMU_ENTRY_EXECUTE_DISABLE);

 #ifdef CONFIG_USERSPACE
 MMU_BOOT_REGION((u32_t)&_app_smem_start, (u32_t)&_app_smem_size,
 		MMU_ENTRY_WRITE | MMU_ENTRY_EXECUTE_DISABLE);
 #endif

 #ifdef CONFIG_COVERAGE_GCOV
 MMU_BOOT_REGION((u32_t)&__gcov_bss_start, (u32_t)&__gcov_bss_size,
 		MMU_ENTRY_WRITE | MMU_ENTRY_USER | MMU_ENTRY_EXECUTE_DISABLE);
 #endif

 /* __kernel_ram_size includes all unused memory, which is used for heaps.
  * User threads cannot access this unless granted at runtime. This is done
  * automatically for stacks.
  */
 MMU_BOOT_REGION((u32_t)&__kernel_ram_start, (u32_t)&__kernel_ram_size,
 		MMU_ENTRY_WRITE | MMU_ENTRY_EXECUTE_DISABLE);

 /* Works for PDPT, PD, PT entries, the bits we check here are all the same.
  *
  * Not trying to capture every flag, just the most interesting stuff,
  * Present, write, XD, user, in typically encountered combinations.
  */
 static char get_entry_code(u64_t value)
 {
 	char ret;

 	if ((value & MMU_ENTRY_PRESENT) == 0) {
 		ret = '.';
 	} else {
 		if ((value & MMU_ENTRY_WRITE) != 0) {
 			/* Writable page */
 			if ((value & MMU_ENTRY_EXECUTE_DISABLE) != 0) {
 				/* RW */
 				ret = 'w';
 			} else {
 				/* RWX */
 				ret = 'a';
 			}
 		} else {
 			if ((value & MMU_ENTRY_EXECUTE_DISABLE) != 0) {
 				/* R */
 				ret = 'r';
 			} else {
 				/* RX */
 				ret = 'x';
 			}
 		}

 		if ((value & MMU_ENTRY_USER) != 0) {
 			/* Uppercase indicates user mode access */
 			ret = toupper(ret);
 		}
 	}

 	return ret;
 }

 static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index)
 {
 	int column = 0;

 	printk("Page table %d for 0x%08lX - 0x%08lX at %p\n",
 	       index, base, base + Z_X86_PT_AREA - 1, pt);

 	for (int i = 0; i < Z_X86_NUM_PT_ENTRIES; i++) {
 		printk("%c", get_entry_code(pt->entry[i].value));

 		column++;
 		if (column == 64) {
 			column = 0;
 			printk("\n");
 		}
 	}
 }

 static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index)
 {
 	int column = 0;

 	printk("Page directory %d for 0x%08lX - 0x%08lX at %p\n",
 	       index, base, base + Z_X86_PD_AREA - 1, pd);

 	for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) {
 		printk("%c", get_entry_code(pd->entry[i].pt.value));

 		column++;
 		if (column == 64) {
 			column = 0;
 			printk("\n");
 		}
 	}

 	for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) {
 		struct x86_mmu_pt *pt;
 		union x86_mmu_pde_pt *pde = &pd->entry[i].pt;

 		if (pde->p == 0 || pde->ps == 1) {
 			/* Skip non-present, or 2MB directory entries, there's
 			 * no page table to examine */
 			continue;
 		}
 		pt = (struct x86_mmu_pt *)(pde->pt << MMU_PAGE_SHIFT);

 		z_x86_dump_pt(pt, base + (i * Z_X86_PT_AREA), i);
 	}
 }

 static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base,
 			    int index)
 {
 	printk("Page directory pointer table %d for 0x%08lX - 0x%08lX at %p\n",
 	       index, base, base + Z_X86_PDPT_AREA - 1, pdpt);

 	for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) {
 		printk("%c", get_entry_code(pdpt->entry[i].value));
 	}
 	printk("\n");
 	for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) {
 		struct x86_mmu_pd *pd;

 		if (pdpt->entry[i].p == 0) {
 			continue;
 		}
 		pd = (struct x86_mmu_pd *)(pdpt->entry[i].pd << MMU_PAGE_SHIFT);

 		z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i);
 	}
 }

 void z_x86_dump_page_tables(struct x86_mmu_pdpt *pdpt)
 {
 	z_x86_dump_pdpt(pdpt, 0, 0);
 }

 void z_x86_mmu_get_flags(struct x86_mmu_pdpt *pdpt, void *addr,
 			x86_page_entry_data_t *pde_flags,
 			x86_page_entry_data_t *pte_flags)
 {
 	*pde_flags =
 		(x86_page_entry_data_t)(X86_MMU_GET_PDE(pdpt, addr)->value &
 			~(x86_page_entry_data_t)MMU_PDE_PAGE_TABLE_MASK);

 	if ((*pde_flags & MMU_ENTRY_PRESENT) != 0) {
 		*pte_flags = (x86_page_entry_data_t)
 			(X86_MMU_GET_PTE(pdpt, addr)->value &
 			 ~(x86_page_entry_data_t)MMU_PTE_PAGE_MASK);
 	} else {
 		*pte_flags = 0;
 	}
 }

 int z_x86_mmu_validate(struct x86_mmu_pdpt *pdpt, void *addr, size_t size,
 		       int write)
 {
 	u32_t start_pde_num;
 	u32_t end_pde_num;
 	u32_t starting_pte_num;
 	u32_t ending_pte_num;
 	u32_t pde;
 	u32_t pte;
 	union x86_mmu_pte pte_value;
 	u32_t start_pdpte_num = MMU_PDPTE_NUM(addr);
 	u32_t end_pdpte_num = MMU_PDPTE_NUM((char *)addr + size - 1);
 	u32_t pdpte;
 	struct x86_mmu_pt *pte_address;
 	int ret = -EPERM;

 	start_pde_num = MMU_PDE_NUM(addr);
 	end_pde_num = MMU_PDE_NUM((char *)addr + size - 1);
 	starting_pte_num = MMU_PAGE_NUM((char *)addr);

 	for (pdpte = start_pdpte_num; pdpte <= end_pdpte_num; pdpte++) {
 		if (pdpte != start_pdpte_num) {
 			start_pde_num = 0U;
 		}

 		if (pdpte != end_pdpte_num) {
 			end_pde_num = 0U;
 		} else {
 			end_pde_num = MMU_PDE_NUM((char *)addr + size - 1);
 		}

 		/* Ensure page directory pointer table entry is present */
 		if (X86_MMU_GET_PDPTE_INDEX(pdpt, pdpte)->p == 0) {
 			goto out;
 		}

 		struct x86_mmu_pd *pd_address =
 			X86_MMU_GET_PD_ADDR_INDEX(pdpt, pdpte);

 		/* Iterate for all the pde's the buffer might take up.
 		 * (depends on the size of the buffer and start address
 		 * of the buff)
 		 */
 		for (pde = start_pde_num; pde <= end_pde_num; pde++) {
 			union x86_mmu_pde_pt pde_value =
 				pd_address->entry[pde].pt;

 			if ((pde_value.p) == 0 ||
 			    (pde_value.us) == 0 ||
 			    ((write != 0) && (pde_value.rw == 0))) {
 				goto out;
 			}

 			pte_address = (struct x86_mmu_pt *)
 				(pde_value.pt << MMU_PAGE_SHIFT);

 			/* loop over all the possible page tables for the
 			 * required size. If the pde is not the last one
 			 * then the last pte would be 511. So each pde
 			 * will be using all the page table entries except
 			 * for the last pde. For the last pde, pte is
 			 * calculated using the last memory address
 			 * of the buffer.
 			 */
 			if (pde != end_pde_num) {
 				ending_pte_num = 511U;
 			} else {
 				ending_pte_num =
 					MMU_PAGE_NUM((char *)addr + size - 1);
 			}

 			/* For all the pde's apart from the starting pde,
 			 * will have the start pte number as zero.
 			 */
 			if (pde != start_pde_num) {
 				starting_pte_num = 0U;
 			}

 			pte_value.value = 0xFFFFFFFFU;

 			/* Bitwise AND all the pte values.
 			 * An optimization done to make sure a compare is
 			 * done only once.
 			 */
 			for (pte = starting_pte_num;
 			     pte <= ending_pte_num;
 			     pte++) {
 				pte_value.value &=
 					pte_address->entry[pte].value;
 			}

 			if ((pte_value.p) == 0 ||
 			    (pte_value.us) == 0 ||
 			    ((write != 0) && (pte_value.rw == 0))) {
 				goto out;
 			}
 		}
 	}
 	ret = 0;
 out:
 #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
 	__asm__ volatile ("lfence" : : : "memory");
 #endif

 	return ret;
 }

 static inline void tlb_flush_page(void *addr)
 {
 	/* Invalidate TLB entries corresponding to the page containing the
 	 * specified address
 	 */
 	char *page = (char *)addr;

 	__asm__ ("invlpg %0" :: "m" (*page));
 }

 #define PDPTE_FLAGS_MASK	MMU_ENTRY_PRESENT

 #define PDE_FLAGS_MASK		(MMU_ENTRY_WRITE | MMU_ENTRY_USER | \
 				 PDPTE_FLAGS_MASK)

 #define PTE_FLAGS_MASK		(PDE_FLAGS_MASK | MMU_ENTRY_EXECUTE_DISABLE | \
 				 MMU_ENTRY_WRITE_THROUGH | \
 				 MMU_ENTRY_CACHING_DISABLE)

 void z_x86_mmu_set_flags(struct x86_mmu_pdpt *pdpt, void *ptr, size_t size,
 			 x86_page_entry_data_t flags,
 			 x86_page_entry_data_t mask, bool flush)
 {
 	u32_t addr = (u32_t)ptr;

 	__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
 	__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");

 	/* L1TF mitigation: non-present PTEs will have address fields
 	 * zeroed. Expand the mask to include address bits if we are changing
 	 * the present bit.
 	 */
 	if ((mask & MMU_PTE_P_MASK) != 0) {
 		mask |= MMU_PTE_PAGE_MASK;
 	}

 	while (size != 0) {
 		union x86_mmu_pte *pte;
 		union x86_mmu_pde_pt *pde;
 		union x86_mmu_pdpte *pdpte;
 		x86_page_entry_data_t cur_flags = flags;

 		pdpte = X86_MMU_GET_PDPTE(pdpt, addr);
 		__ASSERT(pdpte->p == 1, "set flags on non-present PDPTE");
 		pdpte->value |= (flags & PDPTE_FLAGS_MASK);

 		pde = X86_MMU_GET_PDE(pdpt, addr);
 		__ASSERT(pde->p == 1, "set flags on non-present PDE");
 		pde->value |= (flags & PDE_FLAGS_MASK);
 		/* If any flags enable execution, clear execute disable at the
 		 * page directory level
 		 */
 		if ((flags & MMU_ENTRY_EXECUTE_DISABLE) == 0) {
 			pde->value &= ~MMU_ENTRY_EXECUTE_DISABLE;
 		}

 		pte = X86_MMU_GET_PTE(pdpt, addr);
 		/* If we're setting the present bit, restore the address
 		 * field. If we're clearing it, then the address field
 		 * will be zeroed instead, mapping the PTE to the NULL page.
 		 */
 		if (((mask & MMU_PTE_P_MASK) != 0) &&
 		    ((flags & MMU_ENTRY_PRESENT) != 0)) {
 			cur_flags |= addr;
 		}

 		pte->value = (pte->value & ~mask) | cur_flags;
 		if (flush) {
 			tlb_flush_page((void *)addr);
 		}

 		size -= MMU_PAGE_SIZE;
 		addr += MMU_PAGE_SIZE;
 	}
 }

 static char __aligned(MMU_PAGE_SIZE)
 	page_pool[MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES];

 static char *page_pos = page_pool + sizeof(page_pool);

 static void *get_page(void)
 {
 	page_pos -= MMU_PAGE_SIZE;

 	__ASSERT(page_pos >= page_pool, "out of MMU pages\n");

 	return page_pos;
 }

 __aligned(0x20) struct x86_mmu_pdpt z_x86_kernel_pdpt;
 #ifdef CONFIG_X86_KPTI
 __aligned(0x20) struct x86_mmu_pdpt z_x86_user_pdpt;
 #endif

 extern char z_shared_kernel_page_start[];

 static inline bool is_within_system_ram(uintptr_t addr)
 {
 	return (addr >= DT_PHYS_RAM_ADDR) &&
 		(addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));
 }

 static void add_mmu_region_page(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
 				u64_t flags, bool user_table)
 {
 	union x86_mmu_pdpte *pdpte;
 	struct x86_mmu_pd *pd;
 	union x86_mmu_pde_pt *pde;
 	struct x86_mmu_pt *pt;
 	union x86_mmu_pte *pte;

 #ifdef CONFIG_X86_KPTI
 	/* If we are generating a page table for user mode, and this address
 	 * does not have the user flag set, and this address falls outside
 	 * of system RAM, then don't bother generating any tables for it,
 	 * we will never need them later as memory domains are limited to
 	 * regions within system RAM.
 	 */
 	if (user_table && (flags & MMU_ENTRY_USER) == 0 &&
 	    !is_within_system_ram(addr)) {
 		return;
 	}
 #endif

 	/* Setup the PDPTE entry for the address, creating a page directory
 	 * if one didn't exist
 	 */
 	pdpte = &pdpt->entry[MMU_PDPTE_NUM(addr)];
 	if (pdpte->p == 0) {
 		pd = get_page();
 		pdpte->pd = ((uintptr_t)pd) >> MMU_PAGE_SHIFT;
 	} else {
 		pd = (struct x86_mmu_pd *)(pdpte->pd << MMU_PAGE_SHIFT);
 	}
 	pdpte->value |= (flags & PDPTE_FLAGS_MASK);

 	/* Setup the PDE entry for the address, creating a page table
 	 * if necessary
 	 */
 	pde = &pd->entry[MMU_PDE_NUM(addr)].pt;
 	if (pde->p == 0) {
 		pt = get_page();
 		pde->pt = ((uintptr_t)pt) >> MMU_PAGE_SHIFT;
 	} else {
 		pt = (struct x86_mmu_pt *)(pde->pt << MMU_PAGE_SHIFT);
 	}
 	pde->value |= (flags & PDE_FLAGS_MASK);

 	/* Execute disable bit needs special handling, we should only set it
 	 * at the page directory level if ALL pages have XD set (instead of
 	 * just one).
 	 *
 	 * Use the 'ignored2' field to store a marker on whether any
 	 * configured region allows execution, the CPU never looks at
 	 * or modifies it.
 	 */
 	if ((flags & MMU_ENTRY_EXECUTE_DISABLE) == 0) {
 		pde->ignored2 = 1;
 		pde->value &= ~MMU_ENTRY_EXECUTE_DISABLE;
 	} else if (pde->ignored2 == 0) {
 		pde->value |= MMU_ENTRY_EXECUTE_DISABLE;
 	}

 #ifdef CONFIG_X86_KPTI
 	if (user_table && (flags & MMU_ENTRY_USER) == 0 &&
 	    addr != (uintptr_t)(&z_shared_kernel_page_start)) {
 		/* All non-user accessible pages except the shared page
 		 * are marked non-present in the page table.
 		 */
 		return;
 	}
 #else
 	ARG_UNUSED(user_table);
 #endif

 	/* Finally set up the page table entry */
 	pte = &pt->entry[MMU_PAGE_NUM(addr)];
 	pte->page = addr >> MMU_PAGE_SHIFT;
 	pte->value |= (flags & PTE_FLAGS_MASK);
 }

 static void add_mmu_region(struct x86_mmu_pdpt *pdpt, struct mmu_region *rgn,
 			   bool user_table)
 {
 	size_t size;
 	u64_t flags;
 	uintptr_t addr;

 	__ASSERT((rgn->address & MMU_PAGE_MASK) == 0U,
 		 "unaligned address provided");
 	__ASSERT((rgn->size & MMU_PAGE_MASK) == 0U,
 		 "unaligned size provided");

 	addr = rgn->address;

 	/* Add the present flag, and filter out 'runtime user' since this
 	 * has no meaning to the actual MMU
 	 */
 	flags = rgn->flags | MMU_ENTRY_PRESENT;

 	/* Iterate through the region a page at a time, creating entries as
 	 * necessary.
 	 */
 	size = rgn->size;
 	while (size > 0) {
 		add_mmu_region_page(pdpt, addr, flags, user_table);

 		size -= MMU_PAGE_SIZE;
 		addr += MMU_PAGE_SIZE;
 	}
 }

 extern struct mmu_region z_x86_mmulist_start[];
 extern struct mmu_region z_x86_mmulist_end[];

 /* Called from x86's kernel_arch_init() */
 void z_x86_paging_init(void)
 {
 	size_t pages_free;

 	for (struct mmu_region *rgn = z_x86_mmulist_start;
 	     rgn < z_x86_mmulist_end; rgn++) {
 		add_mmu_region(&z_x86_kernel_pdpt, rgn, false);
 #ifdef CONFIG_X86_KPTI
 		add_mmu_region(&z_x86_user_pdpt, rgn, true);
 #endif
 	}

 	pages_free = (page_pos - page_pool) / MMU_PAGE_SIZE;

 	if (pages_free != 0) {
 		printk("Optimal CONFIG_X86_MMU_PAGE_POOL_PAGES %zu\n",
 		       CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free);
 	}

 	z_x86_enable_paging();
 }

 #ifdef CONFIG_X86_USERSPACE
 int z_arch_buffer_validate(void *addr, size_t size, int write)
 {
 	return z_x86_mmu_validate(z_x86_pdpt_get(_current), addr, size, write);
 }

 static uintptr_t thread_pd_create(uintptr_t pages,
 				  struct x86_mmu_pdpt *thread_pdpt,
 				  struct x86_mmu_pdpt *master_pdpt)
 {
 	uintptr_t pos = pages, phys_addr = Z_X86_PD_START;

 	for (int i = 0; i < Z_X86_NUM_PD; i++, phys_addr += Z_X86_PD_AREA) {
 		union x86_mmu_pdpte *pdpte;
 		struct x86_mmu_pd *master_pd, *dest_pd;

 		/* Obtain PD in master tables for the address range and copy
 		 * into the per-thread PD for this range
 		 */
 		master_pd = X86_MMU_GET_PD_ADDR(master_pdpt, phys_addr);
 		dest_pd = (struct x86_mmu_pd *)pos;

 		(void)memcpy(dest_pd, master_pd, sizeof(struct x86_mmu_pd));

 		/* Update pointer in per-thread pdpt to point to the per-thread
 		 * directory we just copied
 		 */
 		pdpte = X86_MMU_GET_PDPTE(thread_pdpt, phys_addr);
 		pdpte->pd = pos >> MMU_PAGE_SHIFT;
 		pos += MMU_PAGE_SIZE;
 	}

 	return pos;
 }

 /* thread_pdpt must be initialized, as well as all the page directories */
 static uintptr_t thread_pt_create(uintptr_t pages,
 				  struct x86_mmu_pdpt *thread_pdpt,
 				  struct x86_mmu_pdpt *master_pdpt)
 {
 	uintptr_t pos = pages, phys_addr = Z_X86_PT_START;

 	for (int i = 0; i < Z_X86_NUM_PT; i++, phys_addr += Z_X86_PT_AREA) {
 		union x86_mmu_pde_pt *pde;
 		struct x86_mmu_pt *master_pt, *dest_pt;

 		/* Same as we did with the directories, obtain PT in master
 		 * tables for the address range and copy into per-thread PT
 		 * for this range
 		 */
 		master_pt = X86_MMU_GET_PT_ADDR(master_pdpt, phys_addr);
 		dest_pt = (struct x86_mmu_pt *)pos;
 		(void)memcpy(dest_pt, master_pt, sizeof(struct x86_mmu_pd));

 		/* And then wire this up to the relevant per-thread
 		 * page directory entry
 		 */
 		pde = X86_MMU_GET_PDE(thread_pdpt, phys_addr);
 		pde->pt = pos >> MMU_PAGE_SHIFT;
 		pos += MMU_PAGE_SIZE;
 	}

 	return pos;
 }

 /* Initialize the page tables for a thread. This will contain, once done,
  * the boot-time configuration for a user thread page tables. There are
  * no pre-conditions on the existing state of the per-thread tables.
  */
 static void copy_page_tables(struct k_thread *thread,
 			     struct x86_mmu_pdpt *master_pdpt)
 {
 	uintptr_t pos, start;
 	struct x86_mmu_pdpt *thread_pdpt = z_x86_pdpt_get(thread);
 	struct z_x86_thread_stack_header *header =
 		(struct z_x86_thread_stack_header *)thread->stack_obj;

 	__ASSERT(thread->stack_obj != NULL, "no stack object assigned");
 	__ASSERT(z_x86_page_tables_get() != thread_pdpt, "PDPT is active");
 	__ASSERT(((uintptr_t)thread_pdpt & 0x1f) == 0, "unaligned pdpt at %p",
 		 thread_pdpt);

 	(void)memcpy(thread_pdpt, master_pdpt, sizeof(struct x86_mmu_pdpt));

 	/* pos represents the page we are working with in the reserved area
 	 * in the stack buffer for per-thread tables. As we create tables in
 	 * this area, pos is incremented to the next free page.
 	 *
 	 * The layout of the stack object, when this is done:
 	 *
 	 * +---------------------------+  <- thread->stack_obj
 	 * | PDE(0)                    |
 	 * +---------------------------+
 	 * | ...                       |
 	 * +---------------------------+
 	 * | PDE(Z_X86_NUM_PD - 1)     |
 	 * +---------------------------+
 	 * | PTE(0)                    |
 	 * +---------------------------+
 	 * | ...                       |
 	 * +---------------------------+
 	 * | PTE(Z_X86_NUM_PT - 1)     |
 	 * +---------------------------+ <- pos once this logic completes
 	 * | Stack guard               |
 	 * +---------------------------+
 	 * | Privilege elevation stack |
 	 * | PDPT                      |
 	 * +---------------------------+ <- thread->stack_info.start
 	 * | Thread stack              |
 	 * | ...                       |
 	 *
 	 */
 	start = (uintptr_t)(&header->page_tables);
 	pos = thread_pd_create(start, thread_pdpt, master_pdpt);
 	pos = thread_pt_create(pos, thread_pdpt, master_pdpt);

 	__ASSERT(pos == (start + Z_X86_THREAD_PT_AREA),
 		 "wrong amount of stack object memory used");
 }

 static void reset_mem_partition(struct x86_mmu_pdpt *thread_pdpt,
 				struct k_mem_partition *partition)
 {
 	uintptr_t addr = partition->start;
 	size_t size = partition->size;

 	__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
 	__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");

 	while (size != 0) {
 		union x86_mmu_pte *thread_pte, *master_pte;

 		thread_pte = X86_MMU_GET_PTE(thread_pdpt, addr);
 		master_pte = X86_MMU_GET_PTE(&USER_PDPT, addr);

 		(void)memcpy(thread_pte, master_pte, sizeof(union x86_mmu_pte));

 		size -= MMU_PAGE_SIZE;
 		addr += MMU_PAGE_SIZE;
 	}
 }

 static void apply_mem_partition(struct x86_mmu_pdpt *pdpt,
 				struct k_mem_partition *partition)
 {
 	x86_page_entry_data_t x86_attr;
 	x86_page_entry_data_t mask;

 	if (IS_ENABLED(CONFIG_X86_KPTI)) {
 		x86_attr = partition->attr | MMU_ENTRY_PRESENT;
 		mask = K_MEM_PARTITION_PERM_MASK | MMU_PTE_P_MASK;
 	} else {
 		x86_attr = partition->attr;
 		mask = K_MEM_PARTITION_PERM_MASK;
 	}

 	__ASSERT(partition->start >= DT_PHYS_RAM_ADDR,
 		 "region at %08lx[%u] extends below system ram start 0x%08x",
 		 partition->start, partition->size, DT_PHYS_RAM_ADDR);
 	__ASSERT(((partition->start + partition->size) <=
 		  (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))),
 		 "region at %08lx[%u] end at %08lx extends beyond system ram end 0x%08x",
 		 partition->start, partition->size,
 		 partition->start + partition->size,
 		 (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));

 	z_x86_mmu_set_flags(pdpt, (void *)partition->start, partition->size,
 			    x86_attr, mask, false);
 }

 void z_x86_apply_mem_domain(struct x86_mmu_pdpt *pdpt,
 			    struct k_mem_domain *mem_domain)
 {
 	for (int i = 0, pcount = 0; pcount < mem_domain->num_partitions; i++) {
 		struct k_mem_partition *partition;

 		partition = &mem_domain->partitions[i];
 		if (partition->size == 0) {
 			continue;
 		}
 		pcount++;

 		apply_mem_partition(pdpt, partition);
 	}
 }

 /* Called on creation of a user thread or when a supervisor thread drops to
  * user mode.
  *
  * Sets up the per-thread page tables, such that when they are activated on
  * context switch, everything is ready to go.
  */
 void z_x86_thread_pt_init(struct k_thread *thread)
 {
 	struct x86_mmu_pdpt *pdpt = z_x86_pdpt_get(thread);

 	/* USER_PDPT contains the page tables with the boot time memory
 	 * policy. We use it as a template to set up the per-thread page
 	 * tables.
 	 *
 	 * With KPTI, this is a distinct set of tables z_x86_user_pdpt from the
 	 * kernel page tables in z_x86_kernel_pdpt; it has all non user
 	 * accessible pages except the trampoline page marked as non-present.
 	 * Without KPTI, they are the same object.
 	 */
 	copy_page_tables(thread, &USER_PDPT);

 	/* Enable access to the thread's own stack buffer */
 	z_x86_mmu_set_flags(pdpt, (void *)thread->stack_info.start,
 			    ROUND_UP(thread->stack_info.size, MMU_PAGE_SIZE),
 			    MMU_ENTRY_PRESENT | K_MEM_PARTITION_P_RW_U_RW,
 			    MMU_PTE_P_MASK | K_MEM_PARTITION_PERM_MASK,
 			    false);
 }

 /*
  * Memory domain interface
  *
  * In all cases, if one of these APIs is called on a supervisor thread,
  * we don't need to do anything. If the thread later drops into supervisor
  * mode the per-thread page tables will be generated and the memory domain
  * configuration applied.
  */
 void z_arch_mem_domain_partition_remove(struct k_mem_domain *domain,
 					u32_t partition_id)
 {
 	sys_dnode_t *node, *next_node;

 	/* Removing a partition. Need to reset the relevant memory range
 	 * to the defaults in USER_PDPT for each thread.
 	 */
 	SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
 		struct k_thread *thread =
 			CONTAINER_OF(node, struct k_thread, mem_domain_info);

 		if ((thread->base.user_options & K_USER) == 0) {
 			continue;
 		}

 		reset_mem_partition(z_x86_pdpt_get(thread),
 				    &domain->partitions[partition_id]);
 	}
 }

 void z_arch_mem_domain_destroy(struct k_mem_domain *domain)
 {
 	for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
 		struct k_mem_partition *partition;

 		partition = &domain->partitions[i];
 		if (partition->size == 0) {
 			continue;
 		}
 		pcount++;

 		z_arch_mem_domain_partition_remove(domain, i);
 	}
 }

 void z_arch_mem_domain_thread_remove(struct k_thread *thread)
 {
 	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;

 	/* Non-user threads don't have per-thread page tables set up */
 	if ((thread->base.user_options & K_USER) == 0) {
 		return;
 	}

 	for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
 		struct k_mem_partition *partition;

 		partition = &domain->partitions[i];
 		if (partition->size == 0) {
 			continue;
 		}
 		pcount++;

 		reset_mem_partition(z_x86_pdpt_get(thread), partition);
 	}
 }

 void z_arch_mem_domain_partition_add(struct k_mem_domain *domain,
 				     u32_t partition_id)
 {
 	sys_dnode_t *node, *next_node;

 	SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
 		struct k_thread *thread =
 			CONTAINER_OF(node, struct k_thread, mem_domain_info);

 		if ((thread->base.user_options & K_USER) == 0) {
 			continue;
 		}

 		apply_mem_partition(z_x86_pdpt_get(thread),
 				    &domain->partitions[partition_id]);
 	}
 }

 void z_arch_mem_domain_thread_add(struct k_thread *thread)
 {
 	if ((thread->base.user_options & K_USER) == 0) {
 		return;
 	}

 	z_x86_apply_mem_domain(z_x86_pdpt_get(thread),
 			       thread->mem_domain_info.mem_domain);
 }

 int z_arch_mem_domain_max_partitions_get(void)
 {
 	return CONFIG_MAX_DOMAIN_PARTITIONS;
 }
 #endif	/* CONFIG_X86_USERSPACE*/