/*
 * Copyright (c) 2017, Intel Corporation
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <zephyr/zephyr.h>
#include <string.h>
#include <zephyr/sys/printk.h>
#include <xtensa-asm2.h>

#ifdef CONFIG_MULTITHREADING
#error Disable multithreading for this unit test!
#endif

/* Just random numbers intended to whiten the register contents during
 * the spill test and make every bit of every register in every call
 * significant in an attempt to catch any mistakes/swaps/etc...
 */
int white[] = {
	       0x5fad484a,
	       0xc23e88f7,
	       0xfff301fb,
	       0xf1189ba7,
	       0x88bffad6,
	       0xaabb96fa,
	       0x629619d5,
	       0x246bee82
};

static inline unsigned int ccount(void)
{
	unsigned int cc;

	__asm__ volatile("rsr.ccount %0" : "=r"(cc));
	return cc;
}

/* We call spill_fn() through a pointer to prevent the compiler from
 * detecting and optimizing out the tail recursion in fn() and forcing
 * a real function call using CALLn instructions.
 */
int (*spill_fnp)(int level, int a, int b, int c);

/* WINDOWBASE/WINDOWSTART registers tested before and after the spill */
unsigned int spill_wb0, spill_ws0, spill_wb1, spill_ws1;

/* Test start/end values for CCOUNT */
unsigned int spill_start, spill_end;

/* Validated result for spill_fn() */
int spill_expect;

enum {
	NO_SPILL, HAL_SPILL, ZEPHYR_SPILL, NUM_MODES
} spill_mode;

static int spill_fn(int level, int a, int b, int c)
{
	/* Be very careful when debugging, note that a printk() call
	 * tends to push all the registers out of the windows on its
	 * own, leaving no frames for us to test against!
	 */
	if (level >= ARRAY_SIZE(white)) {
		__asm__ volatile ("rsr.WINDOWBASE %0" : "=r"(spill_wb0));
		__asm__ volatile ("rsr.WINDOWSTART %0" : "=r"(spill_ws0));

		spill_start = ccount();

		if (spill_mode == NO_SPILL) {
			/* Just here to test the cycle count overhead
			 * and get the baseline function result.
			 */
		} else if (spill_mode == ZEPHYR_SPILL) {
			/* FIXME: the a0_save hack should be needless.  It
			 * *should* be enough to list "a0" in the clobber list
			 * of the __asm__ statement (and let the compiler
			 * decide on how to save the value), but that's not
			 * working for me...
			 */
			int a0_save;

			__asm__ volatile
				("mov %0, a0"			"\n\t"
				 "call0 spill_reg_windows"	"\n\t"
				 "mov a0, %0"			"\n\t"
				 : "=r"(a0_save));
		} else if (spill_mode == HAL_SPILL) {
			/* Strictly there is a xthal_window_spill_nw
			 * routine that is called with special setup
			 * (use CALL0, spill A2/A3, clear WOE) and
			 * supposed to be faster, but I couldn't make
			 * that work.
			 */
			extern void xthal_window_spill(void);
			xthal_window_spill();
		}

		spill_end = ccount();
		__asm__ volatile ("rsr.WINDOWBASE %0" : "=r" (spill_wb1));
		__asm__ volatile ("rsr.WINDOWSTART %0" : "=r" (spill_ws1));

		return ((a + b) | c);
	}

	int val1 = (a - (b & c)) ^ white[level];
	int val2 = ((a | b) + c) ^ white[(level + 1) % ARRAY_SIZE(white)];
	int val3 = (a - (b - c)) ^ white[(level + 2) % ARRAY_SIZE(white)];

	int x = spill_fnp(level+1, val1, val2, val3);

	/* FIXME: as it happens, the compiler seems not to be
	 * optimizing components of this addition before the function
	 * call, which is what we want: the desire is that the
	 * individual values be held in registers across the call so
	 * that they can be checked to have been spilled/filled
	 * properly as we return up the stack.  But the compiler
	 * certainly COULD reorder this addition (it would actually be
	 * a good optimization: you could reduce the number of
	 * registers used before the tail return and use a smaller
	 * call frame).  For now, I'm happy enough simply having read
	 * the generated code, but long term this should be a more
	 * robust test if possible.  Maybe write the values to some
	 * extern volatile spots...
	 */
	return x + val1 + val2 + val3 + a + b + c;
}

int test_reg_spill(void)
{
	spill_fnp = spill_fn;

	int ok = 1;

	for (spill_mode = 0; spill_mode < NUM_MODES; spill_mode++) {
		printk("Testing %s\n",
		       spill_mode == NO_SPILL ? "NO_SPILL"
		       : (spill_mode == HAL_SPILL ? "HAL_SPILL"
			  : "ZEPHYR_SPILL"));

		int result = spill_fnp(0, 1, 2, 3);

		printk("  WINDOWBASE %d -> %d, WINDOWSTART 0x%x -> 0x%x (%d cycles)\n",
		       spill_wb0, spill_wb1, spill_ws0, spill_ws1,
		       spill_end - spill_start);

		if (spill_mode == NO_SPILL) {
			spill_expect = result;
			continue;
		}

		if (spill_ws1 != 1 << spill_wb1) {
			printk("WINDOWSTART should show exactly one frame at WINDOWBASE\n");
			ok = 0;
		}

		if (result != spill_expect) {
			printk("Unexpected fn(1, 2, 3) result, got %d want %d\n",
			       result, spill_expect);
			ok = 0;
		}
	}

	return ok;
}

int *test_highreg_handle;

/* Simple save locations for some context needed by the test assembly */
void *_test_highreg_sp_save;
void *_test_highreg_a0_save;

int test_highreg_stack[64];

int *test_highreg_sp_top = &test_highreg_stack[ARRAY_SIZE(test_highreg_stack)];

/* External function, defined in assembly */
void fill_window(void (*fn)(void));

/* Test rig for fill_window, maybe remove as a metatest */
int testfw_wb, testfw_ws;
void testfw(void);

/* Assembly-defined leaf functions for fill_window which poke the
 * specified number of high GPRs before calling xtensa_save_high_regs
 * to spill them into the test_highreg_stack area for inspection.
 */
void test_highreg_0(void);
void test_highreg_4(void);
void test_highreg_8(void);
void test_highreg_12(void);

typedef void (*test_fn_t)(void);
test_fn_t highreg_tests[] = {
	test_highreg_0,
	test_highreg_4,
	test_highreg_8,
	test_highreg_12,
};

int test_highreg_save(void)
{
	int ok = 1;

	fill_window(testfw);
	printk("testfw wb %d ws 0x%x\n", testfw_wb, testfw_ws);
	ok = ok && (testfw_ws == ((1 << (XCHAL_NUM_AREGS / 4)) - 1));

	for (int i = 0; i < ARRAY_SIZE(highreg_tests); i++) {
		printk("\nHighreg test %d\n", i);

		fill_window(highreg_tests[i]);

		ok = ok && (*test_highreg_handle == (int)test_highreg_sp_top);

		int spilled_words = test_highreg_sp_top - test_highreg_handle;

		for (int quad = 0; ok && quad < (spilled_words - 1)/4; quad++) {
			int *qbase = test_highreg_sp_top - (quad + 1) * 4;

			for (int ri = 0; ri < 4; ri++) {
				int reg = 4 + quad * 4 + ri;

				ok = ok && (qbase[ri] == reg);
				printk("  q %d reg %d qb[%d] %d\n",
				       quad, reg, ri, qbase[ri]);
			}
		}
	}

	return ok;
}

void *switch_handle0, *switch_handle1;

void xtensa_switch(void *handle, void **old_handle);

void test_switch_bounce(void);
__asm__("test_switch_bounce:"	"\n\t"
	"call4 test_switch_top"	"\n\t");

volatile int switch_count;

/* Sits in a loop switching back to handle0 (which is the main thread) */
void test_switch_top(void)
{
	int n = 1;

	while (1) {
		switch_count = n++;
		xtensa_switch(switch_handle0, &switch_handle1);
	}
}

int test_switch(void)
{
	static int stack2[512];

	printk("%s\n", __func__);

	(void)memset(stack2, 0, sizeof(stack2));

	int *sp = xtensa_init_stack(k_current_get(),
				    &stack2[ARRAY_SIZE(stack2)],
				    (void *)test_switch_bounce,
				    0, 0, 0);

#if 0
	/* DEBUG: dump the stack contents for manual inspection */
	for (int i = 0; i < 64; i++) {
		int idx = ARRAY_SIZE(stack2) - (i+1);
		int off = (i+1) * -4;
		int *addr = &stack2[idx];

		if (addr < sp) {
			break;
		}
		printk("%p (%d): 0x%x\n", addr, off, stack2[idx]);
	}
	printk("sp: %p\n", sp);
#endif

	switch_handle1 = sp;

	const int n_switch = 10;

	for (int i = 0; i < n_switch; i++) {
		xtensa_switch(switch_handle1, &switch_handle0);
		/* printk("switch %d count %d\n", i, switch_count); */
	}

	return switch_count == n_switch;
}

void rfi_jump(void);
void rfi_jump_c(void)
{
	int ps;

	__asm__ volatile ("rsr.PS %0" : "=r"(ps));
	printk("%s, PS = %xh\n", __func__, ps);
}

int xstack_ok;

#define XSTACK_SIZE 1024
#define XSTACK_CANARY 0x5a5aa5a5
static int xstack_stack2[XSTACK_SIZE + 1];

void do_xstack_call(void *new_stack); /* in asmhelp.S */

void xstack_bottom(void)
{
	xstack_ok = 1;
}

void xstack_top(void)
{
	int on_my_stack;

	printk("%s oms %p\n", __func__, &on_my_stack);

	/* Do this via fill_window() to be absolutely sure the whole
	 * call stack across both physical stacks got spilled and
	 * filled properly.
	 */
	fill_window(xstack_bottom);
}

int test_xstack(void)
{
	/* Make the stack one element big and put a canary above it to
	 * check nothing underflows
	 */

	int *new_stack = &xstack_stack2[XSTACK_SIZE];
	*new_stack = XSTACK_CANARY;

	printk("%s new_stack = %p\n", __func__, new_stack);

	do_xstack_call(new_stack);

	printk("xstack_ok %d stack2[%d] 0x%x\n",
	       xstack_ok, XSTACK_SIZE, xstack_stack2[XSTACK_SIZE]);

	return xstack_ok && xstack_stack2[XSTACK_SIZE] == XSTACK_CANARY;
}

#ifdef CONFIG_SOC_ESP32
#define TIMER_INT 16
#else
#define TIMER_INT 13
#endif

volatile int timer2_fired;

int excint_stack[8192];
void *excint_stack_top = &excint_stack[ARRAY_SIZE(excint_stack)];

static struct { int nest; void *stack_top; } excint_cpu;

volatile int int5_result;

void disable_timer(void)
{
	int ie;

	__asm__ volatile("rsr.intenable %0" : "=r"(ie));
	ie &= ~(1<<TIMER_INT);
	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(ie));
}

void enable_timer(void)
{
	int ie;

	__asm__ volatile("rsr.intenable %0" : "=r"(ie));
	ie |= (1<<TIMER_INT);
	__asm__ volatile("wsr.intenable %0; rsync" : : "r"(ie));
}

void *handle_int5_c(void *handle)
{
	int5_result = spill_fnp(0, 3, 2, 1);

	int ccompare2_val = ccount() - 1;

	__asm__ volatile("wsr.ccompare2 %0; rsync" : : "r"(ccompare2_val));

	disable_timer();

	timer2_fired = 1;

	return handle;
}

int interrupt_test(void)
{
	int ok = 1;

	excint_cpu.nest = 0;
	excint_cpu.stack_top = &excint_stack[ARRAY_SIZE(excint_stack)];

	void *cpuptr = &excint_cpu;

	__asm__ volatile("wsr.MISC0 %0" : : "r"(cpuptr));

	/* We reuse the "spill_fn" logic from above to get a
	 * stack-sensitive, deeply-recursive computation going that
	 * will be sensitive to interrupt bugs
	 */
	spill_mode = NO_SPILL;

	unsigned int start = ccount();
	int expect = spill_fnp(0, 3, 2, 1);
	unsigned int spill_time = ccount() - start;

	/* Ten thousand iterations is still pretty quick */
	for (int i = 0; i < 10000; i++) {
		int nest = i & 1;

		excint_cpu.nest = nest;
		timer2_fired = 0;

		/* Vaguely random delay between 2-8 iterations of
		 * spill_fn().  Maybe improve with a real PRNG.
		 */
		const int max_reps = 8;
		int wh = white[i % ARRAY_SIZE(white)];
		int delay = spill_time * 2U
			+ ((wh * (i+1)) % (spill_time * (max_reps - 2)));

		int alarm = ccount() + delay;

		__asm__ volatile("wsr.ccompare2 %0; rsync" : : "r"(alarm));

		enable_timer();

#if 0
		/* This is what I want to test: run the spill_fn test
		 * repeatedly in the main thread so that it can be
		 * interrupted and restored, and validate that it
		 * returns the same result every time.  But this can't
		 * work, even in principle: the timer interrupt we are
		 * using is "high priority", which means that it can
		 * interrupt the window exceptions being thrown in the
		 * main thread.  And by design, Xtensa window
		 * exceptions CANNOT be made reentrant (they don't
		 * save the interrupted state, so can be interrupted
		 * again before they can mask off exceptions, which
		 * will then lose/clobber the OWB field in PS when the
		 * interrupt handler throws another window exception).
		 * So this doesn't work, in fact it fails every 2-10
		 * iterations as spill_fn spends a lot of its time
		 * spill/filling stack frames (by design, of course).
		 *
		 * This could be made to work if we could repurpose
		 * the existing medium priority timer interrupt (which
		 * is hard in a unit test: that's an important
		 * interrupt!) or use the low priority timer which
		 * delivers to the global exception handler (basically
		 * impossible in a unit test).  Frustrating.
		 */
		int reps = 0;

		while (!timer2_fired && reps < (max_reps+2)) {
			int result = spill_fnp(0, 3, 2, 1);

			reps++;
			if (result != expect) {
				ok = 0;
			}
		}
		if (reps >= max_reps+2) {
			printk("Interrupt didn't arrive\n");
			ok = 0;
		}
		if (int5_result != expect) {
			printk("Unexpected int spill_fn() result\n");
			ok = 0;
		}
		printk("INT test delay %d nest %d reps %d\n",
		       delay, nest, reps);
#else
		/* So this is what we do instead: just spin in the
		 * main thread calling functions that don't involve
		 * exceptions.  By experiment, calling spill_fn with a
		 * first (depth) argument of 6 or 7 results in a
		 * shallow call tree that won't throw exceptions.  At
		 * least we're executing real code which depends on
		 * its register state and validating that interrupts
		 * don't hurt.
		 */
		volatile int dummy = 1;

		while (!timer2_fired) {
			dummy = spill_fnp(6, dummy, 2, 3);
		}
		if (int5_result != expect) {
			printk("Unexpected int spill_fn() result\n");
			ok = 0;
		}
#endif
	}
	return ok;
}

void main(void)
{
	/* Turn off interrupts and leave disabled, otherwise the
	 * "userspace" context switching tests might not be reliable.
	 * Stack pointers can exist in indeterminate states here.
	 * (Note: the interrupt test below is using a high priority
	 * interrupt which is not masked by irq_lock(), so it doesn't
	 * care).
	 */
	int key = irq_lock();

	/* Strictly not a "test", we just want to know that the jump
	 * worked.  If the rest of the code runs, this must have
	 * "passed".
	 */
	rfi_jump();

	int ok = 1;

	ok = ok && test_reg_spill();

	ok = ok && test_highreg_save();

	ok = ok && test_switch();

	ok = ok && test_xstack();

	ok = ok && interrupt_test();

	irq_unlock(key);

	printk("%s\n", ok ? "OK" : "Failed");
}
