From 958a89272dfb00e94dcd487c2d3876ff09a917e4 Mon Sep 17 00:00:00 2001 From: veclav talica Date: Tue, 19 Sep 2023 09:13:46 +0500 Subject: [PATCH] opCall, return stack, r12 for thread pointer --- idea.md | 6 ++++ src/arch/x86-64.zig | 81 ++++++++++++++++++++++++++++++++------------- src/interpreter.zig | 2 +- src/main.zig | 19 +++++++---- 4 files changed, 78 insertions(+), 30 deletions(-) diff --git a/idea.md b/idea.md index 2d7192f..d652a8f 100644 --- a/idea.md +++ b/idea.md @@ -1,2 +1,8 @@ # .nmvm Near Metal Virtual Machine Exercise in building low overhead VM via architecture specific means. + +## Cases +- Native stack usage for virtual machine. +- Specific permutations of instructions for case optimizations. +- Array processing instructions. +- Absolute addressing for interpreter state, in TLS. diff --git a/src/arch/x86-64.zig b/src/arch/x86-64.zig index 62b821a..821ba30 100644 --- a/src/arch/x86-64.zig +++ b/src/arch/x86-64.zig @@ -1,5 +1,10 @@ +// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable, +// but they might increase binary size, gotta test. + +// todo: Try using something else that has lesser opcode size. // Execution thread convention: -// rdi <- binary thread +// r12 <- binary thread pointer +// r13 <- return stack pointer // Resources used: // https://mort.coffee/home/fast-interpreters/ @@ -10,14 +15,17 @@ // https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm pub const Word = u64; +pub const RecursionLimit = 1024; + +threadlocal var return_stack: [RecursionLimit + 1]Word = undefined; // todo: Variant that pushes array of words. /// (iw | -- iw) pub fn opPushWord() callconv(.Naked) noreturn { asm volatile ( - \\ add $0x10, %%rdi - \\ pushq -8(%%rdi) - \\ jmpq *(%%rdi) + \\ add $0x10, %%r12 + \\ pushq -8(%%r12) + \\ jmpq *(%%r12) ); } @@ -25,9 +33,9 @@ pub fn opPushWord() callconv(.Naked) noreturn { /// (w --) pub fn opSinkWord() callconv(.Naked) noreturn { asm volatile ( - \\ add $0x08, %%rdi + \\ add $0x08, %%r12 \\ addq $0x08, %%rsp - \\ jmpq *(%%rdi) + \\ jmpq *(%%r12) ); } @@ -53,13 +61,13 @@ pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { // https://www.felixcloutier.com/x86/setcc // idea: Could https://www.felixcloutier.com/x86/cmovcc be better for overflow push? asm volatile ( + \\ addq $0x08, %%r12 \\ movq (%%rsp), %%rax \\ adcq 8(%%rsp), %%rax \\ movq %%rax, 8(%%rsp) \\ setc %%al - \\ movb %%al, 7(%%rsp) - \\ addq $0x08, %%rdi - \\ jmpq *(%%rdi) + \\ movb %%al, (%%rsp) + \\ jmpq *(%%r12) ); } @@ -72,20 +80,47 @@ pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { // @call(.always_tail, binary[offset].function, .{ &binary[offset], cond }); // } -/// (addr) -pub fn opReturn() callconv(.Naked) noreturn { - // https://www.felixcloutier.com/x86/ret - asm volatile ("ret"); -} - -pub fn execute(binary: []const Word, entry_addr: usize) void { - // todo: Ensure correctness. - // https://wiki.osdev.org/System_V_ABI - // https://www.felixcloutier.com/x86/call +// todo: Complex call op that would receive immediate mask that would tell +// which positions of stack to duplicate, as well as mixing of plain immediate operands. +// Or we could decouple it from call, it might be useful at other places. +/// (iw |) +pub fn opCall() callconv(.Naked) noreturn { asm volatile ( - \\ call *(%%rdi) - : - : [thread] "rdi" (&binary[entry_addr]), - : "rflags", "rax", "rbx", "rsp", "rbp", "r12", "r13", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory" + \\ leaq 16(%%r12), %%rax + \\ subq $0x8, %%r13 + \\ movq %%rax, (%%r13) + \\ movq 8(%%r12), %%r12 + \\ jmpq *(%%r12) + ); +} + +/// (addr) +pub fn opReturn() callconv(.Naked) noreturn { + asm volatile ( + \\ movq (%%r13), %%r12 + \\ addq $0x08, %%r13 + \\ jmpq *(%%r12) + ); +} + +// todo: Make sure it's non reentry in one given thread. +pub fn execute(binary: []const Word, entry_addr: usize) void { + @setCold(true); + // todo: Ensure correctness. + // https://wiki.osdev.org/System_V_ABI + + // todo: Use remaining stack as return. + + // Such device is used so that opReturn could be used for return. + asm volatile ( + \\ movq $0f, 8(%%r13) + \\ leaq 8(%%r13), %%rax + \\ movq %%rax, (%%r13) + \\ jmpq *(%%r12) + \\ 0: + : + : [thread] "r" (&binary[entry_addr]), + [retstk] "r" (&return_stack[return_stack.len - 2]), + : "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory" ); } diff --git a/src/interpreter.zig b/src/interpreter.zig index e303f30..20a0b5f 100644 --- a/src/interpreter.zig +++ b/src/interpreter.zig @@ -14,4 +14,4 @@ // idea: 'JIT' could be done by simple op* compiled binary copying up until `jmpq *(%%rdi)`, // with immediate operand prelude modified, which could be done procedurally. -usingnamespace @import("arch/x86-64.zig"); +pub usingnamespace @import("arch/x86-64.zig"); diff --git a/src/main.zig b/src/main.zig index 873ffa9..7a67c96 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,16 +1,23 @@ const int = @import("interpreter.zig"); pub fn main() !void { - const binary = [_]int.Word{ - @as(int.Word, @intFromPtr(&int.opPushWord)), - ~@as(int.Word, 1), - @as(int.Word, @intFromPtr(&int.opPushWord)), - ~@as(int.Word, 1), + // todo: Mixing return addresses in stack poses a challenge, hm. + const add = [_]int.Word{ @as(int.Word, @intFromPtr(&int.opSumWordsWithOverflow)), + @as(int.Word, @intFromPtr(&int.opReturn)), + }; + + const entry = [_]int.Word{ + @as(int.Word, @intFromPtr(&int.opPushWord)), + 1, + @as(int.Word, @intFromPtr(&int.opPushWord)), + 2, + @as(int.Word, @intFromPtr(&int.opCall)), + @as(int.Word, @intFromPtr(&add)), @as(int.Word, @intFromPtr(&int.opSinkWord)), @as(int.Word, @intFromPtr(&int.opSinkWord)), @as(int.Word, @intFromPtr(&int.opReturn)), }; - int.execute(&binary, 0); + int.execute(&entry, 0); }