diff --git a/src/arch/x86-64.zig b/src/arch/x86-64.zig index 821ba30..8c25a22 100644 --- a/src/arch/x86-64.zig +++ b/src/arch/x86-64.zig @@ -1,126 +1 @@ -// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable, -// but they might increase binary size, gotta test. - -// todo: Try using something else that has lesser opcode size. -// Execution thread convention: -// r12 <- binary thread pointer -// r13 <- return stack pointer - -// Resources used: -// https://mort.coffee/home/fast-interpreters/ -// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html -// https://en.wikibooks.org/wiki/X86_Assembly/GNU_assembly_syntax -// https://www.cs.princeton.edu/courses/archive/spr18/cos217/lectures/15_AssemblyFunctions.pdf -// https://ziglang.org/documentation/master/#toc-Assembly -// https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm - -pub const Word = u64; -pub const RecursionLimit = 1024; - -threadlocal var return_stack: [RecursionLimit + 1]Word = undefined; - -// todo: Variant that pushes array of words. -/// (iw | -- iw) -pub fn opPushWord() callconv(.Naked) noreturn { - asm volatile ( - \\ add $0x10, %%r12 - \\ pushq -8(%%r12) - \\ jmpq *(%%r12) - ); -} - -// todo: Variant that discards array of words. -/// (w --) -pub fn opSinkWord() callconv(.Naked) noreturn { - asm volatile ( - \\ add $0x08, %%r12 - \\ addq $0x08, %%rsp - \\ jmpq *(%%r12) - ); -} - -/// (iw | -- (iw'nth word from stack) ) -// fn opTakeWord(binary: [*]const Word, cond: bool) noreturn { -// @setRuntimeSafety(false); -// takeWord(binary[1].word); -// @call(.always_tail, binary[2].function, .{ &binary[2], cond }); -// } - -/// (iw | w) -// fn opSetWord(binary: [*]const Word, cond: bool) noreturn { -// @setRuntimeSafety(false); -// setWord(binary[1].word, popWord()); -// @call(.always_tail, binary[2].function, .{ &binary[2], cond }); -// } - -// todo: Generate operation permutations procedurally. -// todo: Jump on overflow instead of cond setting? -/// (w1 w2 -- sum overflow) -pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { - // https://www.felixcloutier.com/x86/adc - // https://www.felixcloutier.com/x86/setcc - // idea: Could https://www.felixcloutier.com/x86/cmovcc be better for overflow push? - asm volatile ( - \\ addq $0x08, %%r12 - \\ movq (%%rsp), %%rax - \\ adcq 8(%%rsp), %%rax - \\ movq %%rax, 8(%%rsp) - \\ setc %%al - \\ movb %%al, (%%rsp) - \\ jmpq *(%%r12) - ); -} - -// todo: Generate operation permutations procedurally. -// todo: We might not need cond register if conditions and jumps are combined? -/// (w1 w2) -// fn opRelativeJumpIfGreaterThan(binary: [*]const Word, cond: bool) noreturn { -// @setRuntimeSafety(false); -// const offset = if (popWord() > popWord()) binary[1].word else 2; -// @call(.always_tail, binary[offset].function, .{ &binary[offset], cond }); -// } - -// todo: Complex call op that would receive immediate mask that would tell -// which positions of stack to duplicate, as well as mixing of plain immediate operands. -// Or we could decouple it from call, it might be useful at other places. -/// (iw |) -pub fn opCall() callconv(.Naked) noreturn { - asm volatile ( - \\ leaq 16(%%r12), %%rax - \\ subq $0x8, %%r13 - \\ movq %%rax, (%%r13) - \\ movq 8(%%r12), %%r12 - \\ jmpq *(%%r12) - ); -} - -/// (addr) -pub fn opReturn() callconv(.Naked) noreturn { - asm volatile ( - \\ movq (%%r13), %%r12 - \\ addq $0x08, %%r13 - \\ jmpq *(%%r12) - ); -} - -// todo: Make sure it's non reentry in one given thread. -pub fn execute(binary: []const Word, entry_addr: usize) void { - @setCold(true); - // todo: Ensure correctness. - // https://wiki.osdev.org/System_V_ABI - - // todo: Use remaining stack as return. - - // Such device is used so that opReturn could be used for return. - asm volatile ( - \\ movq $0f, 8(%%r13) - \\ leaq 8(%%r13), %%rax - \\ movq %%rax, (%%r13) - \\ jmpq *(%%r12) - \\ 0: - : - : [thread] "r" (&binary[entry_addr]), - [retstk] "r" (&return_stack[return_stack.len - 2]), - : "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory" - ); -} +pub usingnamespace @import("x86-64/jedino-jedro.zig"); diff --git a/src/arch/x86-64/jedino-jedro.zig b/src/arch/x86-64/jedino-jedro.zig new file mode 100644 index 0000000..9b5b15d --- /dev/null +++ b/src/arch/x86-64/jedino-jedro.zig @@ -0,0 +1,142 @@ +//! jedino jedro (.jj:x86-64) +//! +//! Desired properties: +//! - OS agnosticism, meaning it tries to respect conventions posed by target OSes. +//! For extensions based open it there should be an enum value indicating host, +//! for example, when dealing with extern C functions of shared objects. +//! Stack/thread pointers are chosen so that SysV and MS abis callee side preserve those, +//! so that we don't need to constantly push and restore on procedure call. + +// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable, +// but they might increase binary size, gotta test. + +// todo: Try using something else that has lesser opcode size. +// Execution thread convention: +// r12 <- binary thread pointer +// r13 <- return stack pointer + +// Resources used: +// https://mort.coffee/home/fast-interpreters/ +// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html +// https://en.wikibooks.org/wiki/X86_Assembly/GNU_assembly_syntax +// https://www.cs.princeton.edu/courses/archive/spr18/cos217/lectures/15_AssemblyFunctions.pdf +// https://ziglang.org/documentation/master/#toc-Assembly +// https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm + +// Neat things: +// https://joryanick.com/retro-fast-x86-memcpy.php +// https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE + +const int = @import("../../interpreter.zig"); +const Word = int.Word; +pub const RecursionLimit = int.RecursionLimit; + +threadlocal var return_stack: [RecursionLimit + 1]Word = undefined; + +// todo: Variant that pushes array of words. +/// (iw | -- iw) +pub fn opPushWord() callconv(.Naked) noreturn { + asm volatile ( + \\ add $0x10, %%r12 + \\ pushq -8(%%r12) + \\ jmpq *(%%r12) + ); +} + +// todo: Variant that discards array of words. +/// (w) +pub fn opSinkWord() callconv(.Naked) noreturn { + asm volatile ( + \\ add $0x08, %%r12 + \\ addq $0x08, %%rsp + \\ jmpq *(%%r12) + ); +} + +/// (iw | -- (iw'nth word from stack) ) +// fn opTakeWord(binary: [*]const Word, cond: bool) noreturn { +// @setRuntimeSafety(false); +// takeWord(binary[1].word); +// @call(.always_tail, binary[2].function, .{ &binary[2], cond }); +// } + +/// (iw | w) +// fn opSetWord(binary: [*]const Word, cond: bool) noreturn { +// @setRuntimeSafety(false); +// setWord(binary[1].word, popWord()); +// @call(.always_tail, binary[2].function, .{ &binary[2], cond }); +// } + +// todo: Generate operation permutations procedurally. +// todo: Jump on overflow instead of cond setting? +/// (w1 w2 -- sum overflow) +pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { + // https://www.felixcloutier.com/x86/adc + // https://www.felixcloutier.com/x86/setcc + // idea: Could https://www.felixcloutier.com/x86/cmovcc be better for overflow push? + asm volatile ( + \\ addq $0x08, %%r12 + \\ movq (%%rsp), %%rax + \\ adcq 8(%%rsp), %%rax + \\ movq %%rax, 8(%%rsp) + \\ setc %%al + \\ movb %%al, (%%rsp) + \\ jmpq *(%%r12) + ); +} + +// todo: Generate operation permutations procedurally. +// todo: We might not need cond register if conditions and jumps are combined? +/// (w1 w2) +// fn opRelativeJumpIfGreaterThan(binary: [*]const Word, cond: bool) noreturn { +// @setRuntimeSafety(false); +// const offset = if (popWord() > popWord()) binary[1].word else 2; +// @call(.always_tail, binary[offset].function, .{ &binary[offset], cond }); +// } + +// todo: Complex call op that would receive immediate mask that would tell +// which positions of stack to duplicate, as well as mixing of plain immediate operands. +// Or we could decouple it from call, it might be useful at other places. +/// (iw |) +pub fn opCall() callconv(.Naked) noreturn { + asm volatile ( + \\ leaq 16(%%r12), %%rax + \\ subq $0x8, %%r13 + \\ movq %%rax, (%%r13) + \\ movq 8(%%r12), %%r12 + \\ jmpq *(%%r12) + ); +} + +/// (addr) +pub fn opReturn() callconv(.Naked) noreturn { + asm volatile ( + \\ movq (%%r13), %%r12 + \\ addq $0x08, %%r13 + \\ jmpq *(%%r12) + ); +} + +// todo: Make sure it's non reentry in one given thread. +// todo: Allow passing initial stack via array of words. +// todo: Ensure correctness. +// todo: Use remaining stack as return. +// todo: Make it .C callconv and extern. +// todo: Permute by calling conventions. +pub fn execute(binary: []const Word, entry_addr: usize) void { + // https://wiki.osdev.org/System_V_ABI + @setCold(true); + + // Such device is used so that opReturn could be used for return. + asm volatile ( + \\ movq $0f, 8(%%r13) + \\ leaq 8(%%r13), %%rax + \\ movq %%rax, (%%r13) + \\ jmpq *(%%r12) + \\ 0: + : + : [thread] "r" (&binary[entry_addr]), + [retstk] "r" (&return_stack[return_stack.len - 2]), + : "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory" + ); +} diff --git a/src/interpreter.zig b/src/interpreter.zig index 20a0b5f..b87e83f 100644 --- a/src/interpreter.zig +++ b/src/interpreter.zig @@ -14,4 +14,7 @@ // idea: 'JIT' could be done by simple op* compiled binary copying up until `jmpq *(%%rdi)`, // with immediate operand prelude modified, which could be done procedurally. +pub const Word = u64; +pub const RecursionLimit = 1024; + pub usingnamespace @import("arch/x86-64.zig");