diff --git a/src/arch/x86-64/jedino-jedro.zig b/src/arch/x86-64/jedino-jedro.zig index 43a3a5b..bc0f906 100644 --- a/src/arch/x86-64/jedino-jedro.zig +++ b/src/arch/x86-64/jedino-jedro.zig @@ -14,6 +14,9 @@ // todo: Use ZF flag as conditional register so to not involve stack? // Alternatively we could keep boolean word, but implement it in vector semantics. +// +// ZF flag route can be achieved with using LEA instead of ADD as it doesn't touch flags, +// but we would need to store the flag when doing other calling convention calls. // Resources: // https://mort.coffee/home/fast-interpreters/ @@ -23,10 +26,25 @@ // https://ziglang.org/documentation/master/#toc-Assembly // https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm // https://stackoverflow.com/questions/37639993/is-this-assembly-function-call-safe-complete +// https://groups.csail.mit.edu/pag/OLD/parg/piumarta98optimizing.pdf +// https://dl.acm.org/doi/pdf/10.1145/1328195.1328197 +// https://www.agner.org/optimize/instruction_tables.pdf +// https://stackoverflow.com/questions/6323027/lea-or-add-instruction // Neat things: // https://joryanick.com/retro-fast-x86-memcpy.php // https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE +// https://www.usenix.org/legacy/publications/library/proceedings/jvm01/gagnon/gagnon_html/node19.html#piumarta +// https://www.agner.org/optimize/optimizing_assembly.pdf +// http://sebastianmihai.com/x86-assembly-optimization.html + +// todo: Opcode for fast integer divisions with statically known divider. +// todo: Opcodes taking operands based on offset from top of stack, without consuming. +// todo: Certain constant multiplication optimization by LEA: +// LEA EAX, [EAX * 2 + EAX] ;EAX = EAX * 3 +// LEA EAX, [EAX * 4 + EAX] ;EAX = EAX * 5 +// LEA EAX, [EAX * 8 + EAX] ;EAX = EAX * 9 +// const tolmac = @import("../../tolmac.zig"); const Word = tolmac.Word; @@ -35,7 +53,7 @@ const Word = tolmac.Word; /// (iw | -- iw) pub fn opPushWord() callconv(.Naked) noreturn { asm volatile ( - \\ add $0x10, %%r12 + \\ addq $0x10, %%r12 \\ pushq -8(%%r12) \\ jmpq *(%%r12) ); @@ -45,7 +63,7 @@ pub fn opPushWord() callconv(.Naked) noreturn { /// (w) pub fn opSinkWord() callconv(.Naked) noreturn { asm volatile ( - \\ add $0x08, %%r12 + \\ addq $0x08, %%r12 \\ addq $0x08, %%rsp \\ jmpq *(%%r12) ); @@ -65,20 +83,29 @@ pub fn opSinkWord() callconv(.Naked) noreturn { // @call(.always_tail, binary[2].function, .{ &binary[2], cond }); // } -// todo: Generate operation permutations procedurally. -// todo: Jump on overflow instead of cond setting? /// (w1 w2 -- sum overflow) pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { - // https://www.felixcloutier.com/x86/adc + // https://www.felixcloutier.com/x86/add // https://www.felixcloutier.com/x86/setcc - // idea: Could https://www.felixcloutier.com/x86/cmovcc be better for overflow push? asm volatile ( \\ addq $0x08, %%r12 - \\ movq (%%rsp), %%rax - \\ adcq 8(%%rsp), %%rax + \\ movq (%%rsp), %%rbx + \\ movq %%rbx, %%rax + \\ addq 8(%%rsp), %%rax \\ movq %%rax, 8(%%rsp) - \\ setc %%al - \\ movb %%al, (%%rsp) + \\ xorq %%rbx, (%%rsp) + \\ setc (%%rsp) + \\ jmpq *(%%r12) + ); +} + +/// (w1 w2 -- sum) +pub fn opSumWords() callconv(.Naked) noreturn { + asm volatile ( + \\ addq $0x08, %%r12 + \\ popq %%rax + \\ addq (%%rsp), %%rax + \\ movq %%rax, (%%rsp) \\ jmpq *(%%r12) ); } @@ -92,9 +119,6 @@ pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn { // @call(.always_tail, binary[offset].function, .{ &binary[offset], cond }); // } -// todo: Complex call op that would receive immediate mask that would tell -// which positions of stack to duplicate, as well as mixing of plain immediate operands. -// Or we could decouple it from call, it might be useful at other places. /// (iw |) pub fn opCall() callconv(.Naked) noreturn { asm volatile (