fixes and optimizations in x86-64 opcodes

2023-09-22 21:52:57 +05:00
parent 511936220e
commit 95e4d54042
1 changed files with 37 additions and 13 deletions
@@ -14,6 +14,9 @@
 // todo: Use ZF flag as conditional register so to not involve stack?
 //       Alternatively we could keep boolean word, but implement it in vector semantics.
 //
 //       ZF flag route can be achieved with using LEA instead of ADD as it doesn't touch flags,
 //       but we would need to store the flag when doing other calling convention calls.
 // Resources:
 // https://mort.coffee/home/fast-interpreters/
@@ -23,10 +26,25 @@
 // https://ziglang.org/documentation/master/#toc-Assembly
 // https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm
 // https://stackoverflow.com/questions/37639993/is-this-assembly-function-call-safe-complete
 // https://groups.csail.mit.edu/pag/OLD/parg/piumarta98optimizing.pdf
 // https://dl.acm.org/doi/pdf/10.1145/1328195.1328197
 // https://www.agner.org/optimize/instruction_tables.pdf
 // https://stackoverflow.com/questions/6323027/lea-or-add-instruction
 // Neat things:
 // https://joryanick.com/retro-fast-x86-memcpy.php
 // https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE
 // https://www.usenix.org/legacy/publications/library/proceedings/jvm01/gagnon/gagnon_html/node19.html#piumarta
 // https://www.agner.org/optimize/optimizing_assembly.pdf
 // http://sebastianmihai.com/x86-assembly-optimization.html
 // todo: Opcode for fast integer divisions with statically known divider.
 // todo: Opcodes taking operands based on offset from top of stack, without consuming.
 // todo: Certain constant multiplication optimization by LEA:
 //   LEA EAX, [EAX * 2 + EAX]   ;EAX = EAX * 3
 //   LEA EAX, [EAX * 4 + EAX]   ;EAX = EAX * 5
 //   LEA EAX, [EAX * 8 + EAX]   ;EAX = EAX * 9
 //
 const tolmac = @import("../../tolmac.zig");
 const Word = tolmac.Word;
@@ -35,7 +53,7 @@ const Word = tolmac.Word;
 /// (iw | -- iw)
 pub fn opPushWord() callconv(.Naked) noreturn {
    asm volatile (
-        \\ add $0x10, %%r12
+        \\ addq $0x10, %%r12
        \\ pushq -8(%%r12)
        \\ jmpq *(%%r12)
    );
@@ -45,7 +63,7 @@ pub fn opPushWord() callconv(.Naked) noreturn {
 /// (w)
 pub fn opSinkWord() callconv(.Naked) noreturn {
    asm volatile (
-        \\ add $0x08, %%r12
+        \\ addq $0x08, %%r12
        \\ addq $0x08, %%rsp
        \\ jmpq *(%%r12)
    );
@@ -65,20 +83,29 @@ pub fn opSinkWord() callconv(.Naked) noreturn {
 //     @call(.always_tail, binary[2].function, .{ &binary[2], cond });
 // }
 // todo: Generate operation permutations procedurally.
 // todo: Jump on overflow instead of cond setting?
 /// (w1 w2 -- sum overflow)
 pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn {
-    // https://www.felixcloutier.com/x86/adc
+    // https://www.felixcloutier.com/x86/add
    // https://www.felixcloutier.com/x86/setcc
    // idea: Could https://www.felixcloutier.com/x86/cmovcc be better for overflow push?
    asm volatile (
        \\ addq $0x08, %%r12
-        \\ movq (%%rsp), %%rax
+        \\ movq (%%rsp), %%rbx
-        \\ adcq 8(%%rsp), %%rax
+        \\ movq %%rbx, %%rax
        \\ addq 8(%%rsp), %%rax
        \\ movq %%rax, 8(%%rsp)
-        \\ setc %%al
+        \\ xorq %%rbx, (%%rsp)
-        \\ movb %%al, (%%rsp)
+        \\ setc (%%rsp)
        \\ jmpq *(%%r12)
    );
 }
 /// (w1 w2 -- sum)
 pub fn opSumWords() callconv(.Naked) noreturn {
    asm volatile (
        \\ addq $0x08, %%r12
        \\ popq %%rax
        \\ addq (%%rsp), %%rax
        \\ movq %%rax, (%%rsp)
        \\ jmpq *(%%r12)
    );
 }
@@ -92,9 +119,6 @@ pub fn opSumWordsWithOverflow() callconv(.Naked) noreturn {
 //     @call(.always_tail, binary[offset].function, .{ &binary[offset], cond });
 // }
 // todo: Complex call op that would receive immediate mask that would tell
 //       which positions of stack to duplicate, as well as mixing of plain immediate operands.
 //       Or we could decouple it from call, it might be useful at other places.
 /// (iw |)
 pub fn opCall() callconv(.Naked) noreturn {
    asm volatile (