From b66c3dca14e3bf84c366034d707c2cee28b38e25 Mon Sep 17 00:00:00 2001 From: veclav talica Date: Tue, 19 Sep 2023 22:54:27 +0500 Subject: [PATCH] sysv zov generation for low overhead ffi --- gdb.sh | 2 +- src/arch/x86-64.zig | 1 + src/arch/x86-64/jedino-jedro.zig | 39 +++++--- src/arch/x86-64/ve-sistema.zig | 162 +++++++++++++++++++++++++++++++ src/interpreter.zig | 20 ---- src/main.zig | 42 +++++--- 6 files changed, 218 insertions(+), 48 deletions(-) create mode 100644 src/arch/x86-64/ve-sistema.zig delete mode 100644 src/interpreter.zig diff --git a/gdb.sh b/gdb.sh index d646690..fa2e07b 100755 --- a/gdb.sh +++ b/gdb.sh @@ -1,3 +1,3 @@ #!/bin/sh -gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.execute' -ex 'layout asm' -ex 'r' +gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.jedino-jedro.execute' -ex 'layout asm' -ex 'r' diff --git a/src/arch/x86-64.zig b/src/arch/x86-64.zig index 8c25a22..dbb8c71 100644 --- a/src/arch/x86-64.zig +++ b/src/arch/x86-64.zig @@ -1 +1,2 @@ pub usingnamespace @import("x86-64/jedino-jedro.zig"); +pub usingnamespace @import("x86-64/ve-sistema.zig"); diff --git a/src/arch/x86-64/jedino-jedro.zig b/src/arch/x86-64/jedino-jedro.zig index b9095a6..d23f7a6 100644 --- a/src/arch/x86-64/jedino-jedro.zig +++ b/src/arch/x86-64/jedino-jedro.zig @@ -7,29 +7,29 @@ //! Stack/thread pointers are chosen so that SysV and MS abis callee side preserve those, //! so that we don't need to constantly push and restore on procedure call. -// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable, -// but they might increase binary size, gotta test. - -// todo: Try using something else that has lesser opcode size. // Execution thread convention: // r12 <- binary thread pointer // r13 <- return stack pointer +// r14 <- extension context pointer -// Resources used: +// todo: Use ZF flag as conditional register so to not involve stack? +// Alternatively we could keep boolean word, but implement it in vector semantics. + +// Resources: // https://mort.coffee/home/fast-interpreters/ // https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html // https://en.wikibooks.org/wiki/X86_Assembly/GNU_assembly_syntax // https://www.cs.princeton.edu/courses/archive/spr18/cos217/lectures/15_AssemblyFunctions.pdf // https://ziglang.org/documentation/master/#toc-Assembly // https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm +// https://stackoverflow.com/questions/37639993/is-this-assembly-function-call-safe-complete // Neat things: // https://joryanick.com/retro-fast-x86-memcpy.php // https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE -const int = @import("../../interpreter.zig"); -const Word = int.Word; -pub const RecursionLimit = int.RecursionLimit; +const tolmac = @import("../../tolmac.zig"); +const Word = tolmac.Word; // todo: Variant that pushes array of words. /// (iw | -- iw) @@ -124,18 +124,29 @@ pub fn execute(binary: []const Word, entry_addr: usize) void { // https://wiki.osdev.org/System_V_ABI @setCold(true); - var return_stack: [RecursionLimit + 1]Word = undefined; + var return_stack: [tolmac.RecursionLimit + 1]Word = undefined; - // Such device is used so that opReturn could be used for return. + jumpstartSysV(&binary[entry_addr], &return_stack[return_stack.len - 2]); +} + +const jumpstartSysV = @as(*const fn (thread: *const Word, return_stack: *Word) callconv(.SysV) void, @ptrCast(&jumpstartNakedSysV)); + +fn jumpstartNakedSysV() callconv(.Naked) void { asm volatile ( + \\ pushq %%rbp + \\ movq %%rsp, %%rbp + \\ + \\ movq %%rdi, %%r12 + \\ movq %%rsi, %%r13 + \\ + \\ # Such device is used so that opReturn could be used for return. \\ movq $0f, 8(%%r13) \\ leaq 8(%%r13), %%rax \\ movq %%rax, (%%r13) \\ jmpq *(%%r12) \\ 0: - : - : [thread] "r" (&binary[entry_addr]), - [retstk] "r" (&return_stack[return_stack.len - 2]), - : "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory" + \\ + \\ popq %%rbp + \\ ret ); } diff --git a/src/arch/x86-64/ve-sistema.zig b/src/arch/x86-64/ve-sistema.zig new file mode 100644 index 0000000..5f8b7d9 --- /dev/null +++ b/src/arch/x86-64/ve-sistema.zig @@ -0,0 +1,162 @@ +//! ve sistema (.ve-sistema:x86-64) +//! +//! Provides entry opcodes for System V calling convention, optimized for specific prototypes. +//! + +// https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf + +const std = @import("std"); + +/// Used for stack parameter passing. +pub const WordLimit = 128; +const AsmBufferLimit = 4096; +const ClassBufferLimit = 256; + +const Class = enum { + void, // Denotes empty types. + integer, + sse, + sseup, + x87, + x87up, + no_class, + memory, +}; + +fn determiteClass(comptime T: type, buffer: []Class) []Class { + switch (@typeInfo(T)) { + .Void => &[1]Class{.void}, + .Int => |int| { + switch (int.bits) { + 0 => buffer[0] = .void, + 1...64 => buffer[0] = .integer, + 65...128 => @compileError("unimplemented"), + else => @compileError("unimplemented"), + } + }, + .Float => |float| { + switch (float.bits) { + 0 => buffer[0] = .void, + 1...64 => buffer[0] = .sse, + 65...80 => @compileError("unimplemented"), + 81...128 => @compileError("unimplemented"), + else => @compileError("unimplemented"), + } + }, + .Bool => buffer[0] = .integer, + .Pointer => |ptr| { + switch (ptr.size) { + .Slice => { + buffer[0] = .integer; + buffer[1] = .integer; + }, + else => buffer[0] = .integer, + } + }, + .Fn => buffer[0] = .integer, + else => @compileError("unimplemented"), + } + + return buffer[0 .. (@sizeOf(T) - 1) / 8 + 1]; +} + +// todo: Make sure duplicates are not made. +// todo: Cache results for identical in effect devices. +// +/// (iw | -- (arbitrary amount of words)) +pub fn generateOpZovSysvFromPrototype(prototype: anytype) !*const fn () callconv(.Naked) noreturn { + // todo: Should we care about this? + // > The direction flag DF in the %rFLAGS register must be clear (set to “forward” + // > direction) on function entry and return. + + comptime { + const func = @typeInfo(@TypeOf(prototype)).Fn; + + var source_buffer = [_]u8{0} ** AsmBufferLimit; + var source_needle: usize = 0; + + // todo: Align callee frame to 16? + // > shrq $4, %%rsp + // > addq $1, %%rsp + // > shlq $4, %%rsp + + // idea: Try using REP for big consequent memory pushes. + + // todo: In-stack returns by pointing %rdi directly to final destination. + + const Prelude = + \\ movq %%rsp, %%rbp # Move stack pointer in non-volatile %rbp to restore later + \\ subq $0x8, %%rsp + \\ + ; + + const Call = + \\ call *8(%%r12) + \\ + ; + + const Epilogue = + \\ movq %%rbp, %%rsp # Restore stack pointer + \\ addq $0x10, %%r12 + \\ jmpq *(%%r12) + \\ + ; + + @memcpy(source_buffer[source_needle .. source_needle + Prelude.len], Prelude[0..]); + source_needle += Prelude.len; + + var integer_allocation: usize = 0; + const IntegerAllocations = [_][]const u8{ "rdi", "rsi", "rdx", "rcx", "r8", "r9", "stack" }; + // var sse_allocation: enum { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, stack } = .xmm0; + + var class_buffer = [_]Class{.void} ** ClassBufferLimit; + + // Calculate stack space used by parameters. + var parameter_stack_size: usize = 0; + for (func.params) |param| { + const classes = determiteClass(param.type.?, &class_buffer); + parameter_stack_size += 8 * classes.len; + } + + // Push parameters to appropriate registers and stack positions. + var stack_offset: usize = parameter_stack_size; + for (func.params) |param| { + const classes = determiteClass(param.type.?, &class_buffer); + for (classes) |class| { + stack_offset -= 8; + switch (class) { + .integer => { + if (integer_allocation < IntegerAllocations.len - 1) { + source_needle += (try std.fmt.bufPrint( + source_buffer[source_needle..], + "movq {}(%%rbp), %%{s}\n", + .{ stack_offset, IntegerAllocations[integer_allocation] }, + )).len; + integer_allocation += 1; + } else { + source_needle += (try std.fmt.bufPrint( + source_buffer[source_needle..], + "pushq {}(%%rbp)\n", + .{stack_offset}, + )).len; + } + }, + .void => {}, + else => @compileError("unimplemented"), + } + } + } + + @memcpy(source_buffer[source_needle .. source_needle + Call.len], Call[0..]); + source_needle += Call.len; + + @memcpy(source_buffer[source_needle .. source_needle + Epilogue.len], Epilogue[0..]); + source_needle += Epilogue.len; + + return &struct { + fn op() callconv(.Naked) noreturn { + asm volatile (source_buffer[0..source_needle]); + } + }.op; + } +} diff --git a/src/interpreter.zig b/src/interpreter.zig deleted file mode 100644 index b87e83f..0000000 --- a/src/interpreter.zig +++ /dev/null @@ -1,20 +0,0 @@ -// todo: Interpreter context as binary local variable. -// It would hold memory mappings, as well as error stack. -// todo: Define procedure call for user code. -// todo: Instruction set extensions, such as memory management schemes, non-exhaustive logging, -// exception mechanism, coroutines via yield/resume and etc. -// todo: Threading scheme. -// todo: Extension for native floating point stack ops. -// todo: Try using small code model with nopie/nopic binary. - -// idea: Specialized opcodes that have side effects on read and write, such as -// zero-check on push/pop, or jump if condition bit met. This would create a lot -// of permutations tho, we might try to discover which code devices are most used. - -// idea: 'JIT' could be done by simple op* compiled binary copying up until `jmpq *(%%rdi)`, -// with immediate operand prelude modified, which could be done procedurally. - -pub const Word = u64; -pub const RecursionLimit = 1024; - -pub usingnamespace @import("arch/x86-64.zig"); diff --git a/src/main.zig b/src/main.zig index 7a67c96..bfe27cf 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,23 +1,39 @@ -const int = @import("interpreter.zig"); +const std = @import("std"); +const tolmac = @import("tolmac.zig"); + +fn printInt(int: u64, other: u32, another: u16) callconv(.SysV) void { + @setAlignStack(16); + std.debug.print("test: {}, {}, {}\n", .{ int, other, another }); +} + +const opPrintIntZov = tolmac.generateOpZovSysvFromPrototype(printInt) catch unreachable; pub fn main() !void { // todo: Mixing return addresses in stack poses a challenge, hm. - const add = [_]int.Word{ - @as(int.Word, @intFromPtr(&int.opSumWordsWithOverflow)), - @as(int.Word, @intFromPtr(&int.opReturn)), + const add = [_]tolmac.Word{ + @as(tolmac.Word, @intFromPtr(&tolmac.opSumWordsWithOverflow)), + @as(tolmac.Word, @intFromPtr(&tolmac.opReturn)), }; - const entry = [_]int.Word{ - @as(int.Word, @intFromPtr(&int.opPushWord)), + const entry = [_]tolmac.Word{ + @as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)), 1, - @as(int.Word, @intFromPtr(&int.opPushWord)), + @as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)), 2, - @as(int.Word, @intFromPtr(&int.opCall)), - @as(int.Word, @intFromPtr(&add)), - @as(int.Word, @intFromPtr(&int.opSinkWord)), - @as(int.Word, @intFromPtr(&int.opSinkWord)), - @as(int.Word, @intFromPtr(&int.opReturn)), + @as(tolmac.Word, @intFromPtr(&tolmac.opCall)), + @as(tolmac.Word, @intFromPtr(&add)), + @as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)), + @as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)), + 10, + @as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)), + 20, + @as(tolmac.Word, @intFromPtr(opPrintIntZov)), + @as(tolmac.Word, @intFromPtr(&printInt)), + @as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)), + @as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)), + @as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)), + @as(tolmac.Word, @intFromPtr(&tolmac.opReturn)), }; - int.execute(&entry, 0); + tolmac.execute(&entry, 0); }