sysv zov generation for low overhead ffi

This commit is contained in:
veclav talica 2023-09-19 22:54:27 +05:00
parent 0d5bd46412
commit b66c3dca14
6 changed files with 218 additions and 48 deletions

2
gdb.sh
View File

@ -1,3 +1,3 @@
#!/bin/sh
gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.execute' -ex 'layout asm' -ex 'r'
gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.jedino-jedro.execute' -ex 'layout asm' -ex 'r'

View File

@ -1 +1,2 @@
pub usingnamespace @import("x86-64/jedino-jedro.zig");
pub usingnamespace @import("x86-64/ve-sistema.zig");

View File

@ -7,29 +7,29 @@
//! Stack/thread pointers are chosen so that SysV and MS abis callee side preserve those,
//! so that we don't need to constantly push and restore on procedure call.
// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable,
// but they might increase binary size, gotta test.
// todo: Try using something else that has lesser opcode size.
// Execution thread convention:
// r12 <- binary thread pointer
// r13 <- return stack pointer
// r14 <- extension context pointer
// Resources used:
// todo: Use ZF flag as conditional register so to not involve stack?
// Alternatively we could keep boolean word, but implement it in vector semantics.
// Resources:
// https://mort.coffee/home/fast-interpreters/
// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
// https://en.wikibooks.org/wiki/X86_Assembly/GNU_assembly_syntax
// https://www.cs.princeton.edu/courses/archive/spr18/cos217/lectures/15_AssemblyFunctions.pdf
// https://ziglang.org/documentation/master/#toc-Assembly
// https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm
// https://stackoverflow.com/questions/37639993/is-this-assembly-function-call-safe-complete
// Neat things:
// https://joryanick.com/retro-fast-x86-memcpy.php
// https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE
const int = @import("../../interpreter.zig");
const Word = int.Word;
pub const RecursionLimit = int.RecursionLimit;
const tolmac = @import("../../tolmac.zig");
const Word = tolmac.Word;
// todo: Variant that pushes array of words.
/// (iw | -- iw)
@ -124,18 +124,29 @@ pub fn execute(binary: []const Word, entry_addr: usize) void {
// https://wiki.osdev.org/System_V_ABI
@setCold(true);
var return_stack: [RecursionLimit + 1]Word = undefined;
var return_stack: [tolmac.RecursionLimit + 1]Word = undefined;
// Such device is used so that opReturn could be used for return.
jumpstartSysV(&binary[entry_addr], &return_stack[return_stack.len - 2]);
}
const jumpstartSysV = @as(*const fn (thread: *const Word, return_stack: *Word) callconv(.SysV) void, @ptrCast(&jumpstartNakedSysV));
fn jumpstartNakedSysV() callconv(.Naked) void {
asm volatile (
\\ pushq %%rbp
\\ movq %%rsp, %%rbp
\\
\\ movq %%rdi, %%r12
\\ movq %%rsi, %%r13
\\
\\ # Such device is used so that opReturn could be used for return.
\\ movq $0f, 8(%%r13)
\\ leaq 8(%%r13), %%rax
\\ movq %%rax, (%%r13)
\\ jmpq *(%%r12)
\\ 0:
:
: [thread] "r" (&binary[entry_addr]),
[retstk] "r" (&return_stack[return_stack.len - 2]),
: "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory"
\\
\\ popq %%rbp
\\ ret
);
}

View File

@ -0,0 +1,162 @@
//! ve sistema (.ve-sistema:x86-64)
//!
//! Provides entry opcodes for System V calling convention, optimized for specific prototypes.
//!
// https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf
const std = @import("std");
/// Used for stack parameter passing.
pub const WordLimit = 128;
const AsmBufferLimit = 4096;
const ClassBufferLimit = 256;
const Class = enum {
void, // Denotes empty types.
integer,
sse,
sseup,
x87,
x87up,
no_class,
memory,
};
fn determiteClass(comptime T: type, buffer: []Class) []Class {
switch (@typeInfo(T)) {
.Void => &[1]Class{.void},
.Int => |int| {
switch (int.bits) {
0 => buffer[0] = .void,
1...64 => buffer[0] = .integer,
65...128 => @compileError("unimplemented"),
else => @compileError("unimplemented"),
}
},
.Float => |float| {
switch (float.bits) {
0 => buffer[0] = .void,
1...64 => buffer[0] = .sse,
65...80 => @compileError("unimplemented"),
81...128 => @compileError("unimplemented"),
else => @compileError("unimplemented"),
}
},
.Bool => buffer[0] = .integer,
.Pointer => |ptr| {
switch (ptr.size) {
.Slice => {
buffer[0] = .integer;
buffer[1] = .integer;
},
else => buffer[0] = .integer,
}
},
.Fn => buffer[0] = .integer,
else => @compileError("unimplemented"),
}
return buffer[0 .. (@sizeOf(T) - 1) / 8 + 1];
}
// todo: Make sure duplicates are not made.
// todo: Cache results for identical in effect devices.
//
/// (iw | -- (arbitrary amount of words))
pub fn generateOpZovSysvFromPrototype(prototype: anytype) !*const fn () callconv(.Naked) noreturn {
// todo: Should we care about this?
// > The direction flag DF in the %rFLAGS register must be clear (set to forward
// > direction) on function entry and return.
comptime {
const func = @typeInfo(@TypeOf(prototype)).Fn;
var source_buffer = [_]u8{0} ** AsmBufferLimit;
var source_needle: usize = 0;
// todo: Align callee frame to 16?
// > shrq $4, %%rsp
// > addq $1, %%rsp
// > shlq $4, %%rsp
// idea: Try using REP for big consequent memory pushes.
// todo: In-stack returns by pointing %rdi directly to final destination.
const Prelude =
\\ movq %%rsp, %%rbp # Move stack pointer in non-volatile %rbp to restore later
\\ subq $0x8, %%rsp
\\
;
const Call =
\\ call *8(%%r12)
\\
;
const Epilogue =
\\ movq %%rbp, %%rsp # Restore stack pointer
\\ addq $0x10, %%r12
\\ jmpq *(%%r12)
\\
;
@memcpy(source_buffer[source_needle .. source_needle + Prelude.len], Prelude[0..]);
source_needle += Prelude.len;
var integer_allocation: usize = 0;
const IntegerAllocations = [_][]const u8{ "rdi", "rsi", "rdx", "rcx", "r8", "r9", "stack" };
// var sse_allocation: enum { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, stack } = .xmm0;
var class_buffer = [_]Class{.void} ** ClassBufferLimit;
// Calculate stack space used by parameters.
var parameter_stack_size: usize = 0;
for (func.params) |param| {
const classes = determiteClass(param.type.?, &class_buffer);
parameter_stack_size += 8 * classes.len;
}
// Push parameters to appropriate registers and stack positions.
var stack_offset: usize = parameter_stack_size;
for (func.params) |param| {
const classes = determiteClass(param.type.?, &class_buffer);
for (classes) |class| {
stack_offset -= 8;
switch (class) {
.integer => {
if (integer_allocation < IntegerAllocations.len - 1) {
source_needle += (try std.fmt.bufPrint(
source_buffer[source_needle..],
"movq {}(%%rbp), %%{s}\n",
.{ stack_offset, IntegerAllocations[integer_allocation] },
)).len;
integer_allocation += 1;
} else {
source_needle += (try std.fmt.bufPrint(
source_buffer[source_needle..],
"pushq {}(%%rbp)\n",
.{stack_offset},
)).len;
}
},
.void => {},
else => @compileError("unimplemented"),
}
}
}
@memcpy(source_buffer[source_needle .. source_needle + Call.len], Call[0..]);
source_needle += Call.len;
@memcpy(source_buffer[source_needle .. source_needle + Epilogue.len], Epilogue[0..]);
source_needle += Epilogue.len;
return &struct {
fn op() callconv(.Naked) noreturn {
asm volatile (source_buffer[0..source_needle]);
}
}.op;
}
}

View File

@ -1,20 +0,0 @@
// todo: Interpreter context as binary local variable.
// It would hold memory mappings, as well as error stack.
// todo: Define procedure call for user code.
// todo: Instruction set extensions, such as memory management schemes, non-exhaustive logging,
// exception mechanism, coroutines via yield/resume and etc.
// todo: Threading scheme.
// todo: Extension for native floating point stack ops.
// todo: Try using small code model with nopie/nopic binary.
// idea: Specialized opcodes that have side effects on read and write, such as
// zero-check on push/pop, or jump if condition bit met. This would create a lot
// of permutations tho, we might try to discover which code devices are most used.
// idea: 'JIT' could be done by simple op* compiled binary copying up until `jmpq *(%%rdi)`,
// with immediate operand prelude modified, which could be done procedurally.
pub const Word = u64;
pub const RecursionLimit = 1024;
pub usingnamespace @import("arch/x86-64.zig");

View File

@ -1,23 +1,39 @@
const int = @import("interpreter.zig");
const std = @import("std");
const tolmac = @import("tolmac.zig");
fn printInt(int: u64, other: u32, another: u16) callconv(.SysV) void {
@setAlignStack(16);
std.debug.print("test: {}, {}, {}\n", .{ int, other, another });
}
const opPrintIntZov = tolmac.generateOpZovSysvFromPrototype(printInt) catch unreachable;
pub fn main() !void {
// todo: Mixing return addresses in stack poses a challenge, hm.
const add = [_]int.Word{
@as(int.Word, @intFromPtr(&int.opSumWordsWithOverflow)),
@as(int.Word, @intFromPtr(&int.opReturn)),
const add = [_]tolmac.Word{
@as(tolmac.Word, @intFromPtr(&tolmac.opSumWordsWithOverflow)),
@as(tolmac.Word, @intFromPtr(&tolmac.opReturn)),
};
const entry = [_]int.Word{
@as(int.Word, @intFromPtr(&int.opPushWord)),
const entry = [_]tolmac.Word{
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
1,
@as(int.Word, @intFromPtr(&int.opPushWord)),
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
2,
@as(int.Word, @intFromPtr(&int.opCall)),
@as(int.Word, @intFromPtr(&add)),
@as(int.Word, @intFromPtr(&int.opSinkWord)),
@as(int.Word, @intFromPtr(&int.opSinkWord)),
@as(int.Word, @intFromPtr(&int.opReturn)),
@as(tolmac.Word, @intFromPtr(&tolmac.opCall)),
@as(tolmac.Word, @intFromPtr(&add)),
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
10,
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
20,
@as(tolmac.Word, @intFromPtr(opPrintIntZov)),
@as(tolmac.Word, @intFromPtr(&printInt)),
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
@as(tolmac.Word, @intFromPtr(&tolmac.opReturn)),
};
int.execute(&entry, 0);
tolmac.execute(&entry, 0);
}