sysv zov generation for low overhead ffi
This commit is contained in:
parent
0d5bd46412
commit
b66c3dca14
2
gdb.sh
2
gdb.sh
@ -1,3 +1,3 @@
|
||||
#!/bin/sh
|
||||
|
||||
gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.execute' -ex 'layout asm' -ex 'r'
|
||||
gdb ./zig-out/bin/nmvm -ex 'b arch.x86-64.jedino-jedro.execute' -ex 'layout asm' -ex 'r'
|
||||
|
@ -1 +1,2 @@
|
||||
pub usingnamespace @import("x86-64/jedino-jedro.zig");
|
||||
pub usingnamespace @import("x86-64/ve-sistema.zig");
|
||||
|
@ -7,29 +7,29 @@
|
||||
//! Stack/thread pointers are chosen so that SysV and MS abis callee side preserve those,
|
||||
//! so that we don't need to constantly push and restore on procedure call.
|
||||
|
||||
// todo: Use r12, r13, ... instead? They're preserved in Sys V abi which might make it more confortable,
|
||||
// but they might increase binary size, gotta test.
|
||||
|
||||
// todo: Try using something else that has lesser opcode size.
|
||||
// Execution thread convention:
|
||||
// r12 <- binary thread pointer
|
||||
// r13 <- return stack pointer
|
||||
// r14 <- extension context pointer
|
||||
|
||||
// Resources used:
|
||||
// todo: Use ZF flag as conditional register so to not involve stack?
|
||||
// Alternatively we could keep boolean word, but implement it in vector semantics.
|
||||
|
||||
// Resources:
|
||||
// https://mort.coffee/home/fast-interpreters/
|
||||
// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
|
||||
// https://en.wikibooks.org/wiki/X86_Assembly/GNU_assembly_syntax
|
||||
// https://www.cs.princeton.edu/courses/archive/spr18/cos217/lectures/15_AssemblyFunctions.pdf
|
||||
// https://ziglang.org/documentation/master/#toc-Assembly
|
||||
// https://csiflabs.cs.ucdavis.edu/~ssdavis/50/att-syntax.htm
|
||||
// https://stackoverflow.com/questions/37639993/is-this-assembly-function-call-safe-complete
|
||||
|
||||
// Neat things:
|
||||
// https://joryanick.com/retro-fast-x86-memcpy.php
|
||||
// https://www.codeproject.com/Articles/1110153/Apex-memmove-the-fastest-memcpy-memmove-on-x-x-EVE
|
||||
|
||||
const int = @import("../../interpreter.zig");
|
||||
const Word = int.Word;
|
||||
pub const RecursionLimit = int.RecursionLimit;
|
||||
const tolmac = @import("../../tolmac.zig");
|
||||
const Word = tolmac.Word;
|
||||
|
||||
// todo: Variant that pushes array of words.
|
||||
/// (iw | -- iw)
|
||||
@ -124,18 +124,29 @@ pub fn execute(binary: []const Word, entry_addr: usize) void {
|
||||
// https://wiki.osdev.org/System_V_ABI
|
||||
@setCold(true);
|
||||
|
||||
var return_stack: [RecursionLimit + 1]Word = undefined;
|
||||
var return_stack: [tolmac.RecursionLimit + 1]Word = undefined;
|
||||
|
||||
// Such device is used so that opReturn could be used for return.
|
||||
jumpstartSysV(&binary[entry_addr], &return_stack[return_stack.len - 2]);
|
||||
}
|
||||
|
||||
const jumpstartSysV = @as(*const fn (thread: *const Word, return_stack: *Word) callconv(.SysV) void, @ptrCast(&jumpstartNakedSysV));
|
||||
|
||||
fn jumpstartNakedSysV() callconv(.Naked) void {
|
||||
asm volatile (
|
||||
\\ pushq %%rbp
|
||||
\\ movq %%rsp, %%rbp
|
||||
\\
|
||||
\\ movq %%rdi, %%r12
|
||||
\\ movq %%rsi, %%r13
|
||||
\\
|
||||
\\ # Such device is used so that opReturn could be used for return.
|
||||
\\ movq $0f, 8(%%r13)
|
||||
\\ leaq 8(%%r13), %%rax
|
||||
\\ movq %%rax, (%%r13)
|
||||
\\ jmpq *(%%r12)
|
||||
\\ 0:
|
||||
:
|
||||
: [thread] "r" (&binary[entry_addr]),
|
||||
[retstk] "r" (&return_stack[return_stack.len - 2]),
|
||||
: "rflags", "rax", "rbx", "rsp", "rdi", "rbp", "r14", "r15", "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11", "memory"
|
||||
\\
|
||||
\\ popq %%rbp
|
||||
\\ ret
|
||||
);
|
||||
}
|
||||
|
162
src/arch/x86-64/ve-sistema.zig
Normal file
162
src/arch/x86-64/ve-sistema.zig
Normal file
@ -0,0 +1,162 @@
|
||||
//! ve sistema (.ve-sistema:x86-64)
|
||||
//!
|
||||
//! Provides entry opcodes for System V calling convention, optimized for specific prototypes.
|
||||
//!
|
||||
|
||||
// https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf
|
||||
|
||||
const std = @import("std");
|
||||
|
||||
/// Used for stack parameter passing.
|
||||
pub const WordLimit = 128;
|
||||
const AsmBufferLimit = 4096;
|
||||
const ClassBufferLimit = 256;
|
||||
|
||||
const Class = enum {
|
||||
void, // Denotes empty types.
|
||||
integer,
|
||||
sse,
|
||||
sseup,
|
||||
x87,
|
||||
x87up,
|
||||
no_class,
|
||||
memory,
|
||||
};
|
||||
|
||||
fn determiteClass(comptime T: type, buffer: []Class) []Class {
|
||||
switch (@typeInfo(T)) {
|
||||
.Void => &[1]Class{.void},
|
||||
.Int => |int| {
|
||||
switch (int.bits) {
|
||||
0 => buffer[0] = .void,
|
||||
1...64 => buffer[0] = .integer,
|
||||
65...128 => @compileError("unimplemented"),
|
||||
else => @compileError("unimplemented"),
|
||||
}
|
||||
},
|
||||
.Float => |float| {
|
||||
switch (float.bits) {
|
||||
0 => buffer[0] = .void,
|
||||
1...64 => buffer[0] = .sse,
|
||||
65...80 => @compileError("unimplemented"),
|
||||
81...128 => @compileError("unimplemented"),
|
||||
else => @compileError("unimplemented"),
|
||||
}
|
||||
},
|
||||
.Bool => buffer[0] = .integer,
|
||||
.Pointer => |ptr| {
|
||||
switch (ptr.size) {
|
||||
.Slice => {
|
||||
buffer[0] = .integer;
|
||||
buffer[1] = .integer;
|
||||
},
|
||||
else => buffer[0] = .integer,
|
||||
}
|
||||
},
|
||||
.Fn => buffer[0] = .integer,
|
||||
else => @compileError("unimplemented"),
|
||||
}
|
||||
|
||||
return buffer[0 .. (@sizeOf(T) - 1) / 8 + 1];
|
||||
}
|
||||
|
||||
// todo: Make sure duplicates are not made.
|
||||
// todo: Cache results for identical in effect devices.
|
||||
//
|
||||
/// (iw | -- (arbitrary amount of words))
|
||||
pub fn generateOpZovSysvFromPrototype(prototype: anytype) !*const fn () callconv(.Naked) noreturn {
|
||||
// todo: Should we care about this?
|
||||
// > The direction flag DF in the %rFLAGS register must be clear (set to “forward”
|
||||
// > direction) on function entry and return.
|
||||
|
||||
comptime {
|
||||
const func = @typeInfo(@TypeOf(prototype)).Fn;
|
||||
|
||||
var source_buffer = [_]u8{0} ** AsmBufferLimit;
|
||||
var source_needle: usize = 0;
|
||||
|
||||
// todo: Align callee frame to 16?
|
||||
// > shrq $4, %%rsp
|
||||
// > addq $1, %%rsp
|
||||
// > shlq $4, %%rsp
|
||||
|
||||
// idea: Try using REP for big consequent memory pushes.
|
||||
|
||||
// todo: In-stack returns by pointing %rdi directly to final destination.
|
||||
|
||||
const Prelude =
|
||||
\\ movq %%rsp, %%rbp # Move stack pointer in non-volatile %rbp to restore later
|
||||
\\ subq $0x8, %%rsp
|
||||
\\
|
||||
;
|
||||
|
||||
const Call =
|
||||
\\ call *8(%%r12)
|
||||
\\
|
||||
;
|
||||
|
||||
const Epilogue =
|
||||
\\ movq %%rbp, %%rsp # Restore stack pointer
|
||||
\\ addq $0x10, %%r12
|
||||
\\ jmpq *(%%r12)
|
||||
\\
|
||||
;
|
||||
|
||||
@memcpy(source_buffer[source_needle .. source_needle + Prelude.len], Prelude[0..]);
|
||||
source_needle += Prelude.len;
|
||||
|
||||
var integer_allocation: usize = 0;
|
||||
const IntegerAllocations = [_][]const u8{ "rdi", "rsi", "rdx", "rcx", "r8", "r9", "stack" };
|
||||
// var sse_allocation: enum { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, stack } = .xmm0;
|
||||
|
||||
var class_buffer = [_]Class{.void} ** ClassBufferLimit;
|
||||
|
||||
// Calculate stack space used by parameters.
|
||||
var parameter_stack_size: usize = 0;
|
||||
for (func.params) |param| {
|
||||
const classes = determiteClass(param.type.?, &class_buffer);
|
||||
parameter_stack_size += 8 * classes.len;
|
||||
}
|
||||
|
||||
// Push parameters to appropriate registers and stack positions.
|
||||
var stack_offset: usize = parameter_stack_size;
|
||||
for (func.params) |param| {
|
||||
const classes = determiteClass(param.type.?, &class_buffer);
|
||||
for (classes) |class| {
|
||||
stack_offset -= 8;
|
||||
switch (class) {
|
||||
.integer => {
|
||||
if (integer_allocation < IntegerAllocations.len - 1) {
|
||||
source_needle += (try std.fmt.bufPrint(
|
||||
source_buffer[source_needle..],
|
||||
"movq {}(%%rbp), %%{s}\n",
|
||||
.{ stack_offset, IntegerAllocations[integer_allocation] },
|
||||
)).len;
|
||||
integer_allocation += 1;
|
||||
} else {
|
||||
source_needle += (try std.fmt.bufPrint(
|
||||
source_buffer[source_needle..],
|
||||
"pushq {}(%%rbp)\n",
|
||||
.{stack_offset},
|
||||
)).len;
|
||||
}
|
||||
},
|
||||
.void => {},
|
||||
else => @compileError("unimplemented"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@memcpy(source_buffer[source_needle .. source_needle + Call.len], Call[0..]);
|
||||
source_needle += Call.len;
|
||||
|
||||
@memcpy(source_buffer[source_needle .. source_needle + Epilogue.len], Epilogue[0..]);
|
||||
source_needle += Epilogue.len;
|
||||
|
||||
return &struct {
|
||||
fn op() callconv(.Naked) noreturn {
|
||||
asm volatile (source_buffer[0..source_needle]);
|
||||
}
|
||||
}.op;
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
// todo: Interpreter context as binary local variable.
|
||||
// It would hold memory mappings, as well as error stack.
|
||||
// todo: Define procedure call for user code.
|
||||
// todo: Instruction set extensions, such as memory management schemes, non-exhaustive logging,
|
||||
// exception mechanism, coroutines via yield/resume and etc.
|
||||
// todo: Threading scheme.
|
||||
// todo: Extension for native floating point stack ops.
|
||||
// todo: Try using small code model with nopie/nopic binary.
|
||||
|
||||
// idea: Specialized opcodes that have side effects on read and write, such as
|
||||
// zero-check on push/pop, or jump if condition bit met. This would create a lot
|
||||
// of permutations tho, we might try to discover which code devices are most used.
|
||||
|
||||
// idea: 'JIT' could be done by simple op* compiled binary copying up until `jmpq *(%%rdi)`,
|
||||
// with immediate operand prelude modified, which could be done procedurally.
|
||||
|
||||
pub const Word = u64;
|
||||
pub const RecursionLimit = 1024;
|
||||
|
||||
pub usingnamespace @import("arch/x86-64.zig");
|
42
src/main.zig
42
src/main.zig
@ -1,23 +1,39 @@
|
||||
const int = @import("interpreter.zig");
|
||||
const std = @import("std");
|
||||
const tolmac = @import("tolmac.zig");
|
||||
|
||||
fn printInt(int: u64, other: u32, another: u16) callconv(.SysV) void {
|
||||
@setAlignStack(16);
|
||||
std.debug.print("test: {}, {}, {}\n", .{ int, other, another });
|
||||
}
|
||||
|
||||
const opPrintIntZov = tolmac.generateOpZovSysvFromPrototype(printInt) catch unreachable;
|
||||
|
||||
pub fn main() !void {
|
||||
// todo: Mixing return addresses in stack poses a challenge, hm.
|
||||
const add = [_]int.Word{
|
||||
@as(int.Word, @intFromPtr(&int.opSumWordsWithOverflow)),
|
||||
@as(int.Word, @intFromPtr(&int.opReturn)),
|
||||
const add = [_]tolmac.Word{
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opSumWordsWithOverflow)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opReturn)),
|
||||
};
|
||||
|
||||
const entry = [_]int.Word{
|
||||
@as(int.Word, @intFromPtr(&int.opPushWord)),
|
||||
const entry = [_]tolmac.Word{
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
|
||||
1,
|
||||
@as(int.Word, @intFromPtr(&int.opPushWord)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
|
||||
2,
|
||||
@as(int.Word, @intFromPtr(&int.opCall)),
|
||||
@as(int.Word, @intFromPtr(&add)),
|
||||
@as(int.Word, @intFromPtr(&int.opSinkWord)),
|
||||
@as(int.Word, @intFromPtr(&int.opSinkWord)),
|
||||
@as(int.Word, @intFromPtr(&int.opReturn)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opCall)),
|
||||
@as(tolmac.Word, @intFromPtr(&add)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
|
||||
10,
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opPushWord)),
|
||||
20,
|
||||
@as(tolmac.Word, @intFromPtr(opPrintIntZov)),
|
||||
@as(tolmac.Word, @intFromPtr(&printInt)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opSinkWord)),
|
||||
@as(tolmac.Word, @intFromPtr(&tolmac.opReturn)),
|
||||
};
|
||||
|
||||
int.execute(&entry, 0);
|
||||
tolmac.execute(&entry, 0);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user