From 8e7391d4704f8a01f395547bc8132550f825cf89 Mon Sep 17 00:00:00 2001 From: veclav talica Date: Sun, 24 Sep 2023 22:49:51 +0500 Subject: [PATCH] simd-rect article --- articles/simd-rect/page.mmd | 183 ++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 articles/simd-rect/page.mmd diff --git a/articles/simd-rect/page.mmd b/articles/simd-rect/page.mmd new file mode 100644 index 0000000..4df1b8e --- /dev/null +++ b/articles/simd-rect/page.mmd @@ -0,0 +1,183 @@ +Title: Vectorized Axis-Aligned Rect Ops +Brief: Small detour in making rect type and operations on it in SIMD semantics. +Date: 1695570693 +Tags: Programming, Zig, Optimization +CSS: /style.css + +### Code ### +Zig's `@shuffle` makes it rather arcane to look at, so be prepared. + +```zig +pub fn RectSIMD(comptime T: type) type { + return struct { + xyxy: @Vector(4, T), + + pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool { + const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 }); + const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 }); + return @reduce(.And, q <= w); + } + + pub fn isRectWithin(self: @This(), a: @This()) bool { + const q = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ -1, -2, 0, 1, -1, -2, 2, 1, -1, -2, 0, 3, -1, -2, 2, 3 }); + const w = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ 0, 1, -3, -4, 2, 1, -3, -4, 0, 3, -3, -4, 2, 3, -3, -4 }); + return @reduce(.And, q <= w); + } + + // todo: Handle zero area cases? + pub fn isRectIntersecting(self: @This(), a: @This()) bool { + const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, -1, -2, 1 }); + const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, 2, 3, -4 }); + return @reduce(.And, q <= w); + } + }; +} +``` + +### Assembly ### +This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact. + +note: SysV prelude and outro are omitted, with inlining you can expect it looking similarly. + +For 32bit floating point: +```asm +"example.RectSIMD(f32).isPointWithin": + vmovaps xmm2, xmm0 + vmovapd xmm1, xmmword ptr [rdi] + vunpcklpd xmm0, xmm1, xmm2 + vblendpd xmm1, xmm1, xmm2, 1 + vcmpleps k0, xmm0, xmm1 + kmovd eax, k0 + sub al, 15 + sete al + +"example.RectSIMD(f32).isRectWithin": + vmovaps xmm2, xmmword ptr [rsi] + vmovaps xmm0, xmmword ptr [rdi] + vmovaps xmm1, xmm2 + vmovaps xmm3, xmm0 + vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] + vmovaps zmm0, zmm3 + vpermt2ps zmm0, zmm2, zmm1 + vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] + vpermt2ps zmm1, zmm2, zmm3 + vcmpleps k0, zmm0, zmm1 + kortestw k0, k0 + setb al + +"example.RectSIMD(f32).isRectIntersecting": + vmovaps xmm1, xmmword ptr [rsi] + vmovaps xmm3, xmmword ptr [rdi] + vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] + vmovaps xmm0, xmm1 + vpermt2ps xmm0, xmm2, xmm3 + vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1] + vpermt2ps xmm1, xmm2, xmm3 + vcmpleps k0, xmm0, xmm1 + kmovd eax, k0 + sub al, 15 + sete al +``` + +For 32bit signed integers it fares amazing too: +```asm +"example.RectSIMD(i32).isPointWithin": + vmovaps xmm1, xmm0 + vmovdqa xmm2, xmmword ptr [rdi] + vpunpcklqdq xmm0, xmm2, xmm1 + vpblendd xmm1, xmm1, xmm2, 12 + vpcmpled k0, xmm0, xmm1 + kmovd eax, k0 + sub al, 15 + sete al + +"example.RectSIMD(i32).isRectWithin": + vmovdqa xmm2, xmmword ptr [rsi] + vmovdqa xmm0, xmmword ptr [rdi] + vmovaps xmm1, xmm2 + vmovaps xmm3, xmm0 + vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] + vmovaps zmm0, zmm3 + vpermt2d zmm0, zmm2, zmm1 + vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] + vpermt2d zmm1, zmm2, zmm3 + vpcmpled k0, zmm0, zmm1 + kortestw k0, k0 + setb al + +"example.RectSIMD(i32).isRectIntersecting": + vmovdqa xmm1, xmmword ptr [rsi] + vmovdqa xmm3, xmmword ptr [rdi] + vpbroadcastq xmm0, qword ptr [rsi] + vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] + vpermt2d xmm0, xmm2, xmm3 + vpbroadcastq xmm3, qword ptr [rdi + 8] + vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1] + vpermt2d xmm1, xmm2, xmm3 + vpcmpled k0, xmm0, xmm1 + kmovd eax, k0 + sub al, 15 + sete al +``` + +64bit floating point, now even on AVX512 some ops are not done at once: +```asm +"example.RectSIMD(f64).isPointWithin": + vmovaps xmm3, xmm0 + vmovapd ymm1, ymmword ptr [rdi] + vinsertf128 ymm0, ymm1, xmm3, 1 + vmovaps xmm2, xmm3 + vblendpd ymm1, ymm1, ymm2, 3 + vcmplepd k0, ymm0, ymm1 + kmovd eax, k0 + sub al, 15 + sete al + pop rbp + +"example.RectSIMD(f64).isRectWithin": + vmovapd ymm3, ymmword ptr [rsi] + vmovapd ymm5, ymmword ptr [rdi] + vblendpd ymm0, ymm5, ymm3, 8 + vperm2f128 ymm4, ymm5, ymm3, 32 + vblendpd ymm1, ymm4, ymm0, 10 + vmovaps ymm0, ymm1 + vblendpd ymm1, ymm5, ymm3, 12 + vinsertf64x4 zmm0, zmm0, ymm1, 1 + vblendpd ymm1, ymm5, ymm3, 4 + vblendpd ymm2, ymm1, ymm4, 10 + vmovaps ymm1, ymm4 + vinsertf64x4 zmm2, zmm1, ymm2, 1 + vperm2f128 ymm4, ymm3, ymm5, 49 + vblendpd ymm1, ymm3, ymm5, 4 + vblendpd ymm6, ymm1, ymm4, 10 + vmovaps ymm1, ymm6 + vinsertf64x4 zmm1, zmm1, ymm4, 1 + vblendpd ymm6, ymm3, ymm5, 8 + vblendpd ymm4, ymm4, ymm6, 10 + vblendpd ymm5, ymm3, ymm5, 12 + vmovaps ymm3, ymm5 + vinsertf64x4 zmm3, zmm3, ymm4, 1 + vcmplepd k1, zmm2, zmm3 + vcmplepd k0, zmm0, zmm1 + kunpckbw k0, k0, k1 + kortestw k0, k0 + setb al + +"example.RectSIMD(f64).isRectIntersecting": + vmovapd ymm3, ymmword ptr [rsi] + vmovapd ymm1, ymmword ptr [rdi] + vmovdqa ymm2, ymmword ptr [rip + .LCPI3_0] + vmovaps ymm0, ymm3 + vpermt2pd ymm0, ymm2, ymm1 + vmovdqa ymm2, ymmword ptr [rip + .LCPI3_1] + vpermt2pd ymm1, ymm2, ymm3 + vcmplepd k0, ymm0, ymm1 + kmovd eax, k0 + sub al, 15 + sete al +``` + +So, selection of coordinate data type plays quite big role, especially when extensions are not provided. + +Note that permutation masks are also supplied along side code which increase binary size. +With inlining it could be quite substantial if per object %rip relative addressing is used.