Title: Vectorized Axis-Aligned Rect Ops Brief: Small detour in making rect type and operations on it in SIMD semantics. Date: 1695570693 Tags: Programming, Zig, Optimization CSS: /style.css ### Code ### Zig's `@shuffle` makes it rather arcane to look at, so be prepared. ```zig pub fn RectSIMD(comptime T: type) type { return struct { xyxy: @Vector(4, T), pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool { const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 }); const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 }); return @reduce(.And, q <= w); } pub fn isRectWithin(self: @This(), a: @This()) bool { const q = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ -1, -2, 0, 1, -1, -2, 2, 1, -1, -2, 0, 3, -1, -2, 2, 3 }); const w = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ 0, 1, -3, -4, 2, 1, -3, -4, 0, 3, -3, -4, 2, 3, -3, -4 }); return @reduce(.And, q <= w); } // todo: Handle zero area cases? pub fn isRectIntersecting(self: @This(), a: @This()) bool { const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, -1, -2, 1 }); const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, 2, 3, -4 }); return @reduce(.And, q <= w); } }; } ``` ### Assembly ### This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact. note: SysV prelude and outro are omitted, with inlining you can expect it looking similarly. For 32bit floating point: ```asm "example.RectSIMD(f32).isPointWithin": vmovaps xmm2, xmm0 vmovapd xmm1, xmmword ptr [rdi] vunpcklpd xmm0, xmm1, xmm2 vblendpd xmm1, xmm1, xmm2, 1 vcmpleps k0, xmm0, xmm1 kmovd eax, k0 sub al, 15 sete al "example.RectSIMD(f32).isRectWithin": vmovaps xmm2, xmmword ptr [rsi] vmovaps xmm0, xmmword ptr [rdi] vmovaps xmm1, xmm2 vmovaps xmm3, xmm0 vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] vmovaps zmm0, zmm3 vpermt2ps zmm0, zmm2, zmm1 vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] vpermt2ps zmm1, zmm2, zmm3 vcmpleps k0, zmm0, zmm1 kortestw k0, k0 setb al "example.RectSIMD(f32).isRectIntersecting": vmovaps xmm1, xmmword ptr [rsi] vmovaps xmm3, xmmword ptr [rdi] vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] vmovaps xmm0, xmm1 vpermt2ps xmm0, xmm2, xmm3 vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1] vpermt2ps xmm1, xmm2, xmm3 vcmpleps k0, xmm0, xmm1 kmovd eax, k0 sub al, 15 sete al ``` For 32bit signed integers it fares amazing too: ```asm "example.RectSIMD(i32).isPointWithin": vmovaps xmm1, xmm0 vmovdqa xmm2, xmmword ptr [rdi] vpunpcklqdq xmm0, xmm2, xmm1 vpblendd xmm1, xmm1, xmm2, 12 vpcmpled k0, xmm0, xmm1 kmovd eax, k0 sub al, 15 sete al "example.RectSIMD(i32).isRectWithin": vmovdqa xmm2, xmmword ptr [rsi] vmovdqa xmm0, xmmword ptr [rdi] vmovaps xmm1, xmm2 vmovaps xmm3, xmm0 vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] vmovaps zmm0, zmm3 vpermt2d zmm0, zmm2, zmm1 vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] vpermt2d zmm1, zmm2, zmm3 vpcmpled k0, zmm0, zmm1 kortestw k0, k0 setb al "example.RectSIMD(i32).isRectIntersecting": vmovdqa xmm1, xmmword ptr [rsi] vmovdqa xmm3, xmmword ptr [rdi] vpbroadcastq xmm0, qword ptr [rsi] vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] vpermt2d xmm0, xmm2, xmm3 vpbroadcastq xmm3, qword ptr [rdi + 8] vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1] vpermt2d xmm1, xmm2, xmm3 vpcmpled k0, xmm0, xmm1 kmovd eax, k0 sub al, 15 sete al ``` 64bit floating point, now even on AVX512 some ops are not done at once: ```asm "example.RectSIMD(f64).isPointWithin": vmovaps xmm3, xmm0 vmovapd ymm1, ymmword ptr [rdi] vinsertf128 ymm0, ymm1, xmm3, 1 vmovaps xmm2, xmm3 vblendpd ymm1, ymm1, ymm2, 3 vcmplepd k0, ymm0, ymm1 kmovd eax, k0 sub al, 15 sete al pop rbp "example.RectSIMD(f64).isRectWithin": vmovapd ymm3, ymmword ptr [rsi] vmovapd ymm5, ymmword ptr [rdi] vblendpd ymm0, ymm5, ymm3, 8 vperm2f128 ymm4, ymm5, ymm3, 32 vblendpd ymm1, ymm4, ymm0, 10 vmovaps ymm0, ymm1 vblendpd ymm1, ymm5, ymm3, 12 vinsertf64x4 zmm0, zmm0, ymm1, 1 vblendpd ymm1, ymm5, ymm3, 4 vblendpd ymm2, ymm1, ymm4, 10 vmovaps ymm1, ymm4 vinsertf64x4 zmm2, zmm1, ymm2, 1 vperm2f128 ymm4, ymm3, ymm5, 49 vblendpd ymm1, ymm3, ymm5, 4 vblendpd ymm6, ymm1, ymm4, 10 vmovaps ymm1, ymm6 vinsertf64x4 zmm1, zmm1, ymm4, 1 vblendpd ymm6, ymm3, ymm5, 8 vblendpd ymm4, ymm4, ymm6, 10 vblendpd ymm5, ymm3, ymm5, 12 vmovaps ymm3, ymm5 vinsertf64x4 zmm3, zmm3, ymm4, 1 vcmplepd k1, zmm2, zmm3 vcmplepd k0, zmm0, zmm1 kunpckbw k0, k0, k1 kortestw k0, k0 setb al "example.RectSIMD(f64).isRectIntersecting": vmovapd ymm3, ymmword ptr [rsi] vmovapd ymm1, ymmword ptr [rdi] vmovdqa ymm2, ymmword ptr [rip + .LCPI3_0] vmovaps ymm0, ymm3 vpermt2pd ymm0, ymm2, ymm1 vmovdqa ymm2, ymmword ptr [rip + .LCPI3_1] vpermt2pd ymm1, ymm2, ymm3 vcmplepd k0, ymm0, ymm1 kmovd eax, k0 sub al, 15 sete al ``` So, selection of coordinate data type plays quite big role, especially when extensions are not provided. Note that permutation masks are also supplied along side code which increase binary size. With inlining it could be quite substantial if per object %rip relative addressing is used.