mjestecko/articles/simd-rect/page.mmd

Title:  Vectorized Axis-Aligned Rect Ops
Brief:  Small detour in making rect type and operations on it in SIMD semantics.
Date:   1695570693
Tags:   Programming, Zig, Optimization
CSS:    /style.css

### Code ###
Zig's `@shuffle` makes it rather arcane to look at, so be prepared.

```zig
pub fn RectSIMD(comptime T: type) type {
    return struct {
        xyxy: @Vector(4, T),

        pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool {
            const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 });
            const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 });
            return @reduce(.And, q <= w);
        }

        pub fn isRectWithin(self: @This(), a: @This()) bool {
            const q = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ 0, 1, 2, 3, -1, -2, -1, -2 });
            const w = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ -3, -4, -3, -4, 0, 1, 2, 3 });
            return @reduce(.And, q <= w);
        }

        // todo: Handle zero area cases?
        pub fn isRectIntersecting(self: @This(), a: @This()) bool {
            const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, 1, -1, -2 });
            const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, -4, 2, 3 });
            return @reduce(.And, q <= w);
        }
    };
}
```

### Assembly ###
This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact.

Note: Calling prelude and outro are omitted, with inlining you can expect it looking similarly.
Zig calling convention is used, which is roughly equal to C's static marked procedure.

For 32bit floating point:
```asm
"example.RectSIMD(f32).isPointWithin":
        vmovaps xmm2, xmm0
        vmovapd xmm1, xmmword ptr [rdi]
        vunpcklpd       xmm0, xmm1, xmm2
        vblendpd        xmm1, xmm1, xmm2, 1
        vcmpleps        k0, xmm0, xmm1
        kmovd   eax, k0
        sub     al, 15
        sete    al

"example.RectSIMD(f32).isRectWithin":
        vmovaps xmm2, xmmword ptr [rsi]
        vmovaps xmm0, xmm2
        vmovddup        xmm1, qword ptr [rdi]
        vinsertf128     ymm0, ymm0, xmm1, 1
        vmovddup        xmm3, qword ptr [rdi + 8]
        vmovaps xmm1, xmm3
        vinsertf128     ymm1, ymm1, xmm2, 1
        vcmpleps        k0, ymm0, ymm1
        kortestb        k0, k0
        setb    al

"example.RectSIMD(f32).isRectIntersecting":
        vmovapd xmm2, xmmword ptr [rsi]
        vmovapd xmm1, xmmword ptr [rdi]
        vunpcklpd       xmm0, xmm2, xmm1
        vunpckhpd       xmm1, xmm1, xmm2
        vcmpleps        k0, xmm0, xmm1
        kmovd   eax, k0
        sub     al, 15
        sete    al
```

For 32bit signed integers it fares amazing too:
```asm
"example.RectSIMD(i32).isPointWithin":
        vmovaps xmm1, xmm0
        vmovdqa xmm2, xmmword ptr [rdi]
        vpunpcklqdq     xmm0, xmm2, xmm1
        vpblendd        xmm1, xmm1, xmm2, 12
        vpcmpled        k0, xmm0, xmm1
        kmovd   eax, k0
        sub     al, 15
        sete    al

"example.RectSIMD(i32).isRectWithin":
        vmovdqa xmm2, xmmword ptr [rsi]
        vmovaps xmm0, xmm2
        vpbroadcastq    xmm1, qword ptr [rdi]
        vinserti128     ymm0, ymm0, xmm1, 1
        vpbroadcastq    xmm3, qword ptr [rdi + 8]
        vmovaps xmm1, xmm3
        vinserti128     ymm1, ymm1, xmm2, 1
        vpcmpled        k0, ymm0, ymm1
        kortestb        k0, k0
        setb    al

"example.RectSIMD(i32).isRectIntersecting":
        vmovdqa xmm2, xmmword ptr [rsi]
        vmovdqa xmm1, xmmword ptr [rdi]
        vpunpcklqdq     xmm0, xmm2, xmm1
        vpunpckhqdq     xmm1, xmm1, xmm2
        vpcmpled        k0, xmm0, xmm1
        kmovd   eax, k0
        sub     al, 15
        sete    al
```

64bit floating point:
```asm
"example.RectSIMD(f64).isPointWithin":
        vmovaps xmm3, xmm0
        vmovapd ymm1, ymmword ptr [rdi]
        vinsertf128     ymm0, ymm1, xmm3, 1
        vmovaps xmm2, xmm3
        vblendpd        ymm1, ymm1, ymm2, 3
        vcmplepd        k0, ymm0, ymm1
        kmovd   eax, k0
        sub     al, 15
        sete    al

"example.RectSIMD(f64).isRectWithin":
        vmovapd ymm2, ymmword ptr [rsi]
        vmovapd ymm1, ymmword ptr [rdi]
        vmovaps ymm0, ymm2
        vpermpd ymm3, ymm1, 68
        vinsertf64x4    zmm0, zmm0, ymm3, 1
        vpermpd ymm3, ymm1, 238
        vmovaps ymm1, ymm3
        vinsertf64x4    zmm1, zmm1, ymm2, 1
        vcmplepd        k0, zmm0, zmm1
        kortestb        k0, k0
        setb    al

"example.RectSIMD(f64).isRectIntersecting":
        vmovapd ymm2, ymmword ptr [rsi]
        vmovapd ymm1, ymmword ptr [rdi]
        vperm2f128      ymm0, ymm2, ymm1, 32
        vperm2f128      ymm1, ymm1, ymm2, 49
        vcmplepd        k0, ymm0, ymm1
        kmovd   eax, k0
        sub     al, 15
        sete    al
```

AVX512 makes it so that there's no big penalty for double precision types, which is nice.

### Edits ###
- Reordered to use packed vectors without swizzling when possible.
- Eliminated redundant computations.
- Calling convention notice.
simd-rect article 2023-09-24 17:49:51 +00:00			`Title: Vectorized Axis-Aligned Rect Ops`
			`Brief: Small detour in making rect type and operations on it in SIMD semantics.`
			`Date: 1695570693`
			`Tags: Programming, Zig, Optimization`
			`CSS: /style.css`

			`### Code ###`
			Zig's `@shuffle` makes it rather arcane to look at, so be prepared.

			```zig
			`pub fn RectSIMD(comptime T: type) type {`
			`return struct {`
			`xyxy: @Vector(4, T),`

			`pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool {`
			`const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 });`
			`const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 });`
			`return @reduce(.And, q <= w);`
			`}`

			`pub fn isRectWithin(self: @This(), a: @This()) bool {`
update to simd-rect 2023-09-25 11:41:37 +00:00			`const q = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ 0, 1, 2, 3, -1, -2, -1, -2 });`
			`const w = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ -3, -4, -3, -4, 0, 1, 2, 3 });`
simd-rect article 2023-09-24 17:49:51 +00:00			`return @reduce(.And, q <= w);`
			`}`

			`// todo: Handle zero area cases?`
			`pub fn isRectIntersecting(self: @This(), a: @This()) bool {`
update to simd-rect 2023-09-25 11:41:37 +00:00			`const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, 1, -1, -2 });`
			`const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, -4, 2, 3 });`
simd-rect article 2023-09-24 17:49:51 +00:00			`return @reduce(.And, q <= w);`
			`}`
			`};`
			`}`
			```

			`### Assembly ###`
			`This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact.`

calling convention notice 2023-11-09 16:59:28 +00:00			`Note: Calling prelude and outro are omitted, with inlining you can expect it looking similarly.`
			`Zig calling convention is used, which is roughly equal to C's static marked procedure.`
simd-rect article 2023-09-24 17:49:51 +00:00
			`For 32bit floating point:`
			```asm
			`"example.RectSIMD(f32).isPointWithin":`
			`vmovaps xmm2, xmm0`
			`vmovapd xmm1, xmmword ptr [rdi]`
			`vunpcklpd xmm0, xmm1, xmm2`
			`vblendpd xmm1, xmm1, xmm2, 1`
			`vcmpleps k0, xmm0, xmm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`

			`"example.RectSIMD(f32).isRectWithin":`
			`vmovaps xmm2, xmmword ptr [rsi]`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovaps xmm0, xmm2`
			`vmovddup xmm1, qword ptr [rdi]`
			`vinsertf128 ymm0, ymm0, xmm1, 1`
			`vmovddup xmm3, qword ptr [rdi + 8]`
			`vmovaps xmm1, xmm3`
			`vinsertf128 ymm1, ymm1, xmm2, 1`
			`vcmpleps k0, ymm0, ymm1`
			`kortestb k0, k0`
simd-rect article 2023-09-24 17:49:51 +00:00			`setb al`

			`"example.RectSIMD(f32).isRectIntersecting":`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovapd xmm2, xmmword ptr [rsi]`
			`vmovapd xmm1, xmmword ptr [rdi]`
			`vunpcklpd xmm0, xmm2, xmm1`
			`vunpckhpd xmm1, xmm1, xmm2`
simd-rect article 2023-09-24 17:49:51 +00:00			`vcmpleps k0, xmm0, xmm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`
			```

			`For 32bit signed integers it fares amazing too:`
			```asm
			`"example.RectSIMD(i32).isPointWithin":`
			`vmovaps xmm1, xmm0`
			`vmovdqa xmm2, xmmword ptr [rdi]`
			`vpunpcklqdq xmm0, xmm2, xmm1`
			`vpblendd xmm1, xmm1, xmm2, 12`
			`vpcmpled k0, xmm0, xmm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`

			`"example.RectSIMD(i32).isRectWithin":`
			`vmovdqa xmm2, xmmword ptr [rsi]`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovaps xmm0, xmm2`
			`vpbroadcastq xmm1, qword ptr [rdi]`
			`vinserti128 ymm0, ymm0, xmm1, 1`
			`vpbroadcastq xmm3, qword ptr [rdi + 8]`
			`vmovaps xmm1, xmm3`
			`vinserti128 ymm1, ymm1, xmm2, 1`
			`vpcmpled k0, ymm0, ymm1`
			`kortestb k0, k0`
simd-rect article 2023-09-24 17:49:51 +00:00			`setb al`

			`"example.RectSIMD(i32).isRectIntersecting":`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovdqa xmm2, xmmword ptr [rsi]`
			`vmovdqa xmm1, xmmword ptr [rdi]`
			`vpunpcklqdq xmm0, xmm2, xmm1`
			`vpunpckhqdq xmm1, xmm1, xmm2`
simd-rect article 2023-09-24 17:49:51 +00:00			`vpcmpled k0, xmm0, xmm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`
			```

update to simd-rect 2023-09-25 11:41:37 +00:00			`64bit floating point:`
simd-rect article 2023-09-24 17:49:51 +00:00			```asm
			`"example.RectSIMD(f64).isPointWithin":`
			`vmovaps xmm3, xmm0`
			`vmovapd ymm1, ymmword ptr [rdi]`
			`vinsertf128 ymm0, ymm1, xmm3, 1`
			`vmovaps xmm2, xmm3`
			`vblendpd ymm1, ymm1, ymm2, 3`
			`vcmplepd k0, ymm0, ymm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`

			`"example.RectSIMD(f64).isRectWithin":`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovapd ymm2, ymmword ptr [rsi]`
			`vmovapd ymm1, ymmword ptr [rdi]`
			`vmovaps ymm0, ymm2`
			`vpermpd ymm3, ymm1, 68`
			`vinsertf64x4 zmm0, zmm0, ymm3, 1`
			`vpermpd ymm3, ymm1, 238`
			`vmovaps ymm1, ymm3`
			`vinsertf64x4 zmm1, zmm1, ymm2, 1`
simd-rect article 2023-09-24 17:49:51 +00:00			`vcmplepd k0, zmm0, zmm1`
update to simd-rect 2023-09-25 11:41:37 +00:00			`kortestb k0, k0`
simd-rect article 2023-09-24 17:49:51 +00:00			`setb al`

			`"example.RectSIMD(f64).isRectIntersecting":`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vmovapd ymm2, ymmword ptr [rsi]`
simd-rect article 2023-09-24 17:49:51 +00:00			`vmovapd ymm1, ymmword ptr [rdi]`
update to simd-rect 2023-09-25 11:41:37 +00:00			`vperm2f128 ymm0, ymm2, ymm1, 32`
			`vperm2f128 ymm1, ymm1, ymm2, 49`
simd-rect article 2023-09-24 17:49:51 +00:00			`vcmplepd k0, ymm0, ymm1`
			`kmovd eax, k0`
			`sub al, 15`
			`sete al`
			```

update to simd-rect 2023-09-25 11:41:37 +00:00			`AVX512 makes it so that there's no big penalty for double precision types, which is nice.`
simd-rect article 2023-09-24 17:49:51 +00:00
update to simd-rect 2023-09-25 11:41:37 +00:00			`### Edits ###`
			`- Reordered to use packed vectors without swizzling when possible.`
			`- Eliminated redundant computations.`
calling convention notice 2023-11-09 16:59:28 +00:00			`- Calling convention notice.`