mjestecko/articles/simd-rect/page.mmd

157 lines
4.9 KiB
Plaintext
Raw Normal View History

2023-09-24 17:49:51 +00:00
Title: Vectorized Axis-Aligned Rect Ops
Brief: Small detour in making rect type and operations on it in SIMD semantics.
Date: 1695570693
Tags: Programming, Zig, Optimization
CSS: /style.css
### Code ###
Zig's `@shuffle` makes it rather arcane to look at, so be prepared.
```zig
pub fn RectSIMD(comptime T: type) type {
return struct {
xyxy: @Vector(4, T),
pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool {
const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 });
const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 });
return @reduce(.And, q <= w);
}
pub fn isRectWithin(self: @This(), a: @This()) bool {
2023-09-25 11:41:37 +00:00
const q = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ 0, 1, 2, 3, -1, -2, -1, -2 });
const w = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ -3, -4, -3, -4, 0, 1, 2, 3 });
2023-09-24 17:49:51 +00:00
return @reduce(.And, q <= w);
}
// todo: Handle zero area cases?
pub fn isRectIntersecting(self: @This(), a: @This()) bool {
2023-09-25 11:41:37 +00:00
const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, 1, -1, -2 });
const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, -4, 2, 3 });
2023-09-24 17:49:51 +00:00
return @reduce(.And, q <= w);
}
};
}
```
### Assembly ###
This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact.
2023-11-09 16:59:28 +00:00
Note: Calling prelude and outro are omitted, with inlining you can expect it looking similarly.
Zig calling convention is used, which is roughly equal to C's static marked procedure.
2023-09-24 17:49:51 +00:00
For 32bit floating point:
```asm
"example.RectSIMD(f32).isPointWithin":
vmovaps xmm2, xmm0
vmovapd xmm1, xmmword ptr [rdi]
vunpcklpd xmm0, xmm1, xmm2
vblendpd xmm1, xmm1, xmm2, 1
vcmpleps k0, xmm0, xmm1
kmovd eax, k0
sub al, 15
sete al
"example.RectSIMD(f32).isRectWithin":
vmovaps xmm2, xmmword ptr [rsi]
2023-09-25 11:41:37 +00:00
vmovaps xmm0, xmm2
vmovddup xmm1, qword ptr [rdi]
vinsertf128 ymm0, ymm0, xmm1, 1
vmovddup xmm3, qword ptr [rdi + 8]
vmovaps xmm1, xmm3
vinsertf128 ymm1, ymm1, xmm2, 1
vcmpleps k0, ymm0, ymm1
kortestb k0, k0
2023-09-24 17:49:51 +00:00
setb al
"example.RectSIMD(f32).isRectIntersecting":
2023-09-25 11:41:37 +00:00
vmovapd xmm2, xmmword ptr [rsi]
vmovapd xmm1, xmmword ptr [rdi]
vunpcklpd xmm0, xmm2, xmm1
vunpckhpd xmm1, xmm1, xmm2
2023-09-24 17:49:51 +00:00
vcmpleps k0, xmm0, xmm1
kmovd eax, k0
sub al, 15
sete al
```
For 32bit signed integers it fares amazing too:
```asm
"example.RectSIMD(i32).isPointWithin":
vmovaps xmm1, xmm0
vmovdqa xmm2, xmmword ptr [rdi]
vpunpcklqdq xmm0, xmm2, xmm1
vpblendd xmm1, xmm1, xmm2, 12
vpcmpled k0, xmm0, xmm1
kmovd eax, k0
sub al, 15
sete al
"example.RectSIMD(i32).isRectWithin":
vmovdqa xmm2, xmmword ptr [rsi]
2023-09-25 11:41:37 +00:00
vmovaps xmm0, xmm2
vpbroadcastq xmm1, qword ptr [rdi]
vinserti128 ymm0, ymm0, xmm1, 1
vpbroadcastq xmm3, qword ptr [rdi + 8]
vmovaps xmm1, xmm3
vinserti128 ymm1, ymm1, xmm2, 1
vpcmpled k0, ymm0, ymm1
kortestb k0, k0
2023-09-24 17:49:51 +00:00
setb al
"example.RectSIMD(i32).isRectIntersecting":
2023-09-25 11:41:37 +00:00
vmovdqa xmm2, xmmword ptr [rsi]
vmovdqa xmm1, xmmword ptr [rdi]
vpunpcklqdq xmm0, xmm2, xmm1
vpunpckhqdq xmm1, xmm1, xmm2
2023-09-24 17:49:51 +00:00
vpcmpled k0, xmm0, xmm1
kmovd eax, k0
sub al, 15
sete al
```
2023-09-25 11:41:37 +00:00
64bit floating point:
2023-09-24 17:49:51 +00:00
```asm
"example.RectSIMD(f64).isPointWithin":
vmovaps xmm3, xmm0
vmovapd ymm1, ymmword ptr [rdi]
vinsertf128 ymm0, ymm1, xmm3, 1
vmovaps xmm2, xmm3
vblendpd ymm1, ymm1, ymm2, 3
vcmplepd k0, ymm0, ymm1
kmovd eax, k0
sub al, 15
sete al
"example.RectSIMD(f64).isRectWithin":
2023-09-25 11:41:37 +00:00
vmovapd ymm2, ymmword ptr [rsi]
vmovapd ymm1, ymmword ptr [rdi]
vmovaps ymm0, ymm2
vpermpd ymm3, ymm1, 68
vinsertf64x4 zmm0, zmm0, ymm3, 1
vpermpd ymm3, ymm1, 238
vmovaps ymm1, ymm3
vinsertf64x4 zmm1, zmm1, ymm2, 1
2023-09-24 17:49:51 +00:00
vcmplepd k0, zmm0, zmm1
2023-09-25 11:41:37 +00:00
kortestb k0, k0
2023-09-24 17:49:51 +00:00
setb al
"example.RectSIMD(f64).isRectIntersecting":
2023-09-25 11:41:37 +00:00
vmovapd ymm2, ymmword ptr [rsi]
2023-09-24 17:49:51 +00:00
vmovapd ymm1, ymmword ptr [rdi]
2023-09-25 11:41:37 +00:00
vperm2f128 ymm0, ymm2, ymm1, 32
vperm2f128 ymm1, ymm1, ymm2, 49
2023-09-24 17:49:51 +00:00
vcmplepd k0, ymm0, ymm1
kmovd eax, k0
sub al, 15
sete al
```
2023-09-25 11:41:37 +00:00
AVX512 makes it so that there's no big penalty for double precision types, which is nice.
2023-09-24 17:49:51 +00:00
2023-09-25 11:41:37 +00:00
### Edits ###
- Reordered to use packed vectors without swizzling when possible.
- Eliminated redundant computations.
2023-11-09 16:59:28 +00:00
- Calling convention notice.