158 lines
5.0 KiB
Plaintext
158 lines
5.0 KiB
Plaintext
Title: Vectorized Axis-Aligned Rect Ops
|
|
Brief: Small detour in making rect type and operations on it in SIMD semantics.
|
|
Date: 1695570693
|
|
Tags: Programming, Zig, Optimization
|
|
CSS: /style.css
|
|
|
|
### Code ###
|
|
Zig's `@shuffle` makes it rather arcane to look at, so be prepared.
|
|
|
|
```zig
|
|
pub fn RectSIMD(comptime T: type) type {
|
|
return struct {
|
|
xyxy: @Vector(4, T),
|
|
|
|
pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool {
|
|
const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 });
|
|
const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 });
|
|
return @reduce(.And, q <= w);
|
|
}
|
|
|
|
pub fn isRectWithin(self: @This(), a: @This()) bool {
|
|
const q = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ 0, 1, 2, 3, -1, -2, -1, -2 });
|
|
const w = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ -3, -4, -3, -4, 0, 1, 2, 3 });
|
|
return @reduce(.And, q <= w);
|
|
}
|
|
|
|
// todo: Handle zero area cases?
|
|
pub fn isRectIntersecting(self: @This(), a: @This()) bool {
|
|
const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, 1, -1, -2 });
|
|
const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, -4, 2, 3 });
|
|
return @reduce(.And, q <= w);
|
|
}
|
|
};
|
|
}
|
|
```
|
|
|
|
### Assembly ###
|
|
This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact.
|
|
|
|
note: SysV prelude and outro are omitted, with inlining you can expect it looking similarly.
|
|
|
|
For 32bit floating point:
|
|
```asm
|
|
"example.RectSIMD(f32).isPointWithin":
|
|
vmovaps xmm2, xmm0
|
|
vmovapd xmm1, xmmword ptr [rdi]
|
|
vunpcklpd xmm0, xmm1, xmm2
|
|
vblendpd xmm1, xmm1, xmm2, 1
|
|
vcmpleps k0, xmm0, xmm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
|
|
"example.RectSIMD(f32).isRectWithin":
|
|
vmovaps xmm2, xmmword ptr [rsi]
|
|
vmovaps xmm0, xmm2
|
|
vmovddup xmm1, qword ptr [rdi]
|
|
vinsertf128 ymm0, ymm0, xmm1, 1
|
|
vmovddup xmm3, qword ptr [rdi + 8]
|
|
vmovaps xmm1, xmm3
|
|
vinsertf128 ymm1, ymm1, xmm2, 1
|
|
vcmpleps k0, ymm0, ymm1
|
|
kortestb k0, k0
|
|
setb al
|
|
|
|
"example.RectSIMD(f32).isRectIntersecting":
|
|
vmovapd xmm2, xmmword ptr [rsi]
|
|
vmovapd xmm1, xmmword ptr [rdi]
|
|
vunpcklpd xmm0, xmm2, xmm1
|
|
vunpckhpd xmm1, xmm1, xmm2
|
|
vcmpleps k0, xmm0, xmm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
```
|
|
|
|
For 32bit signed integers it fares amazing too:
|
|
```asm
|
|
"example.RectSIMD(i32).isPointWithin":
|
|
vmovaps xmm1, xmm0
|
|
vmovdqa xmm2, xmmword ptr [rdi]
|
|
vpunpcklqdq xmm0, xmm2, xmm1
|
|
vpblendd xmm1, xmm1, xmm2, 12
|
|
vpcmpled k0, xmm0, xmm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
|
|
"example.RectSIMD(i32).isRectWithin":
|
|
vmovdqa xmm2, xmmword ptr [rsi]
|
|
vmovaps xmm0, xmm2
|
|
vpbroadcastq xmm1, qword ptr [rdi]
|
|
vinserti128 ymm0, ymm0, xmm1, 1
|
|
vpbroadcastq xmm3, qword ptr [rdi + 8]
|
|
vmovaps xmm1, xmm3
|
|
vinserti128 ymm1, ymm1, xmm2, 1
|
|
vpcmpled k0, ymm0, ymm1
|
|
kortestb k0, k0
|
|
setb al
|
|
|
|
"example.RectSIMD(i32).isRectIntersecting":
|
|
vmovdqa xmm2, xmmword ptr [rsi]
|
|
vmovdqa xmm1, xmmword ptr [rdi]
|
|
vpunpcklqdq xmm0, xmm2, xmm1
|
|
vpunpckhqdq xmm1, xmm1, xmm2
|
|
vpcmpled k0, xmm0, xmm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
```
|
|
|
|
64bit floating point:
|
|
```asm
|
|
"example.RectSIMD(f64).isPointWithin":
|
|
vmovaps xmm3, xmm0
|
|
vmovapd ymm1, ymmword ptr [rdi]
|
|
vinsertf128 ymm0, ymm1, xmm3, 1
|
|
vmovaps xmm2, xmm3
|
|
vblendpd ymm1, ymm1, ymm2, 3
|
|
vcmplepd k0, ymm0, ymm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
pop rbp
|
|
|
|
"example.RectSIMD(f64).isRectWithin":
|
|
vmovapd ymm2, ymmword ptr [rsi]
|
|
vmovapd ymm1, ymmword ptr [rdi]
|
|
vmovaps ymm0, ymm2
|
|
vpermpd ymm3, ymm1, 68
|
|
vinsertf64x4 zmm0, zmm0, ymm3, 1
|
|
vpermpd ymm3, ymm1, 238
|
|
vmovaps ymm1, ymm3
|
|
vinsertf64x4 zmm1, zmm1, ymm2, 1
|
|
vcmplepd k0, zmm0, zmm1
|
|
kortestb k0, k0
|
|
setb al
|
|
|
|
"example.RectSIMD(f64).isRectIntersecting":
|
|
vmovapd ymm2, ymmword ptr [rsi]
|
|
vmovapd ymm1, ymmword ptr [rdi]
|
|
vperm2f128 ymm0, ymm2, ymm1, 32
|
|
vperm2f128 ymm1, ymm1, ymm2, 49
|
|
vcmplepd k0, ymm0, ymm1
|
|
kmovd eax, k0
|
|
sub al, 15
|
|
sete al
|
|
```
|
|
|
|
AVX512 makes it so that there's no big penalty for double precision types, which is nice.
|
|
|
|
Note that permutation masks are also supplied along side code which increase binary size.
|
|
With inlining it could be quite substantial if per object %rip relative addressing is used.
|
|
|
|
### Edits ###
|
|
- Reordered to use packed vectors without swizzling when possible
|
|
- Eliminated redundant computations.
|