Compare commits
No commits in common. "8e7391d4704f8a01f395547bc8132550f825cf89" and "b7727d6aa36cba355b68a25203e1711fa222ff29" have entirely different histories.
8e7391d470
...
b7727d6aa3
@ -1,183 +0,0 @@
|
|||||||
Title: Vectorized Axis-Aligned Rect Ops
|
|
||||||
Brief: Small detour in making rect type and operations on it in SIMD semantics.
|
|
||||||
Date: 1695570693
|
|
||||||
Tags: Programming, Zig, Optimization
|
|
||||||
CSS: /style.css
|
|
||||||
|
|
||||||
### Code ###
|
|
||||||
Zig's `@shuffle` makes it rather arcane to look at, so be prepared.
|
|
||||||
|
|
||||||
```zig
|
|
||||||
pub fn RectSIMD(comptime T: type) type {
|
|
||||||
return struct {
|
|
||||||
xyxy: @Vector(4, T),
|
|
||||||
|
|
||||||
pub fn isPointWithin(self: @This(), p: @Vector(2, T)) bool {
|
|
||||||
const q = @shuffle(T, p, self.xyxy, [4]i32{ -1, -2, 0, 1 });
|
|
||||||
const w = @shuffle(T, p, self.xyxy, [4]i32{ 0, 1, -3, -4 });
|
|
||||||
return @reduce(.And, q <= w);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn isRectWithin(self: @This(), a: @This()) bool {
|
|
||||||
const q = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ -1, -2, 0, 1, -1, -2, 2, 1, -1, -2, 0, 3, -1, -2, 2, 3 });
|
|
||||||
const w = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ 0, 1, -3, -4, 2, 1, -3, -4, 0, 3, -3, -4, 2, 3, -3, -4 });
|
|
||||||
return @reduce(.And, q <= w);
|
|
||||||
}
|
|
||||||
|
|
||||||
// todo: Handle zero area cases?
|
|
||||||
pub fn isRectIntersecting(self: @This(), a: @This()) bool {
|
|
||||||
const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, -1, -2, 1 });
|
|
||||||
const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, 2, 3, -4 });
|
|
||||||
return @reduce(.And, q <= w);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Assembly ###
|
|
||||||
This is produced by godbolt, which apparently has AVX512 extensions, so, it's extremely compact.
|
|
||||||
|
|
||||||
note: SysV prelude and outro are omitted, with inlining you can expect it looking similarly.
|
|
||||||
|
|
||||||
For 32bit floating point:
|
|
||||||
```asm
|
|
||||||
"example.RectSIMD(f32).isPointWithin":
|
|
||||||
vmovaps xmm2, xmm0
|
|
||||||
vmovapd xmm1, xmmword ptr [rdi]
|
|
||||||
vunpcklpd xmm0, xmm1, xmm2
|
|
||||||
vblendpd xmm1, xmm1, xmm2, 1
|
|
||||||
vcmpleps k0, xmm0, xmm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
|
|
||||||
"example.RectSIMD(f32).isRectWithin":
|
|
||||||
vmovaps xmm2, xmmword ptr [rsi]
|
|
||||||
vmovaps xmm0, xmmword ptr [rdi]
|
|
||||||
vmovaps xmm1, xmm2
|
|
||||||
vmovaps xmm3, xmm0
|
|
||||||
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0]
|
|
||||||
vmovaps zmm0, zmm3
|
|
||||||
vpermt2ps zmm0, zmm2, zmm1
|
|
||||||
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1]
|
|
||||||
vpermt2ps zmm1, zmm2, zmm3
|
|
||||||
vcmpleps k0, zmm0, zmm1
|
|
||||||
kortestw k0, k0
|
|
||||||
setb al
|
|
||||||
|
|
||||||
"example.RectSIMD(f32).isRectIntersecting":
|
|
||||||
vmovaps xmm1, xmmword ptr [rsi]
|
|
||||||
vmovaps xmm3, xmmword ptr [rdi]
|
|
||||||
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0]
|
|
||||||
vmovaps xmm0, xmm1
|
|
||||||
vpermt2ps xmm0, xmm2, xmm3
|
|
||||||
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1]
|
|
||||||
vpermt2ps xmm1, xmm2, xmm3
|
|
||||||
vcmpleps k0, xmm0, xmm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
```
|
|
||||||
|
|
||||||
For 32bit signed integers it fares amazing too:
|
|
||||||
```asm
|
|
||||||
"example.RectSIMD(i32).isPointWithin":
|
|
||||||
vmovaps xmm1, xmm0
|
|
||||||
vmovdqa xmm2, xmmword ptr [rdi]
|
|
||||||
vpunpcklqdq xmm0, xmm2, xmm1
|
|
||||||
vpblendd xmm1, xmm1, xmm2, 12
|
|
||||||
vpcmpled k0, xmm0, xmm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
|
|
||||||
"example.RectSIMD(i32).isRectWithin":
|
|
||||||
vmovdqa xmm2, xmmword ptr [rsi]
|
|
||||||
vmovdqa xmm0, xmmword ptr [rdi]
|
|
||||||
vmovaps xmm1, xmm2
|
|
||||||
vmovaps xmm3, xmm0
|
|
||||||
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0]
|
|
||||||
vmovaps zmm0, zmm3
|
|
||||||
vpermt2d zmm0, zmm2, zmm1
|
|
||||||
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1]
|
|
||||||
vpermt2d zmm1, zmm2, zmm3
|
|
||||||
vpcmpled k0, zmm0, zmm1
|
|
||||||
kortestw k0, k0
|
|
||||||
setb al
|
|
||||||
|
|
||||||
"example.RectSIMD(i32).isRectIntersecting":
|
|
||||||
vmovdqa xmm1, xmmword ptr [rsi]
|
|
||||||
vmovdqa xmm3, xmmword ptr [rdi]
|
|
||||||
vpbroadcastq xmm0, qword ptr [rsi]
|
|
||||||
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0]
|
|
||||||
vpermt2d xmm0, xmm2, xmm3
|
|
||||||
vpbroadcastq xmm3, qword ptr [rdi + 8]
|
|
||||||
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1]
|
|
||||||
vpermt2d xmm1, xmm2, xmm3
|
|
||||||
vpcmpled k0, xmm0, xmm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
```
|
|
||||||
|
|
||||||
64bit floating point, now even on AVX512 some ops are not done at once:
|
|
||||||
```asm
|
|
||||||
"example.RectSIMD(f64).isPointWithin":
|
|
||||||
vmovaps xmm3, xmm0
|
|
||||||
vmovapd ymm1, ymmword ptr [rdi]
|
|
||||||
vinsertf128 ymm0, ymm1, xmm3, 1
|
|
||||||
vmovaps xmm2, xmm3
|
|
||||||
vblendpd ymm1, ymm1, ymm2, 3
|
|
||||||
vcmplepd k0, ymm0, ymm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
pop rbp
|
|
||||||
|
|
||||||
"example.RectSIMD(f64).isRectWithin":
|
|
||||||
vmovapd ymm3, ymmword ptr [rsi]
|
|
||||||
vmovapd ymm5, ymmword ptr [rdi]
|
|
||||||
vblendpd ymm0, ymm5, ymm3, 8
|
|
||||||
vperm2f128 ymm4, ymm5, ymm3, 32
|
|
||||||
vblendpd ymm1, ymm4, ymm0, 10
|
|
||||||
vmovaps ymm0, ymm1
|
|
||||||
vblendpd ymm1, ymm5, ymm3, 12
|
|
||||||
vinsertf64x4 zmm0, zmm0, ymm1, 1
|
|
||||||
vblendpd ymm1, ymm5, ymm3, 4
|
|
||||||
vblendpd ymm2, ymm1, ymm4, 10
|
|
||||||
vmovaps ymm1, ymm4
|
|
||||||
vinsertf64x4 zmm2, zmm1, ymm2, 1
|
|
||||||
vperm2f128 ymm4, ymm3, ymm5, 49
|
|
||||||
vblendpd ymm1, ymm3, ymm5, 4
|
|
||||||
vblendpd ymm6, ymm1, ymm4, 10
|
|
||||||
vmovaps ymm1, ymm6
|
|
||||||
vinsertf64x4 zmm1, zmm1, ymm4, 1
|
|
||||||
vblendpd ymm6, ymm3, ymm5, 8
|
|
||||||
vblendpd ymm4, ymm4, ymm6, 10
|
|
||||||
vblendpd ymm5, ymm3, ymm5, 12
|
|
||||||
vmovaps ymm3, ymm5
|
|
||||||
vinsertf64x4 zmm3, zmm3, ymm4, 1
|
|
||||||
vcmplepd k1, zmm2, zmm3
|
|
||||||
vcmplepd k0, zmm0, zmm1
|
|
||||||
kunpckbw k0, k0, k1
|
|
||||||
kortestw k0, k0
|
|
||||||
setb al
|
|
||||||
|
|
||||||
"example.RectSIMD(f64).isRectIntersecting":
|
|
||||||
vmovapd ymm3, ymmword ptr [rsi]
|
|
||||||
vmovapd ymm1, ymmword ptr [rdi]
|
|
||||||
vmovdqa ymm2, ymmword ptr [rip + .LCPI3_0]
|
|
||||||
vmovaps ymm0, ymm3
|
|
||||||
vpermt2pd ymm0, ymm2, ymm1
|
|
||||||
vmovdqa ymm2, ymmword ptr [rip + .LCPI3_1]
|
|
||||||
vpermt2pd ymm1, ymm2, ymm3
|
|
||||||
vcmplepd k0, ymm0, ymm1
|
|
||||||
kmovd eax, k0
|
|
||||||
sub al, 15
|
|
||||||
sete al
|
|
||||||
```
|
|
||||||
|
|
||||||
So, selection of coordinate data type plays quite big role, especially when extensions are not provided.
|
|
||||||
|
|
||||||
Note that permutation masks are also supplied along side code which increase binary size.
|
|
||||||
With inlining it could be quite substantial if per object %rip relative addressing is used.
|
|
@ -101,6 +101,8 @@ exit;
|
|||||||
|
|
||||||
#ifdef ELF
|
#ifdef ELF
|
||||||
|
|
||||||
|
/* https://github.com/Jorengarenar/CMObALL/blob/master/cmoball.h */
|
||||||
|
|
||||||
/* https://man7.org/linux/man-pages/man2/exit.2.html */
|
/* https://man7.org/linux/man-pages/man2/exit.2.html */
|
||||||
#define SYS_EXIT(p_return_code) \
|
#define SYS_EXIT(p_return_code) \
|
||||||
{ \
|
{ \
|
||||||
|
Loading…
Reference in New Issue
Block a user