update to simd-rect

This commit is contained in:
veclav talica 2023-09-25 16:41:37 +05:00
parent 8e7391d470
commit cd8d819ed9

View File

@ -19,15 +19,15 @@ pub fn RectSIMD(comptime T: type) type {
} }
pub fn isRectWithin(self: @This(), a: @This()) bool { pub fn isRectWithin(self: @This(), a: @This()) bool {
const q = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ -1, -2, 0, 1, -1, -2, 2, 1, -1, -2, 0, 3, -1, -2, 2, 3 }); const q = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ 0, 1, 2, 3, -1, -2, -1, -2 });
const w = @shuffle(T, a.xyxy, self.xyxy, [16]i32{ 0, 1, -3, -4, 2, 1, -3, -4, 0, 3, -3, -4, 2, 3, -3, -4 }); const w = @shuffle(T, a.xyxy, self.xyxy, [8]i32{ -3, -4, -3, -4, 0, 1, 2, 3 });
return @reduce(.And, q <= w); return @reduce(.And, q <= w);
} }
// todo: Handle zero area cases? // todo: Handle zero area cases?
pub fn isRectIntersecting(self: @This(), a: @This()) bool { pub fn isRectIntersecting(self: @This(), a: @This()) bool {
const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, -1, -2, 1 }); const q = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ 0, 1, -1, -2 });
const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, 2, 3, -4 }); const w = @shuffle(T, a.xyxy, self.xyxy, [4]i32{ -3, -4, 2, 3 });
return @reduce(.And, q <= w); return @reduce(.And, q <= w);
} }
}; };
@ -53,26 +53,21 @@ For 32bit floating point:
"example.RectSIMD(f32).isRectWithin": "example.RectSIMD(f32).isRectWithin":
vmovaps xmm2, xmmword ptr [rsi] vmovaps xmm2, xmmword ptr [rsi]
vmovaps xmm0, xmmword ptr [rdi] vmovaps xmm0, xmm2
vmovaps xmm1, xmm2 vmovddup xmm1, qword ptr [rdi]
vmovaps xmm3, xmm0 vinsertf128 ymm0, ymm0, xmm1, 1
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] vmovddup xmm3, qword ptr [rdi + 8]
vmovaps zmm0, zmm3 vmovaps xmm1, xmm3
vpermt2ps zmm0, zmm2, zmm1 vinsertf128 ymm1, ymm1, xmm2, 1
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] vcmpleps k0, ymm0, ymm1
vpermt2ps zmm1, zmm2, zmm3 kortestb k0, k0
vcmpleps k0, zmm0, zmm1
kortestw k0, k0
setb al setb al
"example.RectSIMD(f32).isRectIntersecting": "example.RectSIMD(f32).isRectIntersecting":
vmovaps xmm1, xmmword ptr [rsi] vmovapd xmm2, xmmword ptr [rsi]
vmovaps xmm3, xmmword ptr [rdi] vmovapd xmm1, xmmword ptr [rdi]
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] vunpcklpd xmm0, xmm2, xmm1
vmovaps xmm0, xmm1 vunpckhpd xmm1, xmm1, xmm2
vpermt2ps xmm0, xmm2, xmm3
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1]
vpermt2ps xmm1, xmm2, xmm3
vcmpleps k0, xmm0, xmm1 vcmpleps k0, xmm0, xmm1
kmovd eax, k0 kmovd eax, k0
sub al, 15 sub al, 15
@ -93,34 +88,28 @@ For 32bit signed integers it fares amazing too:
"example.RectSIMD(i32).isRectWithin": "example.RectSIMD(i32).isRectWithin":
vmovdqa xmm2, xmmword ptr [rsi] vmovdqa xmm2, xmmword ptr [rsi]
vmovdqa xmm0, xmmword ptr [rdi] vmovaps xmm0, xmm2
vmovaps xmm1, xmm2 vpbroadcastq xmm1, qword ptr [rdi]
vmovaps xmm3, xmm0 vinserti128 ymm0, ymm0, xmm1, 1
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_0] vpbroadcastq xmm3, qword ptr [rdi + 8]
vmovaps zmm0, zmm3 vmovaps xmm1, xmm3
vpermt2d zmm0, zmm2, zmm1 vinserti128 ymm1, ymm1, xmm2, 1
vmovdqa64 zmm2, zmmword ptr [rip + .LCPI2_1] vpcmpled k0, ymm0, ymm1
vpermt2d zmm1, zmm2, zmm3 kortestb k0, k0
vpcmpled k0, zmm0, zmm1
kortestw k0, k0
setb al setb al
"example.RectSIMD(i32).isRectIntersecting": "example.RectSIMD(i32).isRectIntersecting":
vmovdqa xmm1, xmmword ptr [rsi] vmovdqa xmm2, xmmword ptr [rsi]
vmovdqa xmm3, xmmword ptr [rdi] vmovdqa xmm1, xmmword ptr [rdi]
vpbroadcastq xmm0, qword ptr [rsi] vpunpcklqdq xmm0, xmm2, xmm1
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_0] vpunpckhqdq xmm1, xmm1, xmm2
vpermt2d xmm0, xmm2, xmm3
vpbroadcastq xmm3, qword ptr [rdi + 8]
vmovdqa xmm2, xmmword ptr [rip + .LCPI3_1]
vpermt2d xmm1, xmm2, xmm3
vpcmpled k0, xmm0, xmm1 vpcmpled k0, xmm0, xmm1
kmovd eax, k0 kmovd eax, k0
sub al, 15 sub al, 15
sete al sete al
``` ```
64bit floating point, now even on AVX512 some ops are not done at once: 64bit floating point:
```asm ```asm
"example.RectSIMD(f64).isPointWithin": "example.RectSIMD(f64).isPointWithin":
vmovaps xmm3, xmm0 vmovaps xmm3, xmm0
@ -135,49 +124,34 @@ For 32bit signed integers it fares amazing too:
pop rbp pop rbp
"example.RectSIMD(f64).isRectWithin": "example.RectSIMD(f64).isRectWithin":
vmovapd ymm3, ymmword ptr [rsi] vmovapd ymm2, ymmword ptr [rsi]
vmovapd ymm5, ymmword ptr [rdi] vmovapd ymm1, ymmword ptr [rdi]
vblendpd ymm0, ymm5, ymm3, 8 vmovaps ymm0, ymm2
vperm2f128 ymm4, ymm5, ymm3, 32 vpermpd ymm3, ymm1, 68
vblendpd ymm1, ymm4, ymm0, 10 vinsertf64x4 zmm0, zmm0, ymm3, 1
vmovaps ymm0, ymm1 vpermpd ymm3, ymm1, 238
vblendpd ymm1, ymm5, ymm3, 12 vmovaps ymm1, ymm3
vinsertf64x4 zmm0, zmm0, ymm1, 1 vinsertf64x4 zmm1, zmm1, ymm2, 1
vblendpd ymm1, ymm5, ymm3, 4
vblendpd ymm2, ymm1, ymm4, 10
vmovaps ymm1, ymm4
vinsertf64x4 zmm2, zmm1, ymm2, 1
vperm2f128 ymm4, ymm3, ymm5, 49
vblendpd ymm1, ymm3, ymm5, 4
vblendpd ymm6, ymm1, ymm4, 10
vmovaps ymm1, ymm6
vinsertf64x4 zmm1, zmm1, ymm4, 1
vblendpd ymm6, ymm3, ymm5, 8
vblendpd ymm4, ymm4, ymm6, 10
vblendpd ymm5, ymm3, ymm5, 12
vmovaps ymm3, ymm5
vinsertf64x4 zmm3, zmm3, ymm4, 1
vcmplepd k1, zmm2, zmm3
vcmplepd k0, zmm0, zmm1 vcmplepd k0, zmm0, zmm1
kunpckbw k0, k0, k1 kortestb k0, k0
kortestw k0, k0
setb al setb al
"example.RectSIMD(f64).isRectIntersecting": "example.RectSIMD(f64).isRectIntersecting":
vmovapd ymm3, ymmword ptr [rsi] vmovapd ymm2, ymmword ptr [rsi]
vmovapd ymm1, ymmword ptr [rdi] vmovapd ymm1, ymmword ptr [rdi]
vmovdqa ymm2, ymmword ptr [rip + .LCPI3_0] vperm2f128 ymm0, ymm2, ymm1, 32
vmovaps ymm0, ymm3 vperm2f128 ymm1, ymm1, ymm2, 49
vpermt2pd ymm0, ymm2, ymm1
vmovdqa ymm2, ymmword ptr [rip + .LCPI3_1]
vpermt2pd ymm1, ymm2, ymm3
vcmplepd k0, ymm0, ymm1 vcmplepd k0, ymm0, ymm1
kmovd eax, k0 kmovd eax, k0
sub al, 15 sub al, 15
sete al sete al
``` ```
So, selection of coordinate data type plays quite big role, especially when extensions are not provided. AVX512 makes it so that there's no big penalty for double precision types, which is nice.
Note that permutation masks are also supplied along side code which increase binary size. Note that permutation masks are also supplied along side code which increase binary size.
With inlining it could be quite substantial if per object %rip relative addressing is used. With inlining it could be quite substantial if per object %rip relative addressing is used.
### Edits ###
- Reordered to use packed vectors without swizzling when possible
- Eliminated redundant computations.