mjestecko/articles/vector-pi-rotation/page.mmd

Title:  Optimized Vector Rotation
Brief:  Specialized rotation methods over Pi and Pi/2 in 2D and 3D.
Date:   1699548646
Tags:   Programming, Zig, Optimization
CSS:    /style.css

Came up with some useful optimization for 90 and 180 degree rotations while making a grid walker,
below implementations are given, ripped straight from source, lol.

Compared to generic cos/sin method of rotation it's magnitudes of times less work in many cases, especially if glibc implementation is used.

Note: Given example assumes coordinate system where Y grows downwards and X to the right.

## Two dimensions

```zig
pub fn rotateByHalfPiClockwise(self: Self) Self {
    return .{ .components = .{ -self.y(), self.x() } };
}

pub fn rotateByHalfPiCounterClockwise(self: Self) Self {
    return .{ .components = .{ self.y(), -self.x() } };
}

pub fn rotateByPi(self: Self) Self {
    return .{ .components = .{ -self.x(), -self.y() } };
}

```

## Three dimensions

```zig
pub fn rotateByHalfPiClockwiseAroundAxis(self: Self, axis: Axis3) Self {
    return .{ .components = switch (axis) {
        .x => .{ self.x(), -self.z(), self.y() },
        .y => .{ -self.z(), self.y(), self.x() },
        .z => .{ -self.y(), self.x(), self.z() },
    } };
}

pub fn rotateByHalfPiCounterClockwiseAroundAxis(self: Self, axis: Axis3) Self {
    return .{ .components = switch (axis) {
        .x => .{ self.x(), self.z(), -self.y() },
        .y => .{ self.z(), self.y(), -self.x() },
        .z => .{ self.y(), -self.x(), self.z() },
    } };
}

pub fn rotateByPiAroundAxis(self: Self, axis: Axis3) Self {
    return .{ .components = switch (axis) {
        .x => .{ self.x(), -self.x(), -self.y() },
        .y => .{ -self.x(), self.y(), -self.z() },
        .z => .{ -self.x(), -self.y(), self.z() },
    } };
}

```

## Generated amd64 assembly
Note: Procedure prelude/epilogue is omitted. Zig's calling convention is used, which is roughly equivalent to C's static marked function in effect.

Note: It's for vectors stored packed for use in SSE, array/separate scalar passing produces worse result, at least when not inlined.

### rotateByHalfPiClockwise
Notice how it's one instruction longer than coutner-clockwise case,
so, choice of coordinate system effects costs of particular direction to rotate around.

```asm
        vmovlpd qword ptr [rsp], xmm0
        vmovshdup       xmm1, xmm0
        vpbroadcastd    xmm2, dword ptr [rip + .LCPI2_0]
        vpxor   xmm1, xmm1, xmm2
        vbroadcastss    xmm0, xmm0
        vblendps        xmm0, xmm0, xmm1, 1
```
### rotateByHalfPiCounterClockwise

```asm
        vmovlpd qword ptr [rsp], xmm0
        vpbroadcastd    xmm1, dword ptr [rip + .LCPI1_0]
        vpxor   xmm1, xmm0, xmm1
        vmovshdup       xmm0, xmm0
        vinsertps       xmm0, xmm0, xmm1, 16
```

### rotateByPi
```asm
        vmovlpd qword ptr [rsp], xmm0
        vpermilps       xmm0, xmm0, 212
        vpbroadcastd    xmm1, dword ptr [rip + .LCPI3_0]
        vpxor   xmm0, xmm0, xmm1
```

### rotateByHalfPiClockwiseAroundAxis (X)
```asm
        sub     rsp, 24
        vmovq   qword ptr [rsp], xmm0
        vpermilpd       xmm1, xmm0, 1
        vmovaps xmm2, xmm1
        vmovss  dword ptr [rsp + 8], xmm2
        vpbroadcastd    xmm2, dword ptr [rip + .LCPI4_0]
        vpxor   xmm1, xmm1, xmm2
        vpermilps       xmm0, xmm0, 212
        vinsertps       xmm0, xmm0, xmm1, 16
        add     rsp, 24
```

### rotateByHalfPiCounterClockwiseAroundAxis (X)
Again, one instruction shorter.

```asm
        sub     rsp, 24
        vextractps      dword ptr [rsp + 8], xmm0, 2
        vmovq   qword ptr [rsp], xmm0
        vmovshdup       xmm1, xmm0
        vpbroadcastd    xmm2, dword ptr [rip + .LCPI5_0]
        vpxor   xmm1, xmm1, xmm2
        vpermilps       xmm0, xmm0, 232
        vinsertps       xmm0, xmm0, xmm1, 32
        add     rsp, 24
```

### rotateByPiAroundAxis (X)
Now it's more work.

```asm
        sub     rsp, 24
        vmovq   qword ptr [rsp], xmm0
        vpermilpd       xmm1, xmm0, 1
        vmovaps xmm2, xmm1
        vmovss  dword ptr [rsp + 8], xmm2
        vmovshdup       xmm2, xmm0
        vbroadcastss    xmm3, dword ptr [rip + .LCPI6_0]
        vpxor   xmm2, xmm2, xmm3
        vpxor   xmm1, xmm1, xmm3
        vinsertps       xmm0, xmm0, xmm2, 16
        vinsertps       xmm0, xmm0, xmm1, 32
        add     rsp, 24
```
vector-pi-rotation article 2023-11-09 16:59:42 +00:00			`Title: Optimized Vector Rotation`
			`Brief: Specialized rotation methods over Pi and Pi/2 in 2D and 3D.`
			`Date: 1699548646`
			`Tags: Programming, Zig, Optimization`
			`CSS: /style.css`

			`Came up with some useful optimization for 90 and 180 degree rotations while making a grid walker,`
			`below implementations are given, ripped straight from source, lol.`

			`Compared to generic cos/sin method of rotation it's magnitudes of times less work in many cases, especially if glibc implementation is used.`

			`Note: Given example assumes coordinate system where Y grows downwards and X to the right.`

			`## Two dimensions`

			```zig
			`pub fn rotateByHalfPiClockwise(self: Self) Self {`
			`return .{ .components = .{ -self.y(), self.x() } };`
			`}`

			`pub fn rotateByHalfPiCounterClockwise(self: Self) Self {`
			`return .{ .components = .{ self.y(), -self.x() } };`
			`}`

			`pub fn rotateByPi(self: Self) Self {`
			`return .{ .components = .{ -self.x(), -self.y() } };`
			`}`

			```

			`## Three dimensions`

			```zig
			`pub fn rotateByHalfPiClockwiseAroundAxis(self: Self, axis: Axis3) Self {`
			`return .{ .components = switch (axis) {`
			`.x => .{ self.x(), -self.z(), self.y() },`
			`.y => .{ -self.z(), self.y(), self.x() },`
			`.z => .{ -self.y(), self.x(), self.z() },`
			`} };`
			`}`

			`pub fn rotateByHalfPiCounterClockwiseAroundAxis(self: Self, axis: Axis3) Self {`
			`return .{ .components = switch (axis) {`
			`.x => .{ self.x(), self.z(), -self.y() },`
			`.y => .{ self.z(), self.y(), -self.x() },`
			`.z => .{ self.y(), -self.x(), self.z() },`
			`} };`
			`}`

			`pub fn rotateByPiAroundAxis(self: Self, axis: Axis3) Self {`
			`return .{ .components = switch (axis) {`
			`.x => .{ self.x(), -self.x(), -self.y() },`
			`.y => .{ -self.x(), self.y(), -self.z() },`
			`.z => .{ -self.x(), -self.y(), self.z() },`
			`} };`
			`}`

			```

			`## Generated amd64 assembly`
			`Note: Procedure prelude/epilogue is omitted. Zig's calling convention is used, which is roughly equivalent to C's static marked function in effect.`

			`Note: It's for vectors stored packed for use in SSE, array/separate scalar passing produces worse result, at least when not inlined.`

			`### rotateByHalfPiClockwise`
			`Notice how it's one instruction longer than coutner-clockwise case,`
			`so, choice of coordinate system effects costs of particular direction to rotate around.`

			```asm
			`vmovlpd qword ptr [rsp], xmm0`
			`vmovshdup xmm1, xmm0`
			`vpbroadcastd xmm2, dword ptr [rip + .LCPI2_0]`
			`vpxor xmm1, xmm1, xmm2`
			`vbroadcastss xmm0, xmm0`
			`vblendps xmm0, xmm0, xmm1, 1`
			```
			`### rotateByHalfPiCounterClockwise`

			```asm
			`vmovlpd qword ptr [rsp], xmm0`
			`vpbroadcastd xmm1, dword ptr [rip + .LCPI1_0]`
			`vpxor xmm1, xmm0, xmm1`
			`vmovshdup xmm0, xmm0`
			`vinsertps xmm0, xmm0, xmm1, 16`
			```

			`### rotateByPi`
			```asm
			`vmovlpd qword ptr [rsp], xmm0`
			`vpermilps xmm0, xmm0, 212`
			`vpbroadcastd xmm1, dword ptr [rip + .LCPI3_0]`
			`vpxor xmm0, xmm0, xmm1`
			```

			`### rotateByHalfPiClockwiseAroundAxis (X)`
			```asm
			`sub rsp, 24`
			`vmovq qword ptr [rsp], xmm0`
			`vpermilpd xmm1, xmm0, 1`
			`vmovaps xmm2, xmm1`
			`vmovss dword ptr [rsp + 8], xmm2`
			`vpbroadcastd xmm2, dword ptr [rip + .LCPI4_0]`
			`vpxor xmm1, xmm1, xmm2`
			`vpermilps xmm0, xmm0, 212`
			`vinsertps xmm0, xmm0, xmm1, 16`
			`add rsp, 24`
			```

			`### rotateByHalfPiCounterClockwiseAroundAxis (X)`
			`Again, one instruction shorter.`

			```asm
			`sub rsp, 24`
			`vextractps dword ptr [rsp + 8], xmm0, 2`
			`vmovq qword ptr [rsp], xmm0`
			`vmovshdup xmm1, xmm0`
			`vpbroadcastd xmm2, dword ptr [rip + .LCPI5_0]`
			`vpxor xmm1, xmm1, xmm2`
			`vpermilps xmm0, xmm0, 232`
			`vinsertps xmm0, xmm0, xmm1, 32`
			`add rsp, 24`
			```

			`### rotateByPiAroundAxis (X)`
			`Now it's more work.`

			```asm
			`sub rsp, 24`
			`vmovq qword ptr [rsp], xmm0`
			`vpermilpd xmm1, xmm0, 1`
			`vmovaps xmm2, xmm1`
			`vmovss dword ptr [rsp + 8], xmm2`
			`vmovshdup xmm2, xmm0`
			`vbroadcastss xmm3, dword ptr [rip + .LCPI6_0]`
			`vpxor xmm2, xmm2, xmm3`
			`vpxor xmm1, xmm1, xmm3`
			`vinsertps xmm0, xmm0, xmm2, 16`
			`vinsertps xmm0, xmm0, xmm1, 32`
			`add rsp, 24`
			```