From 77aabf61c52b0ff0ad43b9279626c396005cf107 Mon Sep 17 00:00:00 2001 From: veclav talica Date: Mon, 10 Jul 2023 18:23:11 +0500 Subject: [PATCH] hand optimized simplex page --- articles/2d-visibility/page.mmd | 2 +- articles/hand-opt-simplex-2d/page.mmd | 116 ++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 articles/hand-opt-simplex-2d/page.mmd diff --git a/articles/2d-visibility/page.mmd b/articles/2d-visibility/page.mmd index ace6222..7e43be9 100644 --- a/articles/2d-visibility/page.mmd +++ b/articles/2d-visibility/page.mmd @@ -8,7 +8,7 @@ CSS: /style.css Based on [Redblobgames' visibility article and Haxe reference implementation](https://www.redblobgames.com/articles/visibility) -Full usable code is [here](/articles/2d-visibility/Visiblity2D.gd.txt). +Full usable code is [here](/articles/2d-visibility/Visibility2D.gd.txt). ### Explanation ### diff --git a/articles/hand-opt-simplex-2d/page.mmd b/articles/hand-opt-simplex-2d/page.mmd new file mode 100644 index 0000000..bc194db --- /dev/null +++ b/articles/hand-opt-simplex-2d/page.mmd @@ -0,0 +1,116 @@ +Title: Hand Optimized Simplex 2D +Brief: Results of messing around with moving and hoisting stuff around. +Date: 1688995095 +Tags: Programming, GLSL, OpenGL +CSS: /style.css + +![](/articles/hand-opt-simplex-2d/noise.png) + +Based on [webgl-noise repository](https://github.com/ashima/webgl-noise), which is based on [this paper](https://arxiv.org/pdf/1204.1461.pdf). + +Things tried: +* Rearranging operations to reduce register pressure. +* Calculating things as soon as possible. +* Hand inlining. + +### Results ### +For testing screen space *1024x1024* texture is generated, resulting in *1048576* fragment invocations, +with 4 octave fractal brownian motion. + +Hardware: `Mobile Intel® GM45 Express Chipset` + +Driver: `DRI Mesa 21.2.6` + +Original: +``` +Benchmark Iterations Min(ns) Max(ns) Variance Mean(ns) +---------------------------------------------------------------- +full(0) 100 124848395 510494575 1473830605484805 129237053 +``` + +Hand optimized: +``` +Benchmark Iterations Min(ns) Max(ns) Variance Mean(ns) +---------------------------------------------------------------- +full(0) 100 119354512 731397135 3705581696928414 125714847 +``` + +Mean difference is `3ms 522µs 206ns`, min difference is `5ms 493µs 883ns`. + +This suggests that given driver is suboptimal in its optimizing capabilities, +and I imagine there might be GLSL compilers a lot worse than this. + +Some intermediate shader representation comes to mind as a mean for +automatic GLSL source level, profile guided and other optimizations; +as well as polyfilling to different extensions, profiles and APIs. But welp. + +### Source ### + +```glsl +#version 120 + +// Author : Ian McEwan, Ashima Arts. +// Maintainer : stegu +// Lastmod : 20110822 (ijm) +// License : Copyright (C) 2011 Ashima Arts. All rights reserved. +// Distributed under the MIT License. See LICENSE file. +// https://github.com/ashima/webgl-noise +// https://github.com/stegu/webgl-noise +// + +#define MOD289(p_x) ((p_x) - floor((p_x) * (1.0 / 289.0)) * 289.0) +#define PERMUTE(p_result, p_x) { vec3 _temp = (((p_x) * 34.0) + 10.0) * (p_x); p_result = MOD289(_temp); } + +float simplex_noise_2d(in vec2 v) { + const vec4 C = vec4(0.211324865405187, // (3.0-sqrt(3.0))/6.0 + 0.366025403784439, // 0.5*(sqrt(3.0)-1.0) + -0.577350269189626, // -1.0 + 2.0 * C.x + 0.024390243902439); // 1.0 / 41.0 + // First corner + vec2 i = floor(v + dot(v, C.yy)); + vec2 x0 = v - i + dot(i, C.xx); + i = MOD289(i); // Avoid truncation effects in permutation + + // i1.x = step( x0.y, x0.x ); // x0.x > x0.y ? 1.0 : 0.0 + // i1.y = 1.0 - i1.x; + vec2 i1 = (x0.x > x0.y) ? vec2(1.0, 0.0) : vec2(0.0, 1.0); + + // Other corners + // x0 = x0 - 0.0 + 0.0 * C.xx ; + // x1 = x0 - i1 + 1.0 * C.xx ; + // x2 = x0 - 1.0 + 2.0 * C.xx ; + vec4 x12 = x0.xyxy + C.xxzz - vec4(i1.xy, 0.0, 0.0); + + // Permutations + vec3 pp; + vec3 p = i.y + vec3(0.0, i1.y, 1.0); + PERMUTE(pp, p); + pp += i.x + vec3(0.0, i1.x, 1.0); + PERMUTE(p, pp); + p = fract(p * C.www); + + vec3 m = max(0.5 - vec3(dot(x0, x0), dot(x12.xy, x12.xy), dot(x12.zw, x12.zw)), 0.0); + + // Gradients: 41 points uniformly over a line, mapped onto a diamond. + // The ring size 17*17 = 289 is close to a multiple of 41 (41*7 = 287) + vec3 x = 2.0 * p - 1.0; + vec3 a0 = x - floor(x + 0.5); + vec3 h = abs(x) - 0.5; + + m = m * m; + m = m * m; + + // Normalise gradients implicitly by scaling m + // Approximation of: m *= inversesqrt( a0*a0 + h*h ); + m *= 1.79284291400159 - 0.85373472095314 * (a0 * a0 + h * h); + + // Compute final noise value at P + return 130.0 * dot(m, vec3(a0.x * x0.x + h.x * x0.y, a0.yz * x12.xz + h.yz * x12.yw)); +} + +``` + +### Possibilities +[NVidia's TEGRA guide](https://docs.nvidia.com/drive/drive_os_5.1.6.1L/nvvib_docs/DRIVE_OS_Linux_SDK_Development_Guide/baggage/tegra_gles2_performance.pdf) states that uniform access is often better than constants. +On our hardware it only degrades performance, but there's possibility of other chipsets having similar to TEGRA's preferences. +`C` constant is legible for this.