117 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			117 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| Title:  Hand Optimized Simplex 2D
 | |
| Brief:  Results of messing around with moving and hoisting stuff around.
 | |
| Date:   1688995095
 | |
| Tags:   Programming, GLSL, OpenGL
 | |
| CSS:    /style.css
 | |
| 
 | |
| 
 | |
| 
 | |
| Based on [webgl-noise repository](https://github.com/ashima/webgl-noise), which is based on [this paper](https://arxiv.org/pdf/1204.1461.pdf).
 | |
| 
 | |
| Things tried:
 | |
| * Rearranging operations to reduce register pressure.
 | |
| * Calculating things as soon as possible.
 | |
| * Hand inlining.
 | |
| 
 | |
| ### Results ###
 | |
| For testing screen space *1024x1024* texture is generated, resulting in *1048576* fragment invocations,
 | |
| with 4 octave fractal brownian motion.
 | |
| 
 | |
| Hardware: `Mobile Intel® GM45 Express Chipset`
 | |
| 
 | |
| Driver: `DRI Mesa 21.2.6`
 | |
| 
 | |
| Original:
 | |
| ```
 | |
| Benchmark Iterations    Min(ns)    Max(ns)   Variance   Mean(ns)
 | |
| ----------------------------------------------------------------
 | |
| full(0)          100  124848395  510494575 1473830605484805  129237053
 | |
| ```
 | |
| 
 | |
| Hand optimized:
 | |
| ```
 | |
| Benchmark Iterations    Min(ns)    Max(ns)   Variance   Mean(ns)
 | |
| ----------------------------------------------------------------
 | |
| full(0)          100  119354512  731397135 3705581696928414  125714847
 | |
| ```
 | |
| 
 | |
| Mean difference is `3ms 522µs 206ns (-2.7%)`, min difference is `5ms 493µs 883ns (-4.4%)`
 | |
| 
 | |
| This suggests that given driver is suboptimal in its optimizing capabilities,
 | |
| and I imagine there might be GLSL compilers a lot worse than this.
 | |
| 
 | |
| Some intermediate shader representation comes to mind as a mean for
 | |
| automatic GLSL source level, profile guided and other optimizations;
 | |
| as well as polyfilling to different extensions, profiles and APIs. But welp.
 | |
| 
 | |
| ### Source ###
 | |
| 
 | |
| ```glsl
 | |
| #version 120
 | |
| 
 | |
| //      Author : Ian McEwan, Ashima Arts.
 | |
| //  Maintainer : stegu
 | |
| //     Lastmod : 20110822 (ijm)
 | |
| //     License : Copyright (C) 2011 Ashima Arts. All rights reserved.
 | |
| //               Distributed under the MIT License. See LICENSE file.
 | |
| //               https://github.com/ashima/webgl-noise
 | |
| //               https://github.com/stegu/webgl-noise
 | |
| // 
 | |
| 
 | |
| #define MOD289(p_x) ((p_x) - floor((p_x) * (1.0 / 289.0)) * 289.0)
 | |
| #define PERMUTE(p_result, p_x) { vec3 _temp = (((p_x) * 34.0) + 10.0) * (p_x); p_result = MOD289(_temp); }
 | |
| 
 | |
| float simplex_noise_2d(in vec2 v) {
 | |
|   const vec4 C = vec4(0.211324865405187,  // (3.0-sqrt(3.0))/6.0
 | |
|                       0.366025403784439,  // 0.5*(sqrt(3.0)-1.0)
 | |
|                      -0.577350269189626,  // -1.0 + 2.0 * C.x
 | |
|                       0.024390243902439); // 1.0 / 41.0
 | |
|   // First corner
 | |
|   vec2 i  = floor(v + dot(v, C.yy));
 | |
|   vec2 x0 = v -   i + dot(i, C.xx);
 | |
|   i = MOD289(i); // Avoid truncation effects in permutation
 | |
| 
 | |
|   // i1.x = step( x0.y, x0.x ); // x0.x > x0.y ? 1.0 : 0.0
 | |
|   // i1.y = 1.0 - i1.x;
 | |
|   vec2 i1 = (x0.x > x0.y) ? vec2(1.0, 0.0) : vec2(0.0, 1.0);
 | |
| 
 | |
|   // Other corners
 | |
|   // x0 = x0 - 0.0 + 0.0 * C.xx ;
 | |
|   // x1 = x0 - i1 + 1.0 * C.xx ;
 | |
|   // x2 = x0 - 1.0 + 2.0 * C.xx ;
 | |
|   vec4 x12 = x0.xyxy + C.xxzz - vec4(i1.xy, 0.0, 0.0);
 | |
| 
 | |
|   // Permutations
 | |
|   vec3 pp;
 | |
|   vec3 p = i.y + vec3(0.0, i1.y, 1.0);
 | |
|   PERMUTE(pp, p);
 | |
|   pp += i.x + vec3(0.0, i1.x, 1.0);
 | |
|   PERMUTE(p, pp);
 | |
|   p = fract(p * C.www);
 | |
| 
 | |
|   vec3 m = max(0.5 - vec3(dot(x0, x0), dot(x12.xy, x12.xy), dot(x12.zw, x12.zw)), 0.0);
 | |
| 
 | |
|   // Gradients: 41 points uniformly over a line, mapped onto a diamond.
 | |
|   // The ring size 17*17 = 289 is close to a multiple of 41 (41*7 = 287)
 | |
|   vec3 x = 2.0 * p - 1.0;
 | |
|   vec3 a0 = x - floor(x + 0.5);
 | |
|   vec3 h = abs(x) - 0.5;
 | |
| 
 | |
|   m = m * m;
 | |
|   m = m * m;
 | |
| 
 | |
|   // Normalise gradients implicitly by scaling m
 | |
|   // Approximation of: m *= inversesqrt( a0*a0 + h*h );
 | |
|   m *= 1.79284291400159 - 0.85373472095314 * (a0 * a0 + h * h);
 | |
| 
 | |
|   // Compute final noise value at P
 | |
|   return 130.0 * dot(m, vec3(a0.x * x0.x + h.x * x0.y, a0.yz * x12.xz + h.yz * x12.yw));
 | |
| }
 | |
| 
 | |
| ```
 | |
| 
 | |
| ### Possibilities
 | |
| [NVidia's TEGRA guide](https://docs.nvidia.com/drive/drive_os_5.1.6.1L/nvvib_docs/DRIVE_OS_Linux_SDK_Development_Guide/baggage/tegra_gles2_performance.pdf) states that uniform access is often better than constants.
 | |
| On our hardware it only degrades performance, but there's possibility of other chipsets having similar to TEGRA's preferences.
 | |
| `C` constant is legible for this.
 |