Browse Source

Examples/TSL: Add SubgroupFunctionNode with Compute Reduction Demonstration (#31378)

* init branch

* add example meta data

* remove logs, simplify comments, use more appropriate syntax (instancedArray instead of storage)

* new screenshot

* remove unused function

* add screenshot to puppeteer exceptions

* work

* add subgroup reduction example

* change native select approach

* revert nativeSelect error and permit intent of subgroup operation being stored in a var

* remove deprecated .label() calls

* cleanup

* work

* work

* fix rebase issues

* finish example 4

* add explanatory css animation for subgroup reduction

* remove references to nativeSelect, findLSB, and countTrailingZeros

* update screenshot

* remove unused style

* run lint

* fix tabs

* fix subgroup::before tab

* dummy commit to restart deepscan

* try tests again
Christian Helgeson 6 months ago
parent
commit
63b49831e0

+ 1 - 0
examples/files.json

@@ -312,6 +312,7 @@
 		"webgpu_compute_particles_rain",
 		"webgpu_compute_particles_snow",
 		"webgpu_compute_points",
+		"webgpu_compute_reduce",
 		"webgpu_compute_sort_bitonic",
 		"webgpu_compute_texture",
 		"webgpu_compute_texture_3d",

BIN
examples/screenshots/webgpu_compute_reduce.jpg


+ 1 - 0
examples/tags.json

@@ -125,6 +125,7 @@
 	"webgpu_compute_particles_rain": [ "gpgpu" ],
 	"webgpu_compute_particles_snow_external": [ "gpgpu" ],
 	"webgpu_compute_points": [ "gpgpu" ],
+	"webgpu_compute_reduce": [ "gpgpu" ],
 	"webgpu_compute_sort_bitonic": [ "gpgpu" ],
 	"webgpu_compute_texture": [ "gpgpu" ],
 	"webgpu_compute_texture_pingpong": [ "gpgpu" ],

+ 1383 - 0
examples/webgpu_compute_reduce.html

@@ -0,0 +1,1383 @@
+<html lang="en">
+	<head>
+		<title>three.js webgpu - compute reduction</title>
+		<meta charset="utf-8">
+		<meta name="viewport" content="width=device-width, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
+		<link type="text/css" rel="stylesheet" href="main.css">
+	</head>
+	<body>
+
+	<style>
+
+		#reduction-panel {
+			background-color: #111;
+			width: 100%;
+			display: flex;
+			position: fixed;
+			height: auto;
+			bottom: 0px;
+			z-index: 99;
+			flex-direction: column;
+			justify-content: center;
+			align-items: center;
+			border-left: 2px solid #222;
+			text-align: center;
+		}
+
+		#panel-title {
+			width: fit-content;
+		}
+
+		.thread-row {
+			display: flex;
+			flex-direction: row;
+			align-items: center;
+			margin: 4px 0;
+			position: relative;
+		}
+
+		.thread {
+			width: 16px;
+			height: 16px;
+			background-color: #444;
+			margin-right: 2px;
+			transition: background-color 0.5s, transform 0.5s;
+		}
+
+		.stage-display {
+			display: flex;
+			flex-direction: column;
+			justify-content: center;
+			margin-bottom: 5px;
+		}
+		
+		.stage-label {
+			font-size: 1.2em;
+			color: #aaa;
+			font-style: bold;
+			margin-top: 6px;
+			margin-bottom: 20px; 
+		}
+
+		.thread {
+			display: flex;
+			justify-content: center;
+			align-items: center;
+			width: 40px;
+			height: 40px;
+			margin: 2px;
+			border: 1px solid rgba(255, 255, 255, 0.2);
+			border-radius: 4px;
+			background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(0,0,0,0.2));
+			box-shadow: inset 0 0 2px rgba(255,255,255,0.1);
+			font-family: monospace;
+			color: white;
+		}
+
+		.thread_data {
+			display: block;
+			max-width: 100%;
+			padding: 0 2px;
+			white-space: nowrap;
+			overflow: hidden;
+			text-overflow: ellipsis;
+			font-size: clamp(8px, 2vw, 14px); 
+			text-align: center;
+		}
+		
+		.subgroup {
+			display: flex;
+			position: relative;
+			margin-left: 10px;
+			margin-right: 10px;
+		}
+
+		.subgroup::before {
+			/* label text for each subgroup label */
+			content: "subgroupAdd()";
+			position: absolute;
+			top: -20px; 
+			/* Hide until animation is displayed */
+			opacity: 0; 
+			z-index: 100;
+			transition: opacity 0.5s ease;
+			font-weight: bold;
+			color: white;
+			width: 100%;
+		}
+
+		.subgroup::after {
+			content: attr(data-label);
+			position: absolute;
+			bottom: -20px;
+			opacity: 1;
+			z-index: 100;
+			color: gray;
+			width: 100%;
+		}
+
+		.reduction-stage {
+			margin-bottom: 20px;
+		}
+		
+		@keyframes labelAbsorb {
+			0% {
+				opacity: 0;
+				transform: translateY(-50%);
+			}
+			40% {
+				opacity: 1;
+				transform: translateY(0%);
+			}
+			60% {
+				opacity: 1;
+				transform: translateY(0%);
+			}
+			80% {
+				opacity: 1;
+				transform: translate(0%, -20%);
+			}
+			100% {
+				opacity: 0;
+				transform: translate(0%, 100%);
+			}
+		}
+
+		.subgroup.anim::before {
+			opacity: 0;
+			animation-name: labelAbsorb;
+			animation-duration: 1.5s;
+			transition:
+			transform 0.6s ease-out,
+			opacity 0.3s ease-in 0.3s; 
+		}
+
+	</style>
+
+		<div id="info">
+			<a href="https://threejs.org" target="_blank" rel="noopener">three.js</a>
+			<br /> This example demonstrates the performance of various simple parallel reduction kernels.
+			<br /> Reference implementations are translated from the CUDA/WGSL code present in the following books/repos:
+			<br /> Impl. 0 - 2: <a href="https://www.cambridge.org/core/books/programming-in-parallel-with-cuda/C43652A69033C25AD6933368CDBE084C"><i>Programming in Parallel with CUDA</i></a> by <a href="https://people.bss.phy.cam.ac.uk/~rea1/">Richard Ansorge</a>
+			<br /> Impl. 3: <a href="https://github.com/frost-beta/betann/blob/main/betann/wgsl/reduce_all.wgsl"><i>betann reduce_all kernel</i></a> by <a href="https://github.com/zcbenz">zcbenz</a>
+			<br /> Impl. 4: <a href="https://github.com/b0nes164/GPUPrefixSums/blob/main/GPUPrefixSumsWebGPUapis/SharedShaders/rts.wgsl"><i>GPUPrefixSums reduction approach</i></a> by <a href="https://github.com/b0nes164">b0nes164</a>
+			<div id="left_side_display" style="position: absolute;top: 150px;left: 0;padding: 10px;background: rgba( 0, 0, 0, 0.5 );color: #fff;font-family: monospace;font-size: 12px;line-height: 1.5;pointer-events: none;text-align: left;"></div>
+			<div id="right_side_display" style="position: absolute;top: 150px;right: 0;padding: 10px;background: rgba( 0, 0, 0, 0.5 );color: #fff;font-family: monospace;font-size: 12px;line-height: 1.5;pointer-events: none;text-align: left;"></div>
+		</div>
+		
+		<div id="reduction-panel">
+			<h3 id="panel-title" style="flex: 0 0 auto;">Subgroup Reduction Explanation</h3>
+			<div class="reduction-stage" id="subgroup-reduction-stage">
+    		<div class="stage-label">Use subgroupAdd() to capture reduction of each workgroup's subgroups (Hover for animation)</div>
+				<div class="stage-display">
+					<div id="workgroup_threads" style="display: flex; justify-content: center; margin-bottom: 20px;"></div>
+					<div id="subgroup_reduction" style="display: flex; justify-content: center; margin-bottom: 5px;"></div>
+				</div>
+			</div>
+    </div>
+
+		<script type="importmap">
+			{
+				"imports": {
+					"three": "../build/three.webgpu.js",
+					"three/webgpu": "../build/three.webgpu.js",
+					"three/tsl": "../build/three.tsl.js",
+					"three/addons/": "./jsm/"
+				}
+			}
+		</script>
+
+		<script type="module">
+
+			import * as THREE from 'three/webgpu';
+			import { instancedArray, Loop, If, vec3, dot, clamp, storage, uvec4, subgroupAdd, uniform, uv, uint, float, Fn, vec2, invocationLocalIndex, invocationSubgroupIndex, uvec2, floor, instanceIndex, workgroupId, workgroupBarrier, workgroupArray, subgroupSize, select, log2 } from 'three/tsl';
+
+			import WebGPU from 'three/addons/capabilities/WebGPU.js';
+
+			import { GUI } from 'three/addons/libs/lil-gui.module.min.js';
+
+			const timestamps = {
+				left_side_display: document.getElementById( 'left_side_display' ),
+				right_side_display: document.getElementById( 'right_side_display' )
+			};
+
+			const divRoundUp = ( size, part_size ) => {
+
+				return Math.floor( ( size + part_size - 1 ) / part_size );
+
+			};
+
+			const cssSubgroupSize = 4;
+			const cssWorkgroupSize = 16;
+
+			const workgroupThreadsContainer = document.getElementById( 'workgroup_threads' );
+			const subgroupReductionContainer = document.getElementById( 'subgroup_reduction' );
+
+			document.getElementById( 'panel-title' ).textContent += ` (Subgroup Size: ${cssSubgroupSize}, Workgroup Size: ${cssWorkgroupSize})`;
+
+			const createThreadWithData = ( data ) => {
+
+				const threadEle = document.createElement( 'div' );
+				threadEle.className = 'thread';
+				const threadData = document.createElement( 'span' );
+				threadData.textContent = data; // safer than innerHTML for just text
+				threadData.className = 'thread_data';
+				threadEle.append( threadData );
+
+				return threadEle;
+
+			};
+
+			// Create thread elements
+			const workgroupThreads = [];
+			const initialSubgroups = [];
+			const initialData = [];
+			let currentSubgroupDiv = null;
+			for ( let i = 0; i < cssWorkgroupSize; i ++ ) {
+
+				if ( i % cssSubgroupSize === 0 ) {
+
+					const currentSubgroupIndex = Math.floor( i / cssSubgroupSize );
+
+					const subgroupReductionThread = createThreadWithData( 0 );
+					subgroupReductionThread.id = `subgroup_reduction_element_${currentSubgroupIndex}`;
+					subgroupReductionContainer.appendChild( subgroupReductionThread );
+
+					currentSubgroupDiv = document.createElement( 'div' );
+					currentSubgroupDiv.className = 'subgroup';
+					currentSubgroupDiv.setAttribute( 'data-label', `Threads ${currentSubgroupIndex * cssSubgroupSize}-${( currentSubgroupIndex + 1 ) * cssSubgroupSize - 1}` );
+					initialSubgroups.push( currentSubgroupDiv );
+					workgroupThreadsContainer.appendChild( currentSubgroupDiv );
+			
+				}
+
+				const data = Math.floor( Math.random() * 9 ) + 1;
+				initialData.push( data );
+
+				const thread = createThreadWithData( data );
+				workgroupThreads.push( thread );
+				currentSubgroupDiv.appendChild( thread );
+			
+			}
+
+			const deactivateLabelAnimation = ( subgroupDiv, idx ) => {
+
+				subgroupDiv.classList.remove( 'anim' );
+
+				const subgroupReductionBufferElement = document.getElementById( `subgroup_reduction_element_${idx}` ).querySelector( '.thread_data' );
+
+				subgroupReductionBufferElement.innerHTML = 0;
+			
+			};
+
+			const activateLabelAnimation = ( subgroupDiv, idx ) => {
+
+				const threads = Array.from( subgroupDiv.children );
+				let total = 0;
+
+				for ( let i = idx * cssSubgroupSize; i < idx * cssSubgroupSize + cssSubgroupSize; i ++ ) {
+
+					total += initialData[ i ];
+
+				}
+
+				subgroupDiv.classList.add( 'anim' );
+
+				setTimeout( () => {
+
+					threads.forEach( t => {
+
+						t.querySelector( '.thread_data' ).textContent = total;
+
+					} );
+
+					const subgroupReductionBufferElement = document.getElementById( `subgroup_reduction_element_${idx}` ).querySelector( '.thread_data' );
+
+					subgroupReductionBufferElement.innerHTML = total;
+
+				}, 1000 );
+
+				// Remove the class after the animation ends so it can be triggered again
+				setTimeout( () => {
+
+					subgroupDiv.classList.remove( 'anim' );
+
+				}, 1500 ); // matches animation duration in CSS
+
+			};
+
+			document.getElementById( 'subgroup-reduction-stage' ).addEventListener( 'mouseenter', () => {
+
+				initialSubgroups.forEach( ( subgroupDiv, idx ) => {
+
+					activateLabelAnimation( subgroupDiv, idx );
+			
+				} );
+
+			} );
+
+			document.getElementById( 'subgroup-reduction-stage' ).addEventListener( 'mouseleave', () => {
+
+				initialSubgroups.forEach( ( subgroupDiv, idx ) => {
+
+					deactivateLabelAnimation( subgroupDiv, idx );
+			
+				} );
+
+				workgroupThreads.forEach( ( thread, idx ) => {
+
+					thread.querySelector( '.thread_data' ).textContent = initialData[ idx ];
+
+				} );
+
+			} );
+			
+			
+			if ( WebGPU.isAvailable() === false ) {
+
+				document.body.appendChild( WebGPU.getErrorMessage() );
+
+				throw new Error( 'No WebGPU support' );
+
+			}
+
+			// Total number of elements and the dimensions of the display grid.
+			const size = 262144;
+			const vecSize = divRoundUp( size, 4 );
+			// Grid display is gridDim x gridDim
+			const gridDim = Math.sqrt( size );
+			let maxWorkgroupSize = 64;
+
+			// Algorithm speed increase as you iterate through algorithms array
+			const algorithms = [
+				'Reduce 0 (N/2)',
+				'Reduce 1 (Naive Accumulate)',
+				'Reduce 2 (Workgroup Reduction)',
+				'Reduce 3 (Subgroup Reduce)',
+				'Reduce 4 (Subgroup Optimized)',
+				'Incorrect Baseline',
+			];
+
+			// Input Grid: Displays input data in a grid format
+			// Input Log2: Displays input grid data's logarithmic indices horizontally (1, 2, 4, 8, 16, ..., size)
+			// Input Element 0: Displays clamped input[0]
+			const displayModes = [ 'Input Grid', 'Input Log2', 'Input Element 0', 'Workgroup Sum Grid' ];
+
+			// Holds uniforms for both displays as well as debug information
+			const unifiedEffectController = {
+				// Number of elements in the grid
+				gridElementWidth: uniform( gridDim ),
+				gridElementHeight: uniform( gridDim ),
+				// Number of elements in the grid being displayed
+				gridDisplayWidth: uniform( gridDim ),
+				gridDisplayHeight: uniform( gridDim ),
+				// How to display end result of reduction.
+				// Ideally this is unique to the reduction method being deployed
+				'Display Mode': 'Input Log2',
+				loggedBuffer: 'Input Buffer',
+				elementsReduced: size,
+			};
+			
+
+			const leftEffectController = {
+				// Current reduction algorithm being executed by this side
+				algo: 'Reduce 0 (N/2)',
+				// Flag indicating whether to highlight element in validation check
+				highlight: uniform( 0 ),
+				// Uniform that corresponds to the index of the current algorithm within the algorithms array
+				currentAlgo: uniform( 0 ),
+				// Current state of reduction (Running, validating, reseting)
+				state: 'Run Algo',
+				// Current display mode
+				displayMode: 'Input Log2',
+				// Reduce 0 specfic uniform
+				numThreadsDispatched: uniform( size / 2 ),
+				// The subgroup size used by this side's device
+			};
+
+			const rightEffectController = {
+				algo: 'Reduce 4 (Subgroup Optimized)',
+				currentAlgo: uniform( 3 ),
+				highlight: uniform( 0 ),
+				displayMode: 'Input Element 0',
+				state: 'Run Algo',
+				numThreadsDispatched: uniform( size / 2 )
+			};
+
+			const leftMaterial = new THREE.MeshBasicNodeMaterial( { color: 0x00ff00 } );
+			const rightMaterial = new THREE.MeshBasicNodeMaterial( { color: 0x00ff00 } );
+			const leftDisplayColorNodes = {};
+			const rightDisplayColorNodes = {};
+
+			const gui = new GUI();
+
+			gui.add( leftEffectController, 'algo', algorithms ).onChange( () => {
+
+				leftEffectController.currentAlgo.value = algorithms.findIndex( val => val === leftEffectController.algo );
+			
+			} );
+
+			gui.add( rightEffectController, 'algo', algorithms ).onChange( () => {
+
+				rightEffectController.currentAlgo.value = algorithms.findIndex( val => val === rightEffectController.algo );
+
+			} );
+
+			gui.add( leftEffectController, 'displayMode', displayModes ).name( 'Left Display Mode' ).onChange( () => {
+
+				leftMaterial.colorNode = leftDisplayColorNodes[ leftEffectController.displayMode ];
+				leftMaterial.needsUpdate = true;
+
+			} );
+			gui.add( rightEffectController, 'displayMode', displayModes ).name( 'Right Display Mode' ).onChange( () => {
+
+				rightMaterial.colorNode = rightDisplayColorNodes[ rightEffectController.displayMode ];
+				rightMaterial.needsUpdate = true;
+
+			} );
+
+			const debugFolder = gui.addFolder( 'Debug' );
+			const elementsReducedController = debugFolder.add( unifiedEffectController, 'elementsReduced' ).name( 'Elements Reduced' );
+			elementsReducedController.disable();
+			const stateLeftController = debugFolder.add( leftEffectController, 'state' ).name( 'Left Display State' );
+			const stateRightController = debugFolder.add( rightEffectController, 'state' ).name( 'Right Display State' );
+			stateLeftController.disable();
+			stateRightController.disable();
+			debugFolder.add( unifiedEffectController, 'loggedBuffer', [ 'Input Buffer', 'Input Vectorized Buffer', 'Workgroup Sums Buffer', 'Debug Buffer' ] ).name( 'Buffer to Log' );
+			debugFolder.close();
+
+			// HELPER FUNCTIONS
+			const pow2Ceil = Fn( ( [ x ] ) => {
+
+				If( x.equal( uint( 0 ) ), () => {
+
+					return uint( 1 );
+
+				} );
+
+				const val = x.sub( 1 ).toVar( 'val' );
+				val.assign( val.bitOr( val.shiftRight( 1 ) ) );
+				val.assign( val.bitOr( val.shiftRight( 2 ) ) );
+				val.assign( val.bitOr( val.shiftRight( 4 ) ) );
+				val.assign( val.bitOr( val.shiftRight( 8 ) ) );
+				val.assign( val.bitOr( val.shiftRight( 16 ) ) );
+				return val.add( 1 );
+
+			} ).setLayout( {
+				name: 'pow2Ceil',
+				type: 'uint',
+				inputs: [
+					{ name: 'x', type: 'uint' }
+				]
+			} );
+
+			// ALGORITHM CONSTRUCTORS
+
+			// REDUCE 1
+
+			// Thanks to Sam0oneau of Graphics Programming Discord for the explanation.
+			// (Graphics Programming Discord Message Link): https://discord.com/channels/318590007881236480/374061825454768129/1391248956171882597
+
+			/* Reduce 1 Example (Assume Workgroup Size 256, numElements: 262144) -> Initial currentBuffer State: | 1, 1, 1, 1, ... |
+				 *
+				 * KERNEL 1:
+				 * Executes 256 threads by 256 workgroups. Each thread loops 4 times and accesses elements
+				 * at the indices below.
+				 *          Thread 1                        Thread 2                         Thread 3
+				 * | 0, 65536, ..., n * 65536 | 1, 65537, .... (n * 65536) + 1 | 1, 65538, .... (n * 65536) + 2 | etc
+				 * Buffer Values: | 4, 4, 4, 4, ...|
+				 *
+				 * KERNEL 2:
+				 * Executes 256 threads by one workgroup. Each thread loops 1024 times
+				 *          Thread 1                     Thread 2                     Thread 3
+				 * | 0, 256, ...., n * 256    | 1, 257, ... (n * 256) + 1 | 2, 258, ... (n * 256) + 3 | etc
+				 * Buffer Values: | 1024, 1024, 1024, 1024, ... |
+				 *
+				 * KERNEL 3:
+				 * Executes 1 thread by one workgroup. Single thread loops 256 times
+				 *          Thread 1
+				 * | 0, 1, 2, 3, 4, 5, 6 ... etc|
+				 * Buffer Values: [262144, 1024, 1024]
+				 */
+
+
+			const createReduce1Fn = ( createReduce1FnProps ) => {
+
+				const { dispatchSize, numElements, inputBuffer, workgroupSize } = createReduce1FnProps;
+
+				const fnDef = Fn( () => {
+
+					const dispatch = uint( dispatchSize ).toVar( 'dispatchSize' );
+					const tSum = uint( 0 ).toVar();
+					const k = instanceIndex.toVar( 'k' );
+
+					Loop( k.lessThan( uint( numElements ) ), ( ) => {
+
+						tSum.addAssign( inputBuffer.element( k ) );
+						k.addAssign( uint( dispatch ) );
+
+					} );
+
+					inputBuffer.element( instanceIndex ).assign( tSum );
+
+
+				} )().compute( dispatchSize, [ workgroupSize ] );
+
+				return fnDef;
+			
+			};
+
+			// REDUCE 2
+			
+			// For non power of 2 # of workgroups
+			const createReduce2Fn = ( createReduce2FnProps ) => {
+
+				const { workgroupSize, dispatchSize, numElements, inputBuffer } = createReduce2FnProps;
+
+				const fnDef = Fn( () => {
+
+					const tSum = workgroupArray( 'uint', workgroupSize );
+
+					const k = instanceIndex.toVar( 'k' );
+					tSum.element( invocationLocalIndex ).assign( uint( 0 ) );
+
+					Loop( k.lessThan( uint( numElements ) ), () => {
+
+						tSum.element( invocationLocalIndex ).addAssign( inputBuffer.element( k ) );
+
+						k.addAssign( uint( dispatchSize ) );
+
+					} );
+
+					workgroupBarrier();
+
+					// Reset the loop condition (account for numWorkgroups % 2 != 0)
+					k.assign( pow2Ceil( uint( workgroupSize ) ).div( 2 ) );
+
+					Loop( k.greaterThan( 0 ), () => {
+
+						If( invocationLocalIndex.lessThan( k ).and( invocationLocalIndex.add( k ).lessThan( workgroupSize ) ), () => {
+
+							tSum.element( invocationLocalIndex ).addAssign( tSum.element( invocationLocalIndex.add( k ) ) );
+
+						} );
+						workgroupBarrier();
+						k.divAssign( 2 );
+
+					} );
+
+					If( invocationLocalIndex.equal( uint( 0 ) ), () => {
+
+						inputBuffer.element( workgroupId.x ).assign( tSum.element( uint( 0 ) ) );
+
+					} );
+
+				} )().compute( dispatchSize, [ workgroupSize ] );
+
+				return fnDef;
+
+			};
+
+			// REDUCE 3
+
+			/* Create array with enough indices for worst-case subgroup size */
+			const createSubgroupArray = ( type, workgroupSize, minSubgroupSize = 4 ) => {
+
+				return workgroupArray( 'uint', workgroupSize / minSubgroupSize );
+
+			};
+
+			// zcbenz implementation
+			// https://github.com/frost-beta/betann/blob/8aa2701caf63fb29bd4cd2454e656973342c1588/betann/wgsl/reduce_ops.wgsl#L71
+			const RowReduce = ( rowReduceProps ) => {
+
+				const { workgroupSize, inputBuffer, total, rowOffset, currentRowSize, workPerThread, vectorized } = rowReduceProps;
+
+				// Number of unvectorized elements each workgroup can ingest
+				// At workgroupSize of 256, blockSize will be 1024
+				const blockSize = uint( workgroupSize ).mul( workPerThread );
+				const block = uint( 0 ).toVar( 'block' );
+
+				// At rowSize of 2048, there will be two blocks
+				const blockLimiter = currentRowSize.div( blockSize ).toVar( 'blockLimiter' );
+				Loop( block.lessThan( blockLimiter ), () => {
+
+					const blockOffset = block.mul( blockSize );
+					const startThread = blockOffset.add( invocationLocalIndex.mul( workPerThread ) );
+					const localThreadOffset = uint( 0 ).toVar( 'localThreadOffset' );
+					Loop( localThreadOffset.lessThan( workPerThread ), () => {
+
+						const inputElement = inputBuffer.element( rowOffset.add( startThread ).addLocal );
+
+						if ( vectorized ) {
+
+							const value = dot( inputElement, uvec4( 1 ) );
+							total.addAssign( value );
+
+						} else {
+
+							const inputElement = inputBuffer.element( rowOffset.add( startThread ).add( localThreadOffset ) );
+							total.addAssign( inputElement );
+
+						}
+			
+						// Increment up a thread
+						localThreadOffset.addAssign( 1 );
+
+					} );
+
+					// Increment up a block
+					block.addAssign( 1 );
+
+				} );
+
+				// Ignoring left over check for this example, since we know ahead of time the value of leftover (2048 % 1024 === 0)
+			
+			};
+
+			const WorkgroupReduce = ( workgroupReduceProps ) => {
+
+				const { total, workgroupSize } = workgroupReduceProps;
+			
+				const subgroupSums = createSubgroupArray( 'uint', workgroupSize );
+
+				// Assign sum of all values in subgroup to total
+				total.assign( subgroupAdd( total ) );
+
+				const delta = uint( workgroupSize ).div( subgroupSize ).toVar( 'delta' );
+
+				const subgroupMetaRank = invocationLocalIndex.div( subgroupSize );
+
+				Loop( float( delta ).greaterThan( 1.0 ), () => {
+
+					If( invocationSubgroupIndex.equal( 0 ), () => {
+			
+						// Each subgroup will populate the subgroupSums array
+						subgroupSums.element( subgroupMetaRank ).assign( total );
+
+					} );
+
+					// Ensure that all subgroups in the workgroup have populated the workgroup memory array
+					workgroupBarrier();
+
+					// Thread 0 - subgroupsInWorkgroup will assign a value to total
+					total.assign( select( invocationLocalIndex.lessThan( delta ), subgroupSums.element( invocationLocalIndex ), 0 ).uniformFlow() );
+					// # of subgroups in workgroup is invariably less than # of threads in subgroup, so subgroupAdd will still sync here
+					total.assign( subgroupAdd( total ) );
+
+					delta.divAssign( subgroupSize );
+
+				} );
+
+			};
+
+			const createReduce3Fn = ( createReduce3FnProps ) => {
+
+				const { workgroupSize, workPerThread, inputBuffer, intermediateBuffer, rowSize } = createReduce3FnProps;
+
+				const fnDef = Fn( () => {
+
+					const inputSize = uint( inputBuffer.bufferCount.length );
+					const rowOffset = workgroupId.x.mul( rowSize );
+			
+					// If the current rows elements exceed the bounds of the input
+					// Select either 0 or number of elements left,
+					// otherwise, select existing ROW_SIZE
+					const currentRowSize = select(
+						( rowOffset.add( rowSize ) ).greaterThan( inputSize ),
+						select( inputSize.greaterThan( rowOffset ), inputSize.sub( rowOffset ), 0 ).uniformFlow(),
+						rowSize,
+					).uniformFlow();
+
+					const total = uint( 0 ).toVar( 'total' );
+
+					RowReduce( {
+						inputBuffer: inputBuffer,
+						total: total,
+						rowOffset: rowOffset,
+						currentRowSize: currentRowSize,
+						workPerThread: workPerThread,
+						workgroupSize: workgroupSize,
+					} );
+
+					WorkgroupReduce( {
+						total: total,
+						workgroupSize: workgroupSize,
+					} );
+
+					// Populate each workgroup with its reduction
+					If( invocationLocalIndex.equal( 0 ), () => {
+
+						intermediateBuffer.element( workgroupId.x ).assign( total );
+			
+					} );
+
+				} )();
+
+				return fnDef;
+
+			};
+
+			// REDUCE 4
+			
+			// b0nes164 inspired implementation with vec4
+			const createReduce4Fn = ( props ) => {
+
+				// Can't pass in subgroup size since we can't always be certain what size is at runtime
+				const { size, workPerThread, workgroupSize, inputBuffer, intermediateBuffer } = props;
+
+				const ELEMENTS_PER_VEC4 = 4;
+				// The number of individual elements a single workgroup will access
+				const partitionSize = workgroupSize * workPerThread * ELEMENTS_PER_VEC4;
+				const vecSize = divRoundUp( size, ELEMENTS_PER_VEC4 );
+				// Can also be calculated using divRoundUp( vecSize, workgroupSize * workPerThread );
+				const numWorkgroups = divRoundUp( size, partitionSize );
+				// Currently no way to specify dispatch size in increments of workgroups, so we convert to numInvocations
+				const numInvocations = numWorkgroups * workgroupSize;
+
+				const fnDef = Fn( () => {
+
+					const perSubgroupReductionArray = createSubgroupArray( 'uint', workgroupSize );
+
+					// Get the index of the subgroup within the workgroup
+					const subgroupMetaRank = invocationLocalIndex.div( subgroupSize );
+			
+					// Each subgroup block scans across 4 subgroups. So when we move into a new subgroup,
+					// align that subgroups' acccesses to the next 4 subgroups
+					const subgroupOffset = subgroupMetaRank.mul( subgroupSize ).mul( workPerThread );
+					subgroupOffset.addAssign( invocationSubgroupIndex );
+
+					// Per workgroup, offset by number of vectorized elements scanned per workgroup
+					const workgroupOffset = workgroupId.x.mul( uint( maxWorkgroupSize ).mul( workPerThread ) );
+
+					const startThread = subgroupOffset.add( workgroupOffset );
+			
+					const subgroupReduction = uint( 0 );
+
+					// Each thread will accumulate values from across 'workPerThread' subgroups
+					If( workgroupId.x.lessThan( uint( numWorkgroups ).sub( 1 ) ), () => {
+
+						Loop( {
+							start: uint( 0 ),
+							end: workPerThread,
+							type: 'uint',
+							condition: '<',
+							name: 'currentSubgroupInBlock'
+						}, () => {
+
+							// Get vectorized element from input array
+							const val = inputBuffer.element( startThread );
+
+							// Sum values within vec4 together by using result of dot product
+							subgroupReduction.addAssign( dot( uvec4( 1 ), val ) );
+
+							// Increment so thread will scan value in next subgroup
+							startThread.addAssign( subgroupSize );
+			
+						} );
+			
+					} );
+
+					// Ensure that the last workgroup does not access out of bounds indices
+					If( workgroupId.x.equal( uint( numWorkgroups ).sub( 1 ) ), () => {
+
+						Loop( {
+							start: uint( 0 ),
+							end: workPerThread,
+							type: 'uint',
+							condition: '<',
+							name: 'currentSubgroupInBlock'
+						}, () => {
+
+							// Ensure index is less than number of available vectors in inputBuffer
+							const val = select( startThread.lessThan( uint( vecSize ) ), inputBuffer.element( startThread ), uvec4( 0 ) ).uniformFlow();
+			
+							subgroupReduction.addAssign( dot( val, uvec4( 1 ) ) );
+							startThread.addAssign( subgroupSize );
+			
+						} );
+			
+					} );
+
+					subgroupReduction.assign( subgroupAdd( subgroupReduction ) );
+
+					// Assuming that each element in the input buffer is 1, we generally expect each invocation's subgroupReduction
+					// value to be ELEMENTS_PER_VEC4 * workPerThread * subgroupSize
+
+					// Delegate one thread per subgroup to assign each subgroup's reduction to the workgroup array
+					If( invocationSubgroupIndex.equal( uint( 0 ) ), () => {
+
+						perSubgroupReductionArray.element( subgroupMetaRank ).assign( subgroupReduction );
+
+					} );
+
+					// Ensure that each workgroup has populated the perSubgroupReductionArray with data
+					// from each of it's subgroups
+					workgroupBarrier();
+
+					if ( props.debugBuffer ) {
+
+						If( invocationLocalIndex.equal( uint( 0 ) ), () => {
+
+							props.debugBuffer.element( workgroupId.x ).assign( subgroupReduction );
+
+						} );
+
+						workgroupBarrier();
+
+					}
+
+					// WORKGROUP LEVEL REDUCE
+
+					// Multiple approaches here
+					// log2(subgroupSize) -> TSL log2 function
+					// countTrailingZeros/findLSB(subgroupSize) -> Currently unsupported function in TSL that counts trailing zeros in number bit representation
+					// Can technically petition GPU for subgroupSize in shader and calculate logs on CPU at cost of shader being generalizable across devices
+					// May also break if subgroupSize changes when device is lost or if program is rerun on lower power device
+					const subgroupSizeLog = uint( log2( float( subgroupSize ) ) ).toVar( 'subgroupSizeLog' );
+					const spineSize = uint( workgroupSize ).shiftRight( subgroupSizeLog );
+					const spineSizeLog = uint( log2( float( spineSize ) ) ).toVar( 'spineSizeLog' );
+
+
+					// Align size to powers of subgroupSize
+					const squaredSubgroupLog = ( spineSizeLog.add( subgroupSizeLog ).sub( 1 ) );
+					squaredSubgroupLog.divAssign( subgroupSizeLog );
+					squaredSubgroupLog.mulAssign( subgroupSizeLog );
+					const alignedSize = ( uint( 1 ).shiftLeft( squaredSubgroupLog ) ).toVar( 'alignedSize' );
+
+					// aligned size 2 * 4
+
+					const offset = uint( 0 );
+
+					// In cases where the number of subgroups in a workgroup is greater than the subgroup size itself,
+					// we need to iterate over the array again to capture all the data in the workgroup array buffer
+					Loop( { start: subgroupSize, end: alignedSize, condition: '<=', name: 'j', type: 'uint', update: '<<= subgroupSizeLog' }, () => {
+
+						const subgroupIndex = ( ( invocationLocalIndex.add( 1 ) ).shiftLeft( offset ) ).sub( 1 );
+
+						const isValidSubgroupIndex = subgroupIndex.lessThan( spineSize ).toVar( 'isValidSubgroupIndex' );
+
+						// Reduce values within the local workgroup memory.
+						// Set toVar to ensure subgroupAdd executes before (not within) the if statement.
+						const t = subgroupAdd(
+							select(
+								isValidSubgroupIndex,
+								perSubgroupReductionArray.element( subgroupIndex ),
+								0
+							).uniformFlow()
+						).toVar( 't' );
+
+						// Can assign back to workgroupArray since all
+						// subgroup threads work in lockstop for subgroupAdd
+						If( isValidSubgroupIndex, () => {
+
+							perSubgroupReductionArray.element( subgroupIndex ).assign( t );
+
+						} );
+
+						// Ensure all threads have completed work
+
+						workgroupBarrier();
+
+						offset.addAssign( subgroupSizeLog );
+
+					} );
+
+					// Assign single thread from workgroup to assign workgroup reduction
+					If( invocationLocalIndex.equal( uint( 0 ) ), () => {
+
+						const reducedWorkgroupSum = perSubgroupReductionArray.element( uint( spineSize ).sub( 1 ) );
+						intermediateBuffer.element( workgroupId.x ).assign( reducedWorkgroupSum );
+
+					} );
+
+				} )().compute( numInvocations, [ maxWorkgroupSize ] );
+
+				return fnDef;
+
+			};
+			
+
+			// INCORRECT BASELINE
+
+			const createIncorrectBaselineFn = ( incorrectBaselineProps ) => {
+
+				const { inputBuffer } = incorrectBaselineProps;
+
+				const fnDef = Fn( () => {
+
+					inputBuffer.element( instanceIndex ).assign( 99999 );
+
+				} )();
+
+				return fnDef;
+			
+			};
+
+
+			init();
+
+			init( false );
+
+			async function init( leftSideDisplay = true ) {
+
+				const effectController = leftSideDisplay ? leftEffectController : rightEffectController;
+
+				const aspect = ( window.innerWidth / 2 ) / window.innerHeight;
+				const camera = new THREE.OrthographicCamera( - aspect, aspect, 1, - 1, 0, 2 );
+				camera.position.z = 1;
+
+				const scene = new THREE.Scene();
+
+				const array = new Uint32Array( Array.from( { length: size }, ( _, i ) => {
+
+					return 1;
+
+				} ) );
+
+				// Represents array of data as uints in compute shader.
+				const inputStorage = instancedArray( array, 'uint', size ).setPBO( true ).setName( `Current_${leftSideDisplay ? 'Left' : 'Right'}` );
+				// Represents array of data as vec4s in compute shader;
+				const inputVec4BufferAttribute = new THREE.StorageInstancedBufferAttribute( array, 4 );
+				const inputVectorizedStorage = storage( inputVec4BufferAttribute, 'uvec4', vecSize ).setPBO( true ).setName( `CurrentVectorized_${leftSideDisplay ? 'Left' : 'Right'}` );
+			
+				// Reduce 3 Calculations
+				const workPerThread = 4;
+				const numRows = workPerThread * 32;
+				const rowSize = divRoundUp( size, numRows );
+
+				const workgroupSumsArray = new Uint32Array( numRows );
+				const workgroupSumsStorage = instancedArray( workgroupSumsArray, 'uint', numRows ).setPBO( true ).setName( `WorkgroupSums_${leftSideDisplay ? 'Left' : 'Right'}` );
+				const debugArray = new Uint32Array( 1024 );
+				const debugStorage = instancedArray( debugArray, 'uint', 1024 ).setPBO( true ).setName( `Debug_${leftSideDisplay ? 'Left' : 'Right'}` );
+
+				const buffers = {
+					'Input Buffer': inputStorage,
+					'Input Vectorized Buffer': inputVectorizedStorage,
+					'Workgroup Sums Buffer': workgroupSumsStorage,
+					'Debug Buffer': debugStorage,
+				};
+
+				const logFunctionName = `Log ${leftSideDisplay ? 'Left' : 'Right'} Side`;
+				const functionObj = {};
+				functionObj[ logFunctionName ] = async() => {
+
+					const selectedBuffer = buffers[ unifiedEffectController.loggedBuffer ];
+					console.log( new Uint32Array( await renderer.getArrayBufferAsync( selectedBuffer.value ) ) );
+
+				};
+
+				debugFolder.add( functionObj, `Log ${leftSideDisplay ? 'Left' : 'Right'} Side` );
+
+				const computeResetBufferFn = Fn( () => {
+
+					inputStorage.element( instanceIndex ).assign( 1 );
+			
+				} );
+
+				const computeResetWorkgroupSumsFn = Fn( () => {
+
+					workgroupSumsStorage.element( instanceIndex ).assign( 0 );
+
+				} );
+			
+
+				// Re-initialize compute buffer
+				const computeResetBuffer = computeResetBufferFn().compute( size );
+				const computeResetWorkgroupSums = computeResetWorkgroupSumsFn().compute( 256 );
+
+				const renderer = new THREE.WebGPURenderer( { antialias: false, trackTimestamp: true } );
+				renderer.setPixelRatio( window.devicePixelRatio );
+				renderer.setSize( window.innerWidth / 2, window.innerHeight );
+
+				// Unfortunately, need to arbitrarily run compute shader to get access to device limits
+				await renderer.computeAsync( computeResetBuffer );
+
+
+				if ( renderer.backend.device !== null ) {
+
+					maxWorkgroupSize = renderer.backend.device.limits.maxComputeWorkgroupSizeX;
+
+				}
+
+				// Create and store dispatches of reduction of certain size. Map each set of dispatches to algorithm name.
+
+				const computeReduce0Fn = Fn( () => {
+
+					const { numThreadsDispatched } = effectController;
+
+					inputStorage.element( instanceIndex ).addAssign( inputStorage.element( instanceIndex.add( numThreadsDispatched ) ) );
+
+				} )();
+
+				const reduce0Calls = [];
+
+				for ( let i = size / 2; i >= 1; i /= 2 ) {
+
+					const reduce0 = computeReduce0Fn.compute( i, [ maxWorkgroupSize ] );
+					reduce0Calls.push( reduce0 );
+
+				}
+
+				const reduce1Calls = [
+					// Accumulation
+					createReduce1Fn( {
+						dispatchSize: maxWorkgroupSize * maxWorkgroupSize,
+						workgroupSize: maxWorkgroupSize,
+						numElements: size,
+						inputBuffer: inputStorage,
+					} ),
+					// 1 Block accumulation
+					createReduce1Fn( {
+						dispatchSize: maxWorkgroupSize,
+						numElements: maxWorkgroupSize * maxWorkgroupSize,
+						workgroupSize: maxWorkgroupSize,
+						inputBuffer: inputStorage,
+					} ),
+					// Final result
+					createReduce1Fn( {
+						dispatchSize: 1,
+						numElements: maxWorkgroupSize,
+						workgroupSize: 1,
+						inputBuffer: inputStorage
+					} ),
+				];
+
+				const reduce2Calls = [
+					// Accumulate within workgroups
+					createReduce2Fn( {
+						workgroupSize: maxWorkgroupSize,
+						dispatchSize: maxWorkgroupSize * maxWorkgroupSize,
+						numElements: size,
+						inputBuffer: inputStorage,
+					} ),
+					// 1 Block accumulation
+					createReduce2Fn( {
+						workgroupSize: maxWorkgroupSize,
+						dispatchSize: maxWorkgroupSize,
+						numElements: maxWorkgroupSize,
+						inputBuffer: inputStorage,
+					} ),
+				];
+
+				const reduce3Calls = [
+					createReduce3Fn( {
+						inputBuffer: inputStorage,
+						intermediateBuffer: workgroupSumsStorage,
+						workgroupSize: maxWorkgroupSize,
+						workPerThread: 4,
+						rowSize: rowSize,
+						vectorized: false,
+					} ).compute( maxWorkgroupSize * numRows, [ maxWorkgroupSize ] ),
+					createReduce3Fn( {
+						inputBuffer: workgroupSumsStorage,
+						intermediateBuffer: inputStorage,
+						workgroupSize: 32,
+						workPerThread: 4,
+						rowSize: rowSize,
+						vectorized: false
+					} ).compute( 32, [ 32 ] )
+				];
+
+				const reduce4Calls = [
+					createReduce4Fn( {
+						size: size,
+						inputBuffer: inputVectorizedStorage,
+						intermediateBuffer: workgroupSumsStorage,
+						workgroupSize: maxWorkgroupSize,
+						workPerThread: 4,
+					} ),
+					createReduce3Fn( {
+						inputBuffer: workgroupSumsStorage,
+						intermediateBuffer: inputStorage,
+						workgroupSize: 32,
+						workPerThread: 4,
+						rowSize: rowSize,
+						vectorized: false
+					} ).compute( 32, [ 32 ] )
+				];
+
+				const incorrectBaselineCalls = [
+					createIncorrectBaselineFn( {
+						inputBuffer: inputStorage,
+					} ).compute( size ),
+				];
+			
+				const calls = {
+					'Reduce 0 (N/2)': reduce0Calls,
+					'Reduce 1 (Naive Accumulate)': reduce1Calls,
+					'Reduce 2 (Workgroup Reduction)': reduce2Calls,
+					'Reduce 3 (Subgroup Reduce)': reduce3Calls,
+					'Reduce 4 (Subgroup Optimized)': reduce4Calls,
+					'Incorrect Baseline': incorrectBaselineCalls
+				};
+
+				const getColor = ( bufferToCheck, colorChanger, width, height ) => {
+
+					const subtracter = float( colorChanger ).div( width.mul( height ) );
+
+					const color = vec3( subtracter.oneMinus() ).toVar();
+
+					const { highlight } = effectController;
+
+					// Validate that element 0 is equal to expected result of reduction
+					If( highlight.equal( 1 ), () => {
+
+						If( ( bufferToCheck.element( 0 ) ).equal( size ), () => {
+
+							color.assign( vec3( 0.0, subtracter.oneMinus(), 0.0 ) );
+
+						} ).Else( () => {
+
+							color.assign( vec3( subtracter.oneMinus(), 0.0, 0.0 ) );
+
+						} );
+
+					} );
+
+					return color;
+
+				};
+
+				const displayNodes = leftSideDisplay ? leftDisplayColorNodes : rightDisplayColorNodes;
+				displayNodes[ 'Input Grid' ] = Fn( () => {
+
+					const { gridElementWidth, gridElementHeight, gridDisplayWidth, gridDisplayHeight } = unifiedEffectController;
+
+					const newUV = uv().mul( vec2( gridDisplayWidth, gridDisplayHeight ) );
+
+					const pixel = uvec2( uint( floor( newUV.x ) ), uint( floor( newUV.y ) ) );
+
+					const elementIndex = uint( gridDisplayWidth ).mul( pixel.y ).add( pixel.x );
+
+					const colorChanger = uint( 0 ).toVar();
+					const color = vec3( 0 ).toVar( 'color' );
+
+					colorChanger.assign( inputStorage.element( elementIndex ) );
+					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );
+
+					return color;
+
+				} )();
+
+				displayNodes[ 'Input Log2' ] = Fn( () => {
+
+					const { gridElementWidth, gridElementHeight } = unifiedEffectController;
+
+					const newUV = uv().mul( vec2( Math.log2( size ) ), 1 );
+
+					const colorChanger = uint( 0 ).toVar();
+					const color = vec3( 0 ).toVar( 'color' );
+			
+					colorChanger.assign( inputStorage.element( uint( 1 ).shiftLeft( newUV.x ) ) );
+					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );
+
+					return color;
+
+				} )();
+
+				displayNodes[ 'Input Element 0' ] = Fn( () => {
+
+					const { gridElementWidth, gridElementHeight } = unifiedEffectController;
+
+					const colorChanger = uint( 0 ).toVar();
+					const color = vec3( 0 ).toVar( 'color' );
+
+					// Clamp display of single element to shade where green is still readable
+					colorChanger.assign( clamp( inputStorage.element( 0 ), 0, size / 2 ) );
+					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );
+					return color;
+			
+				} )();
+
+				displayNodes[ 'Workgroup Sum Grid' ] = Fn( () => {
+
+					const width = uint( 8 );
+					const height = uint( 16 );
+
+					const newUV = uv().mul( vec2( width, height ) );
+
+					const pixel = uvec2( uint( floor( newUV.x ) ), uint( floor( newUV.y ) ) );
+
+					const elementIndex = uint( width ).mul( pixel.y ).add( pixel.x );
+
+					const colorChanger = uint( 0 ).toVar();
+					const color = vec3( 0 ).toVar( 'color' );
+
+					colorChanger.assign( workgroupSumsStorage.element( elementIndex ) );
+					color.assign( getColor( inputStorage, colorChanger, width, height ) );
+
+					return color;
+
+				} )();
+
+				( leftSideDisplay ? leftMaterial : rightMaterial ).colorNode = displayNodes[ effectController.displayMode ];
+				( leftSideDisplay ? leftMaterial : rightMaterial ).needsUpdate = true;
+
+				const plane = new THREE.Mesh( new THREE.PlaneGeometry( 1, 1 ), ( leftSideDisplay ? leftMaterial : rightMaterial ) );
+				scene.add( plane );
+
+				const animate = () => {
+
+					renderer.render( scene, camera );
+
+				};
+
+				renderer.setAnimationLoop( animate );
+
+				document.body.appendChild( renderer.domElement );
+				renderer.domElement.style.position = 'absolute';
+				renderer.domElement.style.top = '0';
+				renderer.domElement.style.left = '0';
+				renderer.domElement.style.width = '50%';
+				renderer.domElement.style.height = '100%';
+
+				if ( ! leftSideDisplay ) {
+
+					renderer.domElement.style.left = '50%';
+
+					scene.background = new THREE.Color( 0x212121 );
+
+				} else {
+
+					scene.background = new THREE.Color( 0x313131 );
+
+				}
+
+				renderer.info.autoReset = false;
+
+				const stepAnimation = async function () {
+
+					const currentAlgorithm = effectController.algo;
+					const state = effectController.state;
+					const stateController = leftSideDisplay ? stateLeftController : stateRightController;
+
+					if ( state === 'Reset' ) {
+
+						renderer.computeAsync( computeResetBuffer );
+						renderer.computeAsync( computeResetWorkgroupSums );
+
+					} else if ( state === 'Run Algo' ) {
+
+						renderer.info.reset();
+
+						const cpuTime = 0;
+
+						switch ( currentAlgorithm ) {
+			
+							case 'Reduce 0 (N/2)': {
+
+								let m = size / 2;
+
+								for ( let i = 0; i < reduce0Calls.length; i ++ ) {
+
+									effectController.numThreadsDispatched.value = m;
+
+									const reduce0 = reduce0Calls[ i ];
+									// Do a reduction step
+									renderer.computeAsync( reduce0 );
+									renderer.resolveTimestampsAsync( THREE.TimestampQuery.COMPUTE );
+
+									m /= 2;
+
+								}
+
+
+								break;
+
+							}
+
+							default: {
+
+								const currentAlgoCalls = calls[ currentAlgorithm ];
+
+								for ( let i = 0; i < currentAlgoCalls.length; i ++ ) {
+
+									renderer.computeAsync( currentAlgoCalls[ i ] );
+									renderer.resolveTimestampsAsync( THREE.TimestampQuery.COMPUTE );
+
+								}
+
+								break;
+
+							}
+
+						}
+
+						// DEBUG: const reductionResult = new Uint32Array( await renderer.getArrayBufferAsync( currentBuffer ) )[0];
+
+						let passInfoString = '';
+
+						if ( effectController.algo.substring( 0, 3 ) === 'CPU' ) {
+
+							passInfoString = `Ran in ${cpuTime}ms<br>`;
+
+						} else {
+
+							passInfoString = `${renderer.info.compute.frameCalls} pass in ${renderer.info.compute.timestamp.toFixed( 6 )}ms<br>`;
+
+						}
+
+			
+
+
+						timestamps[ leftSideDisplay ? 'left_side_display' : 'right_side_display' ].innerHTML = `
+
+							Compute ${effectController.algo}: ${passInfoString}`;
+
+					}
+
+					renderer.render( scene, camera );
+					renderer.resolveTimestampsAsync( THREE.TimestampQuery.RENDER );
+
+					// Validate next state
+
+					if ( state === 'Run Algo' ) {
+
+						stateController.setValue( 'Validate' );
+
+						effectController.highlight.value = 1;
+
+					} else if ( state === 'Validate' ) {
+
+						stateController.setValue( 'Reset' );
+
+						effectController.highlight.value = 0;
+
+					} else if ( state === 'Reset' ) {
+
+						stateController.setValue( 'Run Algo' );
+
+					}
+
+					setTimeout( stepAnimation, 1000 );
+
+				};
+
+
+				window.addEventListener( 'resize', onWindowResize );
+
+				function onWindowResize() {
+
+					renderer.setSize( window.innerWidth / 2, window.innerHeight );
+
+					const aspect = ( window.innerWidth / 2 ) / window.innerHeight;
+
+					const frustumHeight = camera.top - camera.bottom;
+
+					camera.left = - frustumHeight * aspect / 2;
+					camera.right = frustumHeight * aspect / 2;
+
+					camera.updateProjectionMatrix();
+
+					renderer.render( scene, camera );
+
+				}
+
+				setTimeout( stepAnimation, 1000 );
+
+			}
+
+		</script>
+	</body>
+</html>

+ 21 - 0
src/Three.TSL.js

@@ -506,9 +506,30 @@ export const storageTexture = TSL.storageTexture;
 export const string = TSL.string;
 export const struct = TSL.struct;
 export const sub = TSL.sub;
+export const subgroupAdd = TSL.subgroupAdd;
+export const subgroupAll = TSL.subgroupAll;
+export const subgroupAnd = TSL.subgroupAnd;
+export const subgroupAny = TSL.subgroupAny;
+export const subgroupBallot = TSL.subgroupBallot;
+export const subgroupBroadcast = TSL.subgroupBroadcast;
+export const subgroupBroadcastFirst = TSL.subgroupBroadcastFirst;
 export const subBuild = TSL.subBuild;
+export const subgroupElect = TSL.subgroupElect;
+export const subgroupExclusiveAdd = TSL.subgroupExclusiveAdd;
+export const subgroupExclusiveMul = TSL.subgroupExclusiveMul;
+export const subgroupInclusiveAdd = TSL.subgroupInclusiveAdd;
+export const subgroupInclusiveMul = TSL.subgroupInclusiveMul;
 export const subgroupIndex = TSL.subgroupIndex;
+export const subgroupMax = TSL.subgroupMax;
+export const subgroupMin = TSL.subgroupMin;
+export const subgroupMul = TSL.subgroupMul;
+export const subgroupOr = TSL.subgroupOr;
+export const subgroupShuffle = TSL.subgroupShuffle;
+export const subgroupShuffleDown = TSL.subgroupShuffleDown;
+export const subgroupShuffleUp = TSL.subgroupShuffleUp;
+export const subgroupShuffleXor = TSL.subgroupShuffleXor;
 export const subgroupSize = TSL.subgroupSize;
+export const subgroupXor = TSL.subgroupXor;
 export const tan = TSL.tan;
 export const tangentGeometry = TSL.tangentGeometry;
 export const tangentLocal = TSL.tangentLocal;

+ 1 - 0
src/nodes/TSL.js

@@ -129,6 +129,7 @@ export * from './gpgpu/ComputeBuiltinNode.js';
 export * from './gpgpu/BarrierNode.js';
 export * from './gpgpu/WorkgroupInfoNode.js';
 export * from './gpgpu/AtomicFunctionNode.js';
+export * from './gpgpu/SubgroupFunctionNode.js';
 
 // lighting
 export * from './accessors/Lights.js';

+ 430 - 0
src/nodes/gpgpu/SubgroupFunctionNode.js

@@ -0,0 +1,430 @@
+import TempNode from '../core/TempNode.js';
+import { nodeProxyIntent } from '../tsl/TSLCore.js';
+
+
+/**
+ * This class represents a set of built in WGSL shader functions that sync
+ * synchronously execute an operation across a subgroup, or 'warp', of compute
+ * or fragment shader invocations within a workgroup. Typically, these functions
+ * will synchronously execute an operation using data from all active invocations
+ * within the subgroup, then broadcast that result to all active invocations. In
+ * other graphics APIs, subgroup functions are also referred to as wave intrinsics
+ * (DirectX/HLSL) or warp intrinsics (CUDA).
+ *
+ * @augments TempNode
+ */
+class SubgroupFunctionNode extends TempNode {
+
+	static get type() {
+
+		return 'SubgroupFunctionNode';
+
+	}
+
+	/**
+	 * Constructs a new function node.
+	 *
+	 * @param {string} method - The subgroup/wave intrinsic method to construct.
+	 * @param {Node} [aNode=null] - The method's first argument.
+	 * @param {Node} [bNode=null] - The method's second argument.
+	 */
+	constructor( method, aNode = null, bNode = null ) {
+
+		super();
+
+		/**
+		 * The subgroup/wave intrinsic method to construct.
+		 *
+		 * @type {String}
+		 */
+		this.method = method;
+
+		/**
+		 * The method's first argument.
+		 *
+		 * @type {Node}
+		 */
+		this.aNode = aNode;
+
+		/**
+		 * The method's second argument.
+		 *
+		 * @type {Node}
+		 */
+		this.bNode = bNode;
+
+	}
+
+	getInputType( builder ) {
+
+		const aType = this.aNode ? this.aNode.getNodeType( builder ) : null;
+		const bType = this.bNode ? this.bNode.getNodeType( builder ) : null;
+
+		const aLen = builder.isMatrix( aType ) ? 0 : builder.getTypeLength( aType );
+		const bLen = builder.isMatrix( bType ) ? 0 : builder.getTypeLength( bType );
+
+		if ( aLen > bLen ) {
+
+			return aType;
+
+		} else {
+
+			return bType;
+
+		}
+
+	}
+
+	getNodeType( builder ) {
+
+		const method = this.method;
+
+		if ( method === SubgroupFunctionNode.SUBGROUP_ELECT ) {
+
+			return 'bool';
+
+		} else if ( method === SubgroupFunctionNode.SUBGROUP_BALLOT ) {
+
+			return 'uvec4';
+
+		} else {
+
+			return this.getInputType( builder );
+
+		}
+
+	}
+
+	generate( builder, output ) {
+
+		const method = this.method;
+
+		const type = this.getNodeType( builder );
+		const inputType = this.getInputType( builder );
+
+		const a = this.aNode;
+		const b = this.bNode;
+
+		const params = [];
+
+		if (
+			method === SubgroupFunctionNode.SUBGROUP_BROADCAST ||
+			method === SubgroupFunctionNode.SUBGROUP_SHUFFLE ||
+			method === SubgroupFunctionNode.QUAD_BROADCAST
+		) {
+
+			const bType = b.getNodeType( builder );
+
+			params.push(
+				a.build( builder, type ),
+				b.build( builder, bType === 'float' ? 'int' : type )
+			);
+
+		} else if (
+			method === SubgroupFunctionNode.SUBGROUP_SHUFFLE_XOR ||
+			method === SubgroupFunctionNode.SUBGROUP_SHUFFLE_DOWN ||
+			method === SubgroupFunctionNode.SUBGROUP_SHUFFLE_UP
+		) {
+
+			params.push(
+				a.build( builder, type ),
+				b.build( builder, 'uint' )
+			);
+
+		} else {
+
+			if ( a !== null ) params.push( a.build( builder, inputType ) );
+			if ( b !== null ) params.push( b.build( builder, inputType ) );
+
+		}
+
+		const paramsString = params.length === 0 ? '()' : `( ${params.join( ', ' )} )`;
+
+		return builder.format( `${ builder.getMethod( method, type ) }${paramsString}`, type, output );
+
+
+
+	}
+
+	serialize( data ) {
+
+		super.serialize( data );
+
+		data.method = this.method;
+
+	}
+
+	deserialize( data ) {
+
+		super.deserialize( data );
+
+		this.method = data.method;
+
+	}
+
+}
+
+// 0 inputs
+SubgroupFunctionNode.SUBGROUP_ELECT = 'subgroupElect';
+
+// 1 input
+SubgroupFunctionNode.SUBGROUP_BALLOT = 'subgroupBallot';
+SubgroupFunctionNode.SUBGROUP_ADD = 'subgroupAdd';
+SubgroupFunctionNode.SUBGROUP_INCLUSIVE_ADD = 'subgroupInclusiveAdd';
+SubgroupFunctionNode.SUBGROUP_EXCLUSIVE_AND = 'subgroupExclusiveAdd';
+SubgroupFunctionNode.SUBGROUP_MUL = 'subgroupMul';
+SubgroupFunctionNode.SUBGROUP_INCLUSIVE_MUL = 'subgroupInclusiveMul';
+SubgroupFunctionNode.SUBGROUP_EXCLUSIVE_MUL = 'subgroupExclusiveMul';
+SubgroupFunctionNode.SUBGROUP_AND = 'subgroupAnd';
+SubgroupFunctionNode.SUBGROUP_OR = 'subgroupOr';
+SubgroupFunctionNode.SUBGROUP_XOR = 'subgroupXor';
+SubgroupFunctionNode.SUBGROUP_MIN = 'subgroupMin';
+SubgroupFunctionNode.SUBGROUP_MAX = 'subgroupMax';
+SubgroupFunctionNode.SUBGROUP_ALL = 'subgroupAll';
+SubgroupFunctionNode.SUBGROUP_ANY = 'subgroupAny';
+SubgroupFunctionNode.SUBGROUP_BROADCAST_FIRST = 'subgroupBroadcastFirst';
+SubgroupFunctionNode.QUAD_SWAP_X = 'quadSwapX';
+SubgroupFunctionNode.QUAD_SWAP_Y = 'quadSwapY';
+SubgroupFunctionNode.QUAD_SWAP_DIAGONAL = 'quadSwapDiagonal';
+
+// 2 inputs
+SubgroupFunctionNode.SUBGROUP_BROADCAST = 'subgroupBroadcast';
+SubgroupFunctionNode.SUBGROUP_SHUFFLE = 'subgroupShuffle';
+SubgroupFunctionNode.SUBGROUP_SHUFFLE_XOR = 'subgroupShuffleXor';
+SubgroupFunctionNode.SUBGROUP_SHUFFLE_UP = 'subgroupShuffleUp';
+SubgroupFunctionNode.SUBGROUP_SHUFFLE_DOWN = 'subgroupShuffleDown';
+SubgroupFunctionNode.QUAD_BROADCAST = 'quadBroadcast';
+
+export default SubgroupFunctionNode;
+
+
+
+/**
+ * Returns true if this invocation has the lowest subgroup_invocation_id
+ * among active invocations in the subgroup.
+ *
+ * @method
+ * @return {bool} The result of the computation.
+ */
+export const subgroupElect = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_ELECT ).setParameterLength( 0 );
+
+/**
+ * Returns a set of bitfields where the bit corresponding to subgroup_invocation_id
+ * is 1 if pred is true for that active invocation and 0 otherwise.
+ *
+ * @method
+ * @param {bool} pred - A boolean that sets the bit corresponding to the invocations subgroup invocation id.
+ * @return {vec4<u32>}- A bitfield corresponding to the pred value of each subgroup invocation.
+ */
+export const subgroupBallot = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_BALLOT ).setParameterLength( 1 );
+
+/**
+ * A reduction that adds e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The accumulated result of the reduction operation.
+ */
+export const subgroupAdd = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_ADD ).setParameterLength( 1 );
+
+/**
+ * An inclusive scan returning the sum of e for all active invocations with subgroup_invocation_id less than or equal to this invocation.
+ *
+ * @method
+ * @param {number} e - The value provided to the inclusive scan by the current invocation.
+ * @return {number} The accumulated result of the inclusive scan operation.
+ */
+export const subgroupInclusiveAdd = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_INCLUSIVE_ADD ).setParameterLength( 1 );
+
+/**
+ * An exclusive scan that returns the sum of e for all active invocations with subgroup_invocation_id less than this invocation.
+ *
+ * @method
+ * @param {number} e - The value provided to the exclusive scan by the current invocation.
+ * @return {number} The accumulated result of the exclusive scan operation.
+ */
+export const subgroupExclusiveAdd = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_EXCLUSIVE_AND ).setParameterLength( 1 );
+
+/**
+ * A reduction that multiplies e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The accumulated result of the reduction operation.
+ */
+export const subgroupMul = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_MUL ).setParameterLength( 1 );
+
+/**
+ * An inclusive scan returning the product of e for all active invocations with subgroup_invocation_id less than or equal to this invocation.
+ *
+ * @method
+ * @param {number} e - The value provided to the inclusive scan by the current invocation.
+ * @return {number} The accumulated result of the inclusive scan operation.
+ */
+export const subgroupInclusiveMul = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_INCLUSIVE_MUL ).setParameterLength( 1 );
+
+/**
+ * An exclusive scan that returns the product of e for all active invocations with subgroup_invocation_id less than this invocation.
+ *
+ * @method
+ * @param {number} e - The value provided to the exclusive scan by the current invocation.
+ * @return {number} The accumulated result of the exclusive scan operation.
+ */
+export const subgroupExclusiveMul = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_EXCLUSIVE_MUL ).setParameterLength( 1 );
+
+/**
+ * A reduction that performs a bitwise and of e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The result of the reduction operation.
+ */
+export const subgroupAnd = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_AND ).setParameterLength( 1 );
+
+/**
+ * A reduction that performs a bitwise or of e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The result of the reduction operation.
+ */
+export const subgroupOr = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_OR ).setParameterLength( 1 );
+
+/**
+ * A reduction that performs a bitwise xor of e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The result of the reduction operation.
+ */
+export const subgroupXor = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_XOR ).setParameterLength( 1 );
+
+/**
+ * A reduction that performs a min of e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The result of the reduction operation.
+ */
+export const subgroupMin = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_MIN ).setParameterLength( 1 );
+
+/**
+ * A reduction that performs a max of e among all active invocations and returns that result.
+ *
+ * @method
+ * @param {number} e - The value provided to the reduction by the current invocation.
+ * @return {number} The result of the reduction operation.
+ */
+export const subgroupMax = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_MAX ).setParameterLength( 1 );
+
+/**
+ * Returns true if e is true for all active invocations in the subgroup.
+ *
+ * @method
+ * @return {bool} The result of the computation.
+ */
+export const subgroupAll = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_ALL ).setParameterLength( 0 );
+
+/**
+ * Returns true if e is true for any active invocation in the subgroup
+ *
+ * @method
+ * @return {bool} The result of the computation.
+ */
+export const subgroupAny = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_ANY ).setParameterLength( 0 );
+
+/**
+ * Broadcasts e from the active invocation with the lowest subgroup_invocation_id in the subgroup to all other active invocations.
+ *
+ * @method
+ * @param {number} e - The value to broadcast from the lowest subgroup invocation.
+ * @param {number} id - The subgroup invocation to broadcast from.
+ * @return {number} The broadcast value.
+ */
+export const subgroupBroadcastFirst = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_BROADCAST_FIRST ).setParameterLength( 2 );
+
+/**
+ * Swaps e between invocations in the quad in the X direction.
+ *
+ * @method
+ * @param {number} e - The value to swap from the current invocation.
+ * @return {number} The value received from the swap operation.
+ */
+export const quadSwapX = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.QUAD_SWAP_X ).setParameterLength( 1 );
+
+/**
+ * Swaps e between invocations in the quad in the Y direction.
+ *
+ * @method
+ * @param {number} e - The value to swap from the current invocation.
+ * @return {number} The value received from the swap operation.
+ */
+export const quadSwapY = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.QUAD_SWAP_Y ).setParameterLength( 1 );
+
+/**
+ * Swaps e between invocations in the quad diagonally.
+ *
+ * @method
+ * @param {number} e - The value to swap from the current invocation.
+ * @return {number} The value received from the swap operation.
+ */
+export const quadSwapDiagonal = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.QUAD_SWAP_DIAGONAL ).setParameterLength( 1 );
+
+/**
+ * Broadcasts e from the invocation whose subgroup_invocation_id matches id, to all active invocations.
+ *
+ * @method
+ * @param {number} e - The value to broadcast from subgroup invocation 'id'.
+ * @param {number} id - The subgroup invocation to broadcast from.
+ * @return {number} The broadcast value.
+ */
+export const subgroupBroadcast = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_BROADCAST ).setParameterLength( 2 );
+
+/**
+ * Returns v from the active invocation whose subgroup_invocation_id matches id
+ *
+ * @method
+ * @param {number} v - The value to return from subgroup invocation id^mask.
+ * @param {number} id - The subgroup invocation which returns the value v.
+ * @return {number} The broadcast value.
+ */
+export const subgroupShuffle = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_SHUFFLE ).setParameterLength( 2 );
+
+/**
+ * Returns v from the active invocation whose subgroup_invocation_id matches subgroup_invocation_id ^ mask.
+ *
+ * @method
+ * @param {number} v - The value to return from subgroup invocation id^mask.
+ * @param {number} mask - A bitmask that determines the target invocation via a XOR operation.
+ * @return {number} The broadcast value.
+ */
+export const subgroupShuffleXor = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_SHUFFLE_XOR ).setParameterLength( 2 );
+
+/**
+ * Returns v from the active invocation whose subgroup_invocation_id matches subgroup_invocation_id - delta
+ *
+ * @method
+ * @param {number} v - The value to return from subgroup invocation id^mask.
+ * @param {number} delta - A value that offsets the current in.
+ * @return {number} The broadcast value.
+ */
+export const subgroupShuffleUp = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_SHUFFLE_UP ).setParameterLength( 2 );
+
+/**
+ * Returns v from the active invocation whose subgroup_invocation_id matches subgroup_invocation_id + delta
+ *
+ * @method
+ * @param {number} v - The value to return from subgroup invocation id^mask.
+ * @param {number} delta - A value that offsets the current subgroup invocation.
+ * @return {number} The broadcast value.
+ */
+export const subgroupShuffleDown = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.SUBGROUP_SHUFFLE_DOWN ).setParameterLength( 2 );
+
+/**
+ * Broadcasts e from the quad invocation with id equal to id.
+ *
+ * @method
+ * @param {number} e - The value to broadcast.
+ * @return {number} The broadcast value.
+ */
+export const quadBroadcast = /*@__PURE__*/ nodeProxyIntent( SubgroupFunctionNode, SubgroupFunctionNode.QUAD_BROADCAST ).setParameterLength( 1 );

+ 1 - 0
test/e2e/puppeteer.js

@@ -140,6 +140,7 @@ const exceptionList = [
 	// Awaiting for WebGPU Backend support in Puppeteer
 	'webgpu_storage_buffer',
 	'webgpu_compute_sort_bitonic',
+	'webgpu_compute_reduce',
 	'webgpu_struct_drawindirect',
 
 	// WebGPURenderer: Unknown problem

粤ICP备19079148号