samien
/
three.js
mirror of https://github.com/mrdoob/three.js.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383
							<html lang="en">
	<head>
		<title>three.js webgpu - compute reduction</title>
		<meta charset="utf-8">
		<meta name="viewport" content="width=device-width, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
		<link type="text/css" rel="stylesheet" href="main.css">
	</head>
	<body>

	<style>

		#reduction-panel {
			background-color: #111;
			width: 100%;
			display: flex;
			position: fixed;
			height: auto;
			bottom: 0px;
			z-index: 99;
			flex-direction: column;
			justify-content: center;
			align-items: center;
			border-left: 2px solid #222;
			text-align: center;
		}

		#panel-title {
			width: fit-content;
		}

		.thread-row {
			display: flex;
			flex-direction: row;
			align-items: center;
			margin: 4px 0;
			position: relative;
		}

		.thread {
			width: 16px;
			height: 16px;
			background-color: #444;
			margin-right: 2px;
			transition: background-color 0.5s, transform 0.5s;
		}

		.stage-display {
			display: flex;
			flex-direction: column;
			justify-content: center;
			margin-bottom: 5px;
		}

		.stage-label {
			font-size: 1.2em;
			color: #aaa;
			font-style: bold;
			margin-top: 6px;
			margin-bottom: 20px;
		}

		.thread {
			display: flex;
			justify-content: center;
			align-items: center;
			width: 40px;
			height: 40px;
			margin: 2px;
			border: 1px solid rgba(255, 255, 255, 0.2);
			border-radius: 4px;
			background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(0,0,0,0.2));
			box-shadow: inset 0 0 2px rgba(255,255,255,0.1);
			font-family: monospace;
			color: white;
		}

		.thread_data {
			display: block;
			max-width: 100%;
			padding: 0 2px;
			white-space: nowrap;
			overflow: hidden;
			text-overflow: ellipsis;
			font-size: clamp(8px, 2vw, 14px);
			text-align: center;
		}

		.subgroup {
			display: flex;
			position: relative;
			margin-left: 10px;
			margin-right: 10px;
		}

		.subgroup::before {
			/* label text for each subgroup label */
			content: "subgroupAdd()";
			position: absolute;
			top: -20px;
			/* Hide until animation is displayed */
			opacity: 0;
			z-index: 100;
			transition: opacity 0.5s ease;
			font-weight: bold;
			color: white;
			width: 100%;
		}

		.subgroup::after {
			content: attr(data-label);
			position: absolute;
			bottom: -20px;
			opacity: 1;
			z-index: 100;
			color: gray;
			width: 100%;
		}

		.reduction-stage {
			margin-bottom: 20px;
		}

		@keyframes labelAbsorb {
			0% {
				opacity: 0;
				transform: translateY(-50%);
			}
			40% {
				opacity: 1;
				transform: translateY(0%);
			}
			60% {
				opacity: 1;
				transform: translateY(0%);
			}
			80% {
				opacity: 1;
				transform: translate(0%, -20%);
			}
			100% {
				opacity: 0;
				transform: translate(0%, 100%);
			}
		}

		.subgroup.anim::before {
			opacity: 0;
			animation-name: labelAbsorb;
			animation-duration: 1.5s;
			transition:
			transform 0.6s ease-out,
			opacity 0.3s ease-in 0.3s;
		}

	</style>

		<div id="info">
			<a href="https://threejs.org" target="_blank" rel="noopener">three.js</a>
			<br /> This example demonstrates the performance of various simple parallel reduction kernels.
			<br /> Reference implementations are translated from the CUDA/WGSL code present in the following books/repos:
			<br /> Impl. 0 - 2: <a href="https://www.cambridge.org/core/books/programming-in-parallel-with-cuda/C43652A69033C25AD6933368CDBE084C"><i>Programming in Parallel with CUDA</i></a> by <a href="https://people.bss.phy.cam.ac.uk/~rea1/">Richard Ansorge</a>
			<br /> Impl. 3: <a href="https://github.com/frost-beta/betann/blob/main/betann/wgsl/reduce_all.wgsl"><i>betann reduce_all kernel</i></a> by <a href="https://github.com/zcbenz">zcbenz</a>
			<br /> Impl. 4: <a href="https://github.com/b0nes164/GPUPrefixSums/blob/main/GPUPrefixSumsWebGPUapis/SharedShaders/rts.wgsl"><i>GPUPrefixSums reduction approach</i></a> by <a href="https://github.com/b0nes164">b0nes164</a>
			<div id="left_side_display" style="position: absolute;top: 150px;left: 0;padding: 10px;background: rgba( 0, 0, 0, 0.5 );color: #fff;font-family: monospace;font-size: 12px;line-height: 1.5;pointer-events: none;text-align: left;"></div>
			<div id="right_side_display" style="position: absolute;top: 150px;right: 0;padding: 10px;background: rgba( 0, 0, 0, 0.5 );color: #fff;font-family: monospace;font-size: 12px;line-height: 1.5;pointer-events: none;text-align: left;"></div>
		</div>

		<div id="reduction-panel">
			<h3 id="panel-title" style="flex: 0 0 auto;">Subgroup Reduction Explanation</h3>
			<div class="reduction-stage" id="subgroup-reduction-stage">
    		<div class="stage-label">Use subgroupAdd() to capture reduction of each workgroup's subgroups (Hover for animation)</div>
				<div class="stage-display">
					<div id="workgroup_threads" style="display: flex; justify-content: center; margin-bottom: 20px;"></div>
					<div id="subgroup_reduction" style="display: flex; justify-content: center; margin-bottom: 5px;"></div>
				</div>
			</div>
    </div>

		<script type="importmap">
			{
				"imports": {
					"three": "../build/three.webgpu.js",
					"three/webgpu": "../build/three.webgpu.js",
					"three/tsl": "../build/three.tsl.js",
					"three/addons/": "./jsm/"
				}
			}
		</script>

		<script type="module">

			import * as THREE from 'three/webgpu';
			import { instancedArray, Loop, If, vec3, dot, clamp, storage, uvec4, subgroupAdd, uniform, uv, uint, float, Fn, vec2, invocationLocalIndex, invocationSubgroupIndex, uvec2, floor, instanceIndex, workgroupId, workgroupBarrier, workgroupArray, subgroupSize, select, log2 } from 'three/tsl';

			import WebGPU from 'three/addons/capabilities/WebGPU.js';

			import { GUI } from 'three/addons/libs/lil-gui.module.min.js';

			const timestamps = {
				left_side_display: document.getElementById( 'left_side_display' ),
				right_side_display: document.getElementById( 'right_side_display' )
			};

			const divRoundUp = ( size, part_size ) => {

				return Math.floor( ( size + part_size - 1 ) / part_size );

			};

			const cssSubgroupSize = 4;
			const cssWorkgroupSize = 16;

			const workgroupThreadsContainer = document.getElementById( 'workgroup_threads' );
			const subgroupReductionContainer = document.getElementById( 'subgroup_reduction' );

			document.getElementById( 'panel-title' ).textContent += ` (Subgroup Size: ${cssSubgroupSize}, Workgroup Size: ${cssWorkgroupSize})`;

			const createThreadWithData = ( data ) => {

				const threadEle = document.createElement( 'div' );
				threadEle.className = 'thread';
				const threadData = document.createElement( 'span' );
				threadData.textContent = data; // safer than innerHTML for just text
				threadData.className = 'thread_data';
				threadEle.append( threadData );

				return threadEle;

			};

			// Create thread elements
			const workgroupThreads = [];
			const initialSubgroups = [];
			const initialData = [];
			let currentSubgroupDiv = null;
			for ( let i = 0; i < cssWorkgroupSize; i ++ ) {

				if ( i % cssSubgroupSize === 0 ) {

					const currentSubgroupIndex = Math.floor( i / cssSubgroupSize );

					const subgroupReductionThread = createThreadWithData( 0 );
					subgroupReductionThread.id = `subgroup_reduction_element_${currentSubgroupIndex}`;
					subgroupReductionContainer.appendChild( subgroupReductionThread );

					currentSubgroupDiv = document.createElement( 'div' );
					currentSubgroupDiv.className = 'subgroup';
					currentSubgroupDiv.setAttribute( 'data-label', `Threads ${currentSubgroupIndex * cssSubgroupSize}-${( currentSubgroupIndex + 1 ) * cssSubgroupSize - 1}` );
					initialSubgroups.push( currentSubgroupDiv );
					workgroupThreadsContainer.appendChild( currentSubgroupDiv );

				}

				const data = Math.floor( Math.random() * 9 ) + 1;
				initialData.push( data );

				const thread = createThreadWithData( data );
				workgroupThreads.push( thread );
				currentSubgroupDiv.appendChild( thread );

			}

			const deactivateLabelAnimation = ( subgroupDiv, idx ) => {

				subgroupDiv.classList.remove( 'anim' );

				const subgroupReductionBufferElement = document.getElementById( `subgroup_reduction_element_${idx}` ).querySelector( '.thread_data' );

				subgroupReductionBufferElement.innerHTML = 0;

			};

			const activateLabelAnimation = ( subgroupDiv, idx ) => {

				const threads = Array.from( subgroupDiv.children );
				let total = 0;

				for ( let i = idx * cssSubgroupSize; i < idx * cssSubgroupSize + cssSubgroupSize; i ++ ) {

					total += initialData[ i ];

				}

				subgroupDiv.classList.add( 'anim' );

				setTimeout( () => {

					threads.forEach( t => {

						t.querySelector( '.thread_data' ).textContent = total;

					} );

					const subgroupReductionBufferElement = document.getElementById( `subgroup_reduction_element_${idx}` ).querySelector( '.thread_data' );

					subgroupReductionBufferElement.innerHTML = total;

				}, 1000 );

				// Remove the class after the animation ends so it can be triggered again
				setTimeout( () => {

					subgroupDiv.classList.remove( 'anim' );

				}, 1500 ); // matches animation duration in CSS

			};

			document.getElementById( 'subgroup-reduction-stage' ).addEventListener( 'mouseenter', () => {

				initialSubgroups.forEach( ( subgroupDiv, idx ) => {

					activateLabelAnimation( subgroupDiv, idx );

				} );

			} );

			document.getElementById( 'subgroup-reduction-stage' ).addEventListener( 'mouseleave', () => {

				initialSubgroups.forEach( ( subgroupDiv, idx ) => {

					deactivateLabelAnimation( subgroupDiv, idx );

				} );

				workgroupThreads.forEach( ( thread, idx ) => {

					thread.querySelector( '.thread_data' ).textContent = initialData[ idx ];

				} );

			} );


			if ( WebGPU.isAvailable() === false ) {

				document.body.appendChild( WebGPU.getErrorMessage() );

				throw new Error( 'No WebGPU support' );

			}

			// Total number of elements and the dimensions of the display grid.
			const size = 262144;
			const vecSize = divRoundUp( size, 4 );
			// Grid display is gridDim x gridDim
			const gridDim = Math.sqrt( size );
			let maxWorkgroupSize = 64;

			// Algorithm speed increase as you iterate through algorithms array
			const algorithms = [
				'Reduce 0 (N/2)',
				'Reduce 1 (Naive Accumulate)',
				'Reduce 2 (Workgroup Reduction)',
				'Reduce 3 (Subgroup Reduce)',
				'Reduce 4 (Subgroup Optimized)',
				'Incorrect Baseline',
			];

			// Input Grid: Displays input data in a grid format
			// Input Log2: Displays input grid data's logarithmic indices horizontally (1, 2, 4, 8, 16, ..., size)
			// Input Element 0: Displays clamped input[0]
			const displayModes = [ 'Input Grid', 'Input Log2', 'Input Element 0', 'Workgroup Sum Grid' ];

			// Holds uniforms for both displays as well as debug information
			const unifiedEffectController = {
				// Number of elements in the grid
				gridElementWidth: uniform( gridDim ),
				gridElementHeight: uniform( gridDim ),
				// Number of elements in the grid being displayed
				gridDisplayWidth: uniform( gridDim ),
				gridDisplayHeight: uniform( gridDim ),
				// How to display end result of reduction.
				// Ideally this is unique to the reduction method being deployed
				'Display Mode': 'Input Log2',
				loggedBuffer: 'Input Buffer',
				elementsReduced: size,
			};


			const leftEffectController = {
				// Current reduction algorithm being executed by this side
				algo: 'Reduce 0 (N/2)',
				// Flag indicating whether to highlight element in validation check
				highlight: uniform( 0 ),
				// Uniform that corresponds to the index of the current algorithm within the algorithms array
				currentAlgo: uniform( 0 ),
				// Current state of reduction (Running, validating, resetting)
				state: 'Run Algo',
				// Current display mode
				displayMode: 'Input Log2',
				// Reduce 0 specific uniform
				numThreadsDispatched: uniform( size / 2 ),
				// The subgroup size used by this side's device
			};

			const rightEffectController = {
				algo: 'Reduce 4 (Subgroup Optimized)',
				currentAlgo: uniform( 3 ),
				highlight: uniform( 0 ),
				displayMode: 'Input Element 0',
				state: 'Run Algo',
				numThreadsDispatched: uniform( size / 2 )
			};

			const leftMaterial = new THREE.MeshBasicNodeMaterial( { color: 0x00ff00 } );
			const rightMaterial = new THREE.MeshBasicNodeMaterial( { color: 0x00ff00 } );
			const leftDisplayColorNodes = {};
			const rightDisplayColorNodes = {};

			const gui = new GUI();

			gui.add( leftEffectController, 'algo', algorithms ).onChange( () => {

				leftEffectController.currentAlgo.value = algorithms.findIndex( val => val === leftEffectController.algo );

			} );

			gui.add( rightEffectController, 'algo', algorithms ).onChange( () => {

				rightEffectController.currentAlgo.value = algorithms.findIndex( val => val === rightEffectController.algo );

			} );

			gui.add( leftEffectController, 'displayMode', displayModes ).name( 'Left Display Mode' ).onChange( () => {

				leftMaterial.colorNode = leftDisplayColorNodes[ leftEffectController.displayMode ];
				leftMaterial.needsUpdate = true;

			} );
			gui.add( rightEffectController, 'displayMode', displayModes ).name( 'Right Display Mode' ).onChange( () => {

				rightMaterial.colorNode = rightDisplayColorNodes[ rightEffectController.displayMode ];
				rightMaterial.needsUpdate = true;

			} );

			const debugFolder = gui.addFolder( 'Debug' );
			const elementsReducedController = debugFolder.add( unifiedEffectController, 'elementsReduced' ).name( 'Elements Reduced' );
			elementsReducedController.disable();
			const stateLeftController = debugFolder.add( leftEffectController, 'state' ).name( 'Left Display State' );
			const stateRightController = debugFolder.add( rightEffectController, 'state' ).name( 'Right Display State' );
			stateLeftController.disable();
			stateRightController.disable();
			debugFolder.add( unifiedEffectController, 'loggedBuffer', [ 'Input Buffer', 'Input Vectorized Buffer', 'Workgroup Sums Buffer', 'Debug Buffer' ] ).name( 'Buffer to Log' );
			debugFolder.close();

			// HELPER FUNCTIONS
			const pow2Ceil = Fn( ( [ x ] ) => {

				If( x.equal( uint( 0 ) ), () => {

					return uint( 1 );

				} );

				const val = x.sub( 1 ).toVar( 'val' );
				val.assign( val.bitOr( val.shiftRight( 1 ) ) );
				val.assign( val.bitOr( val.shiftRight( 2 ) ) );
				val.assign( val.bitOr( val.shiftRight( 4 ) ) );
				val.assign( val.bitOr( val.shiftRight( 8 ) ) );
				val.assign( val.bitOr( val.shiftRight( 16 ) ) );
				return val.add( 1 );

			} ).setLayout( {
				name: 'pow2Ceil',
				type: 'uint',
				inputs: [
					{ name: 'x', type: 'uint' }
				]
			} );

			// ALGORITHM CONSTRUCTORS

			// REDUCE 1

			// Thanks to Sam0oneau of Graphics Programming Discord for the explanation.
			// (Graphics Programming Discord Message Link): https://discord.com/channels/318590007881236480/374061825454768129/1391248956171882597

			/* Reduce 1 Example (Assume Workgroup Size 256, numElements: 262144) -> Initial currentBuffer State: | 1, 1, 1, 1, ... |
				 *
				 * KERNEL 1:
				 * Executes 256 threads by 256 workgroups. Each thread loops 4 times and accesses elements
				 * at the indices below.
				 *          Thread 1                        Thread 2                         Thread 3
				 * | 0, 65536, ..., n * 65536 | 1, 65537, .... (n * 65536) + 1 | 1, 65538, .... (n * 65536) + 2 | etc
				 * Buffer Values: | 4, 4, 4, 4, ...|
				 *
				 * KERNEL 2:
				 * Executes 256 threads by one workgroup. Each thread loops 1024 times
				 *          Thread 1                     Thread 2                     Thread 3
				 * | 0, 256, ...., n * 256    | 1, 257, ... (n * 256) + 1 | 2, 258, ... (n * 256) + 3 | etc
				 * Buffer Values: | 1024, 1024, 1024, 1024, ... |
				 *
				 * KERNEL 3:
				 * Executes 1 thread by one workgroup. Single thread loops 256 times
				 *          Thread 1
				 * | 0, 1, 2, 3, 4, 5, 6 ... etc|
				 * Buffer Values: [262144, 1024, 1024]
				 */


			const createReduce1Fn = ( createReduce1FnProps ) => {

				const { dispatchSize, numElements, inputBuffer, workgroupSize } = createReduce1FnProps;

				const fnDef = Fn( () => {

					const dispatch = uint( dispatchSize ).toVar( 'dispatchSize' );
					const tSum = uint( 0 ).toVar();
					const k = instanceIndex.toVar( 'k' );

					Loop( k.lessThan( uint( numElements ) ), ( ) => {

						tSum.addAssign( inputBuffer.element( k ) );
						k.addAssign( uint( dispatch ) );

					} );

					inputBuffer.element( instanceIndex ).assign( tSum );


				} )().compute( dispatchSize, [ workgroupSize ] );

				return fnDef;

			};

			// REDUCE 2

			// For non power of 2 # of workgroups
			const createReduce2Fn = ( createReduce2FnProps ) => {

				const { workgroupSize, dispatchSize, numElements, inputBuffer } = createReduce2FnProps;

				const fnDef = Fn( () => {

					const tSum = workgroupArray( 'uint', workgroupSize );

					const k = instanceIndex.toVar( 'k' );
					tSum.element( invocationLocalIndex ).assign( uint( 0 ) );

					Loop( k.lessThan( uint( numElements ) ), () => {

						tSum.element( invocationLocalIndex ).addAssign( inputBuffer.element( k ) );

						k.addAssign( uint( dispatchSize ) );

					} );

					workgroupBarrier();

					// Reset the loop condition (account for numWorkgroups % 2 != 0)
					k.assign( pow2Ceil( uint( workgroupSize ) ).div( 2 ) );

					Loop( k.greaterThan( 0 ), () => {

						If( invocationLocalIndex.lessThan( k ).and( invocationLocalIndex.add( k ).lessThan( workgroupSize ) ), () => {

							tSum.element( invocationLocalIndex ).addAssign( tSum.element( invocationLocalIndex.add( k ) ) );

						} );
						workgroupBarrier();
						k.divAssign( 2 );

					} );

					If( invocationLocalIndex.equal( uint( 0 ) ), () => {

						inputBuffer.element( workgroupId.x ).assign( tSum.element( uint( 0 ) ) );

					} );

				} )().compute( dispatchSize, [ workgroupSize ] );

				return fnDef;

			};

			// REDUCE 3

			/* Create array with enough indices for worst-case subgroup size */
			const createSubgroupArray = ( type, workgroupSize, minSubgroupSize = 4 ) => {

				return workgroupArray( 'uint', workgroupSize / minSubgroupSize );

			};

			// zcbenz implementation
			// https://github.com/frost-beta/betann/blob/8aa2701caf63fb29bd4cd2454e656973342c1588/betann/wgsl/reduce_ops.wgsl#L71
			const RowReduce = ( rowReduceProps ) => {

				const { workgroupSize, inputBuffer, total, rowOffset, currentRowSize, workPerThread, vectorized } = rowReduceProps;

				// Number of unvectorized elements each workgroup can ingest
				// At workgroupSize of 256, blockSize will be 1024
				const blockSize = uint( workgroupSize ).mul( workPerThread );
				const block = uint( 0 ).toVar( 'block' );

				// At rowSize of 2048, there will be two blocks
				const blockLimiter = currentRowSize.div( blockSize ).toVar( 'blockLimiter' );
				Loop( block.lessThan( blockLimiter ), () => {

					const blockOffset = block.mul( blockSize );
					const startThread = blockOffset.add( invocationLocalIndex.mul( workPerThread ) );
					const localThreadOffset = uint( 0 ).toVar( 'localThreadOffset' );
					Loop( localThreadOffset.lessThan( workPerThread ), () => {

						const inputElement = inputBuffer.element( rowOffset.add( startThread ).addLocal );

						if ( vectorized ) {

							const value = dot( inputElement, uvec4( 1 ) );
							total.addAssign( value );

						} else {

							const inputElement = inputBuffer.element( rowOffset.add( startThread ).add( localThreadOffset ) );
							total.addAssign( inputElement );

						}

						// Increment up a thread
						localThreadOffset.addAssign( 1 );

					} );

					// Increment up a block
					block.addAssign( 1 );

				} );

				// Ignoring left over check for this example, since we know ahead of time the value of leftover (2048 % 1024 === 0)

			};

			const WorkgroupReduce = ( workgroupReduceProps ) => {

				const { total, workgroupSize } = workgroupReduceProps;

				const subgroupSums = createSubgroupArray( 'uint', workgroupSize );

				// Assign sum of all values in subgroup to total
				total.assign( subgroupAdd( total ) );

				const delta = uint( workgroupSize ).div( subgroupSize ).toVar( 'delta' );

				const subgroupMetaRank = invocationLocalIndex.div( subgroupSize );

				Loop( float( delta ).greaterThan( 1.0 ), () => {

					If( invocationSubgroupIndex.equal( 0 ), () => {

						// Each subgroup will populate the subgroupSums array
						subgroupSums.element( subgroupMetaRank ).assign( total );

					} );

					// Ensure that all subgroups in the workgroup have populated the workgroup memory array
					workgroupBarrier();

					// Thread 0 - subgroupsInWorkgroup will assign a value to total
					total.assign( select( invocationLocalIndex.lessThan( delta ), subgroupSums.element( invocationLocalIndex ), 0 ).uniformFlow() );
					// # of subgroups in workgroup is invariably less than # of threads in subgroup, so subgroupAdd will still sync here
					total.assign( subgroupAdd( total ) );

					delta.divAssign( subgroupSize );

				} );

			};

			const createReduce3Fn = ( createReduce3FnProps ) => {

				const { workgroupSize, workPerThread, inputBuffer, intermediateBuffer, rowSize } = createReduce3FnProps;

				const fnDef = Fn( () => {

					const inputSize = uint( inputBuffer.bufferCount.length );
					const rowOffset = workgroupId.x.mul( rowSize );

					// If the current rows elements exceed the bounds of the input
					// Select either 0 or number of elements left,
					// otherwise, select existing ROW_SIZE
					const currentRowSize = select(
						( rowOffset.add( rowSize ) ).greaterThan( inputSize ),
						select( inputSize.greaterThan( rowOffset ), inputSize.sub( rowOffset ), 0 ).uniformFlow(),
						rowSize,
					).uniformFlow();

					const total = uint( 0 ).toVar( 'total' );

					RowReduce( {
						inputBuffer: inputBuffer,
						total: total,
						rowOffset: rowOffset,
						currentRowSize: currentRowSize,
						workPerThread: workPerThread,
						workgroupSize: workgroupSize,
					} );

					WorkgroupReduce( {
						total: total,
						workgroupSize: workgroupSize,
					} );

					// Populate each workgroup with its reduction
					If( invocationLocalIndex.equal( 0 ), () => {

						intermediateBuffer.element( workgroupId.x ).assign( total );

					} );

				} )();

				return fnDef;

			};

			// REDUCE 4

			// b0nes164 inspired implementation with vec4
			const createReduce4Fn = ( props ) => {

				// Can't pass in subgroup size since we can't always be certain what size is at runtime
				const { size, workPerThread, workgroupSize, inputBuffer, intermediateBuffer } = props;

				const ELEMENTS_PER_VEC4 = 4;
				// The number of individual elements a single workgroup will access
				const partitionSize = workgroupSize * workPerThread * ELEMENTS_PER_VEC4;
				const vecSize = divRoundUp( size, ELEMENTS_PER_VEC4 );
				// Can also be calculated using divRoundUp( vecSize, workgroupSize * workPerThread );
				const numWorkgroups = divRoundUp( size, partitionSize );
				// Currently no way to specify dispatch size in increments of workgroups, so we convert to numInvocations
				const numInvocations = numWorkgroups * workgroupSize;

				const fnDef = Fn( () => {

					const perSubgroupReductionArray = createSubgroupArray( 'uint', workgroupSize );

					// Get the index of the subgroup within the workgroup
					const subgroupMetaRank = invocationLocalIndex.div( subgroupSize );

					// Each subgroup block scans across 4 subgroups. So when we move into a new subgroup,
					// align that subgroups' accesses to the next 4 subgroups
					const subgroupOffset = subgroupMetaRank.mul( subgroupSize ).mul( workPerThread );
					subgroupOffset.addAssign( invocationSubgroupIndex );

					// Per workgroup, offset by number of vectorized elements scanned per workgroup
					const workgroupOffset = workgroupId.x.mul( uint( maxWorkgroupSize ).mul( workPerThread ) );

					const startThread = subgroupOffset.add( workgroupOffset );

					const subgroupReduction = uint( 0 );

					// Each thread will accumulate values from across 'workPerThread' subgroups
					If( workgroupId.x.lessThan( uint( numWorkgroups ).sub( 1 ) ), () => {

						Loop( {
							start: uint( 0 ),
							end: workPerThread,
							type: 'uint',
							condition: '<',
							name: 'currentSubgroupInBlock'
						}, () => {

							// Get vectorized element from input array
							const val = inputBuffer.element( startThread );

							// Sum values within vec4 together by using result of dot product
							subgroupReduction.addAssign( dot( uvec4( 1 ), val ) );

							// Increment so thread will scan value in next subgroup
							startThread.addAssign( subgroupSize );

						} );

					} );

					// Ensure that the last workgroup does not access out of bounds indices
					If( workgroupId.x.equal( uint( numWorkgroups ).sub( 1 ) ), () => {

						Loop( {
							start: uint( 0 ),
							end: workPerThread,
							type: 'uint',
							condition: '<',
							name: 'currentSubgroupInBlock'
						}, () => {

							// Ensure index is less than number of available vectors in inputBuffer
							const val = select( startThread.lessThan( uint( vecSize ) ), inputBuffer.element( startThread ), uvec4( 0 ) ).uniformFlow();

							subgroupReduction.addAssign( dot( val, uvec4( 1 ) ) );
							startThread.addAssign( subgroupSize );

						} );

					} );

					subgroupReduction.assign( subgroupAdd( subgroupReduction ) );

					// Assuming that each element in the input buffer is 1, we generally expect each invocation's subgroupReduction
					// value to be ELEMENTS_PER_VEC4 * workPerThread * subgroupSize

					// Delegate one thread per subgroup to assign each subgroup's reduction to the workgroup array
					If( invocationSubgroupIndex.equal( uint( 0 ) ), () => {

						perSubgroupReductionArray.element( subgroupMetaRank ).assign( subgroupReduction );

					} );

					// Ensure that each workgroup has populated the perSubgroupReductionArray with data
					// from each of it's subgroups
					workgroupBarrier();

					if ( props.debugBuffer ) {

						If( invocationLocalIndex.equal( uint( 0 ) ), () => {

							props.debugBuffer.element( workgroupId.x ).assign( subgroupReduction );

						} );

						workgroupBarrier();

					}

					// WORKGROUP LEVEL REDUCE

					// Multiple approaches here
					// log2(subgroupSize) -> TSL log2 function
					// countTrailingZeros/findLSB(subgroupSize) -> Currently unsupported function in TSL that counts trailing zeros in number bit representation
					// Can technically petition GPU for subgroupSize in shader and calculate logs on CPU at cost of shader being generalizable across devices
					// May also break if subgroupSize changes when device is lost or if program is rerun on lower power device
					const subgroupSizeLog = uint( log2( float( subgroupSize ) ) ).toVar( 'subgroupSizeLog' );
					const spineSize = uint( workgroupSize ).shiftRight( subgroupSizeLog );
					const spineSizeLog = uint( log2( float( spineSize ) ) ).toVar( 'spineSizeLog' );


					// Align size to powers of subgroupSize
					const squaredSubgroupLog = ( spineSizeLog.add( subgroupSizeLog ).sub( 1 ) );
					squaredSubgroupLog.divAssign( subgroupSizeLog );
					squaredSubgroupLog.mulAssign( subgroupSizeLog );
					const alignedSize = ( uint( 1 ).shiftLeft( squaredSubgroupLog ) ).toVar( 'alignedSize' );

					// aligned size 2 * 4

					const offset = uint( 0 );

					// In cases where the number of subgroups in a workgroup is greater than the subgroup size itself,
					// we need to iterate over the array again to capture all the data in the workgroup array buffer
					Loop( { start: subgroupSize, end: alignedSize, condition: '<=', name: 'j', type: 'uint', update: '<<= subgroupSizeLog' }, () => {

						const subgroupIndex = ( ( invocationLocalIndex.add( 1 ) ).shiftLeft( offset ) ).sub( 1 );

						const isValidSubgroupIndex = subgroupIndex.lessThan( spineSize ).toVar( 'isValidSubgroupIndex' );

						// Reduce values within the local workgroup memory.
						// Set toVar to ensure subgroupAdd executes before (not within) the if statement.
						const t = subgroupAdd(
							select(
								isValidSubgroupIndex,
								perSubgroupReductionArray.element( subgroupIndex ),
								0
							).uniformFlow()
						).toVar( 't' );

						// Can assign back to workgroupArray since all
						// subgroup threads work in lockstop for subgroupAdd
						If( isValidSubgroupIndex, () => {

							perSubgroupReductionArray.element( subgroupIndex ).assign( t );

						} );

						// Ensure all threads have completed work

						workgroupBarrier();

						offset.addAssign( subgroupSizeLog );

					} );

					// Assign single thread from workgroup to assign workgroup reduction
					If( invocationLocalIndex.equal( uint( 0 ) ), () => {

						const reducedWorkgroupSum = perSubgroupReductionArray.element( uint( spineSize ).sub( 1 ) );
						intermediateBuffer.element( workgroupId.x ).assign( reducedWorkgroupSum );

					} );

				} )().compute( numInvocations, [ maxWorkgroupSize ] );

				return fnDef;

			};


			// INCORRECT BASELINE

			const createIncorrectBaselineFn = ( incorrectBaselineProps ) => {

				const { inputBuffer } = incorrectBaselineProps;

				const fnDef = Fn( () => {

					inputBuffer.element( instanceIndex ).assign( 99999 );

				} )();

				return fnDef;

			};


			init();

			init( false );

			async function init( leftSideDisplay = true ) {

				const effectController = leftSideDisplay ? leftEffectController : rightEffectController;

				const aspect = ( window.innerWidth / 2 ) / window.innerHeight;
				const camera = new THREE.OrthographicCamera( - aspect, aspect, 1, - 1, 0, 2 );
				camera.position.z = 1;

				const scene = new THREE.Scene();

				const array = new Uint32Array( Array.from( { length: size }, ( _, i ) => {

					return 1;

				} ) );

				// Represents array of data as uints in compute shader.
				const inputStorage = instancedArray( array, 'uint' ).setPBO( true ).setName( `Current_${leftSideDisplay ? 'Left' : 'Right'}` );
				// Represents array of data as vec4s in compute shader;
				const inputVec4BufferAttribute = new THREE.StorageInstancedBufferAttribute( array, 4 );
				const inputVectorizedStorage = storage( inputVec4BufferAttribute, 'uvec4', vecSize ).setPBO( true ).setName( `CurrentVectorized_${leftSideDisplay ? 'Left' : 'Right'}` );

				// Reduce 3 Calculations
				const workPerThread = 4;
				const numRows = workPerThread * 32;
				const rowSize = divRoundUp( size, numRows );

				const workgroupSumsArray = new Uint32Array( numRows );
				const workgroupSumsStorage = instancedArray( workgroupSumsArray, 'uint' ).setPBO( true ).setName( `WorkgroupSums_${leftSideDisplay ? 'Left' : 'Right'}` );
				const debugArray = new Uint32Array( 1024 );
				const debugStorage = instancedArray( debugArray, 'uint' ).setPBO( true ).setName( `Debug_${leftSideDisplay ? 'Left' : 'Right'}` );

				const buffers = {
					'Input Buffer': inputStorage,
					'Input Vectorized Buffer': inputVectorizedStorage,
					'Workgroup Sums Buffer': workgroupSumsStorage,
					'Debug Buffer': debugStorage,
				};

				const logFunctionName = `Log ${leftSideDisplay ? 'Left' : 'Right'} Side`;
				const functionObj = {};
				functionObj[ logFunctionName ] = async() => {

					const selectedBuffer = buffers[ unifiedEffectController.loggedBuffer ];
					console.log( new Uint32Array( await renderer.getArrayBufferAsync( selectedBuffer.value ) ) );

				};

				debugFolder.add( functionObj, `Log ${leftSideDisplay ? 'Left' : 'Right'} Side` );

				const computeResetBufferFn = Fn( () => {

					inputStorage.element( instanceIndex ).assign( 1 );

				} );

				const computeResetWorkgroupSumsFn = Fn( () => {

					workgroupSumsStorage.element( instanceIndex ).assign( 0 );

				} );


				// Re-initialize compute buffer
				const computeResetBuffer = computeResetBufferFn().compute( size );
				const computeResetWorkgroupSums = computeResetWorkgroupSumsFn().compute( 256 );

				const renderer = new THREE.WebGPURenderer( { antialias: false, trackTimestamp: true } );
				renderer.setPixelRatio( window.devicePixelRatio );
				renderer.setSize( window.innerWidth / 2, window.innerHeight );

				// Unfortunately, need to arbitrarily run compute shader to get access to device limits
				await renderer.computeAsync( computeResetBuffer );


				if ( renderer.backend.device !== null ) {

					maxWorkgroupSize = renderer.backend.device.limits.maxComputeWorkgroupSizeX;

				}

				// Create and store dispatches of reduction of certain size. Map each set of dispatches to algorithm name.

				const computeReduce0Fn = Fn( () => {

					const { numThreadsDispatched } = effectController;

					inputStorage.element( instanceIndex ).addAssign( inputStorage.element( instanceIndex.add( numThreadsDispatched ) ) );

				} )();

				const reduce0Calls = [];

				for ( let i = size / 2; i >= 1; i /= 2 ) {

					const reduce0 = computeReduce0Fn.compute( i, [ maxWorkgroupSize ] );
					reduce0Calls.push( reduce0 );

				}

				const reduce1Calls = [
					// Accumulation
					createReduce1Fn( {
						dispatchSize: maxWorkgroupSize * maxWorkgroupSize,
						workgroupSize: maxWorkgroupSize,
						numElements: size,
						inputBuffer: inputStorage,
					} ),
					// 1 Block accumulation
					createReduce1Fn( {
						dispatchSize: maxWorkgroupSize,
						numElements: maxWorkgroupSize * maxWorkgroupSize,
						workgroupSize: maxWorkgroupSize,
						inputBuffer: inputStorage,
					} ),
					// Final result
					createReduce1Fn( {
						dispatchSize: 1,
						numElements: maxWorkgroupSize,
						workgroupSize: 1,
						inputBuffer: inputStorage
					} ),
				];

				const reduce2Calls = [
					// Accumulate within workgroups
					createReduce2Fn( {
						workgroupSize: maxWorkgroupSize,
						dispatchSize: maxWorkgroupSize * maxWorkgroupSize,
						numElements: size,
						inputBuffer: inputStorage,
					} ),
					// 1 Block accumulation
					createReduce2Fn( {
						workgroupSize: maxWorkgroupSize,
						dispatchSize: maxWorkgroupSize,
						numElements: maxWorkgroupSize,
						inputBuffer: inputStorage,
					} ),
				];

				const reduce3Calls = [
					createReduce3Fn( {
						inputBuffer: inputStorage,
						intermediateBuffer: workgroupSumsStorage,
						workgroupSize: maxWorkgroupSize,
						workPerThread: 4,
						rowSize: rowSize,
						vectorized: false,
					} ).compute( maxWorkgroupSize * numRows, [ maxWorkgroupSize ] ),
					createReduce3Fn( {
						inputBuffer: workgroupSumsStorage,
						intermediateBuffer: inputStorage,
						workgroupSize: 32,
						workPerThread: 4,
						rowSize: rowSize,
						vectorized: false
					} ).compute( 32, [ 32 ] )
				];

				const reduce4Calls = [
					createReduce4Fn( {
						size: size,
						inputBuffer: inputVectorizedStorage,
						intermediateBuffer: workgroupSumsStorage,
						workgroupSize: maxWorkgroupSize,
						workPerThread: 4,
					} ),
					createReduce3Fn( {
						inputBuffer: workgroupSumsStorage,
						intermediateBuffer: inputStorage,
						workgroupSize: 32,
						workPerThread: 4,
						rowSize: rowSize,
						vectorized: false
					} ).compute( 32, [ 32 ] )
				];

				const incorrectBaselineCalls = [
					createIncorrectBaselineFn( {
						inputBuffer: inputStorage,
					} ).compute( size ),
				];

				const calls = {
					'Reduce 0 (N/2)': reduce0Calls,
					'Reduce 1 (Naive Accumulate)': reduce1Calls,
					'Reduce 2 (Workgroup Reduction)': reduce2Calls,
					'Reduce 3 (Subgroup Reduce)': reduce3Calls,
					'Reduce 4 (Subgroup Optimized)': reduce4Calls,
					'Incorrect Baseline': incorrectBaselineCalls
				};

				const getColor = ( bufferToCheck, colorChanger, width, height ) => {

					const subtracter = float( colorChanger ).div( width.mul( height ) );

					const color = vec3( subtracter.oneMinus() ).toVar();

					const { highlight } = effectController;

					// Validate that element 0 is equal to expected result of reduction
					If( highlight.equal( 1 ), () => {

						If( ( bufferToCheck.element( 0 ) ).equal( size ), () => {

							color.assign( vec3( 0.0, subtracter.oneMinus(), 0.0 ) );

						} ).Else( () => {

							color.assign( vec3( subtracter.oneMinus(), 0.0, 0.0 ) );

						} );

					} );

					return color;

				};

				const displayNodes = leftSideDisplay ? leftDisplayColorNodes : rightDisplayColorNodes;
				displayNodes[ 'Input Grid' ] = Fn( () => {

					const { gridElementWidth, gridElementHeight, gridDisplayWidth, gridDisplayHeight } = unifiedEffectController;

					const newUV = uv().mul( vec2( gridDisplayWidth, gridDisplayHeight ) );

					const pixel = uvec2( uint( floor( newUV.x ) ), uint( floor( newUV.y ) ) );

					const elementIndex = uint( gridDisplayWidth ).mul( pixel.y ).add( pixel.x );

					const colorChanger = uint( 0 ).toVar();
					const color = vec3( 0 ).toVar( 'color' );

					colorChanger.assign( inputStorage.element( elementIndex ) );
					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );

					return color;

				} )();

				displayNodes[ 'Input Log2' ] = Fn( () => {

					const { gridElementWidth, gridElementHeight } = unifiedEffectController;

					const newUV = uv().mul( vec2( Math.log2( size ) ), 1 );

					const colorChanger = uint( 0 ).toVar();
					const color = vec3( 0 ).toVar( 'color' );

					colorChanger.assign( inputStorage.element( uint( 1 ).shiftLeft( newUV.x ) ) );
					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );

					return color;

				} )();

				displayNodes[ 'Input Element 0' ] = Fn( () => {

					const { gridElementWidth, gridElementHeight } = unifiedEffectController;

					const colorChanger = uint( 0 ).toVar();
					const color = vec3( 0 ).toVar( 'color' );

					// Clamp display of single element to shade where green is still readable
					colorChanger.assign( clamp( inputStorage.element( 0 ), 0, size / 2 ) );
					color.assign( getColor( inputStorage, colorChanger, gridElementWidth, gridElementHeight ) );
					return color;

				} )();

				displayNodes[ 'Workgroup Sum Grid' ] = Fn( () => {

					const width = uint( 8 );
					const height = uint( 16 );

					const newUV = uv().mul( vec2( width, height ) );

					const pixel = uvec2( uint( floor( newUV.x ) ), uint( floor( newUV.y ) ) );

					const elementIndex = uint( width ).mul( pixel.y ).add( pixel.x );

					const colorChanger = uint( 0 ).toVar();
					const color = vec3( 0 ).toVar( 'color' );

					colorChanger.assign( workgroupSumsStorage.element( elementIndex ) );
					color.assign( getColor( inputStorage, colorChanger, width, height ) );

					return color;

				} )();

				( leftSideDisplay ? leftMaterial : rightMaterial ).colorNode = displayNodes[ effectController.displayMode ];
				( leftSideDisplay ? leftMaterial : rightMaterial ).needsUpdate = true;

				const plane = new THREE.Mesh( new THREE.PlaneGeometry( 1, 1 ), ( leftSideDisplay ? leftMaterial : rightMaterial ) );
				scene.add( plane );

				const animate = () => {

					renderer.render( scene, camera );

				};

				renderer.setAnimationLoop( animate );

				document.body.appendChild( renderer.domElement );
				renderer.domElement.style.position = 'absolute';
				renderer.domElement.style.top = '0';
				renderer.domElement.style.left = '0';
				renderer.domElement.style.width = '50%';
				renderer.domElement.style.height = '100%';

				if ( ! leftSideDisplay ) {

					renderer.domElement.style.left = '50%';

					scene.background = new THREE.Color( 0x212121 );

				} else {

					scene.background = new THREE.Color( 0x313131 );

				}

				renderer.info.autoReset = false;

				const stepAnimation = async function () {

					const currentAlgorithm = effectController.algo;
					const state = effectController.state;
					const stateController = leftSideDisplay ? stateLeftController : stateRightController;

					if ( state === 'Reset' ) {

						renderer.computeAsync( computeResetBuffer );
						renderer.computeAsync( computeResetWorkgroupSums );

					} else if ( state === 'Run Algo' ) {

						renderer.info.reset();

						const cpuTime = 0;

						switch ( currentAlgorithm ) {

							case 'Reduce 0 (N/2)': {

								let m = size / 2;

								for ( let i = 0; i < reduce0Calls.length; i ++ ) {

									effectController.numThreadsDispatched.value = m;

									const reduce0 = reduce0Calls[ i ];
									// Do a reduction step
									renderer.computeAsync( reduce0 );
									renderer.resolveTimestampsAsync( THREE.TimestampQuery.COMPUTE );

									m /= 2;

								}


								break;

							}

							default: {

								const currentAlgoCalls = calls[ currentAlgorithm ];

								for ( let i = 0; i < currentAlgoCalls.length; i ++ ) {

									renderer.computeAsync( currentAlgoCalls[ i ] );
									renderer.resolveTimestampsAsync( THREE.TimestampQuery.COMPUTE );

								}

								break;

							}

						}

						// DEBUG: const reductionResult = new Uint32Array( await renderer.getArrayBufferAsync( currentBuffer ) )[0];

						let passInfoString = '';

						if ( effectController.algo.substring( 0, 3 ) === 'CPU' ) {

							passInfoString = `Ran in ${cpuTime}ms<br>`;

						} else {

							passInfoString = `${renderer.info.compute.frameCalls} pass in ${renderer.info.compute.timestamp.toFixed( 6 )}ms<br>`;

						}


						timestamps[ leftSideDisplay ? 'left_side_display' : 'right_side_display' ].innerHTML = `

							Compute ${effectController.algo}: ${passInfoString}`;

					}

					renderer.render( scene, camera );
					renderer.resolveTimestampsAsync( THREE.TimestampQuery.RENDER );

					// Validate next state

					if ( state === 'Run Algo' ) {

						stateController.setValue( 'Validate' );

						effectController.highlight.value = 1;

					} else if ( state === 'Validate' ) {

						stateController.setValue( 'Reset' );

						effectController.highlight.value = 0;

					} else if ( state === 'Reset' ) {

						stateController.setValue( 'Run Algo' );

					}

					setTimeout( stepAnimation, 1000 );

				};


				window.addEventListener( 'resize', onWindowResize );

				function onWindowResize() {

					renderer.setSize( window.innerWidth / 2, window.innerHeight );

					const aspect = ( window.innerWidth / 2 ) / window.innerHeight;

					const frustumHeight = camera.top - camera.bottom;

					camera.left = - frustumHeight * aspect / 2;
					camera.right = frustumHeight * aspect / 2;

					camera.updateProjectionMatrix();

					renderer.render( scene, camera );

				}

				setTimeout( stepAnimation, 1000 );

			}

		</script>
	</body>
</html>