Cache deposits

- Make the scatter shader more cache friendly by doing one copy per workgroup and making copies up to 64 uints long
2025-01-27 05:17:56 +01:00 · 2024-03-31 16:04:25 -07:00 · 2024-03-31 16:04:25 -07:00 · d5c5b998bc
commit d5c5b998bc
parent 2ee3944ca6
3 changed files with 20 additions and 12 deletions
--- a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java
+++ b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java
@ -12,10 +12,11 @@ public class ScatterList {
 	private long usedBytes;

 	public ScatterList() {
-		this(64);
+		// Should be the same as the local_size_x of the scatter shader
+		this(64 * 4);
 	}

-	public ScatterList(long maxBytesPerScatter) {
+	private ScatterList(long maxBytesPerScatter) {
 		if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
 			throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
 		}
--- a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java
+++ b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java
@ -8,7 +8,6 @@ import org.lwjgl.opengl.GL45C;
 import org.lwjgl.system.MemoryUtil;

 import com.jozufozu.flywheel.backend.compile.IndirectPrograms;
-import com.jozufozu.flywheel.backend.gl.GlCompat;
 import com.jozufozu.flywheel.backend.gl.GlFence;
 import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer;
 import com.jozufozu.flywheel.backend.gl.shader.GlProgram;
@ -282,7 +281,7 @@ public class StagingBuffer {

 		GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);

-		GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1);
+		GL45.glDispatchCompute(scatterList.copyCount(), 1, 1);

 		scatterList.reset();
 	}
--- a/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl
+++ b/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl
@ -1,4 +1,4 @@
-layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
+layout(local_size_x = 64) in;

 const uint SRC_OFFSET_MASK = 0xFFFFFF;

@ -24,19 +24,27 @@ layout(std430, binding = 2) restrict writeonly buffer Dst {
 };

 void main() {
-    uint copy = gl_GlobalInvocationID.x;
+    // Each work group is responsible for one of the copies in the buffer.
+    // We dispatch exactly as many work groups as there are copies, so no need to check bounds.
+    uint copy = gl_WorkGroupID.x;

-    if (copy >= copies.length()) {
-        return;
-    }
+    // Each invocation in the work group is responsible for one uint in the copy.
+    uint i = gl_LocalInvocationID.x;

+    // Unpack the copy.
    uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
+    uint dstOffset = copies[copy].dstOffset;
    uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
    uint size = sizeAndSrcOffset >> 24;

-    uint dstOffset = copies[copy].dstOffset;
+    // Fetch the uint to copy before exiting to make instruction reordering happy.
+    // With 20mb going through a 24mb staging buffer, this made a 1ms/frame difference.
+    // Should properly test with nsight at some point.
+    uint toCopy = src[srcOffset + i];

-    for (uint i = 0; i < size; i++) {
-        dst[dstOffset + i] = src[srcOffset + i];
+    if (i >= size) {
+        return;
    }
+
+    dst[dstOffset + i] = toCopy;
 }