From d5c5b998bc60d2ad3077d6f5fd5bfc9f2f701316 Mon Sep 17 00:00:00 2001 From: Jozufozu Date: Sun, 31 Mar 2024 16:04:25 -0700 Subject: [PATCH] Cache deposits - Make the scatter shader more cache friendly by doing one copy per workgroup and making copies up to 64 uints long --- .../backend/engine/indirect/ScatterList.java | 5 ++-- .../engine/indirect/StagingBuffer.java | 3 +-- .../flywheel/internal/indirect/scatter.glsl | 24 ++++++++++++------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java index 05ce00d03..468c8a433 100644 --- a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java +++ b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/ScatterList.java @@ -12,10 +12,11 @@ public class ScatterList { private long usedBytes; public ScatterList() { - this(64); + // Should be the same as the local_size_x of the scatter shader + this(64 * 4); } - public ScatterList(long maxBytesPerScatter) { + private ScatterList(long maxBytesPerScatter) { if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) { throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024"); } diff --git a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java index 0556c0c2e..8c0324f38 100644 --- a/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java +++ b/src/main/java/com/jozufozu/flywheel/backend/engine/indirect/StagingBuffer.java @@ -8,7 +8,6 @@ import org.lwjgl.opengl.GL45C; import org.lwjgl.system.MemoryUtil; import com.jozufozu.flywheel.backend.compile.IndirectPrograms; -import com.jozufozu.flywheel.backend.gl.GlCompat; import com.jozufozu.flywheel.backend.gl.GlFence; import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer; import com.jozufozu.flywheel.backend.gl.shader.GlProgram; @@ -282,7 +281,7 @@ public class StagingBuffer { GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo); - GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1); + GL45.glDispatchCompute(scatterList.copyCount(), 1, 1); scatterList.reset(); } diff --git a/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl b/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl index 489cace85..ec7910e05 100644 --- a/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl +++ b/src/main/resources/assets/flywheel/flywheel/internal/indirect/scatter.glsl @@ -1,4 +1,4 @@ -layout(local_size_x = _FLW_SUBGROUP_SIZE) in; +layout(local_size_x = 64) in; const uint SRC_OFFSET_MASK = 0xFFFFFF; @@ -24,19 +24,27 @@ layout(std430, binding = 2) restrict writeonly buffer Dst { }; void main() { - uint copy = gl_GlobalInvocationID.x; + // Each work group is responsible for one of the copies in the buffer. + // We dispatch exactly as many work groups as there are copies, so no need to check bounds. + uint copy = gl_WorkGroupID.x; - if (copy >= copies.length()) { - return; - } + // Each invocation in the work group is responsible for one uint in the copy. + uint i = gl_LocalInvocationID.x; + // Unpack the copy. uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset; + uint dstOffset = copies[copy].dstOffset; uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK; uint size = sizeAndSrcOffset >> 24; - uint dstOffset = copies[copy].dstOffset; + // Fetch the uint to copy before exiting to make instruction reordering happy. + // With 20mb going through a 24mb staging buffer, this made a 1ms/frame difference. + // Should properly test with nsight at some point. + uint toCopy = src[srcOffset + i]; - for (uint i = 0; i < size; i++) { - dst[dstOffset + i] = src[srcOffset + i]; + if (i >= size) { + return; } + + dst[dstOffset + i] = toCopy; }