Cache deposits

- Make the scatter shader more cache friendly by doing one copy per
  workgroup and making copies up to 64 uints long
This commit is contained in:
Jozufozu 2024-03-31 16:04:25 -07:00
parent 2ee3944ca6
commit d5c5b998bc
3 changed files with 20 additions and 12 deletions

View File

@ -12,10 +12,11 @@ public class ScatterList {
private long usedBytes;
public ScatterList() {
this(64);
// Should be the same as the local_size_x of the scatter shader
this(64 * 4);
}
public ScatterList(long maxBytesPerScatter) {
private ScatterList(long maxBytesPerScatter) {
if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
}

View File

@ -8,7 +8,6 @@ import org.lwjgl.opengl.GL45C;
import org.lwjgl.system.MemoryUtil;
import com.jozufozu.flywheel.backend.compile.IndirectPrograms;
import com.jozufozu.flywheel.backend.gl.GlCompat;
import com.jozufozu.flywheel.backend.gl.GlFence;
import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer;
import com.jozufozu.flywheel.backend.gl.shader.GlProgram;
@ -282,7 +281,7 @@ public class StagingBuffer {
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1);
GL45.glDispatchCompute(scatterList.copyCount(), 1, 1);
scatterList.reset();
}

View File

@ -1,4 +1,4 @@
layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
layout(local_size_x = 64) in;
const uint SRC_OFFSET_MASK = 0xFFFFFF;
@ -24,19 +24,27 @@ layout(std430, binding = 2) restrict writeonly buffer Dst {
};
void main() {
uint copy = gl_GlobalInvocationID.x;
// Each work group is responsible for one of the copies in the buffer.
// We dispatch exactly as many work groups as there are copies, so no need to check bounds.
uint copy = gl_WorkGroupID.x;
if (copy >= copies.length()) {
return;
}
// Each invocation in the work group is responsible for one uint in the copy.
uint i = gl_LocalInvocationID.x;
// Unpack the copy.
uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
uint dstOffset = copies[copy].dstOffset;
uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
uint size = sizeAndSrcOffset >> 24;
uint dstOffset = copies[copy].dstOffset;
// Fetch the uint to copy before exiting to make instruction reordering happy.
// With 20mb going through a 24mb staging buffer, this made a 1ms/frame difference.
// Should properly test with nsight at some point.
uint toCopy = src[srcOffset + i];
for (uint i = 0; i < size; i++) {
dst[dstOffset + i] = src[srcOffset + i];
if (i >= size) {
return;
}
dst[dstOffset + i] = toCopy;
}