mirror of
https://github.com/Jozufozu/Flywheel.git
synced 2024-12-26 15:06:28 +01:00
Cache deposits
- Make the scatter shader more cache friendly by doing one copy per workgroup and making copies up to 64 uints long
This commit is contained in:
parent
2ee3944ca6
commit
d5c5b998bc
3 changed files with 20 additions and 12 deletions
|
@ -12,10 +12,11 @@ public class ScatterList {
|
|||
private long usedBytes;
|
||||
|
||||
public ScatterList() {
|
||||
this(64);
|
||||
// Should be the same as the local_size_x of the scatter shader
|
||||
this(64 * 4);
|
||||
}
|
||||
|
||||
public ScatterList(long maxBytesPerScatter) {
|
||||
private ScatterList(long maxBytesPerScatter) {
|
||||
if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
|
||||
throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
|
||||
}
|
||||
|
|
|
@ -8,7 +8,6 @@ import org.lwjgl.opengl.GL45C;
|
|||
import org.lwjgl.system.MemoryUtil;
|
||||
|
||||
import com.jozufozu.flywheel.backend.compile.IndirectPrograms;
|
||||
import com.jozufozu.flywheel.backend.gl.GlCompat;
|
||||
import com.jozufozu.flywheel.backend.gl.GlFence;
|
||||
import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer;
|
||||
import com.jozufozu.flywheel.backend.gl.shader.GlProgram;
|
||||
|
@ -282,7 +281,7 @@ public class StagingBuffer {
|
|||
|
||||
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
|
||||
|
||||
GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1);
|
||||
GL45.glDispatchCompute(scatterList.copyCount(), 1, 1);
|
||||
|
||||
scatterList.reset();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
|
||||
layout(local_size_x = 64) in;
|
||||
|
||||
const uint SRC_OFFSET_MASK = 0xFFFFFF;
|
||||
|
||||
|
@ -24,19 +24,27 @@ layout(std430, binding = 2) restrict writeonly buffer Dst {
|
|||
};
|
||||
|
||||
void main() {
|
||||
uint copy = gl_GlobalInvocationID.x;
|
||||
// Each work group is responsible for one of the copies in the buffer.
|
||||
// We dispatch exactly as many work groups as there are copies, so no need to check bounds.
|
||||
uint copy = gl_WorkGroupID.x;
|
||||
|
||||
if (copy >= copies.length()) {
|
||||
return;
|
||||
}
|
||||
// Each invocation in the work group is responsible for one uint in the copy.
|
||||
uint i = gl_LocalInvocationID.x;
|
||||
|
||||
// Unpack the copy.
|
||||
uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
|
||||
uint dstOffset = copies[copy].dstOffset;
|
||||
uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
|
||||
uint size = sizeAndSrcOffset >> 24;
|
||||
|
||||
uint dstOffset = copies[copy].dstOffset;
|
||||
// Fetch the uint to copy before exiting to make instruction reordering happy.
|
||||
// With 20mb going through a 24mb staging buffer, this made a 1ms/frame difference.
|
||||
// Should properly test with nsight at some point.
|
||||
uint toCopy = src[srcOffset + i];
|
||||
|
||||
for (uint i = 0; i < size; i++) {
|
||||
dst[dstOffset + i] = src[srcOffset + i];
|
||||
if (i >= size) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[dstOffset + i] = toCopy;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue