mirror of
https://github.com/Jozufozu/Flywheel.git
synced 2024-12-27 07:26:48 +01:00
Cache deposits
- Make the scatter shader more cache friendly by doing one copy per workgroup and making copies up to 64 uints long
This commit is contained in:
parent
2ee3944ca6
commit
d5c5b998bc
3 changed files with 20 additions and 12 deletions
|
@ -12,10 +12,11 @@ public class ScatterList {
|
||||||
private long usedBytes;
|
private long usedBytes;
|
||||||
|
|
||||||
public ScatterList() {
|
public ScatterList() {
|
||||||
this(64);
|
// Should be the same as the local_size_x of the scatter shader
|
||||||
|
this(64 * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScatterList(long maxBytesPerScatter) {
|
private ScatterList(long maxBytesPerScatter) {
|
||||||
if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
|
if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
|
||||||
throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
|
throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,6 @@ import org.lwjgl.opengl.GL45C;
|
||||||
import org.lwjgl.system.MemoryUtil;
|
import org.lwjgl.system.MemoryUtil;
|
||||||
|
|
||||||
import com.jozufozu.flywheel.backend.compile.IndirectPrograms;
|
import com.jozufozu.flywheel.backend.compile.IndirectPrograms;
|
||||||
import com.jozufozu.flywheel.backend.gl.GlCompat;
|
|
||||||
import com.jozufozu.flywheel.backend.gl.GlFence;
|
import com.jozufozu.flywheel.backend.gl.GlFence;
|
||||||
import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer;
|
import com.jozufozu.flywheel.backend.gl.buffer.GlBuffer;
|
||||||
import com.jozufozu.flywheel.backend.gl.shader.GlProgram;
|
import com.jozufozu.flywheel.backend.gl.shader.GlProgram;
|
||||||
|
@ -282,7 +281,7 @@ public class StagingBuffer {
|
||||||
|
|
||||||
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
|
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
|
||||||
|
|
||||||
GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1);
|
GL45.glDispatchCompute(scatterList.copyCount(), 1, 1);
|
||||||
|
|
||||||
scatterList.reset();
|
scatterList.reset();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
|
layout(local_size_x = 64) in;
|
||||||
|
|
||||||
const uint SRC_OFFSET_MASK = 0xFFFFFF;
|
const uint SRC_OFFSET_MASK = 0xFFFFFF;
|
||||||
|
|
||||||
|
@ -24,19 +24,27 @@ layout(std430, binding = 2) restrict writeonly buffer Dst {
|
||||||
};
|
};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
uint copy = gl_GlobalInvocationID.x;
|
// Each work group is responsible for one of the copies in the buffer.
|
||||||
|
// We dispatch exactly as many work groups as there are copies, so no need to check bounds.
|
||||||
|
uint copy = gl_WorkGroupID.x;
|
||||||
|
|
||||||
if (copy >= copies.length()) {
|
// Each invocation in the work group is responsible for one uint in the copy.
|
||||||
return;
|
uint i = gl_LocalInvocationID.x;
|
||||||
}
|
|
||||||
|
|
||||||
|
// Unpack the copy.
|
||||||
uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
|
uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
|
||||||
|
uint dstOffset = copies[copy].dstOffset;
|
||||||
uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
|
uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
|
||||||
uint size = sizeAndSrcOffset >> 24;
|
uint size = sizeAndSrcOffset >> 24;
|
||||||
|
|
||||||
uint dstOffset = copies[copy].dstOffset;
|
// Fetch the uint to copy before exiting to make instruction reordering happy.
|
||||||
|
// With 20mb going through a 24mb staging buffer, this made a 1ms/frame difference.
|
||||||
|
// Should properly test with nsight at some point.
|
||||||
|
uint toCopy = src[srcOffset + i];
|
||||||
|
|
||||||
for (uint i = 0; i < size; i++) {
|
if (i >= size) {
|
||||||
dst[dstOffset + i] = src[srcOffset + i];
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dst[dstOffset + i] = toCopy;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue