Poking and prodding

- Invert image size on CPU to avoid divisions on GPU
- Increase depth reduce group size to 16x16
- Early-out in uploadInstances based on changed cardinality
  - Much faster to calculate cardinality than it is to clear an
    AtomicBitSet, so the check is worth it
- Upload scatter list directly in the staging buffer if there's room
This commit is contained in:
Jozufozu 2024-09-09 20:39:10 -07:00
parent f12aa15dae
commit 0bfaac7154
5 changed files with 32 additions and 8 deletions

View file

@ -35,7 +35,6 @@ public final class MaterialRenderState {
setupBackfaceCulling(material.backfaceCulling());
setupPolygonOffset(material.polygonOffset());
setupDepthTest(material.depthTest());
// setupDepthTest(DepthTest.OFF);
setupTransparency(material.transparency());
setupWriteMask(material.writeMask());
}

View file

@ -11,6 +11,8 @@ import dev.engine_room.flywheel.lib.math.MoreMath;
import net.minecraft.client.Minecraft;
public class DepthPyramid {
private static final int GROUP_SIZE = 16;
private final GlProgram depthReduceProgram;
public int pyramidTextureId = -1;
@ -50,10 +52,10 @@ public class DepthPyramid {
GL46.glBindImageTexture(0, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F);
depthReduceProgram.setVec2("imageSize", mipWidth, mipHeight);
depthReduceProgram.setVec2("oneOverImageSize", 1f / (float) mipWidth, 1f / (float) mipHeight);
depthReduceProgram.setInt("lod", Math.max(0, i - 1));
GL46.glDispatchCompute(MoreMath.ceilingDiv(mipWidth, 8), MoreMath.ceilingDiv(mipHeight, 8), 1);
GL46.glDispatchCompute(MoreMath.ceilingDiv(mipWidth, GROUP_SIZE), MoreMath.ceilingDiv(mipHeight, GROUP_SIZE), 1);
GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT);
}

View file

@ -82,6 +82,11 @@ public class IndirectInstancer<I extends Instance> extends AbstractInstancer<I>
}
public void uploadInstances(StagingBuffer stagingBuffer, int instanceVbo) {
if (changedPages.cardinality() == 0) {
// Early return because checking the cardinality is faster than clearing.
return;
}
int numPages = mapping.pageCount();
var instanceCount = instances.size();

View file

@ -23,6 +23,8 @@ public class StagingBuffer {
private static final int STORAGE_FLAGS = GL45C.GL_MAP_PERSISTENT_BIT | GL45C.GL_MAP_WRITE_BIT | GL45C.GL_CLIENT_STORAGE_BIT;
private static final int MAP_FLAGS = GL45C.GL_MAP_PERSISTENT_BIT | GL45C.GL_MAP_WRITE_BIT | GL45C.GL_MAP_FLUSH_EXPLICIT_BIT | GL45C.GL_MAP_INVALIDATE_BUFFER_BIT;
private static final int SSBO_ALIGNMENT = GL45.glGetInteger(GL45.GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
private final int vbo;
private final long map;
private final long capacity;
@ -254,7 +256,6 @@ public class StagingBuffer {
scatterProgram.bind();
// These bindings don't change between dstVbos.
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, scatterBuffer.handle());
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 1, vbo);
int dstVbo;
@ -276,7 +277,24 @@ public class StagingBuffer {
}
private void dispatchScatter(int dstVbo) {
scatterBuffer.upload(scatterList.ptr(), scatterList.usedBytes());
var scatterSize = scatterList.usedBytes();
long alignedPos = pos + SSBO_ALIGNMENT - 1 - (pos + SSBO_ALIGNMENT - 1) % SSBO_ALIGNMENT;
long remaining = capacity - alignedPos;
if (scatterSize <= remaining && scatterSize <= totalAvailable) {
MemoryUtil.memCopy(scatterList.ptr(), map + alignedPos, scatterSize);
GL45.glBindBufferRange(GL45C.GL_SHADER_STORAGE_BUFFER, 0, vbo, alignedPos, scatterSize);
long alignmentCost = alignedPos - pos;
usedCapacity += scatterSize + alignmentCost;
totalAvailable -= scatterSize + alignmentCost;
pos += scatterSize + alignmentCost;
} else {
scatterBuffer.upload(scatterList.ptr(), scatterSize);
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, scatterBuffer.handle());
}
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);

View file

@ -1,9 +1,9 @@
layout(local_size_x = 8, local_size_y = 8) in;
layout(local_size_x = 16, local_size_y = 16) in;
layout(binding = 0, r32f) uniform writeonly image2D outImage;
layout(binding = 1) uniform sampler2D inImage;
uniform vec2 imageSize;
uniform vec2 oneOverImageSize;
uniform int lod;
uniform int useMin = 0;
@ -13,7 +13,7 @@ void main() {
// Map the output texel to an input texel. Properly do the division because generating mip0 maps from the actual
// full resolution depth buffer and the aspect ratio may be different from our Po2 pyramid.
ivec2 samplePos = ivec2(floor(vec2(pos) * vec2(textureSize(inImage, lod)) / imageSize));
ivec2 samplePos = ivec2(floor(vec2(pos) * vec2(textureSize(inImage, lod)) * oneOverImageSize));
float depth01 = texelFetchOffset(inImage, samplePos, lod, ivec2(0, 1)).r;
float depth11 = texelFetchOffset(inImage, samplePos, lod, ivec2(1, 1)).r;