diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java index 39d211ef9..ad416941f 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java @@ -26,12 +26,15 @@ import net.minecraft.resources.ResourceLocation; public class IndirectPrograms extends AtomicReferenceCounted { private static final ResourceLocation CULL_SHADER_API_IMPL = Flywheel.rl("internal/indirect/cull_api_impl.glsl"); - private static final ResourceLocation CULL_SHADER_MAIN = Flywheel.rl("internal/indirect/cull.glsl"); + private static final ResourceLocation CULL_SHADER_MAIN = Flywheel.rl("internal/indirect/early_cull.glsl"); + private static final ResourceLocation PASS2_SHADER_MAIN = Flywheel.rl("internal/indirect/late_cull.glsl"); private static final ResourceLocation APPLY_SHADER_MAIN = Flywheel.rl("internal/indirect/apply.glsl"); private static final ResourceLocation SCATTER_SHADER_MAIN = Flywheel.rl("internal/indirect/scatter.glsl"); + private static final ResourceLocation READ_VISIBILITY_SHADER_MAIN = Flywheel.rl("internal/indirect/read_visibility.glsl"); + private static final ResourceLocation ZERO_MODELS_SHADER_MAIN = Flywheel.rl("internal/indirect/zero_models.glsl"); private static final ResourceLocation DOWNSAMPLE_FIRST = Flywheel.rl("internal/indirect/downsample_first.glsl"); private static final ResourceLocation DOWNSAMPLE_SECOND = Flywheel.rl("internal/indirect/downsample_second.glsl"); - public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, DOWNSAMPLE_FIRST, DOWNSAMPLE_SECOND); + public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, READ_VISIBILITY_SHADER_MAIN, ZERO_MODELS_SHADER_MAIN, DOWNSAMPLE_FIRST, DOWNSAMPLE_SECOND); private static final Compile> CULL = new Compile<>(); private static final Compile UTIL = new Compile<>(); @@ -44,11 +47,13 @@ public class IndirectPrograms extends AtomicReferenceCounted { private final PipelineCompiler pipeline; private final CompilationHarness> culling; + private final CompilationHarness> cullPassTwo; private final CompilationHarness utils; - private IndirectPrograms(PipelineCompiler pipeline, CompilationHarness> culling, CompilationHarness utils) { + private IndirectPrograms(PipelineCompiler pipeline, CompilationHarness> culling, CompilationHarness> cullPassTwo, CompilationHarness utils) { this.pipeline = pipeline; this.culling = culling; + this.cullPassTwo = cullPassTwo; this.utils = utils; } @@ -86,10 +91,11 @@ public class IndirectPrograms extends AtomicReferenceCounted { } var pipelineCompiler = PipelineCompiler.create(sources, Pipelines.INDIRECT, vertexComponents, fragmentComponents, EXTENSIONS); - var cullingCompiler = createCullingCompiler(sources); + var pass1Compiler = createCullingCompiler(sources, CULL_SHADER_MAIN, "early_cull"); + var pass2Compiler = createCullingCompiler(sources, PASS2_SHADER_MAIN, "late_cull"); var utilCompiler = createUtilCompiler(sources); - IndirectPrograms newInstance = new IndirectPrograms(pipelineCompiler, cullingCompiler, utilCompiler); + IndirectPrograms newInstance = new IndirectPrograms(pipelineCompiler, pass1Compiler, pass2Compiler, utilCompiler); setInstance(newInstance); } @@ -97,19 +103,19 @@ public class IndirectPrograms extends AtomicReferenceCounted { /** * A compiler for cull shaders, parameterized by the instance type. */ - private static CompilationHarness> createCullingCompiler(ShaderSources sources) { + private static CompilationHarness> createCullingCompiler(ShaderSources sources, ResourceLocation main, String name) { return CULL.program() .link(CULL.shader(GlCompat.MAX_GLSL_VERSION, ShaderType.COMPUTE) - .nameMapper(instanceType -> "culling/" + ResourceUtil.toDebugFileNameNoExtension(instanceType.cullShader())) + .nameMapper(instanceType -> name + "/" + ResourceUtil.toDebugFileNameNoExtension(instanceType.cullShader())) .requireExtensions(COMPUTE_EXTENSIONS) .define("_FLW_SUBGROUP_SIZE", GlCompat.SUBGROUP_SIZE) .withResource(CULL_SHADER_API_IMPL) .withComponent(InstanceStructComponent::new) .withResource(InstanceType::cullShader) .withComponent(SsboInstanceComponent::new) - .withResource(CULL_SHADER_MAIN)) + .withResource(main)) .postLink((key, program) -> Uniforms.setUniformBlockBindings(program)) - .harness("culling", sources); + .harness(name, sources); } /** @@ -156,10 +162,18 @@ public class IndirectPrograms extends AtomicReferenceCounted { return culling.get(instanceType); } + public GlProgram getCullPassTwoProgram(InstanceType instanceType) { + return cullPassTwo.get(instanceType); + } + public GlProgram getApplyProgram() { return utils.get(APPLY_SHADER_MAIN); } + public GlProgram getZeroModelProgram() { + return utils.get(ZERO_MODELS_SHADER_MAIN); + } + public GlProgram getScatterProgram() { return utils.get(SCATTER_SHADER_MAIN); } @@ -172,10 +186,15 @@ public class IndirectPrograms extends AtomicReferenceCounted { return utils.get(DOWNSAMPLE_SECOND); } + public GlProgram getReadVisibilityProgram() { + return utils.get(READ_VISIBILITY_SHADER_MAIN); + } + @Override protected void _delete() { pipeline.delete(); culling.delete(); + cullPassTwo.delete(); utils.delete(); } } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java index 0b9eea6fd..67b0a9dc6 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/BufferBindings.java @@ -1,15 +1,18 @@ package dev.engine_room.flywheel.backend.engine.indirect; public final class BufferBindings { - public static final int PAGE_FRAME_DESCRIPTOR = 0; - public static final int INSTANCE = 1; - public static final int DRAW_INSTANCE_INDEX = 2; - public static final int MODEL = 3; - public static final int DRAW = 4; + public static final int PASS_TWO_DISPATCH = 0; + public static final int PASS_TWO_INSTANCE_INDEX = 1; + public static final int PAGE_FRAME_DESCRIPTOR = 2; + public static final int INSTANCE = 3; + public static final int DRAW_INSTANCE_INDEX = 4; + public static final int MODEL = 5; + public static final int DRAW = 6; - public static final int LIGHT_LUT = 5; - public static final int LIGHT_SECTION = 6; - public static final int MATRICES = 7; + public static final int LIGHT_LUT = 7; + public static final int LIGHT_SECTION = 8; + public static final int MATRICES = 9; + public static final int LAST_FRAME_VISIBILITY = 10; private BufferBindings() { } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java index 866150347..722c89e8f 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectBuffers.java @@ -11,7 +11,7 @@ import dev.engine_room.flywheel.lib.memory.MemoryBlock; public class IndirectBuffers { // Number of vbos created. - public static final int BUFFER_COUNT = 5; + public static final int BUFFER_COUNT = 7; public static final long INT_SIZE = Integer.BYTES; public static final long PTR_SIZE = Pointer.POINTER_SIZE; @@ -30,6 +30,8 @@ public class IndirectBuffers { private static final long BUFFERS_SIZE_BYTES = SIZE_OFFSET + BUFFER_COUNT * PTR_SIZE; // Offsets to the vbos + private static final long PASS_TWO_DISPATCH_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * INT_SIZE; + private static final long PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * INT_SIZE; private static final long PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * INT_SIZE; private static final long INSTANCE_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.INSTANCE * INT_SIZE; private static final long DRAW_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * INT_SIZE; @@ -37,6 +39,8 @@ public class IndirectBuffers { private static final long DRAW_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW * INT_SIZE; // Offsets to the sizes + private static final long PASS_TWO_DISPATCH_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * PTR_SIZE; + private static final long PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * PTR_SIZE; private static final long PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * PTR_SIZE; private static final long INSTANCE_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.INSTANCE * PTR_SIZE; private static final long DRAW_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * PTR_SIZE; @@ -62,6 +66,8 @@ public class IndirectBuffers { */ private final MemoryBlock multiBindBlock; + public final ResizableStorageBuffer passTwoDispatch; + public final ResizableStorageArray passTwoInstanceIndex; public final ObjectStorage objectStorage; public final ResizableStorageArray drawInstanceIndex; public final ResizableStorageArray model; @@ -70,25 +76,34 @@ public class IndirectBuffers { IndirectBuffers(long instanceStride) { this.multiBindBlock = MemoryBlock.calloc(BUFFERS_SIZE_BYTES, 1); + passTwoDispatch = new ResizableStorageBuffer(); + passTwoInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR); objectStorage = new ObjectStorage(instanceStride); drawInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR); model = new ResizableStorageArray(MODEL_STRIDE, MODEL_GROWTH_FACTOR); draw = new ResizableStorageArray(DRAW_COMMAND_STRIDE, DRAW_GROWTH_FACTOR); + + passTwoDispatch.ensureCapacity(INT_SIZE * 4); } void updateCounts(int instanceCount, int modelCount, int drawCount) { drawInstanceIndex.ensureCapacity(instanceCount); + passTwoInstanceIndex.ensureCapacity(instanceCount); model.ensureCapacity(modelCount); draw.ensureCapacity(drawCount); final long ptr = multiBindBlock.ptr(); + MemoryUtil.memPutInt(ptr + PASS_TWO_DISPATCH_HANDLE_OFFSET, passTwoDispatch.handle()); + MemoryUtil.memPutInt(ptr + PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET, passTwoInstanceIndex.handle()); MemoryUtil.memPutInt(ptr + PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET, objectStorage.frameDescriptorBuffer.handle()); MemoryUtil.memPutInt(ptr + INSTANCE_HANDLE_OFFSET, objectStorage.objectBuffer.handle()); MemoryUtil.memPutInt(ptr + DRAW_INSTANCE_INDEX_HANDLE_OFFSET, drawInstanceIndex.handle()); MemoryUtil.memPutInt(ptr + MODEL_HANDLE_OFFSET, model.handle()); MemoryUtil.memPutInt(ptr + DRAW_HANDLE_OFFSET, draw.handle()); + MemoryUtil.memPutAddress(ptr + PASS_TWO_DISPATCH_SIZE_OFFSET, passTwoDispatch.capacity()); + MemoryUtil.memPutAddress(ptr + PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount); MemoryUtil.memPutAddress(ptr + PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET, objectStorage.frameDescriptorBuffer.capacity()); MemoryUtil.memPutAddress(ptr + INSTANCE_SIZE_OFFSET, objectStorage.objectBuffer.capacity()); MemoryUtil.memPutAddress(ptr + DRAW_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount); @@ -96,16 +111,25 @@ public class IndirectBuffers { MemoryUtil.memPutAddress(ptr + DRAW_SIZE_OFFSET, DRAW_COMMAND_STRIDE * drawCount); } - public void bindForCull() { - multiBind(0, 4); + public void bindForCullPassOne() { + multiBind(0, 6); + } + + public void bindForCullPassTwo() { + multiBind(0, 6); + GlBufferType.DISPATCH_INDIRECT_BUFFER.bind(passTwoDispatch.handle()); } public void bindForApply() { - multiBind(3, 2); + multiBind(5, 2); + } + + public void bindForModelReset() { + multiBind(5, 1); } public void bindForDraw() { - multiBind(1, 4); + multiBind(3, 4); GlBufferType.DRAW_INDIRECT_BUFFER.bind(draw.handle()); } @@ -113,12 +137,15 @@ public class IndirectBuffers { * Bind all buffers except the draw command buffer. */ public void bindForCrumbling() { - multiBind(1, 4); + multiBind(3, 3); } private void multiBind(int base, int count) { final long ptr = multiBindBlock.ptr(); - nglBindBuffersRange(GL_SHADER_STORAGE_BUFFER, base, count, ptr + base * INT_SIZE, ptr + OFFSET_OFFSET + base * PTR_SIZE, ptr + SIZE_OFFSET + base * PTR_SIZE); + long handlePtr = ptr + HANDLE_OFFSET + base * INT_SIZE; + long offsetPtr = ptr + OFFSET_OFFSET + base * PTR_SIZE; + long sizePtr = ptr + SIZE_OFFSET + base * PTR_SIZE; + nglBindBuffersRange(GL_SHADER_STORAGE_BUFFER, base, count, handlePtr, offsetPtr, sizePtr); } public void delete() { @@ -128,5 +155,7 @@ public class IndirectBuffers { drawInstanceIndex.delete(); model.delete(); draw.delete(); + passTwoDispatch.delete(); + passTwoInstanceIndex.delete(); } } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java index 9060b5363..b998ea6f8 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectCullingGroup.java @@ -6,6 +6,7 @@ import static org.lwjgl.opengl.GL30.glUniform1ui; import static org.lwjgl.opengl.GL42.GL_COMMAND_BARRIER_BIT; import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.glDispatchCompute; +import static org.lwjgl.opengl.GL43.glDispatchComputeIndirect; import java.util.ArrayList; import java.util.Comparator; @@ -13,6 +14,8 @@ import java.util.EnumMap; import java.util.List; import java.util.Map; +import org.lwjgl.opengl.GL46; + import dev.engine_room.flywheel.api.instance.Instance; import dev.engine_room.flywheel.api.instance.InstanceType; import dev.engine_room.flywheel.api.material.Material; @@ -26,6 +29,7 @@ import dev.engine_room.flywheel.backend.engine.MeshPool; import dev.engine_room.flywheel.backend.engine.uniform.Uniforms; import dev.engine_room.flywheel.backend.gl.GlCompat; import dev.engine_room.flywheel.backend.gl.shader.GlProgram; +import dev.engine_room.flywheel.lib.material.LightShaders; import dev.engine_room.flywheel.lib.math.MoreMath; public class IndirectCullingGroup { @@ -43,11 +47,18 @@ public class IndirectCullingGroup { private final Map> multiDraws = new EnumMap<>(VisualType.class); private final IndirectPrograms programs; - private final GlProgram cullProgram; + private final GlProgram earlyCull; + private final GlProgram lateCull; private boolean needsDrawBarrier; private boolean needsDrawSort; - private int instanceCountThisFrame; + public int instanceCountThisFrame; + + private int pagesLastFrame = 0; + private int pagesThisFrame = 0; + + private int visibilityWriteOffsetPages = 0; + private int visibilityReadOffsetPages = 0; IndirectCullingGroup(InstanceType instanceType, IndirectPrograms programs) { this.instanceType = instanceType; @@ -56,7 +67,8 @@ public class IndirectCullingGroup { buffers = new IndirectBuffers(instanceStride); this.programs = programs; - cullProgram = programs.getCullingProgram(instanceType); + earlyCull = programs.getCullingProgram(instanceType); + lateCull = programs.getCullPassTwoProgram(instanceType); } public void flushInstancers() { @@ -83,6 +95,17 @@ public class IndirectCullingGroup { } } + public int flipVisibilityOffsets(int visibilityWriteOffsetPages) { + this.visibilityReadOffsetPages = this.visibilityWriteOffsetPages; + this.visibilityWriteOffsetPages = visibilityWriteOffsetPages; + + pagesLastFrame = pagesThisFrame; + + pagesThisFrame = buffers.objectStorage.capacity(); + + return pagesThisFrame; + } + public void upload(StagingBuffer stagingBuffer) { if (nothingToDo()) { return; @@ -105,7 +128,7 @@ public class IndirectCullingGroup { uploadDraws(stagingBuffer); - needsDrawBarrier = true; + GL46.nglClearNamedBufferData(buffers.passTwoDispatch.handle(), GL46.GL_R32UI, GL46.GL_RED, GL46.GL_UNSIGNED_INT, 0); } public void dispatchCull() { @@ -114,12 +137,26 @@ public class IndirectCullingGroup { } Uniforms.bindAll(); - cullProgram.bind(); + earlyCull.bind(); - buffers.bindForCull(); + earlyCull.setUInt("_flw_visibilityReadOffsetPages", visibilityReadOffsetPages); + + buffers.bindForCullPassOne(); glDispatchCompute(buffers.objectStorage.capacity(), 1, 1); } + public void dispatchCullPassTwo() { + if (nothingToDo()) { + return; + } + + Uniforms.bindAll(); + lateCull.bind(); + + buffers.bindForCullPassTwo(); + glDispatchComputeIndirect(0); + } + public void dispatchApply() { if (nothingToDo()) { return; @@ -127,6 +164,17 @@ public class IndirectCullingGroup { buffers.bindForApply(); glDispatchCompute(GlCompat.getComputeGroupCount(indirectDraws.size()), 1, 1); + + needsDrawBarrier = true; + } + + public void dispatchModelReset() { + if (nothingToDo()) { + return; + } + + buffers.bindForModelReset(); + glDispatchCompute(GlCompat.getComputeGroupCount(instancers.size()), 1, 1); } private boolean nothingToDo() { @@ -209,6 +257,8 @@ public class IndirectCullingGroup { // Don't need to do this unless the program changes. drawProgram.bind(); baseDrawUniformLoc = drawProgram.getUniformLocation("_flw_baseDraw"); + + drawProgram.setUInt("_flw_visibilityWriteOffsetInstances", visibilityWriteOffsetPages << ObjectStorage.LOG_2_PAGE_SIZE); } glUniform1ui(baseDrawUniformLoc, multiDraw.start); diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java index 4b14df6b2..c898a84dc 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java @@ -4,6 +4,7 @@ import static org.lwjgl.opengl.GL11.GL_TRIANGLES; import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT; import static org.lwjgl.opengl.GL30.glBindBufferRange; import static org.lwjgl.opengl.GL40.glDrawElementsIndirect; +import static org.lwjgl.opengl.GL42.GL_BUFFER_UPDATE_BARRIER_BIT; import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BARRIER_BIT; import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BUFFER; @@ -12,6 +13,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import com.mojang.blaze3d.platform.GlStateManager; + import dev.engine_room.flywheel.api.backend.Engine; import dev.engine_room.flywheel.api.instance.Instance; import dev.engine_room.flywheel.api.instance.InstanceType; @@ -30,6 +33,7 @@ import dev.engine_room.flywheel.backend.engine.MeshPool; import dev.engine_room.flywheel.backend.engine.TextureBinder; import dev.engine_room.flywheel.backend.engine.embed.EnvironmentStorage; import dev.engine_room.flywheel.backend.engine.uniform.Uniforms; +import dev.engine_room.flywheel.backend.gl.GlTextureUnit; import dev.engine_room.flywheel.backend.gl.array.GlVertexArray; import dev.engine_room.flywheel.backend.gl.buffer.GlBuffer; import dev.engine_room.flywheel.backend.gl.buffer.GlBufferType; @@ -49,6 +53,9 @@ public class IndirectDrawManager extends DrawManager> { private final MatrixBuffer matrixBuffer; private final DepthPyramid depthPyramid; + private final VisibilityBuffer visibilityBuffer; + + private int totalPagesLastFrame = 0; private boolean needsBarrier = false; @@ -66,6 +73,7 @@ public class IndirectDrawManager extends DrawManager> { matrixBuffer = new MatrixBuffer(); depthPyramid = new DepthPyramid(programs); + visibilityBuffer = new VisibilityBuffer(programs); } @Override @@ -90,7 +98,8 @@ public class IndirectDrawManager extends DrawManager> { } public void render(VisualType visualType) { - if (!hasVisualType(visualType)) { + // FIXME: Two pass occlusion prefers to render everything at once + if (visualType != VisualType.BLOCK_ENTITY) { return; } @@ -101,17 +110,71 @@ public class IndirectDrawManager extends DrawManager> { matrixBuffer.bind(); Uniforms.bindAll(); - if (needsBarrier) { - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - needsBarrier = false; + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT); + + visibilityBuffer.bind(); + + for (var group1 : cullingGroups.values()) { + group1.dispatchCull(); } + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + dispatchApply(); + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + visibilityBuffer.attach(); + + submitDraws(); + + depthPyramid.generate(); + + programs.getZeroModelProgram() + .bind(); + for (var group : cullingGroups.values()) { - group.submit(visualType); + group.dispatchModelReset(); } + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + GlTextureUnit.T0.makeActive(); + GlStateManager._bindTexture(depthPyramid.pyramidTextureId); + + for (var group1 : cullingGroups.values()) { + group1.dispatchCullPassTwo(); + } + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + dispatchApply(); + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + submitDraws(); + MaterialRenderState.reset(); TextureBinder.resetLightAndOverlay(); + + visibilityBuffer.detach(); + } + + private void dispatchApply() { + programs.getApplyProgram() + .bind(); + + for (var group1 : cullingGroups.values()) { + group1.dispatchApply(); + } + } + + private void submitDraws() { + for (var group : cullingGroups.values()) { + group.submit(VisualType.BLOCK_ENTITY); + group.submit(VisualType.ENTITY); + group.submit(VisualType.EFFECT); + } } @Override @@ -122,12 +185,20 @@ public class IndirectDrawManager extends DrawManager> { group.flushInstancers(); } + visibilityBuffer.read(totalPagesLastFrame); + visibilityBuffer.clear(); + cullingGroups.values() .removeIf(IndirectCullingGroup::checkEmptyAndDelete); instancers.values() .removeIf(instancer -> instancer.instanceCount() == 0); + int totalPagesThisFrame = 0; + for (var group : cullingGroups.values()) { + totalPagesThisFrame += group.flipVisibilityOffsets(totalPagesThisFrame); + } + meshPool.flush(); stagingBuffer.reclaim(); @@ -142,31 +213,12 @@ public class IndirectDrawManager extends DrawManager> { stagingBuffer.flush(); - depthPyramid.generate(); - // We could probably save some driver calls here when there are // actually zero instances, but that feels like a very rare case - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - - matrixBuffer.bind(); - - depthPyramid.bindForCull(); - - for (var group : cullingGroups.values()) { - group.dispatchCull(); - } - - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - - programs.getApplyProgram() - .bind(); - - for (var group : cullingGroups.values()) { - group.dispatchApply(); - } - needsBarrier = true; + + totalPagesLastFrame = totalPagesThisFrame; } @Override @@ -186,6 +238,8 @@ public class IndirectDrawManager extends DrawManager> { programs.release(); depthPyramid.delete(); + + visibilityBuffer.delete(); } public void renderCrumbling(List crumblingBlocks) { diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/StagingBuffer.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/StagingBuffer.java index 59a8c3e0a..5838de67e 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/StagingBuffer.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/StagingBuffer.java @@ -22,6 +22,8 @@ public class StagingBuffer { private static final int STORAGE_FLAGS = GL45C.GL_MAP_PERSISTENT_BIT | GL45C.GL_MAP_WRITE_BIT | GL45C.GL_CLIENT_STORAGE_BIT; private static final int MAP_FLAGS = GL45C.GL_MAP_PERSISTENT_BIT | GL45C.GL_MAP_WRITE_BIT | GL45C.GL_MAP_FLUSH_EXPLICIT_BIT | GL45C.GL_MAP_INVALIDATE_BUFFER_BIT; + private static final int SSBO_ALIGNMENT = GL45.glGetInteger(GL45.GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); + private final int vbo; private final long map; private final long capacity; @@ -252,7 +254,6 @@ public class StagingBuffer { .bind(); // These bindings don't change between dstVbos. - GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, scatterBuffer.handle()); GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 1, vbo); int dstVbo; @@ -274,7 +275,24 @@ public class StagingBuffer { } private void dispatchScatter(int dstVbo) { - scatterBuffer.upload(scatterList.ptr(), scatterList.usedBytes()); + var scatterSize = scatterList.usedBytes(); + + long alignedPos = pos + SSBO_ALIGNMENT - 1 - (pos + SSBO_ALIGNMENT - 1) % SSBO_ALIGNMENT; + + long remaining = capacity - alignedPos; + if (scatterSize <= remaining && scatterSize <= totalAvailable) { + MemoryUtil.memCopy(scatterList.ptr(), map + alignedPos, scatterSize); + GL45.glBindBufferRange(GL45C.GL_SHADER_STORAGE_BUFFER, 0, vbo, alignedPos, scatterSize); + + long alignmentCost = alignedPos - pos; + + usedCapacity += scatterSize + alignmentCost; + totalAvailable -= scatterSize + alignmentCost; + pos += scatterSize + alignmentCost; + } else { + scatterBuffer.upload(scatterList.ptr(), scatterSize); + GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, scatterBuffer.handle()); + } GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo); diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java new file mode 100644 index 000000000..7a82f7735 --- /dev/null +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java @@ -0,0 +1,132 @@ +package dev.engine_room.flywheel.backend.engine.indirect; + +import org.lwjgl.opengl.GL30; +import org.lwjgl.opengl.GL32; +import org.lwjgl.opengl.GL46; +import org.lwjgl.opengl.GL46C; + +import com.mojang.blaze3d.platform.GlStateManager; + +import dev.engine_room.flywheel.backend.FlwBackend; +import dev.engine_room.flywheel.backend.compile.IndirectPrograms; +import dev.engine_room.flywheel.backend.gl.GlTextureUnit; +import dev.engine_room.flywheel.lib.math.MoreMath; +import it.unimi.dsi.fastutil.ints.IntArraySet; +import it.unimi.dsi.fastutil.ints.IntSet; +import net.minecraft.client.Minecraft; + +public class VisibilityBuffer { + private static final int READ_GROUP_SIZE = 32; + private static final int ATTACHMENT = GL30.GL_COLOR_ATTACHMENT1; + + private final IndirectPrograms programs; + private final ResizableStorageArray lastFrameVisibility; + private int textureId = -1; + + private int lastWidth = -1; + private int lastHeight = -1; + + private final IntSet attached = new IntArraySet(); + + public VisibilityBuffer(IndirectPrograms programs) { + this.programs = programs; + lastFrameVisibility = new ResizableStorageArray(Integer.BYTES, 1.25f); + } + + public void read(int pageCount) { + if (pageCount == 0) { + return; + } + + lastFrameVisibility.ensureCapacity(pageCount); + + GL46.nglClearNamedBufferData(lastFrameVisibility.handle(), GL46.GL_R32UI, GL46.GL_RED_INTEGER, GL46.GL_UNSIGNED_INT, 0); + + if (lastWidth == -1 || lastHeight == -1) { + return; + } + + programs.getReadVisibilityProgram() + .bind(); + bind(); + + GlTextureUnit.T0.makeActive(); + GlStateManager._bindTexture(textureId); + + GL46.glDispatchCompute(MoreMath.ceilingDiv(lastWidth, READ_GROUP_SIZE), MoreMath.ceilingDiv(lastHeight, READ_GROUP_SIZE), 1); + } + + public void bind() { + GL46.glBindBufferBase(GL46.GL_SHADER_STORAGE_BUFFER, BufferBindings.LAST_FRAME_VISIBILITY, lastFrameVisibility.handle()); + } + + public void attach() { + var mainRenderTarget = Minecraft.getInstance() + .getMainRenderTarget(); + + setupTexture(mainRenderTarget.width, mainRenderTarget.height); + + if (attached.add(mainRenderTarget.frameBufferId)) { + GL46.glNamedFramebufferTexture(mainRenderTarget.frameBufferId, ATTACHMENT, textureId, 0); + + try { + mainRenderTarget.checkStatus(); + } catch (Exception e) { + FlwBackend.LOGGER.error("Error attaching visbuffer", e); + } + } + + // Enable writes + GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0, ATTACHMENT }); + } + + public void detach() { + var mainRenderTarget = Minecraft.getInstance() + .getMainRenderTarget(); + + // Disable writes + GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0 }); + } + + public void delete() { + deleteTexture(); + lastFrameVisibility.delete(); + } + + private void deleteTexture() { + if (textureId != -1) { + GL32.glDeleteTextures(textureId); + textureId = -1; + } + } + + public void clear() { + if (lastWidth == -1 || lastHeight == -1) { + return; + } + + GL46C.nglClearTexImage(textureId, 0, GL32.GL_RED_INTEGER, GL32.GL_UNSIGNED_INT, 0); + } + + private void setupTexture(int width, int height) { + if (lastWidth == width && lastHeight == height) { + return; + } + + // Need to rebind to all fbos because an attachment becomes incomplete when it's resized + attached.clear(); + + lastWidth = width; + lastHeight = height; + + deleteTexture(); + + textureId = GL46.glCreateTextures(GL46.GL_TEXTURE_2D); + GL46.glTextureStorage2D(textureId, 1, GL32.GL_R32UI, width, height); + + GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MIN_FILTER, GL32.GL_NEAREST); + GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MAG_FILTER, GL32.GL_NEAREST); + GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_S, GL32.GL_CLAMP_TO_EDGE); + GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_T, GL32.GL_CLAMP_TO_EDGE); + } +} diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/gl/GlCompat.java b/common/src/backend/java/dev/engine_room/flywheel/backend/gl/GlCompat.java index b7efa23ed..8d80254c3 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/gl/GlCompat.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/gl/GlCompat.java @@ -10,6 +10,7 @@ import org.lwjgl.opengl.GL20C; import org.lwjgl.opengl.GL31C; import org.lwjgl.opengl.GL40; import org.lwjgl.opengl.GL43; +import org.lwjgl.opengl.GL46; import org.lwjgl.opengl.GLCapabilities; import org.lwjgl.opengl.KHRShaderSubgroup; import org.lwjgl.system.MemoryStack; @@ -42,6 +43,8 @@ public final class GlCompat { public static final boolean SUPPORTS_INSTANCING = isInstancingSupported(); public static final boolean SUPPORTS_INDIRECT = isIndirectSupported(); + public static final int MAX_SHADER_STORAGE_BUFFER_BINDINGS = GL46.glGetInteger(GL46.GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS); + private GlCompat() { } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/common.frag b/common/src/backend/resources/assets/flywheel/flywheel/internal/common.frag index 1037ec635..fb2153bcb 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/common.frag +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/common.frag @@ -13,11 +13,7 @@ uniform sampler2D _flw_crumblingTex; in vec2 _flw_crumblingTexCoord; #endif -#ifdef _FLW_DEBUG -flat in uint _flw_instanceID; -#endif - -out vec4 _flw_outputColor; +layout(location = 0) out vec4 _flw_outputColor; float _flw_diffuseFactor() { if (flw_material.diffuse) { @@ -35,7 +31,7 @@ float _flw_diffuseFactor() { } } -void _flw_main() { +void _flw_main(uint instanceID) { flw_sampleColor = texture(flw_diffuseTex, flw_vertexTexCoord); flw_fragColor = flw_vertexColor * flw_sampleColor; flw_fragOverlay = flw_vertexOverlay; @@ -81,7 +77,7 @@ void _flw_main() { color = vec4(flw_vertexNormal * .5 + .5, 1.); break; case 2u: - color = _flw_id2Color(_flw_instanceID); + color = _flw_id2Color(instanceID); break; case 3u: color = vec4(vec2((flw_fragLight * 15.0 + 0.5) / 16.), 0., 1.); diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/common.vert b/common/src/backend/resources/assets/flywheel/flywheel/internal/common.vert index 8163c1118..888114d5d 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/common.vert +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/common.vert @@ -71,11 +71,7 @@ mat4 _flw_modelMatrix; mat3 _flw_normalMatrix; #endif -#ifdef _FLW_DEBUG -flat out uint _flw_instanceID; -#endif - -void _flw_main(in FlwInstance instance, in uint stableInstanceID) { +void _flw_main(in FlwInstance instance) { _flw_layoutVertex(); flw_instanceVertex(instance); flw_materialVertex(); @@ -94,8 +90,4 @@ void _flw_main(in FlwInstance instance, in uint stableInstanceID) { flw_distance = fogDistance(flw_vertexPos.xyz, flw_cameraPos, flw_fogShape); gl_Position = flw_viewProjection * flw_vertexPos; - - #ifdef _FLW_DEBUG - _flw_instanceID = stableInstanceID; - #endif } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl index 256bd68eb..87eb99051 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/buffer_bindings.glsl @@ -1,12 +1,17 @@ // Per culling group -#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 0// cull -#define _FLW_INSTANCE_BUFFER_BINDING 1// cull, draw -#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 2// cull, draw -#define _FLW_MODEL_BUFFER_BINDING 3// cull, apply -#define _FLW_DRAW_BUFFER_BINDING 4// apply, draw +#define _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING 0 // cull1 +#define _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING 1 // cull1, cull2 +#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 2 // cull1, cull2 +#define _FLW_INSTANCE_BUFFER_BINDING 3 // cull1, cull2, draw +#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 4 // cull1, cull2, draw +#define _FLW_MODEL_BUFFER_BINDING 5 // cull1, cull2, apply +#define _FLW_DRAW_BUFFER_BINDING 6 // apply, draw + // Global to the engine -#define _FLW_LIGHT_LUT_BUFFER_BINDING 5 -#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 6 +#define _FLW_LIGHT_LUT_BUFFER_BINDING 7 +#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 8 -#define _FLW_MATRIX_BUFFER_BINDING 7 +#define _FLW_MATRIX_BUFFER_BINDING 9 + +#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 10 diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl new file mode 100644 index 000000000..35b9b24ba --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl @@ -0,0 +1,31 @@ +layout(local_size_x = 16, local_size_y = 16) in; + +layout(binding = 0, r32f) uniform writeonly image2D outImage; +layout(binding = 1) uniform sampler2D inImage; + +uniform vec2 oneOverImageSize; +uniform int lod; + +uniform int useMin = 0; + +void main() { + uvec2 pos = gl_GlobalInvocationID.xy; + + // Map the output texel to an input texel. Properly do the division because generating mip0 maps from the actual + // full resolution depth buffer and the aspect ratio may be different from our Po2 pyramid. + ivec2 samplePos = ivec2(floor(vec2(pos) * vec2(textureSize(inImage, lod)) * oneOverImageSize)); + + float depth01 = texelFetchOffset(inImage, samplePos, lod, ivec2(0, 1)).r; + float depth11 = texelFetchOffset(inImage, samplePos, lod, ivec2(1, 1)).r; + float depth10 = texelFetchOffset(inImage, samplePos, lod, ivec2(1, 0)).r; + float depth00 = texelFetchOffset(inImage, samplePos, lod, ivec2(0, 0)).r; + + float depth; + if (useMin == 0) { + depth = max(max(depth00, depth01), max(depth10, depth11)); + } else { + depth = min(min(depth00, depth01), min(depth10, depth11)); + } + + imageStore(outImage, ivec2(pos), vec4(depth)); +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/dispatch.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/dispatch.glsl new file mode 100644 index 000000000..b0989a7a7 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/dispatch.glsl @@ -0,0 +1,6 @@ +struct _FlwLateCullDispatch { + uint x; + uint y; + uint z; + uint threadCount; +}; diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl new file mode 100644 index 000000000..1c82a935d --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/early_cull.glsl @@ -0,0 +1,122 @@ +#include "flywheel:internal/indirect/buffer_bindings.glsl" +#include "flywheel:internal/indirect/model_descriptor.glsl" +#include "flywheel:internal/uniforms/uniforms.glsl" +#include "flywheel:util/matrix.glsl" +#include "flywheel:internal/indirect/matrices.glsl" +#include "flywheel:internal/indirect/dispatch.glsl" + +layout(local_size_x = 32) in; + +uniform uint _flw_visibilityReadOffsetPages; + +layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer { + _FlwLateCullDispatch _flw_lateCullDispatch; +}; + +layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer PassTwoIndexBuffer { + uint _flw_passTwoIndices[]; +}; + +layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer { + uint _flw_drawIndices[]; +}; + +// High 6 bits for the number of instances in the page. +const uint _FLW_PAGE_COUNT_OFFSET = 26u; +// Bottom 26 bits for the model index. +const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF; + +layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer { + uint _flw_pageFrameDescriptors[]; +}; + +layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict readonly buffer LastFrameVisibilityBuffer { + uint _flw_lastFrameVisibility[]; +}; + +layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer { + ModelDescriptor _flw_models[]; +}; + +layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict readonly buffer MatrixBuffer { + Matrices _flw_matrices[]; +}; + +// Disgustingly vectorized sphere frustum intersection taking advantage of ahead of time packing. +// Only uses 6 fmas and some boolean ops. +// See also: +// flywheel:uniform/flywheel.glsl +// dev.engine_room.flywheel.lib.math.MatrixMath.writePackedFrustumPlanes +// org.joml.FrustumIntersection.testSphere +bool _flw_testSphere(vec3 center, float radius) { + bvec4 xyInside = greaterThanEqual(fma(flw_frustumPlanes.xyX, center.xxxx, fma(flw_frustumPlanes.xyY, center.yyyy, fma(flw_frustumPlanes.xyZ, center.zzzz, flw_frustumPlanes.xyW))), -radius.xxxx); + bvec2 zInside = greaterThanEqual(fma(flw_frustumPlanes.zX, center.xx, fma(flw_frustumPlanes.zY, center.yy, fma(flw_frustumPlanes.zZ, center.zz, flw_frustumPlanes.zW))), -radius.xx); + + return all(xyInside) && all(zInside); +} + +bool _flw_isVisible(uint instanceIndex, uint modelIndex) { + uint matrixIndex = _flw_models[modelIndex].matrixIndex; + BoundingSphere sphere = _flw_models[modelIndex].boundingSphere; + + vec3 center; + float radius; + _flw_unpackBoundingSphere(sphere, center, radius); + + FlwInstance instance = _flw_unpackInstance(instanceIndex); + + flw_transformBoundingSphere(instance, center, radius); + + if (matrixIndex > 0) { + transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius); + } + + return _flw_testSphere(center, radius); +} + +void main() { + uint pageIndex = gl_WorkGroupID.x; + + if (pageIndex >= _flw_pageFrameDescriptors.length()) { + return; + } + + uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex]; + + uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET; + + if (gl_LocalInvocationID.x >= pageInstanceCount) { + return; + } + + uint instanceIndex = gl_GlobalInvocationID.x; + + uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK; + + if (!_flw_isVisible(instanceIndex, modelIndex)) { + return; + } + + uint pageVisibility = _flw_lastFrameVisibility[_flw_visibilityReadOffsetPages + pageIndex]; + + if ((pageVisibility & (1u << gl_LocalInvocationID.x)) != 0u) { + // This instance was visibile last frame, it should be rendered early. + uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1); + uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex; + _flw_drawIndices[targetIndex] = instanceIndex; + } else { + // Try again later to see if it's been disoccluded. + uint targetIndex = atomicAdd(_flw_lateCullDispatch.threadCount, 1); + _flw_passTwoIndices[targetIndex] = instanceIndex; + + if (targetIndex % 32u == 0u) { + // This thread wrote an index that will be at the start of a new workgroup later + atomicAdd(_flw_lateCullDispatch.x, 1); + + if (targetIndex == 0) { + _flw_lateCullDispatch.y = 1; + _flw_lateCullDispatch.z = 1; + } + } + } +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl new file mode 100644 index 000000000..4a32340b9 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/late_cull.glsl @@ -0,0 +1,137 @@ +#include "flywheel:internal/indirect/buffer_bindings.glsl" +#include "flywheel:internal/indirect/model_descriptor.glsl" +#include "flywheel:internal/uniforms/uniforms.glsl" +#include "flywheel:util/matrix.glsl" +#include "flywheel:internal/indirect/matrices.glsl" +#include "flywheel:internal/indirect/dispatch.glsl" + +layout(local_size_x = 32) in; + + +layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer { + _FlwLateCullDispatch _flw_lateCullDispatch; +}; + +layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict readonly buffer PassTwoIndexBuffer { + uint _flw_passTwoIndices[]; +}; + +layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer { + uint _flw_drawIndices[]; +}; + + +// High 6 bits for the number of instances in the page. +const uint _FLW_PAGE_COUNT_OFFSET = 26u; +// Bottom 26 bits for the model index. +const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF; + + +layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer { + uint _flw_pageFrameDescriptors[]; +}; + +layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer { + ModelDescriptor _flw_models[]; +}; + +layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict readonly buffer MatrixBuffer { + Matrices _flw_matrices[]; +}; + +layout(binding = 0) uniform sampler2D _flw_depthPyramid; + +bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 aabb) { + // Closest point on the sphere is between the camera and the near plane, don't even attempt to cull. + if (c.z + r > -znear) { + return false; + } + + vec3 cr = c * r; + float czr2 = c.z * c.z - r * r; + + float vx = sqrt(c.x * c.x + czr2); + float minx = (vx * c.x - cr.z) / (vx * c.z + cr.x); + float maxx = (vx * c.x + cr.z) / (vx * c.z - cr.x); + + float vy = sqrt(c.y * c.y + czr2); + float miny = (vy * c.y - cr.z) / (vy * c.z + cr.y); + float maxy = (vy * c.y + cr.z) / (vy * c.z - cr.y); + + aabb = vec4(minx * P00, miny * P11, maxx * P00, maxy * P11); + aabb = aabb.xwzy * vec4(-0.5f, -0.5f, -0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space + + return true; +} + +bool _flw_isVisible(uint instanceIndex, uint modelIndex) { + uint matrixIndex = _flw_models[modelIndex].matrixIndex; + BoundingSphere sphere = _flw_models[modelIndex].boundingSphere; + + vec3 center; + float radius; + _flw_unpackBoundingSphere(sphere, center, radius); + + FlwInstance instance = _flw_unpackInstance(instanceIndex); + + flw_transformBoundingSphere(instance, center, radius); + + if (matrixIndex > 0) { + transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius); + } + + transformBoundingSphere(flw_view, center, radius); + + vec4 aabb; + if (projectSphere(center, radius, _flw_cullData.znear, _flw_cullData.P00, _flw_cullData.P11, aabb)) + { + float width = (aabb.z - aabb.x) * _flw_cullData.pyramidWidth; + float height = (aabb.w - aabb.y) * _flw_cullData.pyramidHeight; + + int level = clamp(int(ceil(log2(max(width, height)))), 0, _flw_cullData.pyramidLevels); + + ivec2 levelSize = textureSize(_flw_depthPyramid, level); + + ivec4 levelSizePair = ivec4(levelSize, levelSize); + + ivec4 bounds = ivec4(aabb * vec4(levelSizePair)); + + float depth01 = texelFetch(_flw_depthPyramid, bounds.xw, level).r; + float depth11 = texelFetch(_flw_depthPyramid, bounds.zw, level).r; + float depth10 = texelFetch(_flw_depthPyramid, bounds.zy, level).r; + float depth00 = texelFetch(_flw_depthPyramid, bounds.xy, level).r; + + float depth; + if (_flw_cullData.useMin == 0) { + depth = max(max(depth00, depth01), max(depth10, depth11)); + } else { + depth = min(min(depth00, depth01), min(depth10, depth11)); + } + + float depthSphere = 1. + _flw_cullData.znear / (center.z + radius); + + return depthSphere <= depth; + } + + return true; +} + +void main() { + if (gl_GlobalInvocationID.x >= _flw_lateCullDispatch.threadCount) { + return; + } + + uint instanceIndex = _flw_passTwoIndices[gl_GlobalInvocationID.x]; + + uint pageIndex = instanceIndex >> 5; + + uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex]; + + uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK; + + if (_flw_isVisible(instanceIndex, modelIndex)) { + uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1); + uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex; + _flw_drawIndices[targetIndex] = instanceIndex; + } +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag index 68113130a..62e3b4d4a 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.frag @@ -4,9 +4,15 @@ flat in uvec2 _flw_packedMaterial; +flat in uint _flw_instanceID; + +layout(location = 1) out uint _flw_out_instanceID; + void main() { _flw_unpackUint2x16(_flw_packedMaterial.x, _flw_uberFogIndex, _flw_uberCutoutIndex); _flw_unpackMaterialProperties(_flw_packedMaterial.y, flw_material); - _flw_main(); + _flw_main(_flw_instanceID); + + _flw_out_instanceID = _flw_instanceID; } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert index 85e2c8f2b..500a509a2 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/main.vert @@ -5,8 +5,8 @@ #include "flywheel:internal/indirect/light.glsl" #include "flywheel:internal/indirect/matrices.glsl" -layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict readonly buffer TargetBuffer { - uint _flw_instanceIndices[]; +layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict readonly buffer DrawIndexBuffer { + uint _flw_drawIndices[]; }; layout(std430, binding = _FLW_DRAW_BUFFER_BINDING) restrict readonly buffer DrawBuffer { @@ -21,8 +21,14 @@ layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict buffer MatrixBuffe uniform uint _flw_baseDraw; +// We read the visibility buffer for all culling groups into a single shared buffer. +// This offset is used to know where each culling group starts. +uniform uint _flw_visibilityWriteOffsetInstances = 0; + flat out uvec2 _flw_packedMaterial; +flat out uint _flw_instanceID; + #if __VERSION__ < 460 #define flw_baseInstance gl_BaseInstanceARB #define flw_drawId gl_DrawIDARB @@ -46,10 +52,13 @@ void main() { #ifdef _FLW_CRUMBLING uint instanceIndex = flw_baseInstance; #else - uint instanceIndex = _flw_instanceIndices[flw_baseInstance + gl_InstanceID]; + uint instanceIndex = _flw_drawIndices[flw_baseInstance + gl_InstanceID]; #endif FlwInstance instance = _flw_unpackInstance(instanceIndex); - _flw_main(instance, instanceIndex); + _flw_main(instance); + + // Add 1 because a 0 instance id means null. + _flw_instanceID = _flw_visibilityWriteOffsetInstances + instanceIndex + 1; } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl new file mode 100644 index 000000000..b4d506f16 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl @@ -0,0 +1,64 @@ +#include "flywheel:internal/indirect/buffer_bindings.glsl" + +layout(local_size_x = 256) in; + +layout(binding = 0) uniform usampler2D visBuffer; + +layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buffer LastFrameVisibilityBuffer { + uint _flw_lastFrameVisibility[]; +}; + +uint extractBits(uint e, uint offset, uint count) { + return (e >> offset) & ((1u << count) - 1u); +} + +uint insertBits(uint e, uint newbits, uint offset, uint count) { + uint countMask = ((1u << count) - 1u); + // zero out the bits we're going to replace first + return (e & ~(countMask << offset)) | ((newbits & countMask) << offset); +} + +uvec2 remap_for_wave_reduction(uint a) { + return uvec2( + insertBits(extractBits(a, 2u, 3u), a, 0u, 1u), + insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u) + ); +} + +void emit(uint instanceID) { + // Null instance id. + if (instanceID == 0) { + return; + } + + // Adjust for null to find the actual index. + instanceID = instanceID - 1; + + uint index = instanceID >> 5; + + uint mask = 1u << (instanceID & 31u); + + atomicOr(_flw_lastFrameVisibility[index], mask); +} + +void main() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + ivec2 tex = ivec2(gl_WorkGroupID.xy) * 32 + ivec2(x, y) * 2; + + uint instanceID01 = texelFetchOffset(visBuffer, tex, 0, ivec2(0, 1)).r; + uint instanceID11 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 1)).r; + uint instanceID10 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 0)).r; + uint instanceID00 = texelFetch(visBuffer, tex, 0).r; + + if (instanceID00 == instanceID01 && instanceID01 == instanceID10 && instanceID10 == instanceID11) { + emit(instanceID00); + } else { + emit(instanceID00); + emit(instanceID01); + emit(instanceID10); + emit(instanceID11); + } +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/zero_models.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/zero_models.glsl new file mode 100644 index 000000000..c8f7a0b9c --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/zero_models.glsl @@ -0,0 +1,18 @@ +#include "flywheel:internal/indirect/buffer_bindings.glsl" +#include "flywheel:internal/indirect/model_descriptor.glsl" + +layout(local_size_x = _FLW_SUBGROUP_SIZE) in; + +layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict writeonly buffer ModelBuffer { + ModelDescriptor models[]; +}; + +void main() { + uint modelIndex = gl_GlobalInvocationID.x; + + if (modelIndex >= models.length()) { + return; + } + + models[modelIndex].instanceCount = 0; +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.frag b/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.frag index 8a927ec04..3a54e62ac 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.frag +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.frag @@ -3,9 +3,11 @@ uniform uvec2 _flw_packedMaterial; +flat in uint _flw_instanceID; + void main() { _flw_unpackUint2x16(_flw_packedMaterial.x, _flw_uberFogIndex, _flw_uberCutoutIndex); _flw_unpackMaterialProperties(_flw_packedMaterial.y, flw_material); - _flw_main(); + _flw_main(_flw_instanceID); } diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.vert b/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.vert index 5e32d028e..2f7dd05b5 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.vert +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/instancing/main.vert @@ -10,6 +10,8 @@ uniform mat4 _flw_modelMatrixUniform; uniform mat3 _flw_normalMatrixUniform; #endif +flat out uint _flw_instanceID; + void main() { _flw_unpackMaterialProperties(_flw_packedMaterial.y, flw_material); @@ -20,5 +22,7 @@ void main() { _flw_normalMatrix = _flw_normalMatrixUniform; #endif - _flw_main(instance, uint(gl_InstanceID)); + _flw_main(instance); + + _flw_instanceID = gl_InstanceID; }