Spherical instances in a vacuum

- Write out instance bounding spheres in pass one
- Read them back in pass two so we don't have to read in the entire
  instance twice
- Cull pass 2 no longer needs to be parameterized by instance type, so
  less program binds are needed
- Fix page indexing logic
- Fix visibility sizing logic
This commit is contained in:
Jozufozu 2024-11-03 16:32:29 -08:00
parent 1823a9fa24
commit a6c5f93fb4
8 changed files with 82 additions and 85 deletions

View file

@ -23,6 +23,7 @@ import dev.engine_room.flywheel.backend.glsl.SourceComponent;
import dev.engine_room.flywheel.backend.util.AtomicReferenceCounted;
import dev.engine_room.flywheel.lib.util.ResourceUtil;
import net.minecraft.resources.ResourceLocation;
import net.minecraft.util.Unit;
public class IndirectPrograms extends AtomicReferenceCounted {
private static final ResourceLocation CULL_SHADER_API_IMPL = Flywheel.rl("internal/indirect/cull_api_impl.glsl");
@ -37,6 +38,7 @@ public class IndirectPrograms extends AtomicReferenceCounted {
private static final Compile<InstanceType<?>> CULL = new Compile<>();
private static final Compile<ResourceLocation> UTIL = new Compile<>();
private static final Compile<Unit> UNIT = new Compile<>();
private static final List<String> EXTENSIONS = getExtensions(GlCompat.MAX_GLSL_VERSION);
private static final List<String> COMPUTE_EXTENSIONS = getComputeExtensions(GlCompat.MAX_GLSL_VERSION);
@ -46,10 +48,10 @@ public class IndirectPrograms extends AtomicReferenceCounted {
private final PipelineCompiler pipeline;
private final CompilationHarness<InstanceType<?>> culling;
private final CompilationHarness<InstanceType<?>> cullPassTwo;
private final CompilationHarness<Unit> cullPassTwo;
private final CompilationHarness<ResourceLocation> utils;
private IndirectPrograms(PipelineCompiler pipeline, CompilationHarness<InstanceType<?>> culling, CompilationHarness<InstanceType<?>> cullPassTwo, CompilationHarness<ResourceLocation> utils) {
private IndirectPrograms(PipelineCompiler pipeline, CompilationHarness<InstanceType<?>> culling, CompilationHarness<Unit> cullPassTwo, CompilationHarness<ResourceLocation> utils) {
this.pipeline = pipeline;
this.culling = culling;
this.cullPassTwo = cullPassTwo;
@ -91,7 +93,7 @@ public class IndirectPrograms extends AtomicReferenceCounted {
var pipelineCompiler = PipelineCompiler.create(sources, Pipelines.INDIRECT, vertexComponents, fragmentComponents, EXTENSIONS);
var pass1Compiler = createCullingCompiler(sources, CULL_SHADER_MAIN, "early_cull");
var pass2Compiler = createCullingCompiler(sources, PASS2_SHADER_MAIN, "late_cull");
var pass2Compiler = createPassTwoCompiler(sources, PASS2_SHADER_MAIN, "late_cull");
var utilCompiler = createUtilCompiler(sources);
IndirectPrograms newInstance = new IndirectPrograms(pipelineCompiler, pass1Compiler, pass2Compiler, utilCompiler);
@ -119,6 +121,19 @@ public class IndirectPrograms extends AtomicReferenceCounted {
.harness(name, sources);
}
private static CompilationHarness<Unit> createPassTwoCompiler(ShaderSources sources, ResourceLocation main, String name) {
return UNIT.program()
.link(UNIT.shader(GlCompat.MAX_GLSL_VERSION, ShaderType.COMPUTE)
.nameMapper(instanceType -> name)
.requireExtensions(COMPUTE_EXTENSIONS)
.define("_FLW_SUBGROUP_SIZE", GlCompat.SUBGROUP_SIZE)
.enableExtension("GL_KHR_shader_subgroup_basic")
.enableExtension("GL_KHR_shader_subgroup_ballot")
.withResource(main))
.postLink((key, program) -> Uniforms.setUniformBlockBindings(program))
.harness(name, sources);
}
/**
* A compiler for utility shaders, directly compiles the shader at the resource location specified by the parameter.
*/
@ -163,8 +178,8 @@ public class IndirectPrograms extends AtomicReferenceCounted {
return culling.get(instanceType);
}
public GlProgram getCullPassTwoProgram(InstanceType<?> instanceType) {
return cullPassTwo.get(instanceType);
public GlProgram getCullPassTwoProgram() {
return cullPassTwo.get(Unit.INSTANCE);
}
public GlProgram getApplyProgram() {

View file

@ -1,16 +1,17 @@
package dev.engine_room.flywheel.backend.engine.indirect;
public final class BufferBindings {
public static final int LAST_FRAME_VISIBILITY = 0;
public static final int PAGE_FRAME_DESCRIPTOR = 1;
public static final int INSTANCE = 2;
public static final int DRAW_INSTANCE_INDEX = 3;
public static final int MODEL = 4;
public static final int DRAW = 5;
public static final int BOUNDING_SPHERES = 0;
public static final int LAST_FRAME_VISIBILITY = 1;
public static final int PAGE_FRAME_DESCRIPTOR = 2;
public static final int INSTANCE = 3;
public static final int DRAW_INSTANCE_INDEX = 4;
public static final int MODEL = 5;
public static final int DRAW = 6;
public static final int LIGHT_LUT = 6;
public static final int LIGHT_SECTION = 7;
public static final int MATRICES = 8;
public static final int LIGHT_LUT = 7;
public static final int LIGHT_SECTION = 8;
public static final int MATRICES = 9;
private BufferBindings() {
}

View file

@ -7,12 +7,11 @@ import org.lwjgl.system.MemoryUtil;
import org.lwjgl.system.Pointer;
import dev.engine_room.flywheel.backend.gl.buffer.GlBufferType;
import dev.engine_room.flywheel.lib.math.MoreMath;
import dev.engine_room.flywheel.lib.memory.MemoryBlock;
public class IndirectBuffers {
// Number of vbos created.
public static final int BUFFER_COUNT = 6;
public static final int BUFFER_COUNT = 7;
public static final long INT_SIZE = Integer.BYTES;
public static final long PTR_SIZE = Pointer.POINTER_SIZE;
@ -31,6 +30,7 @@ public class IndirectBuffers {
private static final long BUFFERS_SIZE_BYTES = SIZE_OFFSET + BUFFER_COUNT * PTR_SIZE;
// Offsets to the vbos
private static final long BOUNDING_SPHERES_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.BOUNDING_SPHERES * INT_SIZE;
private static final long LAST_FRAME_VISIBILITY_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * INT_SIZE;
private static final long PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * INT_SIZE;
private static final long INSTANCE_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.INSTANCE * INT_SIZE;
@ -39,6 +39,7 @@ public class IndirectBuffers {
private static final long DRAW_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW * INT_SIZE;
// Offsets to the sizes
private static final long BOUNDING_SPHERES_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.BOUNDING_SPHERES * PTR_SIZE;
private static final long LAST_FRAME_VISIBILITY_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * PTR_SIZE;
private static final long PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * PTR_SIZE;
private static final long INSTANCE_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.INSTANCE * PTR_SIZE;
@ -65,6 +66,7 @@ public class IndirectBuffers {
*/
private final MemoryBlock multiBindBlock;
public final ResizableStorageArray boundingSpheres;
public final ResizableStorageArray lastFrameVisibility;
public final ObjectStorage objectStorage;
public final ResizableStorageArray drawInstanceIndex;
@ -74,6 +76,7 @@ public class IndirectBuffers {
IndirectBuffers(long instanceStride) {
this.multiBindBlock = MemoryBlock.calloc(BUFFERS_SIZE_BYTES, 1);
boundingSpheres = new ResizableStorageArray(16);
lastFrameVisibility = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR);
objectStorage = new ObjectStorage(instanceStride);
drawInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR);
@ -83,12 +86,14 @@ public class IndirectBuffers {
void updateCounts(int instanceCount, int modelCount, int drawCount) {
drawInstanceIndex.ensureCapacity(instanceCount);
lastFrameVisibility.ensureCapacity(MoreMath.ceilingDiv(instanceCount, 32));
lastFrameVisibility.ensureCapacity(objectStorage.capacity());
boundingSpheres.ensureCapacity(objectStorage.capacity() * 32L);
model.ensureCapacity(modelCount);
draw.ensureCapacity(drawCount);
final long ptr = multiBindBlock.ptr();
MemoryUtil.memPutInt(ptr + BOUNDING_SPHERES_HANDLE_OFFSET, boundingSpheres.handle());
MemoryUtil.memPutInt(ptr + LAST_FRAME_VISIBILITY_HANDLE_OFFSET, lastFrameVisibility.handle());
MemoryUtil.memPutInt(ptr + PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET, objectStorage.frameDescriptorBuffer.handle());
MemoryUtil.memPutInt(ptr + INSTANCE_HANDLE_OFFSET, objectStorage.objectBuffer.handle());
@ -96,7 +101,8 @@ public class IndirectBuffers {
MemoryUtil.memPutInt(ptr + MODEL_HANDLE_OFFSET, model.handle());
MemoryUtil.memPutInt(ptr + DRAW_HANDLE_OFFSET, draw.handle());
MemoryUtil.memPutAddress(ptr + LAST_FRAME_VISIBILITY_SIZE_OFFSET, INT_SIZE * MoreMath.ceilingDiv(instanceCount, 32));
MemoryUtil.memPutAddress(ptr + BOUNDING_SPHERES_SIZE_OFFSET, 16L * objectStorage.capacity() * 32);
MemoryUtil.memPutAddress(ptr + LAST_FRAME_VISIBILITY_SIZE_OFFSET, INT_SIZE * objectStorage.capacity());
MemoryUtil.memPutAddress(ptr + PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET, objectStorage.frameDescriptorBuffer.capacity());
MemoryUtil.memPutAddress(ptr + INSTANCE_SIZE_OFFSET, objectStorage.objectBuffer.capacity());
MemoryUtil.memPutAddress(ptr + DRAW_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount);
@ -105,23 +111,23 @@ public class IndirectBuffers {
}
public void bindForCullPassOne() {
multiBind(0, 5);
multiBind(0, 6);
}
public void bindForCullPassTwo() {
multiBind(0, 5);
multiBind(0, 6);
}
public void bindForApply() {
multiBind(4, 2);
multiBind(5, 2);
}
public void bindForModelReset() {
multiBind(4, 1);
multiBind(5, 1);
}
public void bindForDraw() {
multiBind(2, 4);
multiBind(3, 4);
GlBufferType.DRAW_INDIRECT_BUFFER.bind(draw.handle());
}
@ -129,7 +135,7 @@ public class IndirectBuffers {
* Bind all buffers except the draw command buffer.
*/
public void bindForCrumbling() {
multiBind(3, 3);
multiBind(4, 3);
}
private void multiBind(int base, int count) {

View file

@ -23,7 +23,6 @@ import dev.engine_room.flywheel.backend.compile.IndirectPrograms;
import dev.engine_room.flywheel.backend.engine.InstancerKey;
import dev.engine_room.flywheel.backend.engine.MaterialRenderState;
import dev.engine_room.flywheel.backend.engine.MeshPool;
import dev.engine_room.flywheel.backend.engine.uniform.Uniforms;
import dev.engine_room.flywheel.backend.gl.GlCompat;
import dev.engine_room.flywheel.backend.gl.shader.GlProgram;
import dev.engine_room.flywheel.lib.math.MoreMath;
@ -44,7 +43,6 @@ public class IndirectCullingGroup<I extends Instance> {
private final IndirectPrograms programs;
private final GlProgram earlyCull;
private final GlProgram lateCull;
private boolean needsDrawBarrier;
private boolean needsDrawSort;
@ -58,7 +56,6 @@ public class IndirectCullingGroup<I extends Instance> {
this.programs = programs;
earlyCull = programs.getCullingProgram(instanceType);
lateCull = programs.getCullPassTwoProgram(instanceType);
}
public void flushInstancers() {
@ -113,7 +110,6 @@ public class IndirectCullingGroup<I extends Instance> {
return;
}
Uniforms.bindAll();
earlyCull.bind();
buffers.bindForCullPassOne();
@ -125,9 +121,6 @@ public class IndirectCullingGroup<I extends Instance> {
return;
}
Uniforms.bindAll();
lateCull.bind();
buffers.bindForCullPassTwo();
glDispatchCompute(buffers.objectStorage.capacity(), 1, 1);
}

View file

@ -132,6 +132,9 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
GlTextureUnit.T0.makeActive();
GlStateManager._bindTexture(depthPyramid.pyramidTextureId);
programs.getCullPassTwoProgram()
.bind();
for (var group1 : cullingGroups.values()) {
group1.dispatchCullPassTwo();
}

View file

@ -2,16 +2,17 @@
// A few of these could be combined.
// Per culling group
#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 0// cull1, cull2
#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 1// cull1, cull2
#define _FLW_INSTANCE_BUFFER_BINDING 2// cull1, cull2, draw
#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 3// cull1, cull2, draw
#define _FLW_MODEL_BUFFER_BINDING 4// cull1, cull2, apply
#define _FLW_DRAW_BUFFER_BINDING 5// apply, draw
#define _FLW_BOUNDING_SPHERE_BINDING 0// cull1, cull2
#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 1// cull1, cull2
#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 2// cull1, cull2
#define _FLW_INSTANCE_BUFFER_BINDING 3// cull1, cull2, draw
#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 4// cull1, cull2, draw
#define _FLW_MODEL_BUFFER_BINDING 5// cull1, cull2, apply
#define _FLW_DRAW_BUFFER_BINDING 6// apply, draw
// Global to the engine
#define _FLW_LIGHT_LUT_BUFFER_BINDING 6
#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 7
#define _FLW_LIGHT_LUT_BUFFER_BINDING 7
#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 8
#define _FLW_MATRIX_BUFFER_BINDING 8
#define _FLW_MATRIX_BUFFER_BINDING 9

View file

@ -7,15 +7,14 @@
layout(local_size_x = 32) in;
layout(std430, binding = _FLW_BOUNDING_SPHERE_BINDING) restrict writeonly buffer BoundingSphereBuffer {
vec4 _flw_boundingSpheres[];
};
layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer {
uint _flw_drawIndices[];
};
// High 6 bits for the number of instances in the page.
const uint _FLW_PAGE_COUNT_OFFSET = 26u;
// Bottom 26 bits for the model index.
const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF;
layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer {
uint _flw_pageFrameDescriptors[];
};
@ -61,39 +60,36 @@ bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius);
}
_flw_boundingSpheres[instanceIndex] = vec4(center, radius);
return _flw_testSphere(center, radius);
}
// TODO: There's an opportunity here to write out the transformed bounding spheres to a buffer and use them in pass 2,
// instead of pulling the entire instance again. It would save a lot of memory bandwidth and matrix multiplications in
// pass 2, but it would also be a good bit of writes in pass 1. It's worth investigating, but it would be nice to have
// nsight trace working to be more sure.
void main() {
uint pageIndex = gl_WorkGroupID.x;
uint pageIndex = gl_WorkGroupID.x << 1u;
if (pageIndex >= _flw_pageFrameDescriptors.length()) {
return;
}
uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex];
uint modelIndex = _flw_pageFrameDescriptors[pageIndex];
uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET;
uint pageValidity = _flw_pageFrameDescriptors[pageIndex + 1];
if (gl_LocalInvocationID.x >= pageInstanceCount) {
if (((1u << gl_LocalInvocationID.x) & pageValidity) == 0) {
return;
}
uint instanceIndex = gl_GlobalInvocationID.x;
uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK;
if (!_flw_isVisible(instanceIndex, modelIndex)) {
return;
}
uint pageVisibility = _flw_visibility[pageIndex];
uint pageVisibility = _flw_visibility[gl_WorkGroupID.x];
bool visibleLastFrame = (_flw_visibility[gl_WorkGroupID.x] & (1u << gl_LocalInvocationID.x)) != 0u;
if ((pageVisibility & (1u << gl_LocalInvocationID.x)) != 0u) {
if (visibleLastFrame) {
// This instance was visibile last frame, it should be rendered early.
uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1);
uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex;

View file

@ -2,7 +2,6 @@
#include "flywheel:internal/indirect/model_descriptor.glsl"
#include "flywheel:internal/uniforms/uniforms.glsl"
#include "flywheel:util/matrix.glsl"
#include "flywheel:internal/indirect/matrices.glsl"
#include "flywheel:internal/indirect/dispatch.glsl"
layout(local_size_x = 32) in;
@ -11,12 +10,9 @@ layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict write
uint _flw_drawIndices[];
};
// High 6 bits for the number of instances in the page.
const uint _FLW_PAGE_COUNT_OFFSET = 26u;
// Bottom 26 bits for the model index.
const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF;
layout(std430, binding = _FLW_BOUNDING_SPHERE_BINDING) restrict readonly buffer BoundingSphereBuffer {
vec4 _flw_boundingSpheres[];
};
layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer {
uint _flw_pageFrameDescriptors[];
@ -30,10 +26,6 @@ layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer
ModelDescriptor _flw_models[];
};
layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict readonly buffer MatrixBuffer {
Matrices _flw_matrices[];
};
layout(binding = 0) uniform sampler2D _flw_depthPyramid;
bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 aabb) {
@ -113,21 +105,11 @@ bool _flw_hizTest(vec3 center, float radius) {
return true;
}
bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
uint matrixIndex = _flw_models[modelIndex].matrixIndex;
BoundingSphere sphere = _flw_models[modelIndex].boundingSphere;
bool _flw_isVisible(uint instanceIndex) {
vec4 boundingSphere = _flw_boundingSpheres[instanceIndex];
vec3 center;
float radius;
_flw_unpackBoundingSphere(sphere, center, radius);
FlwInstance instance = _flw_unpackInstance(instanceIndex);
flw_transformBoundingSphere(instance, center, radius);
if (matrixIndex > 0) {
transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius);
}
vec3 center = boundingSphere.xyz;
float radius = boundingSphere.w;
bool visible = _flw_testSphere(center, radius);
@ -155,8 +137,8 @@ void main() {
uint instanceIndex = gl_GlobalInvocationID.x;
bool visible = _flw_isVisible(instanceIndex, modelIndex);
bool visibleLastFrame = (_flw_visibility[pageIndex] & (1u << gl_LocalInvocationID.x)) != 0u;
bool visible = _flw_isVisible(instanceIndex);
bool visibleLastFrame = (_flw_visibility[gl_WorkGroupID.x] & (1u << gl_LocalInvocationID.x)) != 0u;
if (visible && !visibleLastFrame) {
uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1);
@ -168,6 +150,6 @@ void main() {
uvec4 visibility = subgroupBallot(visible);
if (subgroupElect()) {
_flw_visibility[pageIndex] = visibility.x;
_flw_visibility[gl_WorkGroupID.x] = visibility.x;
}
}