Directly visible

- Don't actually need a framebuffer attachment for visibility
- Instead, process everything in pass 2 and write out the visibility
  bitset directly
- Persist visibility bits between frames for use in pass 1
- No need for indirect dispatch!
- Also saves some ssbo bindings
- Do frustum culling in both passes
This commit is contained in:
Jozufozu 2024-10-18 13:45:37 -07:00
parent afdab92010
commit b90c43ba7e
13 changed files with 109 additions and 513 deletions

View file

@ -109,6 +109,8 @@ public class IndirectPrograms extends AtomicReferenceCounted {
.nameMapper(instanceType -> name + "/" + ResourceUtil.toDebugFileNameNoExtension(instanceType.cullShader()))
.requireExtensions(COMPUTE_EXTENSIONS)
.define("_FLW_SUBGROUP_SIZE", GlCompat.SUBGROUP_SIZE)
.enableExtension("GL_KHR_shader_subgroup_basic")
.enableExtension("GL_KHR_shader_subgroup_ballot")
.withResource(CULL_SHADER_API_IMPL)
.withComponent(InstanceStructComponent::new)
.withResource(InstanceType::cullShader)

View file

@ -1,18 +1,16 @@
package dev.engine_room.flywheel.backend.engine.indirect;
public final class BufferBindings {
public static final int PASS_TWO_DISPATCH = 0;
public static final int PASS_TWO_INSTANCE_INDEX = 1;
public static final int PAGE_FRAME_DESCRIPTOR = 2;
public static final int INSTANCE = 3;
public static final int DRAW_INSTANCE_INDEX = 4;
public static final int MODEL = 5;
public static final int DRAW = 6;
public static final int LAST_FRAME_VISIBILITY = 0;
public static final int PAGE_FRAME_DESCRIPTOR = 1;
public static final int INSTANCE = 2;
public static final int DRAW_INSTANCE_INDEX = 3;
public static final int MODEL = 4;
public static final int DRAW = 5;
public static final int LIGHT_LUT = 7;
public static final int LIGHT_SECTION = 8;
public static final int MATRICES = 9;
public static final int LAST_FRAME_VISIBILITY = 10;
public static final int LIGHT_LUT = 6;
public static final int LIGHT_SECTION = 7;
public static final int MATRICES = 8;
private BufferBindings() {
}

View file

@ -7,11 +7,12 @@ import org.lwjgl.system.MemoryUtil;
import org.lwjgl.system.Pointer;
import dev.engine_room.flywheel.backend.gl.buffer.GlBufferType;
import dev.engine_room.flywheel.lib.math.MoreMath;
import dev.engine_room.flywheel.lib.memory.MemoryBlock;
public class IndirectBuffers {
// Number of vbos created.
public static final int BUFFER_COUNT = 7;
public static final int BUFFER_COUNT = 6;
public static final long INT_SIZE = Integer.BYTES;
public static final long PTR_SIZE = Pointer.POINTER_SIZE;
@ -30,8 +31,7 @@ public class IndirectBuffers {
private static final long BUFFERS_SIZE_BYTES = SIZE_OFFSET + BUFFER_COUNT * PTR_SIZE;
// Offsets to the vbos
private static final long PASS_TWO_DISPATCH_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * INT_SIZE;
private static final long PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * INT_SIZE;
private static final long LAST_FRAME_VISIBILITY_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * INT_SIZE;
private static final long PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * INT_SIZE;
private static final long INSTANCE_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.INSTANCE * INT_SIZE;
private static final long DRAW_INSTANCE_INDEX_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * INT_SIZE;
@ -39,8 +39,7 @@ public class IndirectBuffers {
private static final long DRAW_HANDLE_OFFSET = HANDLE_OFFSET + BufferBindings.DRAW * INT_SIZE;
// Offsets to the sizes
private static final long PASS_TWO_DISPATCH_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_DISPATCH * PTR_SIZE;
private static final long PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PASS_TWO_INSTANCE_INDEX * PTR_SIZE;
private static final long LAST_FRAME_VISIBILITY_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.LAST_FRAME_VISIBILITY * PTR_SIZE;
private static final long PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.PAGE_FRAME_DESCRIPTOR * PTR_SIZE;
private static final long INSTANCE_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.INSTANCE * PTR_SIZE;
private static final long DRAW_INSTANCE_INDEX_SIZE_OFFSET = SIZE_OFFSET + BufferBindings.DRAW_INSTANCE_INDEX * PTR_SIZE;
@ -66,8 +65,7 @@ public class IndirectBuffers {
*/
private final MemoryBlock multiBindBlock;
public final ResizableStorageBuffer passTwoDispatch;
public final ResizableStorageArray passTwoInstanceIndex;
public final ResizableStorageArray lastFrameVisibility;
public final ObjectStorage objectStorage;
public final ResizableStorageArray drawInstanceIndex;
public final ResizableStorageArray model;
@ -76,34 +74,29 @@ public class IndirectBuffers {
IndirectBuffers(long instanceStride) {
this.multiBindBlock = MemoryBlock.calloc(BUFFERS_SIZE_BYTES, 1);
passTwoDispatch = new ResizableStorageBuffer();
passTwoInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR);
lastFrameVisibility = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR);
objectStorage = new ObjectStorage(instanceStride);
drawInstanceIndex = new ResizableStorageArray(INT_SIZE, INSTANCE_GROWTH_FACTOR);
model = new ResizableStorageArray(MODEL_STRIDE, MODEL_GROWTH_FACTOR);
draw = new ResizableStorageArray(DRAW_COMMAND_STRIDE, DRAW_GROWTH_FACTOR);
passTwoDispatch.ensureCapacity(INT_SIZE * 4);
}
void updateCounts(int instanceCount, int modelCount, int drawCount) {
drawInstanceIndex.ensureCapacity(instanceCount);
passTwoInstanceIndex.ensureCapacity(instanceCount);
lastFrameVisibility.ensureCapacity(MoreMath.ceilingDiv(instanceCount, 32));
model.ensureCapacity(modelCount);
draw.ensureCapacity(drawCount);
final long ptr = multiBindBlock.ptr();
MemoryUtil.memPutInt(ptr + PASS_TWO_DISPATCH_HANDLE_OFFSET, passTwoDispatch.handle());
MemoryUtil.memPutInt(ptr + PASS_TWO_INSTANCE_INDEX_HANDLE_OFFSET, passTwoInstanceIndex.handle());
MemoryUtil.memPutInt(ptr + LAST_FRAME_VISIBILITY_HANDLE_OFFSET, lastFrameVisibility.handle());
MemoryUtil.memPutInt(ptr + PAGE_FRAME_DESCRIPTOR_HANDLE_OFFSET, objectStorage.frameDescriptorBuffer.handle());
MemoryUtil.memPutInt(ptr + INSTANCE_HANDLE_OFFSET, objectStorage.objectBuffer.handle());
MemoryUtil.memPutInt(ptr + DRAW_INSTANCE_INDEX_HANDLE_OFFSET, drawInstanceIndex.handle());
MemoryUtil.memPutInt(ptr + MODEL_HANDLE_OFFSET, model.handle());
MemoryUtil.memPutInt(ptr + DRAW_HANDLE_OFFSET, draw.handle());
MemoryUtil.memPutAddress(ptr + PASS_TWO_DISPATCH_SIZE_OFFSET, passTwoDispatch.capacity());
MemoryUtil.memPutAddress(ptr + PASS_TWO_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount);
MemoryUtil.memPutAddress(ptr + LAST_FRAME_VISIBILITY_SIZE_OFFSET, INT_SIZE * MoreMath.ceilingDiv(instanceCount, 32));
MemoryUtil.memPutAddress(ptr + PAGE_FRAME_DESCRIPTOR_SIZE_OFFSET, objectStorage.frameDescriptorBuffer.capacity());
MemoryUtil.memPutAddress(ptr + INSTANCE_SIZE_OFFSET, objectStorage.objectBuffer.capacity());
MemoryUtil.memPutAddress(ptr + DRAW_INSTANCE_INDEX_SIZE_OFFSET, INT_SIZE * instanceCount);
@ -112,24 +105,23 @@ public class IndirectBuffers {
}
public void bindForCullPassOne() {
multiBind(0, 6);
multiBind(0, 5);
}
public void bindForCullPassTwo() {
multiBind(0, 6);
GlBufferType.DISPATCH_INDIRECT_BUFFER.bind(passTwoDispatch.handle());
multiBind(0, 5);
}
public void bindForApply() {
multiBind(5, 2);
multiBind(4, 2);
}
public void bindForModelReset() {
multiBind(5, 1);
multiBind(4, 1);
}
public void bindForDraw() {
multiBind(3, 4);
multiBind(2, 4);
GlBufferType.DRAW_INDIRECT_BUFFER.bind(draw.handle());
}
@ -155,7 +147,6 @@ public class IndirectBuffers {
drawInstanceIndex.delete();
model.delete();
draw.delete();
passTwoDispatch.delete();
passTwoInstanceIndex.delete();
lastFrameVisibility.delete();
}
}

View file

@ -6,7 +6,6 @@ import static org.lwjgl.opengl.GL30.glUniform1ui;
import static org.lwjgl.opengl.GL42.GL_COMMAND_BARRIER_BIT;
import static org.lwjgl.opengl.GL42.glMemoryBarrier;
import static org.lwjgl.opengl.GL43.glDispatchCompute;
import static org.lwjgl.opengl.GL43.glDispatchComputeIndirect;
import java.util.ArrayList;
import java.util.Comparator;
@ -14,8 +13,6 @@ import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import org.lwjgl.opengl.GL46;
import dev.engine_room.flywheel.api.instance.Instance;
import dev.engine_room.flywheel.api.instance.InstanceType;
import dev.engine_room.flywheel.api.material.Material;
@ -29,7 +26,6 @@ import dev.engine_room.flywheel.backend.engine.MeshPool;
import dev.engine_room.flywheel.backend.engine.uniform.Uniforms;
import dev.engine_room.flywheel.backend.gl.GlCompat;
import dev.engine_room.flywheel.backend.gl.shader.GlProgram;
import dev.engine_room.flywheel.lib.material.LightShaders;
import dev.engine_room.flywheel.lib.math.MoreMath;
public class IndirectCullingGroup<I extends Instance> {
@ -54,12 +50,6 @@ public class IndirectCullingGroup<I extends Instance> {
private boolean needsDrawSort;
public int instanceCountThisFrame;
private int pagesLastFrame = 0;
private int pagesThisFrame = 0;
private int visibilityWriteOffsetPages = 0;
private int visibilityReadOffsetPages = 0;
IndirectCullingGroup(InstanceType<I> instanceType, IndirectPrograms programs) {
this.instanceType = instanceType;
instanceStride = MoreMath.align4(instanceType.layout()
@ -95,17 +85,6 @@ public class IndirectCullingGroup<I extends Instance> {
}
}
public int flipVisibilityOffsets(int visibilityWriteOffsetPages) {
this.visibilityReadOffsetPages = this.visibilityWriteOffsetPages;
this.visibilityWriteOffsetPages = visibilityWriteOffsetPages;
pagesLastFrame = pagesThisFrame;
pagesThisFrame = buffers.objectStorage.capacity();
return pagesThisFrame;
}
public void upload(StagingBuffer stagingBuffer) {
if (nothingToDo()) {
return;
@ -127,8 +106,6 @@ public class IndirectCullingGroup<I extends Instance> {
}
uploadDraws(stagingBuffer);
GL46.nglClearNamedBufferData(buffers.passTwoDispatch.handle(), GL46.GL_R32UI, GL46.GL_RED, GL46.GL_UNSIGNED_INT, 0);
}
public void dispatchCull() {
@ -139,8 +116,6 @@ public class IndirectCullingGroup<I extends Instance> {
Uniforms.bindAll();
earlyCull.bind();
earlyCull.setUInt("_flw_visibilityReadOffsetPages", visibilityReadOffsetPages);
buffers.bindForCullPassOne();
glDispatchCompute(buffers.objectStorage.capacity(), 1, 1);
}
@ -154,7 +129,7 @@ public class IndirectCullingGroup<I extends Instance> {
lateCull.bind();
buffers.bindForCullPassTwo();
glDispatchComputeIndirect(0);
glDispatchCompute(buffers.objectStorage.capacity(), 1, 1);
}
public void dispatchApply() {
@ -257,8 +232,6 @@ public class IndirectCullingGroup<I extends Instance> {
// Don't need to do this unless the program changes.
drawProgram.bind();
baseDrawUniformLoc = drawProgram.getUniformLocation("_flw_baseDraw");
drawProgram.setUInt("_flw_visibilityWriteOffsetInstances", visibilityWriteOffsetPages << ObjectStorage.LOG_2_PAGE_SIZE);
}
glUniform1ui(baseDrawUniformLoc, multiDraw.start);

View file

@ -53,11 +53,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
private final MatrixBuffer matrixBuffer;
private final DepthPyramid depthPyramid;
private final VisibilityBuffer visibilityBuffer;
private int totalPagesLastFrame = 0;
private boolean needsBarrier = false;
public IndirectDrawManager(IndirectPrograms programs) {
this.programs = programs;
@ -73,7 +68,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
matrixBuffer = new MatrixBuffer();
depthPyramid = new DepthPyramid(programs);
visibilityBuffer = new VisibilityBuffer(programs);
}
@Override
@ -112,8 +106,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT);
visibilityBuffer.bind();
for (var group1 : cullingGroups.values()) {
group1.dispatchCull();
}
@ -124,8 +116,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
visibilityBuffer.attach();
submitDraws();
depthPyramid.generate();
@ -156,8 +146,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
MaterialRenderState.reset();
TextureBinder.resetLightAndOverlay();
visibilityBuffer.detach();
}
private void dispatchApply() {
@ -185,20 +173,12 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
group.flushInstancers();
}
visibilityBuffer.read(totalPagesLastFrame);
visibilityBuffer.clear();
cullingGroups.values()
.removeIf(IndirectCullingGroup::checkEmptyAndDelete);
instancers.values()
.removeIf(instancer -> instancer.instanceCount() == 0);
int totalPagesThisFrame = 0;
for (var group : cullingGroups.values()) {
totalPagesThisFrame += group.flipVisibilityOffsets(totalPagesThisFrame);
}
meshPool.flush();
stagingBuffer.reclaim();
@ -215,10 +195,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
// We could probably save some driver calls here when there are
// actually zero instances, but that feels like a very rare case
needsBarrier = true;
totalPagesLastFrame = totalPagesThisFrame;
}
@Override
@ -238,8 +214,6 @@ public class IndirectDrawManager extends DrawManager<IndirectInstancer<?>> {
programs.release();
depthPyramid.delete();
visibilityBuffer.delete();
}
public void renderCrumbling(List<Engine.CrumblingBlock> crumblingBlocks) {

View file

@ -1,132 +0,0 @@
package dev.engine_room.flywheel.backend.engine.indirect;
import org.lwjgl.opengl.GL30;
import org.lwjgl.opengl.GL32;
import org.lwjgl.opengl.GL46;
import org.lwjgl.opengl.GL46C;
import com.mojang.blaze3d.platform.GlStateManager;
import dev.engine_room.flywheel.backend.FlwBackend;
import dev.engine_room.flywheel.backend.compile.IndirectPrograms;
import dev.engine_room.flywheel.backend.gl.GlTextureUnit;
import dev.engine_room.flywheel.lib.math.MoreMath;
import it.unimi.dsi.fastutil.ints.IntArraySet;
import it.unimi.dsi.fastutil.ints.IntSet;
import net.minecraft.client.Minecraft;
public class VisibilityBuffer {
private static final int READ_GROUP_SIZE = 32;
private static final int ATTACHMENT = GL30.GL_COLOR_ATTACHMENT1;
private final IndirectPrograms programs;
private final ResizableStorageArray lastFrameVisibility;
private int textureId = -1;
private int lastWidth = -1;
private int lastHeight = -1;
private final IntSet attached = new IntArraySet();
public VisibilityBuffer(IndirectPrograms programs) {
this.programs = programs;
lastFrameVisibility = new ResizableStorageArray(Integer.BYTES, 1.25f);
}
public void read(int pageCount) {
if (pageCount == 0) {
return;
}
lastFrameVisibility.ensureCapacity(pageCount);
GL46.nglClearNamedBufferData(lastFrameVisibility.handle(), GL46.GL_R32UI, GL46.GL_RED_INTEGER, GL46.GL_UNSIGNED_INT, 0);
if (lastWidth == -1 || lastHeight == -1) {
return;
}
programs.getReadVisibilityProgram()
.bind();
bind();
GlTextureUnit.T0.makeActive();
GlStateManager._bindTexture(textureId);
GL46.glDispatchCompute(MoreMath.ceilingDiv(lastWidth, READ_GROUP_SIZE), MoreMath.ceilingDiv(lastHeight, READ_GROUP_SIZE), 1);
}
public void bind() {
GL46.glBindBufferBase(GL46.GL_SHADER_STORAGE_BUFFER, BufferBindings.LAST_FRAME_VISIBILITY, lastFrameVisibility.handle());
}
public void attach() {
var mainRenderTarget = Minecraft.getInstance()
.getMainRenderTarget();
setupTexture(mainRenderTarget.width, mainRenderTarget.height);
if (attached.add(mainRenderTarget.frameBufferId)) {
GL46.glNamedFramebufferTexture(mainRenderTarget.frameBufferId, ATTACHMENT, textureId, 0);
try {
mainRenderTarget.checkStatus();
} catch (Exception e) {
FlwBackend.LOGGER.error("Error attaching visbuffer", e);
}
}
// Enable writes
GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0, ATTACHMENT });
}
public void detach() {
var mainRenderTarget = Minecraft.getInstance()
.getMainRenderTarget();
// Disable writes
GL46.glNamedFramebufferDrawBuffers(mainRenderTarget.frameBufferId, new int[] { GL30.GL_COLOR_ATTACHMENT0 });
}
public void delete() {
deleteTexture();
lastFrameVisibility.delete();
}
private void deleteTexture() {
if (textureId != -1) {
GL32.glDeleteTextures(textureId);
textureId = -1;
}
}
public void clear() {
if (lastWidth == -1 || lastHeight == -1) {
return;
}
GL46C.nglClearTexImage(textureId, 0, GL32.GL_RED_INTEGER, GL32.GL_UNSIGNED_INT, 0);
}
private void setupTexture(int width, int height) {
if (lastWidth == width && lastHeight == height) {
return;
}
// Need to rebind to all fbos because an attachment becomes incomplete when it's resized
attached.clear();
lastWidth = width;
lastHeight = height;
deleteTexture();
textureId = GL46.glCreateTextures(GL46.GL_TEXTURE_2D);
GL46.glTextureStorage2D(textureId, 1, GL32.GL_R32UI, width, height);
GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MIN_FILTER, GL32.GL_NEAREST);
GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_MAG_FILTER, GL32.GL_NEAREST);
GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_S, GL32.GL_CLAMP_TO_EDGE);
GL46.glTextureParameteri(textureId, GL32.GL_TEXTURE_WRAP_T, GL32.GL_CLAMP_TO_EDGE);
}
}

View file

@ -1,17 +1,17 @@
// FIXME: minimum required SSBO bindings in OpenGL is 8, but we use 9.
// A few of these could be combined.
// Per culling group
#define _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING 0 // cull1
#define _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING 1 // cull1, cull2
#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 2 // cull1, cull2
#define _FLW_INSTANCE_BUFFER_BINDING 3 // cull1, cull2, draw
#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 4 // cull1, cull2, draw
#define _FLW_MODEL_BUFFER_BINDING 5 // cull1, cull2, apply
#define _FLW_DRAW_BUFFER_BINDING 6 // apply, draw
#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 0// cull1, cull2
#define _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING 1// cull1, cull2
#define _FLW_INSTANCE_BUFFER_BINDING 2// cull1, cull2, draw
#define _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING 3// cull1, cull2, draw
#define _FLW_MODEL_BUFFER_BINDING 4// cull1, cull2, apply
#define _FLW_DRAW_BUFFER_BINDING 5// apply, draw
// Global to the engine
#define _FLW_LIGHT_LUT_BUFFER_BINDING 7
#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 8
#define _FLW_LIGHT_LUT_BUFFER_BINDING 6
#define _FLW_LIGHT_SECTIONS_BUFFER_BINDING 7
#define _FLW_MATRIX_BUFFER_BINDING 9
#define _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING 10
#define _FLW_MATRIX_BUFFER_BINDING 8

View file

@ -1,152 +0,0 @@
#include "flywheel:internal/indirect/buffer_bindings.glsl"
#include "flywheel:internal/indirect/model_descriptor.glsl"
#include "flywheel:internal/uniforms/uniforms.glsl"
#include "flywheel:util/matrix.glsl"
#include "flywheel:internal/indirect/matrices.glsl"
layout(local_size_x = 32) in;
layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer TargetBuffer {
uint _flw_instanceIndices[];
};
// High 6 bits for the number of instances in the page.
const uint _FLW_PAGE_COUNT_OFFSET = 26u;
// Bottom 26 bits for the model index.
const uint _FLW_MODEL_INDEX_MASK = 0x3FFFFFF;
layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict readonly buffer PageFrameDescriptorBuffer {
uint _flw_pageFrameDescriptors[];
};
layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer {
ModelDescriptor _flw_models[];
};
layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict readonly buffer MatrixBuffer {
Matrices _flw_matrices[];
};
layout(binding = 0) uniform sampler2D _flw_depthPyramid;
// Disgustingly vectorized sphere frustum intersection taking advantage of ahead of time packing.
// Only uses 6 fmas and some boolean ops.
// See also:
// flywheel:uniform/flywheel.glsl
// dev.engine_room.flywheel.lib.math.MatrixMath.writePackedFrustumPlanes
// org.joml.FrustumIntersection.testSphere
bool _flw_testSphere(vec3 center, float radius) {
bvec4 xyInside = greaterThanEqual(fma(flw_frustumPlanes.xyX, center.xxxx, fma(flw_frustumPlanes.xyY, center.yyyy, fma(flw_frustumPlanes.xyZ, center.zzzz, flw_frustumPlanes.xyW))), -radius.xxxx);
bvec2 zInside = greaterThanEqual(fma(flw_frustumPlanes.zX, center.xx, fma(flw_frustumPlanes.zY, center.yy, fma(flw_frustumPlanes.zZ, center.zz, flw_frustumPlanes.zW))), -radius.xx);
return all(xyInside) && all(zInside);
}
bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 aabb) {
// Closest point on the sphere is between the camera and the near plane, don't even attempt to cull.
if (c.z + r > -znear) {
return false;
}
vec3 cr = c * r;
float czr2 = c.z * c.z - r * r;
float vx = sqrt(c.x * c.x + czr2);
float minx = (vx * c.x - cr.z) / (vx * c.z + cr.x);
float maxx = (vx * c.x + cr.z) / (vx * c.z - cr.x);
float vy = sqrt(c.y * c.y + czr2);
float miny = (vy * c.y - cr.z) / (vy * c.z + cr.y);
float maxy = (vy * c.y + cr.z) / (vy * c.z - cr.y);
aabb = vec4(minx * P00, miny * P11, maxx * P00, maxy * P11);
aabb = aabb.xwzy * vec4(-0.5f, -0.5f, -0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space
return true;
}
bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
uint matrixIndex = _flw_models[modelIndex].matrixIndex;
BoundingSphere sphere = _flw_models[modelIndex].boundingSphere;
vec3 center;
float radius;
_flw_unpackBoundingSphere(sphere, center, radius);
FlwInstance instance = _flw_unpackInstance(instanceIndex);
flw_transformBoundingSphere(instance, center, radius);
if (matrixIndex > 0) {
transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius);
}
bool isVisible = _flw_testSphere(center, radius);
if (isVisible) {
transformBoundingSphere(flw_view, center, radius);
vec4 aabb;
if (projectSphere(center, radius, _flw_cullData.znear, _flw_cullData.P00, _flw_cullData.P11, aabb))
{
float width = (aabb.z - aabb.x) * _flw_cullData.pyramidWidth;
float height = (aabb.w - aabb.y) * _flw_cullData.pyramidHeight;
int level = clamp(int(ceil(log2(max(width, height)))), 0, _flw_cullData.pyramidLevels);
ivec2 levelSize = textureSize(_flw_depthPyramid, level);
ivec4 levelSizePair = ivec4(levelSize, levelSize);
ivec4 bounds = ivec4(aabb * vec4(levelSizePair));
// Clamp to the texture bounds.
// Since we're not going through a sampler out of bounds texel fetches will return 0.
bounds = clamp(bounds, ivec4(0), levelSizePair);
float depth01 = texelFetch(_flw_depthPyramid, bounds.xw, level).r;
float depth11 = texelFetch(_flw_depthPyramid, bounds.zw, level).r;
float depth10 = texelFetch(_flw_depthPyramid, bounds.zy, level).r;
float depth00 = texelFetch(_flw_depthPyramid, bounds.xy, level).r;
float depth;
if (_flw_cullData.useMin == 0) {
depth = max(max(depth00, depth01), max(depth10, depth11));
} else {
depth = min(min(depth00, depth01), min(depth10, depth11));
}
float depthSphere = 1. + _flw_cullData.znear / (center.z + radius);
isVisible = isVisible && depthSphere <= depth;
}
}
return isVisible;
}
void main() {
uint pageIndex = gl_WorkGroupID.x;
if (pageIndex >= _flw_pageFrameDescriptors.length()) {
return;
}
uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex];
uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET;
if (gl_LocalInvocationID.x >= pageInstanceCount) {
return;
}
uint instanceIndex = gl_GlobalInvocationID.x;
uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK;
if (_flw_isVisible(instanceIndex, modelIndex)) {
uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1);
uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex;
_flw_instanceIndices[targetIndex] = instanceIndex;
}
}

View file

@ -7,16 +7,6 @@
layout(local_size_x = 32) in;
uniform uint _flw_visibilityReadOffsetPages;
layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer {
_FlwLateCullDispatch _flw_lateCullDispatch;
};
layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer PassTwoIndexBuffer {
uint _flw_passTwoIndices[];
};
layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer {
uint _flw_drawIndices[];
};
@ -31,7 +21,7 @@ layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict rea
};
layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict readonly buffer LastFrameVisibilityBuffer {
uint _flw_lastFrameVisibility[];
uint _flw_visibility[];
};
layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer {
@ -74,6 +64,10 @@ bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
return _flw_testSphere(center, radius);
}
// TODO: There's an opportunity here to write out the transformed bounding spheres to a buffer and use them in pass 2,
// instead of pulling the entire instance again. It would save a lot of memory bandwidth and matrix multiplications in
// pass 2, but it would also be a good bit of writes in pass 1. It's worth investigating, but it would be nice to have
// nsight trace working to be more sure.
void main() {
uint pageIndex = gl_WorkGroupID.x;
@ -97,26 +91,12 @@ void main() {
return;
}
uint pageVisibility = _flw_lastFrameVisibility[_flw_visibilityReadOffsetPages + pageIndex];
uint pageVisibility = _flw_visibility[pageIndex];
if ((pageVisibility & (1u << gl_LocalInvocationID.x)) != 0u) {
// This instance was visibile last frame, it should be rendered early.
uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1);
uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex;
_flw_drawIndices[targetIndex] = instanceIndex;
} else {
// Try again later to see if it's been disoccluded.
uint targetIndex = atomicAdd(_flw_lateCullDispatch.threadCount, 1);
_flw_passTwoIndices[targetIndex] = instanceIndex;
if (targetIndex % 32u == 0u) {
// This thread wrote an index that will be at the start of a new workgroup later
atomicAdd(_flw_lateCullDispatch.x, 1);
if (targetIndex == 0) {
_flw_lateCullDispatch.y = 1;
_flw_lateCullDispatch.z = 1;
}
}
}
}

View file

@ -7,15 +7,6 @@
layout(local_size_x = 32) in;
layout(std430, binding = _FLW_PASS_TWO_DISPATCH_BUFFER_BINDING) restrict buffer PassTwoDispatchBuffer {
_FlwLateCullDispatch _flw_lateCullDispatch;
};
layout(std430, binding = _FLW_PASS_TWO_INSTANCE_INDEX_BUFFER_BINDING) restrict readonly buffer PassTwoIndexBuffer {
uint _flw_passTwoIndices[];
};
layout(std430, binding = _FLW_DRAW_INSTANCE_INDEX_BUFFER_BINDING) restrict writeonly buffer DrawIndexBuffer {
uint _flw_drawIndices[];
};
@ -31,6 +22,10 @@ layout(std430, binding = _FLW_PAGE_FRAME_DESCRIPTOR_BUFFER_BINDING) restrict rea
uint _flw_pageFrameDescriptors[];
};
layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buffer LastFrameVisibilityBuffer {
uint _flw_visibility[];
};
layout(std430, binding = _FLW_MODEL_BUFFER_BINDING) restrict buffer ModelBuffer {
ModelDescriptor _flw_models[];
};
@ -64,22 +59,20 @@ bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4
return true;
}
bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
uint matrixIndex = _flw_models[modelIndex].matrixIndex;
BoundingSphere sphere = _flw_models[modelIndex].boundingSphere;
// Disgustingly vectorized sphere frustum intersection taking advantage of ahead of time packing.
// Only uses 6 fmas and some boolean ops.
// See also:
// flywheel:uniform/flywheel.glsl
// dev.engine_room.flywheel.lib.math.MatrixMath.writePackedFrustumPlanes
// org.joml.FrustumIntersection.testSphere
bool _flw_testSphere(vec3 center, float radius) {
bvec4 xyInside = greaterThanEqual(fma(flw_frustumPlanes.xyX, center.xxxx, fma(flw_frustumPlanes.xyY, center.yyyy, fma(flw_frustumPlanes.xyZ, center.zzzz, flw_frustumPlanes.xyW))), -radius.xxxx);
bvec2 zInside = greaterThanEqual(fma(flw_frustumPlanes.zX, center.xx, fma(flw_frustumPlanes.zY, center.yy, fma(flw_frustumPlanes.zZ, center.zz, flw_frustumPlanes.zW))), -radius.xx);
vec3 center;
float radius;
_flw_unpackBoundingSphere(sphere, center, radius);
FlwInstance instance = _flw_unpackInstance(instanceIndex);
flw_transformBoundingSphere(instance, center, radius);
if (matrixIndex > 0) {
transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius);
}
return all(xyInside) && all(zInside);
}
bool _flw_hizTest(vec3 center, float radius) {
transformBoundingSphere(flw_view, center, radius);
vec4 aabb;
@ -116,22 +109,63 @@ bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
return true;
}
bool _flw_isVisible(uint instanceIndex, uint modelIndex) {
uint matrixIndex = _flw_models[modelIndex].matrixIndex;
BoundingSphere sphere = _flw_models[modelIndex].boundingSphere;
vec3 center;
float radius;
_flw_unpackBoundingSphere(sphere, center, radius);
FlwInstance instance = _flw_unpackInstance(instanceIndex);
flw_transformBoundingSphere(instance, center, radius);
if (matrixIndex > 0) {
transformBoundingSphere(_flw_matrices[matrixIndex].pose, center, radius);
}
bool visible = _flw_testSphere(center, radius);
if (visible) {
visible = visible && _flw_hizTest(center, radius);
}
return visible;
}
void main() {
if (gl_GlobalInvocationID.x >= _flw_lateCullDispatch.threadCount) {
uint pageIndex = gl_WorkGroupID.x;
if (pageIndex >= _flw_pageFrameDescriptors.length()) {
return;
}
uint instanceIndex = _flw_passTwoIndices[gl_GlobalInvocationID.x];
uint pageIndex = instanceIndex >> 5;
uint packedModelIndexAndCount = _flw_pageFrameDescriptors[pageIndex];
uint pageInstanceCount = packedModelIndexAndCount >> _FLW_PAGE_COUNT_OFFSET;
if (gl_LocalInvocationID.x >= pageInstanceCount) {
return;
}
uint instanceIndex = gl_GlobalInvocationID.x;
uint modelIndex = packedModelIndexAndCount & _FLW_MODEL_INDEX_MASK;
if (_flw_isVisible(instanceIndex, modelIndex)) {
bool visible = _flw_isVisible(instanceIndex, modelIndex);
bool visibleLastFrame = (_flw_visibility[pageIndex] & (1u << gl_LocalInvocationID.x)) != 0u;
if (visible && !visibleLastFrame) {
uint localIndex = atomicAdd(_flw_models[modelIndex].instanceCount, 1);
uint targetIndex = _flw_models[modelIndex].baseInstance + localIndex;
_flw_drawIndices[targetIndex] = instanceIndex;
}
// FIXME: need a non-subgroup path
uvec4 visibility = subgroupBallot(visible);
if (subgroupElect()) {
_flw_visibility[pageIndex] = visibility.x;
}
}

View file

@ -6,13 +6,9 @@ flat in uvec2 _flw_packedMaterial;
flat in uint _flw_instanceID;
layout(location = 1) out uint _flw_out_instanceID;
void main() {
_flw_unpackUint2x16(_flw_packedMaterial.x, _flw_uberFogIndex, _flw_uberCutoutIndex);
_flw_unpackMaterialProperties(_flw_packedMaterial.y, flw_material);
_flw_main(_flw_instanceID);
_flw_out_instanceID = _flw_instanceID;
}

View file

@ -21,10 +21,6 @@ layout(std430, binding = _FLW_MATRIX_BUFFER_BINDING) restrict buffer MatrixBuffe
uniform uint _flw_baseDraw;
// We read the visibility buffer for all culling groups into a single shared buffer.
// This offset is used to know where each culling group starts.
uniform uint _flw_visibilityWriteOffsetInstances = 0;
flat out uvec2 _flw_packedMaterial;
flat out uint _flw_instanceID;
@ -60,5 +56,5 @@ void main() {
_flw_main(instance);
// Add 1 because a 0 instance id means null.
_flw_instanceID = _flw_visibilityWriteOffsetInstances + instanceIndex + 1;
_flw_instanceID = instanceIndex + 1;
}

View file

@ -1,64 +0,0 @@
#include "flywheel:internal/indirect/buffer_bindings.glsl"
layout(local_size_x = 256) in;
layout(binding = 0) uniform usampler2D visBuffer;
layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buffer LastFrameVisibilityBuffer {
uint _flw_lastFrameVisibility[];
};
uint extractBits(uint e, uint offset, uint count) {
return (e >> offset) & ((1u << count) - 1u);
}
uint insertBits(uint e, uint newbits, uint offset, uint count) {
uint countMask = ((1u << count) - 1u);
// zero out the bits we're going to replace first
return (e & ~(countMask << offset)) | ((newbits & countMask) << offset);
}
uvec2 remap_for_wave_reduction(uint a) {
return uvec2(
insertBits(extractBits(a, 2u, 3u), a, 0u, 1u),
insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u)
);
}
void emit(uint instanceID) {
// Null instance id.
if (instanceID == 0) {
return;
}
// Adjust for null to find the actual index.
instanceID = instanceID - 1;
uint index = instanceID >> 5;
uint mask = 1u << (instanceID & 31u);
atomicOr(_flw_lastFrameVisibility[index], mask);
}
void main() {
uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u);
uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u);
uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u);
ivec2 tex = ivec2(gl_WorkGroupID.xy) * 32 + ivec2(x, y) * 2;
uint instanceID01 = texelFetchOffset(visBuffer, tex, 0, ivec2(0, 1)).r;
uint instanceID11 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 1)).r;
uint instanceID10 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 0)).r;
uint instanceID00 = texelFetch(visBuffer, tex, 0).r;
if (instanceID00 == instanceID01 && instanceID01 == instanceID10 && instanceID10 == instanceID11) {
emit(instanceID00);
} else {
emit(instanceID00);
emit(instanceID01);
emit(instanceID10);
emit(instanceID11);
}
}