From ba3d84b5ae201e5ec7e4590ff91e32d4cef591b4 Mon Sep 17 00:00:00 2001 From: Jozufozu Date: Fri, 13 Sep 2024 22:39:31 -0700 Subject: [PATCH] Seeing blue - Optimize read visibility by having each invocation read a 2x2 area and coalescing atomicOrs when all 4 texels are equal - Also use the fancy remap function for better texture cache locality --- .../engine/indirect/VisibilityBuffer.java | 2 +- .../internal/indirect/read_visibility.glsl | 43 +++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java index 8cfeaecc9..9b266b65a 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/VisibilityBuffer.java @@ -16,7 +16,7 @@ import it.unimi.dsi.fastutil.ints.IntSet; import net.minecraft.client.Minecraft; public class VisibilityBuffer { - private static final int READ_GROUP_SIZE = 16; + private static final int READ_GROUP_SIZE = 32; private static final int ATTACHMENT = GL30.GL_COLOR_ATTACHMENT1; private final GlProgram readVisibilityProgram; diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl index 52d4c655f..b4d506f16 100644 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/read_visibility.glsl @@ -1,6 +1,6 @@ #include "flywheel:internal/indirect/buffer_bindings.glsl" -layout(local_size_x = 16, local_size_y = 16) in; +layout(local_size_x = 256) in; layout(binding = 0) uniform usampler2D visBuffer; @@ -8,9 +8,24 @@ layout(std430, binding = _FLW_LAST_FRAME_VISIBILITY_BUFFER_BINDING) restrict buf uint _flw_lastFrameVisibility[]; }; -void main() { - uint instanceID = texelFetch(visBuffer, ivec2(gl_GlobalInvocationID.xy), 0).r; +uint extractBits(uint e, uint offset, uint count) { + return (e >> offset) & ((1u << count) - 1u); +} +uint insertBits(uint e, uint newbits, uint offset, uint count) { + uint countMask = ((1u << count) - 1u); + // zero out the bits we're going to replace first + return (e & ~(countMask << offset)) | ((newbits & countMask) << offset); +} + +uvec2 remap_for_wave_reduction(uint a) { + return uvec2( + insertBits(extractBits(a, 2u, 3u), a, 0u, 1u), + insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u) + ); +} + +void emit(uint instanceID) { // Null instance id. if (instanceID == 0) { return; @@ -25,3 +40,25 @@ void main() { atomicOr(_flw_lastFrameVisibility[index], mask); } + +void main() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + ivec2 tex = ivec2(gl_WorkGroupID.xy) * 32 + ivec2(x, y) * 2; + + uint instanceID01 = texelFetchOffset(visBuffer, tex, 0, ivec2(0, 1)).r; + uint instanceID11 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 1)).r; + uint instanceID10 = texelFetchOffset(visBuffer, tex, 0, ivec2(1, 0)).r; + uint instanceID00 = texelFetch(visBuffer, tex, 0).r; + + if (instanceID00 == instanceID01 && instanceID01 == instanceID10 && instanceID10 == instanceID11) { + emit(instanceID00); + } else { + emit(instanceID00); + emit(instanceID01); + emit(instanceID10); + emit(instanceID11); + } +}