From 861009ed11f73627e6d77f3f465b737e18046e58 Mon Sep 17 00:00:00 2001 From: Jozufozu Date: Thu, 12 Sep 2024 21:32:13 -0700 Subject: [PATCH] Rapid descent - Implement single (but actually 2) pass downsampling --- .../backend/compile/IndirectPrograms.java | 12 +- .../backend/engine/indirect/DepthPyramid.java | 54 ++++++- .../engine/indirect/IndirectDrawManager.java | 5 +- .../internal/indirect/downsample.glsl | 31 ++++ .../internal/indirect/downsample_first.glsl | 150 ++++++++++++++++++ .../internal/indirect/downsample_second.glsl | 134 ++++++++++++++++ 6 files changed, 382 insertions(+), 4 deletions(-) create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java index 47e381eff..d352d3c03 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java @@ -34,7 +34,9 @@ public class IndirectPrograms extends AtomicReferenceCounted { private static final ResourceLocation DEPTH_REDUCE_SHADER_MAIN = Flywheel.rl("internal/indirect/depth_reduce.glsl"); private static final ResourceLocation READ_VISIBILITY_SHADER_MAIN = Flywheel.rl("internal/indirect/read_visibility.glsl"); private static final ResourceLocation ZERO_MODELS_SHADER_MAIN = Flywheel.rl("internal/indirect/zero_models.glsl"); - public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, DEPTH_REDUCE_SHADER_MAIN, READ_VISIBILITY_SHADER_MAIN, ZERO_MODELS_SHADER_MAIN); + private static final ResourceLocation DOWNSAMPLE_FIRST = Flywheel.rl("internal/indirect/downsample_first.glsl"); + private static final ResourceLocation DOWNSAMPLE_SECOND = Flywheel.rl("internal/indirect/downsample_second.glsl"); + public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, DEPTH_REDUCE_SHADER_MAIN, READ_VISIBILITY_SHADER_MAIN, ZERO_MODELS_SHADER_MAIN, DOWNSAMPLE_FIRST, DOWNSAMPLE_SECOND); private static final Compile> CULL = new Compile<>(); private static final Compile UTIL = new Compile<>(); @@ -205,6 +207,14 @@ public class IndirectPrograms extends AtomicReferenceCounted { return utils.get(DEPTH_REDUCE_SHADER_MAIN); } + public GlProgram getDownsampleFirstProgram() { + return utils.get(DOWNSAMPLE_FIRST); + } + + public GlProgram getDownsampleSecondProgram() { + return utils.get(DOWNSAMPLE_SECOND); + } + public GlProgram getReadVisibilityProgram() { return utils.get(READ_VISIBILITY_SHADER_MAIN); } diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java index 789891c57..fdfbadb86 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java @@ -14,14 +14,18 @@ public class DepthPyramid { private static final int GROUP_SIZE = 16; private final GlProgram depthReduceProgram; + private final GlProgram downsampleFirstProgram; + private final GlProgram downsampleSecondProgram; public int pyramidTextureId = -1; private int lastWidth = -1; private int lastHeight = -1; - public DepthPyramid(GlProgram depthReduceProgram) { + public DepthPyramid(GlProgram depthReduceProgram, GlProgram downsampleFirstProgram, GlProgram downsampleSecondProgram) { this.depthReduceProgram = depthReduceProgram; + this.downsampleFirstProgram = downsampleFirstProgram; + this.downsampleSecondProgram = downsampleSecondProgram; } public void generate() { @@ -61,6 +65,54 @@ public class DepthPyramid { } } + public void generateSPD() { + var mainRenderTarget = Minecraft.getInstance() + .getMainRenderTarget(); + + int width = mip0Size(mainRenderTarget.width); + int height = mip0Size(mainRenderTarget.height); + + int mipLevels = getImageMipLevels(width, height); + + createPyramidMips(mipLevels, width, height); + + int depthBufferId = mainRenderTarget.getDepthTextureId(); + + GL46.glMemoryBarrier(GL46.GL_FRAMEBUFFER_BARRIER_BIT); + + GlTextureUnit.T0.makeActive(); + GlStateManager._bindTexture(depthBufferId); + + downsampleFirstProgram.bind(); + downsampleFirstProgram.setUInt("max_mip_level", mipLevels); + + for (int i = 0; i < Math.min(6, mipLevels); i++) { + GL46.glBindImageTexture(i + 1, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F); + } + + GL46.glDispatchCompute(MoreMath.ceilingDiv(width << 1, 64), MoreMath.ceilingDiv(height << 1, 64), 1); + + if (mipLevels < 7) { + GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT); + + return; + } + + GL46.glMemoryBarrier(GL46.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + + downsampleSecondProgram.bind(); + downsampleSecondProgram.setUInt("max_mip_level", mipLevels); + + GL46.glBindImageTexture(0, pyramidTextureId, 5, false, 0, GL32.GL_READ_ONLY, GL32.GL_R32F); + for (int i = 6; i < Math.min(12, mipLevels); i++) { + GL46.glBindImageTexture(i - 5, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F); + } + + GL46.glDispatchCompute(1, 1, 1); + + GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT); + } + public void delete() { if (pyramidTextureId != -1) { GL32.glDeleteTextures(pyramidTextureId); diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java index 799495a09..de452a1e5 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java @@ -4,6 +4,7 @@ import static org.lwjgl.opengl.GL11.GL_TRIANGLES; import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT; import static org.lwjgl.opengl.GL30.glBindBufferRange; import static org.lwjgl.opengl.GL40.glDrawElementsIndirect; +import static org.lwjgl.opengl.GL42.GL_BUFFER_UPDATE_BARRIER_BIT; import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BARRIER_BIT; import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BUFFER; @@ -67,7 +68,7 @@ public class IndirectDrawManager extends DrawManager> { lightBuffers = new LightBuffers(); matrixBuffer = new MatrixBuffer(); - depthPyramid = new DepthPyramid(programs.getDepthReduceProgram()); + depthPyramid = new DepthPyramid(programs.getDepthReduceProgram(), programs.getDownsampleFirstProgram(), programs.getDownsampleSecondProgram()); visibilityBuffer = new VisibilityBuffer(programs.getReadVisibilityProgram()); } @@ -124,7 +125,7 @@ public class IndirectDrawManager extends DrawManager> { submitDraws(); - depthPyramid.generate(); + depthPyramid.generateSPD(); programs.getZeroModelProgram() .bind(); diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl new file mode 100644 index 000000000..c423431e6 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl @@ -0,0 +1,31 @@ +layout(local_size_x = 256) in; + +uniform uint max_mip_level; + +/// Generates a hierarchical depth buffer. +/// Based on FidelityFX SPD v2.1 https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/d7531ae47d8b36a5d4025663e731a47a38be882f/sdk/include/FidelityFX/gpu/spd/ffx_spd.h#L528 +/// Based on Bevy's more readable implementation https://github.com/JMS55/bevy/blob/ca2c8e63b9562f88c8cd7e1d88a17a4eea20aaf4/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl + +shared float[16][16] intermediate_memory; + +uint extractBits(uint e, uint offset, uint count) { + return (e >> offset) & ((1u << count) - 1u); +} + +uint insertBits(uint e, uint newbits, uint offset, uint count) { + uint countMask = ((1u << count) - 1u); + // zero out the bits we're going to replace first + return (e & ~(countMask << offset)) | ((newbits & countMask) << offset); +} + +uvec2 remap_for_wave_reduction(uint a) { + return uvec2( + insertBits(extractBits(a, 2u, 3u), a, 0u, 1u), + insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u) + ); +} + +float reduce_4(vec4 v) { + return max(max(v.x, v.y), max(v.z, v.w)); +} + diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl new file mode 100644 index 000000000..e3951a45e --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl @@ -0,0 +1,150 @@ +#include "flywheel:internal/indirect/downsample.glsl" + +layout(binding = 0) uniform sampler2D mip_0; +layout(binding = 1, r32f) uniform writeonly image2D mip_1; +layout(binding = 2, r32f) uniform writeonly image2D mip_2; +layout(binding = 3, r32f) uniform writeonly image2D mip_3; +layout(binding = 4, r32f) uniform writeonly image2D mip_4; +layout(binding = 5, r32f) uniform writeonly image2D mip_5; +layout(binding = 6, r32f) uniform writeonly image2D mip_6; + +float reduce_load_mip_0(uvec2 tex) { + vec2 uv = (vec2(tex) + 0.5) / vec2(imageSize(mip_1)) * 0.5; + return reduce_4(textureGather(mip_0, uv)); +} + +void downsample_mips_0_and_1(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + vec4 v; + + ivec2 tex = workgroup_id * 64 + ivec2(x * 2u, y * 2u); + ivec2 pix = workgroup_id * 32 + ivec2(x, y); + v[0] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[0])); + + tex = workgroup_id * 64 + ivec2(x * 2u + 32u, y * 2u); + pix = workgroup_id * 32 + ivec2(x + 16u, y); + v[1] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[1])); + + tex = workgroup_id * 64 + ivec2(x * 2u, y * 2u + 32u); + pix = workgroup_id * 32 + ivec2(x, y + 16u); + v[2] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[2])); + + tex = workgroup_id * 64 + ivec2(x * 2u + 32u, y * 2u + 32u); + pix = workgroup_id * 32 + ivec2(x + 16u, y + 16u); + v[3] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[3])); + + if (max_mip_level <= 1u) { return; } + + for (uint i = 0u; i < 4u; i++) { + intermediate_memory[x][y] = v[i]; + barrier(); + if (local_invocation_index < 64u) { + v[i] = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + pix = (workgroup_id * 16) + ivec2( + x + (i % 2u) * 8u, + y + (i / 2u) * 8u + ); + imageStore(mip_2, pix, vec4(v[i])); + } + barrier(); + } + + if (local_invocation_index < 64u) { + intermediate_memory[x + 0u][y + 0u] = v[0]; + intermediate_memory[x + 8u][y + 0u] = v[1]; + intermediate_memory[x + 0u][y + 8u] = v[2]; + intermediate_memory[x + 8u][y + 8u] = v[3]; + } +} + + +void downsample_mip_2(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 64u) { + float v = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + imageStore(mip_3, (workgroup_id * 8) + ivec2(x, y), vec4(v)); + intermediate_memory[x * 2u + y % 2u][y * 2u] = v; + } +} + +void downsample_mip_3(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 16u) { + float v = reduce_4(vec4( + intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u], + intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u] + )); + imageStore(mip_4, (workgroup_id * 4) + ivec2(x, y), vec4(v)); + intermediate_memory[x * 4u + y][y * 4u] = v; + } +} + +void downsample_mip_4(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 4u) { + float v = reduce_4(vec4( + intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u], + intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u] + )); + imageStore(mip_5, (workgroup_id * 2) + ivec2(x, y), vec4(v)); + intermediate_memory[x + y * 2u][0u] = v; + } +} + +void downsample_mip_5(ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 1u) { + float v = reduce_4(vec4( + intermediate_memory[0u][0u], + intermediate_memory[1u][0u], + intermediate_memory[2u][0u], + intermediate_memory[3u][0u] + )); + imageStore(mip_6, workgroup_id, vec4(v)); + } +} + +void downsample_mips_2_to_5(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (max_mip_level <= 2u) { return; } + barrier(); + downsample_mip_2(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 3u) { return; } + barrier(); + downsample_mip_3(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 4u) { return; } + barrier(); + downsample_mip_4(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 5u) { return; } + barrier(); + downsample_mip_5(workgroup_id, local_invocation_index); +} + +void downsample_depth_first() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + downsample_mips_0_and_1(x, y, ivec2(gl_WorkGroupID.xy), gl_LocalInvocationIndex); + + downsample_mips_2_to_5(x, y, ivec2(gl_WorkGroupID.xy), gl_LocalInvocationIndex); +} + +void main() { + downsample_depth_first(); +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl new file mode 100644 index 000000000..fe3b64f5a --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl @@ -0,0 +1,134 @@ +#include "flywheel:internal/indirect/downsample.glsl" + +layout(binding = 0, r32f) uniform readonly image2D mip_6; +layout(binding = 1, r32f) uniform writeonly image2D mip_7; +layout(binding = 2, r32f) uniform writeonly image2D mip_8; +layout(binding = 3, r32f) uniform writeonly image2D mip_9; +layout(binding = 4, r32f) uniform writeonly image2D mip_10; +layout(binding = 5, r32f) uniform writeonly image2D mip_11; +layout(binding = 6, r32f) uniform writeonly image2D mip_12; + +float reduce_load_mip_6(ivec2 tex) { + return reduce_4(vec4( + imageLoad(mip_6, tex + ivec2(0u, 0u)).r, + imageLoad(mip_6, tex + ivec2(0u, 1u)).r, + imageLoad(mip_6, tex + ivec2(1u, 0u)).r, + imageLoad(mip_6, tex + ivec2(1u, 1u)).r + )); +} + +void downsample_mips_6_and_7(uint x, uint y) { + vec4 v; + + ivec2 tex = ivec2(x * 4u + 0u, y * 4u + 0u); + ivec2 pix = ivec2(x * 2u + 0u, y * 2u + 0u); + v[0] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[0])); + + tex = ivec2(x * 4u + 2u, y * 4u + 0u); + pix = ivec2(x * 2u + 1u, y * 2u + 0u); + v[1] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[1])); + + tex = ivec2(x * 4u + 0u, y * 4u + 2u); + pix = ivec2(x * 2u + 0u, y * 2u + 1u); + v[2] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[2])); + + tex = ivec2(x * 4u + 2u, y * 4u + 2u); + pix = ivec2(x * 2u + 1u, y * 2u + 1u); + v[3] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[3])); + + if (max_mip_level <= 7u) { return; } + + float vr = reduce_4(v); + imageStore(mip_8, ivec2(x, y), vec4(vr)); + intermediate_memory[x][y] = vr; +} + + +void downsample_mip_8(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 64u) { + float v = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + imageStore(mip_9, ivec2(x, y), vec4(v)); + intermediate_memory[x * 2u + y % 2u][y * 2u] = v; + } +} + +void downsample_mip_9(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 16u) { + float v = reduce_4(vec4( + intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u], + intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u] + )); + imageStore(mip_10, ivec2(x, y), vec4(v)); + intermediate_memory[x * 4u + y][y * 4u] = v; + } +} + +void downsample_mip_10(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 4u) { + float v = reduce_4(vec4( + intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u], + intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u] + )); + imageStore(mip_11, ivec2(x, y), vec4(v)); + intermediate_memory[x + y * 2u][0u] = v; + } +} + +void downsample_mip_11(uint local_invocation_index) { + if (local_invocation_index < 1u) { + float v = reduce_4(vec4( + intermediate_memory[0u][0u], + intermediate_memory[1u][0u], + intermediate_memory[2u][0u], + intermediate_memory[3u][0u] + )); + + imageStore(mip_12, ivec2(0u, 0u), vec4(v)); + } +} + + +void downsample_mips_8_to_11(uint x, uint y, uint local_invocation_index) { + if (max_mip_level <= 8u) { return; } + barrier(); + downsample_mip_8(x, y, local_invocation_index); + + if (max_mip_level <= 9u) { return; } + barrier(); + downsample_mip_9(x, y, local_invocation_index); + + if (max_mip_level <= 10u) { return; } + barrier(); + downsample_mip_10(x, y, local_invocation_index); + + if (max_mip_level <= 11u) { return; } + barrier(); + downsample_mip_11(local_invocation_index); +} + +void downsample_depth_second() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + downsample_mips_6_and_7(x, y); + + downsample_mips_8_to_11(x, y, gl_LocalInvocationIndex); +} + +void main() { + downsample_depth_second(); +}