From ddb04501052badb72f710aa138f259c7cbfd2d49 Mon Sep 17 00:00:00 2001 From: Jozufozu Date: Thu, 12 Sep 2024 21:32:13 -0700 Subject: [PATCH] Rapid descent - Implement single (but actually 2) pass downsampling --- .../backend/compile/IndirectPrograms.java | 14 +- .../backend/engine/indirect/DepthPyramid.java | 59 ++++--- .../engine/indirect/IndirectDrawManager.java | 9 +- .../internal/indirect/depth_reduce.glsl | 31 ---- .../internal/indirect/downsample.glsl | 33 ++++ .../internal/indirect/downsample_first.glsl | 150 ++++++++++++++++++ .../internal/indirect/downsample_second.glsl | 136 ++++++++++++++++ 7 files changed, 372 insertions(+), 60 deletions(-) delete mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl create mode 100644 common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java index cc43568da..645f00451 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/compile/IndirectPrograms.java @@ -30,8 +30,9 @@ public class IndirectPrograms extends AtomicReferenceCounted { private static final ResourceLocation CULL_SHADER_MAIN = Flywheel.rl("internal/indirect/cull.glsl"); private static final ResourceLocation APPLY_SHADER_MAIN = Flywheel.rl("internal/indirect/apply.glsl"); private static final ResourceLocation SCATTER_SHADER_MAIN = Flywheel.rl("internal/indirect/scatter.glsl"); - private static final ResourceLocation DEPTH_REDUCE_SHADER_MAIN = Flywheel.rl("internal/indirect/depth_reduce.glsl"); - public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, DEPTH_REDUCE_SHADER_MAIN); + private static final ResourceLocation DOWNSAMPLE_FIRST = Flywheel.rl("internal/indirect/downsample_first.glsl"); + private static final ResourceLocation DOWNSAMPLE_SECOND = Flywheel.rl("internal/indirect/downsample_second.glsl"); + public static final List UTIL_SHADERS = List.of(APPLY_SHADER_MAIN, SCATTER_SHADER_MAIN, DOWNSAMPLE_FIRST, DOWNSAMPLE_SECOND); private static final Compile> CULL = new Compile<>(); private static final Compile UTIL = new Compile<>(); @@ -184,9 +185,14 @@ public class IndirectPrograms extends AtomicReferenceCounted { return utils.get(SCATTER_SHADER_MAIN); } - public GlProgram getDepthReduceProgram() { - return utils.get(DEPTH_REDUCE_SHADER_MAIN); + public GlProgram getDownsampleFirstProgram() { + return utils.get(DOWNSAMPLE_FIRST); } + + public GlProgram getDownsampleSecondProgram() { + return utils.get(DOWNSAMPLE_SECOND); + } + @Override protected void _delete() { pipeline.values() diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java index 1ba12b86f..56400e6f6 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/DepthPyramid.java @@ -11,15 +11,17 @@ import dev.engine_room.flywheel.lib.math.MoreMath; import net.minecraft.client.Minecraft; public class DepthPyramid { - private final GlProgram depthReduceProgram; + private final GlProgram downsampleFirstProgram; + private final GlProgram downsampleSecondProgram; public int pyramidTextureId = -1; private int lastWidth = -1; private int lastHeight = -1; - public DepthPyramid(GlProgram depthReduceProgram) { - this.depthReduceProgram = depthReduceProgram; + public DepthPyramid(GlProgram downsampleFirstProgram, GlProgram downsampleSecondProgram) { + this.downsampleFirstProgram = downsampleFirstProgram; + this.downsampleSecondProgram = downsampleSecondProgram; } public void generate() { @@ -37,26 +39,43 @@ public class DepthPyramid { GL46.glMemoryBarrier(GL46.GL_FRAMEBUFFER_BARRIER_BIT); - GlTextureUnit.T1.makeActive(); + GlTextureUnit.T0.makeActive(); + GlStateManager._bindTexture(depthBufferId); - depthReduceProgram.bind(); + downsampleFirstProgram.bind(); + downsampleFirstProgram.setUInt("max_mip_level", mipLevels); - for (int i = 0; i < mipLevels; i++) { - int mipWidth = mipSize(width, i); - int mipHeight = mipSize(height, i); - - int srcTexture = (i == 0) ? depthBufferId : pyramidTextureId; - GlStateManager._bindTexture(srcTexture); - - GL46.glBindImageTexture(0, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F); - - depthReduceProgram.setVec2("imageSize", mipWidth, mipHeight); - depthReduceProgram.setInt("lod", Math.max(0, i - 1)); - - GL46.glDispatchCompute(MoreMath.ceilingDiv(mipWidth, 8), MoreMath.ceilingDiv(mipHeight, 8), 1); - - GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT); + for (int i = 0; i < Math.min(6, mipLevels); i++) { + GL46.glBindImageTexture(i + 1, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F); } + + GL46.glDispatchCompute(MoreMath.ceilingDiv(width << 1, 64), MoreMath.ceilingDiv(height << 1, 64), 1); + + if (mipLevels < 7) { + GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT); + + return; + } + + GL46.glMemoryBarrier(GL46.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + + downsampleSecondProgram.bind(); + downsampleSecondProgram.setUInt("max_mip_level", mipLevels); + + // Note: mip_6 in the shader is actually mip level 5 in the texture + GL46.glBindImageTexture(0, pyramidTextureId, 5, false, 0, GL32.GL_READ_ONLY, GL32.GL_R32F); + for (int i = 6; i < Math.min(12, mipLevels); i++) { + GL46.glBindImageTexture(i - 5, pyramidTextureId, i, false, 0, GL32.GL_WRITE_ONLY, GL32.GL_R32F); + } + + GL46.glDispatchCompute(1, 1, 1); + + GL46.glMemoryBarrier(GL46.GL_TEXTURE_FETCH_BARRIER_BIT); + } + + public void bindForCull() { + GlTextureUnit.T0.makeActive(); + GlStateManager._bindTexture(pyramidTextureId); } public void delete() { diff --git a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java index 494403045..99ff35707 100644 --- a/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java +++ b/common/src/backend/java/dev/engine_room/flywheel/backend/engine/indirect/IndirectDrawManager.java @@ -12,8 +12,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.lwjgl.opengl.GL46; - import dev.engine_room.flywheel.api.backend.Engine; import dev.engine_room.flywheel.api.instance.Instance; import dev.engine_room.flywheel.api.instance.InstanceType; @@ -63,7 +61,7 @@ public class IndirectDrawManager extends DrawManager> { lightBuffers = new LightBuffers(); matrixBuffer = new MatrixBuffer(); - depthPyramid = new DepthPyramid(programs.getDepthReduceProgram()); + depthPyramid = new DepthPyramid(programs.getDownsampleFirstProgram(), programs.getDownsampleSecondProgram()); } @Override @@ -151,8 +149,7 @@ public class IndirectDrawManager extends DrawManager> { matrixBuffer.bind(); - GL46.glActiveTexture(GL46.GL_TEXTURE0); - GL46.glBindTexture(GL46.GL_TEXTURE_2D, depthPyramid.pyramidTextureId); + depthPyramid.bindForCull(); for (var group : cullingGroups.values()) { group.dispatchCull(); @@ -185,6 +182,8 @@ public class IndirectDrawManager extends DrawManager> { crumblingDrawBuffer.delete(); programs.release(); + + depthPyramid.delete(); } public void renderCrumbling(List crumblingBlocks) { diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl deleted file mode 100644 index 49bbbf947..000000000 --- a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/depth_reduce.glsl +++ /dev/null @@ -1,31 +0,0 @@ -layout(local_size_x = 8, local_size_y = 8) in; - -layout(binding = 0, r32f) uniform writeonly image2D outImage; -layout(binding = 1) uniform sampler2D inImage; - -uniform vec2 imageSize; -uniform int lod; - -uniform int useMin = 0; - -void main() { - uvec2 pos = gl_GlobalInvocationID.xy; - - // Map the output texel to an input texel. Properly do the division because generating mip0 maps from the actual - // full resolution depth buffer and the aspect ratio may be different from our Po2 pyramid. - ivec2 samplePos = ivec2(floor(vec2(pos) * vec2(textureSize(inImage, lod)) / imageSize)); - - float depth01 = texelFetchOffset(inImage, samplePos, lod, ivec2(0, 1)).r; - float depth11 = texelFetchOffset(inImage, samplePos, lod, ivec2(1, 1)).r; - float depth10 = texelFetchOffset(inImage, samplePos, lod, ivec2(1, 0)).r; - float depth00 = texelFetchOffset(inImage, samplePos, lod, ivec2(0, 0)).r; - - float depth; - if (useMin == 0) { - depth = max(max(depth00, depth01), max(depth10, depth11)); - } else { - depth = min(min(depth00, depth01), min(depth10, depth11)); - } - - imageStore(outImage, ivec2(pos), vec4(depth)); -} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl new file mode 100644 index 000000000..48fd55ec2 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample.glsl @@ -0,0 +1,33 @@ +layout(local_size_x = 256) in; + +uniform uint max_mip_level; + +/// Generates a hierarchical depth buffer. +/// Based on FidelityFX SPD v2.1 https://github.com/GPUOpen-LibrariesAndSDKs/FidelityFX-SDK/blob/d7531ae47d8b36a5d4025663e731a47a38be882f/sdk/include/FidelityFX/gpu/spd/ffx_spd.h#L528 +/// Based on Bevy's more readable implementation https://github.com/JMS55/bevy/blob/ca2c8e63b9562f88c8cd7e1d88a17a4eea20aaf4/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl + +shared float[16][16] intermediate_memory; + +// These are builtins in wgsl but we can trivially emulate them. +uint extractBits(uint e, uint offset, uint count) { + return (e >> offset) & ((1u << count) - 1u); +} + +uint insertBits(uint e, uint newbits, uint offset, uint count) { + uint countMask = ((1u << count) - 1u); + // zero out the bits we're going to replace first + return (e & ~(countMask << offset)) | ((newbits & countMask) << offset); +} + +// I do not understand how this works but it seems cool. +uvec2 remap_for_wave_reduction(uint a) { + return uvec2( + insertBits(extractBits(a, 2u, 3u), a, 0u, 1u), + insertBits(extractBits(a, 3u, 3u), extractBits(a, 1u, 2u), 0u, 2u) + ); +} + +float reduce_4(vec4 v) { + return max(max(v.x, v.y), max(v.z, v.w)); +} + diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl new file mode 100644 index 000000000..351995767 --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_first.glsl @@ -0,0 +1,150 @@ +#include "flywheel:internal/indirect/downsample.glsl" + +layout(binding = 0) uniform sampler2D mip_0; +layout(binding = 1, r32f) uniform writeonly image2D mip_1; +layout(binding = 2, r32f) uniform writeonly image2D mip_2; +layout(binding = 3, r32f) uniform writeonly image2D mip_3; +layout(binding = 4, r32f) uniform writeonly image2D mip_4; +layout(binding = 5, r32f) uniform writeonly image2D mip_5; +layout(binding = 6, r32f) uniform writeonly image2D mip_6; + +float reduce_load_mip_0(uvec2 tex) { + // NOTE: mip_0 is the actual depth buffer, and mip_1 is the "base" of our depth pyramid and has the next + // smallest Po2 dimensions to mip_1's dimensions. We dispatch enough invocations to cover the entire mip_1 + // and will very likely oversample mip_0, but that's okay because we need to ensure conservative coverage. + // All following mip levels are proper halvings of their parents and will not waste any work. + vec2 uv = (vec2(tex) + 0.5) / vec2(imageSize(mip_1)) * 0.5; + return reduce_4(textureGather(mip_0, uv)); +} + +void downsample_mips_0_and_1(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + vec4 v; + + ivec2 tex = workgroup_id * 64 + ivec2(x * 2u, y * 2u); + ivec2 pix = workgroup_id * 32 + ivec2(x, y); + v[0] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[0])); + + tex = workgroup_id * 64 + ivec2(x * 2u + 32u, y * 2u); + pix = workgroup_id * 32 + ivec2(x + 16u, y); + v[1] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[1])); + + tex = workgroup_id * 64 + ivec2(x * 2u, y * 2u + 32u); + pix = workgroup_id * 32 + ivec2(x, y + 16u); + v[2] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[2])); + + tex = workgroup_id * 64 + ivec2(x * 2u + 32u, y * 2u + 32u); + pix = workgroup_id * 32 + ivec2(x + 16u, y + 16u); + v[3] = reduce_load_mip_0(tex); + imageStore(mip_1, pix, vec4(v[3])); + + if (max_mip_level <= 1u) { return; } + + for (uint i = 0u; i < 4u; i++) { + intermediate_memory[x][y] = v[i]; + barrier(); + if (local_invocation_index < 64u) { + v[i] = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + pix = (workgroup_id * 16) + ivec2( + x + (i % 2u) * 8u, + y + (i / 2u) * 8u + ); + imageStore(mip_2, pix, vec4(v[i])); + } + barrier(); + } + + if (local_invocation_index < 64u) { + intermediate_memory[x + 0u][y + 0u] = v[0]; + intermediate_memory[x + 8u][y + 0u] = v[1]; + intermediate_memory[x + 0u][y + 8u] = v[2]; + intermediate_memory[x + 8u][y + 8u] = v[3]; + } +} + + +void downsample_mip_2(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 64u) { + float v = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + imageStore(mip_3, (workgroup_id * 8) + ivec2(x, y), vec4(v)); + intermediate_memory[x * 2u + y % 2u][y * 2u] = v; + } +} + +void downsample_mip_3(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 16u) { + float v = reduce_4(vec4( + intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u], + intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u] + )); + imageStore(mip_4, (workgroup_id * 4) + ivec2(x, y), vec4(v)); + intermediate_memory[x * 4u + y][y * 4u] = v; + } +} + +void downsample_mip_4(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 4u) { + float v = reduce_4(vec4( + intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u], + intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u] + )); + imageStore(mip_5, (workgroup_id * 2) + ivec2(x, y), vec4(v)); + intermediate_memory[x + y * 2u][0u] = v; + } +} + +void downsample_mip_5(ivec2 workgroup_id, uint local_invocation_index) { + if (local_invocation_index < 1u) { + float v = reduce_4(vec4( + intermediate_memory[0u][0u], + intermediate_memory[1u][0u], + intermediate_memory[2u][0u], + intermediate_memory[3u][0u] + )); + imageStore(mip_6, workgroup_id, vec4(v)); + } +} + +void downsample_mips_2_to_5(uint x, uint y, ivec2 workgroup_id, uint local_invocation_index) { + if (max_mip_level <= 2u) { return; } + barrier(); + downsample_mip_2(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 3u) { return; } + barrier(); + downsample_mip_3(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 4u) { return; } + barrier(); + downsample_mip_4(x, y, workgroup_id, local_invocation_index); + + if (max_mip_level <= 5u) { return; } + barrier(); + downsample_mip_5(workgroup_id, local_invocation_index); +} + +void main() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + downsample_mips_0_and_1(x, y, ivec2(gl_WorkGroupID.xy), gl_LocalInvocationIndex); + + downsample_mips_2_to_5(x, y, ivec2(gl_WorkGroupID.xy), gl_LocalInvocationIndex); +} diff --git a/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl new file mode 100644 index 000000000..afedc061c --- /dev/null +++ b/common/src/backend/resources/assets/flywheel/flywheel/internal/indirect/downsample_second.glsl @@ -0,0 +1,136 @@ +#include "flywheel:internal/indirect/downsample.glsl" + +layout(binding = 0, r32f) uniform readonly image2D mip_6; +layout(binding = 1, r32f) uniform writeonly image2D mip_7; +layout(binding = 2, r32f) uniform writeonly image2D mip_8; +layout(binding = 3, r32f) uniform writeonly image2D mip_9; +layout(binding = 4, r32f) uniform writeonly image2D mip_10; +layout(binding = 5, r32f) uniform writeonly image2D mip_11; +layout(binding = 6, r32f) uniform writeonly image2D mip_12; + +float reduce_load_mip_6(ivec2 tex) { + // NOTE: We could bind mip_6 as a sampler2D and use textureGather, + // but it's already written to as an image in the first pass so I think this is fine. + return reduce_4(vec4( + imageLoad(mip_6, tex + ivec2(0u, 0u)).r, + imageLoad(mip_6, tex + ivec2(0u, 1u)).r, + imageLoad(mip_6, tex + ivec2(1u, 0u)).r, + imageLoad(mip_6, tex + ivec2(1u, 1u)).r + )); +} + +void downsample_mips_6_and_7(uint x, uint y) { + vec4 v; + + ivec2 tex = ivec2(x * 4u + 0u, y * 4u + 0u); + ivec2 pix = ivec2(x * 2u + 0u, y * 2u + 0u); + v[0] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[0])); + + tex = ivec2(x * 4u + 2u, y * 4u + 0u); + pix = ivec2(x * 2u + 1u, y * 2u + 0u); + v[1] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[1])); + + tex = ivec2(x * 4u + 0u, y * 4u + 2u); + pix = ivec2(x * 2u + 0u, y * 2u + 1u); + v[2] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[2])); + + tex = ivec2(x * 4u + 2u, y * 4u + 2u); + pix = ivec2(x * 2u + 1u, y * 2u + 1u); + v[3] = reduce_load_mip_6(tex); + imageStore(mip_7, pix, vec4(v[3])); + + if (max_mip_level <= 7u) { return; } + + float vr = reduce_4(v); + imageStore(mip_8, ivec2(x, y), vec4(vr)); + intermediate_memory[x][y] = vr; +} + + +void downsample_mip_8(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 64u) { + float v = reduce_4(vec4( + intermediate_memory[x * 2u + 0u][y * 2u + 0u], + intermediate_memory[x * 2u + 1u][y * 2u + 0u], + intermediate_memory[x * 2u + 0u][y * 2u + 1u], + intermediate_memory[x * 2u + 1u][y * 2u + 1u] + )); + imageStore(mip_9, ivec2(x, y), vec4(v)); + intermediate_memory[x * 2u + y % 2u][y * 2u] = v; + } +} + +void downsample_mip_9(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 16u) { + float v = reduce_4(vec4( + intermediate_memory[x * 4u + 0u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 2u + 0u][y * 4u + 0u], + intermediate_memory[x * 4u + 0u + 1u][y * 4u + 2u], + intermediate_memory[x * 4u + 2u + 1u][y * 4u + 2u] + )); + imageStore(mip_10, ivec2(x, y), vec4(v)); + intermediate_memory[x * 4u + y][y * 4u] = v; + } +} + +void downsample_mip_10(uint x, uint y, uint local_invocation_index) { + if (local_invocation_index < 4u) { + float v = reduce_4(vec4( + intermediate_memory[x * 8u + 0u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 4u + 0u + y * 2u][y * 8u + 0u], + intermediate_memory[x * 8u + 0u + 1u + y * 2u][y * 8u + 4u], + intermediate_memory[x * 8u + 4u + 1u + y * 2u][y * 8u + 4u] + )); + imageStore(mip_11, ivec2(x, y), vec4(v)); + intermediate_memory[x + y * 2u][0u] = v; + } +} + +void downsample_mip_11(uint local_invocation_index) { + if (local_invocation_index < 1u) { + float v = reduce_4(vec4( + intermediate_memory[0u][0u], + intermediate_memory[1u][0u], + intermediate_memory[2u][0u], + intermediate_memory[3u][0u] + )); + + imageStore(mip_12, ivec2(0u, 0u), vec4(v)); + } +} + + +void downsample_mips_8_to_11(uint x, uint y, uint local_invocation_index) { + if (max_mip_level <= 8u) { return; } + barrier(); + downsample_mip_8(x, y, local_invocation_index); + + if (max_mip_level <= 9u) { return; } + barrier(); + downsample_mip_9(x, y, local_invocation_index); + + if (max_mip_level <= 10u) { return; } + barrier(); + downsample_mip_10(x, y, local_invocation_index); + + if (max_mip_level <= 11u) { return; } + barrier(); + downsample_mip_11(local_invocation_index); +} + +void downsample_depth_second() { + uvec2 sub_xy = remap_for_wave_reduction(gl_LocalInvocationIndex % 64u); + uint x = sub_xy.x + 8u * ((gl_LocalInvocationIndex >> 6u) % 2u); + uint y = sub_xy.y + 8u * (gl_LocalInvocationIndex >> 7u); + + downsample_mips_6_and_7(x, y); + + downsample_mips_8_to_11(x, y, gl_LocalInvocationIndex); +} + +void main() { + downsample_depth_second(); +}