mirror of
https://github.com/Jozufozu/Flywheel.git
synced 2024-12-27 23:47:09 +01:00
Scattered to the winds
- A scatter command is 2 uints: - The first contains the size and source offset in the upper byte and lower 3 bytes respectively. - The destination offset. - All offsets and sizes are in uints, not bytes. - Use ScatterList write scatter commands. - Use TransferList to collect transfers. - Rather than consolidating transfers in a separate pass, do so as they are collected. - Reorganize StagingBuffer.
This commit is contained in:
parent
1bfb6db6d1
commit
4e782b8dcd
4 changed files with 363 additions and 162 deletions
|
@ -0,0 +1,105 @@
|
||||||
|
package com.jozufozu.flywheel.backend.engine.indirect;
|
||||||
|
|
||||||
|
import org.lwjgl.system.MemoryUtil;
|
||||||
|
|
||||||
|
import com.jozufozu.flywheel.lib.memory.MemoryBlock;
|
||||||
|
|
||||||
|
public class ScatterList {
|
||||||
|
public static final long STRIDE = Integer.BYTES * 2;
|
||||||
|
public final long maxBytesPerScatter;
|
||||||
|
private MemoryBlock block;
|
||||||
|
private int length;
|
||||||
|
private long usedBytes;
|
||||||
|
|
||||||
|
public ScatterList() {
|
||||||
|
this(64);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ScatterList(long maxBytesPerScatter) {
|
||||||
|
if ((maxBytesPerScatter & 0b1111111100L) != maxBytesPerScatter) {
|
||||||
|
throw new IllegalArgumentException("Max bytes per scatter must be a multiple of 4 and less than 1024");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.maxBytesPerScatter = maxBytesPerScatter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Breaks a transfer into many smaller scatter commands if it is too large, and appends them to this list.
|
||||||
|
*
|
||||||
|
* @param transfers The list of transfers to push.
|
||||||
|
* @param transferIndex The index of the transfer to push.
|
||||||
|
*/
|
||||||
|
public void pushTransfer(TransferList transfers, int transferIndex) {
|
||||||
|
long size = transfers.size(transferIndex);
|
||||||
|
long srcOffset = transfers.srcOffset(transferIndex);
|
||||||
|
long dstOffset = transfers.dstOffset(transferIndex);
|
||||||
|
|
||||||
|
long offset = 0;
|
||||||
|
long remaining = size;
|
||||||
|
|
||||||
|
while (offset < size) {
|
||||||
|
long copySize = Math.min(remaining, maxBytesPerScatter);
|
||||||
|
push(copySize, srcOffset + offset, dstOffset + offset);
|
||||||
|
offset += copySize;
|
||||||
|
remaining -= copySize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void push(long sizeBytes, long srcOffsetBytes, long dstOffsetBytes) {
|
||||||
|
reallocIfNeeded(length);
|
||||||
|
|
||||||
|
long ptr = block.ptr() + length * STRIDE;
|
||||||
|
MemoryUtil.memPutInt(ptr, packSizeAndSrcOffset(sizeBytes, srcOffsetBytes));
|
||||||
|
MemoryUtil.memPutInt(ptr + Integer.BYTES, (int) (dstOffsetBytes >> 2));
|
||||||
|
|
||||||
|
length++;
|
||||||
|
usedBytes += STRIDE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int copyCount() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long usedBytes() {
|
||||||
|
return usedBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return length == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
length = 0;
|
||||||
|
usedBytes = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long ptr() {
|
||||||
|
return block.ptr();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() {
|
||||||
|
if (block != null) {
|
||||||
|
block.free();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reallocIfNeeded(int index) {
|
||||||
|
if (block == null) {
|
||||||
|
block = MemoryBlock.malloc(neededCapacityForIndex(index + 8));
|
||||||
|
} else if (block.size() < neededCapacityForIndex(index)) {
|
||||||
|
block = block.realloc(neededCapacityForIndex(index + 8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long neededCapacityForIndex(int index) {
|
||||||
|
return (index + 1) * STRIDE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int packSizeAndSrcOffset(long sizeBytes, long srcOffsetBytes) {
|
||||||
|
// Divide by 4 and put the offset in the lower 3 bytes.
|
||||||
|
int out = (int) (srcOffsetBytes >>> 2) & 0xFFFFFF;
|
||||||
|
// Place the size divided by 4 in the upper byte.
|
||||||
|
out |= (int) (sizeBytes << 22) & 0xFF000000;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,5 @@
|
||||||
package com.jozufozu.flywheel.backend.engine.indirect;
|
package com.jozufozu.flywheel.backend.engine.indirect;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
@ -18,8 +16,6 @@ import com.jozufozu.flywheel.lib.memory.FlwMemoryTracker;
|
||||||
import com.jozufozu.flywheel.lib.memory.MemoryBlock;
|
import com.jozufozu.flywheel.lib.memory.MemoryBlock;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.PriorityQueue;
|
import it.unimi.dsi.fastutil.PriorityQueue;
|
||||||
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
|
|
||||||
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
|
|
||||||
import it.unimi.dsi.fastutil.objects.ObjectArrayFIFOQueue;
|
import it.unimi.dsi.fastutil.objects.ObjectArrayFIFOQueue;
|
||||||
|
|
||||||
// Used https://github.com/CaffeineMC/sodium-fabric/blob/dev/src/main/java/me/jellysquid/mods/sodium/client/gl/arena/staging/MappedStagingBuffer.java
|
// Used https://github.com/CaffeineMC/sodium-fabric/blob/dev/src/main/java/me/jellysquid/mods/sodium/client/gl/arena/staging/MappedStagingBuffer.java
|
||||||
|
@ -33,18 +29,40 @@ public class StagingBuffer {
|
||||||
private final long map;
|
private final long map;
|
||||||
private final long capacity;
|
private final long capacity;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The position in the buffer at the time of the last flush.
|
||||||
|
*/
|
||||||
private long start = 0;
|
private long start = 0;
|
||||||
|
/**
|
||||||
|
* The current position in the buffer,
|
||||||
|
* incremented as transfers are enqueued.
|
||||||
|
*/
|
||||||
private long pos = 0;
|
private long pos = 0;
|
||||||
|
/**
|
||||||
|
* The number of bytes used in the buffer since the last flush,
|
||||||
|
* decremented as transfers are enqueued.
|
||||||
|
*/
|
||||||
|
private long usedCapacity = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of bytes available in the buffer.
|
||||||
|
* <br>
|
||||||
|
* This decreases as transfers are enqueued and increases as fenced regions are reclaimed.
|
||||||
|
*/
|
||||||
private long totalAvailable;
|
private long totalAvailable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A scratch buffer for when there is not enough contiguous space
|
||||||
|
* in the staging buffer for the write the user wants to make.
|
||||||
|
*/
|
||||||
@Nullable
|
@Nullable
|
||||||
private MemoryBlock scratch;
|
private MemoryBlock scratch;
|
||||||
|
|
||||||
private final GlBuffer copyBuffer = new GlBuffer();
|
|
||||||
private final OverflowStagingBuffer overflow = new OverflowStagingBuffer();
|
private final OverflowStagingBuffer overflow = new OverflowStagingBuffer();
|
||||||
private final PriorityQueue<Transfer> transfers = new ObjectArrayFIFOQueue<>();
|
private final TransferList transfers = new TransferList();
|
||||||
private final PriorityQueue<FencedRegion> fencedRegions = new ObjectArrayFIFOQueue<>();
|
private final PriorityQueue<FencedRegion> fencedRegions = new ObjectArrayFIFOQueue<>();
|
||||||
|
private final GlBuffer scatterBuffer = new GlBuffer();
|
||||||
|
private final ScatterList scatterList = new ScatterList();
|
||||||
|
|
||||||
public StagingBuffer() {
|
public StagingBuffer() {
|
||||||
this(DEFAULT_CAPACITY);
|
this(DEFAULT_CAPACITY);
|
||||||
|
@ -75,7 +93,7 @@ public class StagingBuffer {
|
||||||
*/
|
*/
|
||||||
public void enqueueCopy(long size, int dstVbo, long dstOffset, LongConsumer write) {
|
public void enqueueCopy(long size, int dstVbo, long dstOffset, LongConsumer write) {
|
||||||
// Try to write directly into the staging buffer if there is enough contiguous space.
|
// Try to write directly into the staging buffer if there is enough contiguous space.
|
||||||
var direct = reserveForTransferTo(size, dstVbo, dstOffset);
|
var direct = reserveForCopy(size, dstVbo, dstOffset);
|
||||||
|
|
||||||
if (direct != MemoryUtil.NULL) {
|
if (direct != MemoryUtil.NULL) {
|
||||||
write.accept(direct);
|
write.accept(direct);
|
||||||
|
@ -88,27 +106,6 @@ public class StagingBuffer {
|
||||||
enqueueCopy(block.ptr(), size, dstVbo, dstOffset);
|
enqueueCopy(block.ptr(), size, dstVbo, dstOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private MemoryBlock getScratch(long size) {
|
|
||||||
if (scratch == null) {
|
|
||||||
scratch = MemoryBlock.malloc(size);
|
|
||||||
} else if (scratch.size() < size) {
|
|
||||||
scratch = scratch.realloc(size);
|
|
||||||
}
|
|
||||||
return scratch;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Enqueue a copy from the given pointer to the given VBO.
|
|
||||||
*
|
|
||||||
* @param block The block to copy from.
|
|
||||||
* @param dstVbo The VBO to copy to.
|
|
||||||
* @param dstOffset The offset in the destination VBO.
|
|
||||||
*/
|
|
||||||
public void enqueueCopy(MemoryBlock block, int dstVbo, long dstOffset) {
|
|
||||||
enqueueCopy(block.ptr(), block.size(), dstVbo, dstOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enqueue a copy from the given pointer to the given VBO.
|
* Enqueue a copy from the given pointer to the given VBO.
|
||||||
*
|
*
|
||||||
|
@ -132,21 +129,19 @@ public class StagingBuffer {
|
||||||
|
|
||||||
// Put the first span at the tail of the buffer...
|
// Put the first span at the tail of the buffer...
|
||||||
MemoryUtil.memCopy(ptr, map + pos, remaining);
|
MemoryUtil.memCopy(ptr, map + pos, remaining);
|
||||||
transfers.enqueue(new Transfer(pos, dstVbo, dstOffset, remaining));
|
pushTransfer(dstVbo, pos, dstOffset, remaining);
|
||||||
|
|
||||||
// ... and the rest at the head.
|
// ... and the rest at the head.
|
||||||
MemoryUtil.memCopy(ptr + remaining, map, split);
|
MemoryUtil.memCopy(ptr + remaining, map, split);
|
||||||
transfers.enqueue(new Transfer(0, dstVbo, dstOffset + remaining, split));
|
pushTransfer(dstVbo, 0, dstOffset + remaining, split);
|
||||||
|
|
||||||
pos = split;
|
pos = split;
|
||||||
} else {
|
} else {
|
||||||
MemoryUtil.memCopy(ptr, map + pos, size);
|
MemoryUtil.memCopy(ptr, map + pos, size);
|
||||||
transfers.enqueue(new Transfer(pos, dstVbo, dstOffset, size));
|
pushTransfer(dstVbo, pos, dstOffset, size);
|
||||||
|
|
||||||
pos += size;
|
pos += size;
|
||||||
}
|
}
|
||||||
|
|
||||||
totalAvailable -= size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -163,7 +158,7 @@ public class StagingBuffer {
|
||||||
* @param dstOffset The offset in the destination VBO.
|
* @param dstOffset The offset in the destination VBO.
|
||||||
* @return A pointer to the reserved space, or {@code null} if there is not enough contiguous space.
|
* @return A pointer to the reserved space, or {@code null} if there is not enough contiguous space.
|
||||||
*/
|
*/
|
||||||
public long reserveForTransferTo(long size, int dstVbo, long dstOffset) {
|
public long reserveForCopy(long size, int dstVbo, long dstOffset) {
|
||||||
assertMultipleOf4(size);
|
assertMultipleOf4(size);
|
||||||
// Don't need to check totalAvailable here because that's a looser constraint than the bytes remaining.
|
// Don't need to check totalAvailable here because that's a looser constraint than the bytes remaining.
|
||||||
long remaining = capacity - pos;
|
long remaining = capacity - pos;
|
||||||
|
@ -173,12 +168,10 @@ public class StagingBuffer {
|
||||||
|
|
||||||
long out = map + pos;
|
long out = map + pos;
|
||||||
|
|
||||||
transfers.enqueue(new Transfer(pos, dstVbo, dstOffset, size));
|
pushTransfer(dstVbo, pos, dstOffset, size);
|
||||||
|
|
||||||
pos += size;
|
pos += size;
|
||||||
|
|
||||||
totalAvailable -= size;
|
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -189,120 +182,15 @@ public class StagingBuffer {
|
||||||
|
|
||||||
flushUsedRegion();
|
flushUsedRegion();
|
||||||
|
|
||||||
var usedCapacity = dispatchComputeCopies();
|
dispatchComputeCopies();
|
||||||
|
|
||||||
|
transfers.reset();
|
||||||
fencedRegions.enqueue(new FencedRegion(new GlFence(), usedCapacity));
|
fencedRegions.enqueue(new FencedRegion(new GlFence(), usedCapacity));
|
||||||
|
|
||||||
|
usedCapacity = 0;
|
||||||
start = pos;
|
start = pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
private long dispatchComputeCopies() {
|
|
||||||
long usedCapacity = 0;
|
|
||||||
Int2ObjectMap<List<Transfer>> copiesPerVbo = new Int2ObjectArrayMap<>();
|
|
||||||
|
|
||||||
long bytesPerCopy = 64;
|
|
||||||
|
|
||||||
for (var transfer : consolidateCopies(transfers)) {
|
|
||||||
usedCapacity += transfer.size;
|
|
||||||
|
|
||||||
var forVbo = copiesPerVbo.computeIfAbsent(transfer.dstVbo, k -> new ArrayList<>());
|
|
||||||
|
|
||||||
long offset = 0;
|
|
||||||
long remaining = transfer.size;
|
|
||||||
|
|
||||||
while (offset < transfer.size) {
|
|
||||||
long copySize = Math.min(remaining, bytesPerCopy);
|
|
||||||
forVbo.add(new Transfer(transfer.srcOffset + offset, transfer.dstVbo, transfer.dstOffset + offset, copySize));
|
|
||||||
offset += copySize;
|
|
||||||
remaining -= copySize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
IndirectPrograms.get()
|
|
||||||
.getScatterProgram()
|
|
||||||
.bind();
|
|
||||||
|
|
||||||
for (var entry : copiesPerVbo.int2ObjectEntrySet()) {
|
|
||||||
var dstVbo = entry.getIntKey();
|
|
||||||
var transfers = entry.getValue();
|
|
||||||
var copyCount = transfers.size();
|
|
||||||
|
|
||||||
var size = copyCount * Integer.BYTES * 3L;
|
|
||||||
var scratch = getScratch(size);
|
|
||||||
|
|
||||||
putTransfers(scratch.ptr(), transfers);
|
|
||||||
|
|
||||||
copyBuffer.upload(scratch.ptr(), size);
|
|
||||||
|
|
||||||
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, copyBuffer.handle());
|
|
||||||
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 1, vbo);
|
|
||||||
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
|
|
||||||
|
|
||||||
GL45.glDispatchCompute(GlCompat.getComputeGroupCount(copyCount), 1, 1);
|
|
||||||
}
|
|
||||||
return usedCapacity;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void assertMultipleOf4(long size) {
|
|
||||||
if (size % 4 != 0) {
|
|
||||||
throw new IllegalArgumentException("Size must be a multiple of 4");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private long sendCopyCommands() {
|
|
||||||
long usedCapacity = 0;
|
|
||||||
|
|
||||||
for (Transfer transfer : consolidateCopies(transfers)) {
|
|
||||||
usedCapacity += transfer.size;
|
|
||||||
|
|
||||||
GL45C.glCopyNamedBufferSubData(vbo, transfer.dstVbo, transfer.srcOffset, transfer.dstOffset, transfer.size);
|
|
||||||
}
|
|
||||||
return usedCapacity;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void flushUsedRegion() {
|
|
||||||
if (pos < start) {
|
|
||||||
// we rolled around, need to flush 2 ranges.
|
|
||||||
GL45C.glFlushMappedNamedBufferRange(vbo, start, capacity - start);
|
|
||||||
GL45C.glFlushMappedNamedBufferRange(vbo, 0, pos);
|
|
||||||
} else {
|
|
||||||
GL45C.glFlushMappedNamedBufferRange(vbo, start, pos - start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<Transfer> consolidateCopies(PriorityQueue<Transfer> queue) {
|
|
||||||
List<Transfer> merged = new ArrayList<>();
|
|
||||||
Transfer last = null;
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
Transfer transfer = queue.dequeue();
|
|
||||||
|
|
||||||
if (last != null) {
|
|
||||||
if (areContiguous(last, transfer)) {
|
|
||||||
last.size += transfer.size;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
merged.add(last = new Transfer(transfer));
|
|
||||||
}
|
|
||||||
|
|
||||||
return merged;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void putTransfers(long ptr, List<Transfer> transfers) {
|
|
||||||
for (Transfer transfer : transfers) {
|
|
||||||
MemoryUtil.memPutInt(ptr, (int) transfer.srcOffset);
|
|
||||||
MemoryUtil.memPutInt(ptr + Integer.BYTES, (int) transfer.dstOffset);
|
|
||||||
MemoryUtil.memPutInt(ptr + Integer.BYTES * 2, (int) transfer.size);
|
|
||||||
ptr += Integer.BYTES * 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean areContiguous(Transfer last, Transfer transfer) {
|
|
||||||
return last.dstVbo == transfer.dstVbo && last.dstOffset + last.size == transfer.dstOffset && last.srcOffset + last.size == transfer.srcOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reclaim() {
|
public void reclaim() {
|
||||||
while (!fencedRegions.isEmpty()) {
|
while (!fencedRegions.isEmpty()) {
|
||||||
var region = fencedRegions.first();
|
var region = fencedRegions.first();
|
||||||
|
@ -322,29 +210,106 @@ public class StagingBuffer {
|
||||||
GL45C.glUnmapNamedBuffer(vbo);
|
GL45C.glUnmapNamedBuffer(vbo);
|
||||||
GL45C.glDeleteBuffers(vbo);
|
GL45C.glDeleteBuffers(vbo);
|
||||||
overflow.delete();
|
overflow.delete();
|
||||||
|
scatterBuffer.delete();
|
||||||
|
|
||||||
if (scratch != null) {
|
if (scratch != null) {
|
||||||
scratch.free();
|
scratch.free();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
transfers.delete();
|
||||||
|
scatterList.delete();
|
||||||
|
|
||||||
FlwMemoryTracker._freeCPUMemory(capacity);
|
FlwMemoryTracker._freeCPUMemory(capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final class Transfer {
|
@NotNull
|
||||||
private final long srcOffset;
|
private MemoryBlock getScratch(long size) {
|
||||||
private final int dstVbo;
|
if (scratch == null) {
|
||||||
private final long dstOffset;
|
scratch = MemoryBlock.malloc(size);
|
||||||
private long size;
|
} else if (scratch.size() < size) {
|
||||||
|
scratch = scratch.realloc(size);
|
||||||
|
}
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
|
||||||
private Transfer(long srcOffset, int dstVbo, long dstOffset, long size) {
|
private void pushTransfer(int dstVbo, long srcOffset, long dstOffset, long size) {
|
||||||
this.srcOffset = srcOffset;
|
transfers.push(dstVbo, srcOffset, dstOffset, size);
|
||||||
this.dstVbo = dstVbo;
|
usedCapacity += size;
|
||||||
this.dstOffset = dstOffset;
|
totalAvailable -= size;
|
||||||
this.size = size;
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We <em>could</em> just use {@link #sendCopyCommands}, but that has significant
|
||||||
|
* overhead for many small transfers, such as when the object buffer is sparsely updated.
|
||||||
|
* <br>
|
||||||
|
* Instead, we use a compute shader to scatter the data from the staging buffer to the destination VBOs.
|
||||||
|
* This approach is recommended by nvidia in
|
||||||
|
* <a href=https://on-demand.gputechconf.com/gtc/2016/presentation/s6138-christoph-kubisch-pierre-boudier-gpu-driven-rendering.pdf>this presentation</a>
|
||||||
|
*/
|
||||||
|
private void dispatchComputeCopies() {
|
||||||
|
IndirectPrograms.get()
|
||||||
|
.getScatterProgram()
|
||||||
|
.bind();
|
||||||
|
|
||||||
|
// These bindings don't change between dstVbos.
|
||||||
|
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 0, scatterBuffer.handle());
|
||||||
|
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 1, vbo);
|
||||||
|
|
||||||
|
int dstVbo;
|
||||||
|
var transferCount = transfers.length();
|
||||||
|
for (int i = 0; i < transferCount; i++) {
|
||||||
|
dstVbo = transfers.vbo(i);
|
||||||
|
|
||||||
|
scatterList.pushTransfer(transfers, i);
|
||||||
|
|
||||||
|
int nextVbo = i == transferCount - 1 ? -1 : transfers.vbo(i + 1);
|
||||||
|
|
||||||
|
// If we're switching VBOs, dispatch the copies for the previous VBO.
|
||||||
|
// Generally VBOs don't appear in multiple spans of the list,
|
||||||
|
// so submitting duplicates is rare.
|
||||||
|
if (dstVbo != nextVbo) {
|
||||||
|
dispatchScatter(dstVbo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void dispatchScatter(int dstVbo) {
|
||||||
|
scatterBuffer.upload(scatterList.ptr(), scatterList.usedBytes());
|
||||||
|
|
||||||
|
GL45.glBindBufferBase(GL45C.GL_SHADER_STORAGE_BUFFER, 2, dstVbo);
|
||||||
|
|
||||||
|
GL45.glDispatchCompute(GlCompat.getComputeGroupCount(scatterList.copyCount()), 1, 1);
|
||||||
|
|
||||||
|
scatterList.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertMultipleOf4(long size) {
|
||||||
|
if (size % 4 != 0) {
|
||||||
|
throw new IllegalArgumentException("Size must be a multiple of 4");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long sendCopyCommands() {
|
||||||
|
long usedCapacity = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < transfers.length(); i++) {
|
||||||
|
var size = transfers.size(i);
|
||||||
|
|
||||||
|
usedCapacity += size;
|
||||||
|
|
||||||
|
GL45C.glCopyNamedBufferSubData(vbo, transfers.vbo(i), transfers.srcOffset(i), transfers.dstOffset(i), size);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Transfer(Transfer other) {
|
return usedCapacity;
|
||||||
this(other.srcOffset, other.dstVbo, other.dstOffset, other.size);
|
}
|
||||||
|
|
||||||
|
private void flushUsedRegion() {
|
||||||
|
if (pos < start) {
|
||||||
|
// we rolled around, need to flush 2 ranges.
|
||||||
|
GL45C.glFlushMappedNamedBufferRange(vbo, start, capacity - start);
|
||||||
|
GL45C.glFlushMappedNamedBufferRange(vbo, 0, pos);
|
||||||
|
} else {
|
||||||
|
GL45C.glFlushMappedNamedBufferRange(vbo, start, pos - start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
package com.jozufozu.flywheel.backend.engine.indirect;
|
||||||
|
|
||||||
|
import org.lwjgl.system.MemoryUtil;
|
||||||
|
|
||||||
|
import com.jozufozu.flywheel.lib.memory.MemoryBlock;
|
||||||
|
|
||||||
|
public class TransferList {
|
||||||
|
private static final long STRIDE = Long.BYTES * 4;
|
||||||
|
private MemoryBlock block;
|
||||||
|
private int length;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a transfer to the end of the list, combining with the last transfer if possible.
|
||||||
|
*
|
||||||
|
* @param vbo The VBO to transfer to.
|
||||||
|
* @param srcOffset The offset in the staging buffer.
|
||||||
|
* @param dstOffset The offset in the VBO.
|
||||||
|
* @param size The size of the transfer.
|
||||||
|
*/
|
||||||
|
public void push(int vbo, long srcOffset, long dstOffset, long size) {
|
||||||
|
if (continuesLast(vbo, srcOffset, dstOffset)) {
|
||||||
|
int lastIndex = length - 1;
|
||||||
|
size(lastIndex, size(lastIndex) + size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
reallocIfNeeded(length);
|
||||||
|
|
||||||
|
vbo(length, vbo);
|
||||||
|
srcOffset(length, srcOffset);
|
||||||
|
dstOffset(length, dstOffset);
|
||||||
|
size(length, size);
|
||||||
|
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The number of transfers in the list.
|
||||||
|
*/
|
||||||
|
public int length() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return {@code true} if there are no transfers in the list, {@code false} otherwise.
|
||||||
|
*/
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return length == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset the list to be empty.
|
||||||
|
*/
|
||||||
|
public void reset() {
|
||||||
|
length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int vbo(int index) {
|
||||||
|
return MemoryUtil.memGetInt(ptrForIndex(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
public long srcOffset(int index) {
|
||||||
|
return MemoryUtil.memGetLong(ptrForIndex(index) + Long.BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long dstOffset(int index) {
|
||||||
|
return MemoryUtil.memGetLong(ptrForIndex(index) + Long.BYTES * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long size(int index) {
|
||||||
|
return MemoryUtil.memGetLong(ptrForIndex(index) + Long.BYTES * 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() {
|
||||||
|
if (block != null) {
|
||||||
|
block.free();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean continuesLast(int vbo, long srcOffset, long dstOffset) {
|
||||||
|
if (length == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int lastIndex = length - 1;
|
||||||
|
var lastSize = size(lastIndex);
|
||||||
|
return vbo(lastIndex) == vbo && dstOffset(lastIndex) + lastSize == dstOffset && srcOffset(lastIndex) + lastSize == srcOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void vbo(int index, int vbo) {
|
||||||
|
MemoryUtil.memPutInt(ptrForIndex(index), vbo);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void srcOffset(int index, long srcOffset) {
|
||||||
|
MemoryUtil.memPutLong(ptrForIndex(index) + Long.BYTES, srcOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void dstOffset(int index, long dstOffset) {
|
||||||
|
MemoryUtil.memPutLong(ptrForIndex(index) + Long.BYTES * 2, dstOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void size(int index, long size) {
|
||||||
|
MemoryUtil.memPutLong(ptrForIndex(index) + Long.BYTES * 3, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reallocIfNeeded(int index) {
|
||||||
|
if (block == null) {
|
||||||
|
block = MemoryBlock.malloc(neededCapacityForIndex(index + 8));
|
||||||
|
} else if (block.size() < neededCapacityForIndex(index)) {
|
||||||
|
block = block.realloc(neededCapacityForIndex(index + 8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long ptrForIndex(int index) {
|
||||||
|
return block.ptr() + bytePosForIndex(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long bytePosForIndex(int index) {
|
||||||
|
return index * STRIDE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long neededCapacityForIndex(int index) {
|
||||||
|
return (index + 1) * STRIDE;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,9 +1,14 @@
|
||||||
layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
|
layout(local_size_x = _FLW_SUBGROUP_SIZE) in;
|
||||||
|
|
||||||
|
const uint SRC_OFFSET_MASK = 0xFFFFFF;
|
||||||
|
|
||||||
|
// Since StagingBuffer is 16MB, a source offset *into an array of uints* can be represented with 22 bits.
|
||||||
|
// We use 24 here for some wiggle room.
|
||||||
|
// The lower 24 bits are the offset into the Src buffer.
|
||||||
|
// The upper 8 bits are the size of the copy.
|
||||||
struct Copy {
|
struct Copy {
|
||||||
uint srcOffset;
|
uint sizeAndSrcOffset;
|
||||||
uint dstOffset;
|
uint dstOffset;
|
||||||
uint byteSize;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(std430, binding = 0) restrict readonly buffer Copies {
|
layout(std430, binding = 0) restrict readonly buffer Copies {
|
||||||
|
@ -25,9 +30,11 @@ void main() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint srcOffset = copies[copy].srcOffset >> 2;
|
uint sizeAndSrcOffset = copies[copy].sizeAndSrcOffset;
|
||||||
uint dstOffset = copies[copy].dstOffset >> 2;
|
uint srcOffset = sizeAndSrcOffset & SRC_OFFSET_MASK;
|
||||||
uint size = copies[copy].byteSize >> 2;
|
uint size = sizeAndSrcOffset >> 24;
|
||||||
|
|
||||||
|
uint dstOffset = copies[copy].dstOffset;
|
||||||
|
|
||||||
for (uint i = 0; i < size; i++) {
|
for (uint i = 0; i < size; i++) {
|
||||||
dst[dstOffset + i] = src[srcOffset + i];
|
dst[dstOffset + i] = src[srcOffset + i];
|
||||||
|
|
Loading…
Reference in a new issue