Skip to content

Commit 7defbe9

Browse files
Moultclaude
andcommitted
ifcviewer: GPU cull fwd/rev reflection bucketing (step 3b)
Extend the GPU-cull indirect buffer from M to 2M commands: the first M are the forward (non-reflected, CCW) bucket, the second M are the reverse (reflected, CW) bucket. The compact shader reads flags bit 0 from the AABB SSBO and routes each survivor to the appropriate bucket via bucket = reflected ? mesh_id + M : mesh_id. uploadGpuCullStaticBuffers() now precomputes exact per-mesh fwd/rev instance counts so each bucket reserves only the slots it needs (total visible_ssbo size unchanged — sum of fwd + rev = total). Draw loop issues two MDIs per model under IFC_GPU_CULL: first M commands CCW, next M commands CW. Sub-draws doubled (172k → 345k) which further regresses FPS due to command-processor overhead from zero-instance sub-draws — the same issue noted in 3a. MDI compaction remains the fix. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4fe32b5 commit 7defbe9

2 files changed

Lines changed: 95 additions & 41 deletions

File tree

src/ifcviewer/ViewportWindow.cpp

Lines changed: 81 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,16 @@ static const char* CULL_COMPACT_COMPUTE_SHADER = R"(
278278
#version 450 core
279279
layout(local_size_x = 64) in;
280280
// Each instance contributes two vec4 entries: (min.xyz, mesh_id_as_float),
281-
// (max.xyz, flags_as_float). mesh_id is packed via floatBitsToUint.
281+
// (max.xyz, flags_as_float). mesh_id is packed via floatBitsToUint;
282+
// flags bit 0 = reflected (winding-bucket selector).
282283
layout(std430, binding = 0) readonly buffer AabbBuf { vec4 entries[]; };
283284
layout(std430, binding = 1) coherent buffer IndirectBuf { uint ind[]; };
284285
layout(std430, binding = 2) writeonly buffer VisibleBuf { uint visible[]; };
285286
layout(std430, binding = 3) readonly buffer MeshBaseBuf { uint mesh_base[]; };
286287
287288
uniform vec4 u_planes[6];
288289
uniform uint u_count; // num instances
290+
uniform uint u_fwd_mesh_count; // M; reflected bucket is mesh_id + M
289291
uniform vec3 u_camera_eye;
290292
uniform float u_focal_px;
291293
uniform float u_min_pixel_radius;
@@ -322,9 +324,12 @@ void main() {
322324
vec3 mx = hi.xyz;
323325
if (!frustum(mn, mx)) return;
324326
if (!contribution(mn, mx)) return;
325-
uint mesh_id = floatBitsToUint(lo.w);
326-
uint local = atomicAdd(ind[mesh_id * 5u + 1u], 1u);
327-
visible[mesh_base[mesh_id] + local] = gid;
327+
uint mesh_id = floatBitsToUint(lo.w);
328+
uint flags = floatBitsToUint(hi.w);
329+
uint bucket = ((flags & 1u) != 0u) ? (mesh_id + u_fwd_mesh_count)
330+
: mesh_id;
331+
uint local = atomicAdd(ind[bucket * 5u + 1u], 1u);
332+
visible[mesh_base[bucket] + local] = gid;
328333
}
329334
)";
330335

@@ -913,13 +918,27 @@ void ViewportWindow::uploadInstanceAabbs(ModelGpuData& m) {
913918

914919
void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
915920
const uint32_t M = static_cast<uint32_t>(m.meshes.size());
916-
m.gpu_mesh_command_count = M;
921+
m.gpu_mesh_command_count = 2u * M;
922+
m.gpu_forward_command_count = M;
917923

918-
// Prefix-sum instance_count to get per-mesh base offsets. Also build a
919-
// DrawElementsIndirectCommand template per mesh (count / firstIndex /
920-
// baseVertex / baseInstance static; instanceCount starts at 0).
921-
std::vector<uint32_t> mesh_base(M, 0);
922-
std::vector<DrawElementsIndirectCommand> indir(M);
924+
// Count fwd / rev instances per mesh so each bucket gets a tight
925+
// per-mesh slot range. (Sum of fwd + rev = total_instances, so the
926+
// visible buffer is no bigger than the single-bucket version.)
927+
std::vector<uint32_t> fwd_n(M, 0), rev_n(M, 0);
928+
for (size_t i = 0; i < m.instances.size(); ++i) {
929+
const uint32_t mid = m.instances[i].mesh_id;
930+
if (mid >= M) continue;
931+
const bool reflected = i < m.instance_reflected.size()
932+
&& m.instance_reflected[i];
933+
(reflected ? rev_n[mid] : fwd_n[mid]) += 1u;
934+
}
935+
936+
// Prefix sums. mesh_base[0..M) for fwd, mesh_base[M..2M) for rev.
937+
// Same layout for the indirect commands. baseInstance of each
938+
// command points at its visible[] slot so the vertex shader's
939+
// gl_BaseInstanceARB + gl_InstanceID indexes directly into it.
940+
std::vector<uint32_t> mesh_base(2u * M, 0);
941+
std::vector<DrawElementsIndirectCommand> indir(2u * M);
923942
uint32_t running = 0;
924943
for (uint32_t i = 0; i < M; ++i) {
925944
const MeshInfo& mesh = m.meshes[i];
@@ -930,12 +949,23 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
930949
cmd.firstIndex = mesh.ebo_byte_offset / sizeof(uint32_t);
931950
cmd.baseVertex = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
932951
cmd.baseInstance = running;
933-
running += mesh.instance_count;
952+
running += fwd_n[i];
953+
}
954+
for (uint32_t i = 0; i < M; ++i) {
955+
const MeshInfo& mesh = m.meshes[i];
956+
mesh_base[M + i] = running;
957+
DrawElementsIndirectCommand& cmd = indir[M + i];
958+
cmd.count = mesh.index_count;
959+
cmd.instanceCount = 0;
960+
cmd.firstIndex = mesh.ebo_byte_offset / sizeof(uint32_t);
961+
cmd.baseVertex = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
962+
cmd.baseInstance = running;
963+
running += rev_n[i];
934964
}
935965
const uint32_t total_instances = running;
936966

937-
// Indirect buffer.
938-
const size_t ind_bytes = std::max<size_t>(M * sizeof(DrawElementsIndirectCommand),
967+
// Indirect buffer — 2M commands (fwd bucket then rev bucket).
968+
const size_t ind_bytes = std::max<size_t>(2u * M * sizeof(DrawElementsIndirectCommand),
939969
sizeof(DrawElementsIndirectCommand));
940970
if (m.gpu_indirect_buffer && m.gpu_indirect_capacity < ind_bytes) {
941971
gl_->glDeleteBuffers(1, &m.gpu_indirect_buffer);
@@ -950,10 +980,10 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
950980
}
951981
if (M > 0) {
952982
gl_->glNamedBufferSubData(m.gpu_indirect_buffer, 0,
953-
M * sizeof(DrawElementsIndirectCommand), indir.data());
983+
2u * M * sizeof(DrawElementsIndirectCommand), indir.data());
954984
}
955985

956-
// Visible list — sized to worst case (every instance survives).
986+
// Visible list — exact: fwd + rev per-mesh counts sum to total_instances.
957987
const size_t vis_bytes = std::max<size_t>(total_instances * sizeof(uint32_t),
958988
sizeof(uint32_t));
959989
if (m.gpu_visible_ssbo && m.gpu_visible_capacity < vis_bytes) {
@@ -968,8 +998,8 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
968998
m.gpu_visible_capacity = vis_bytes;
969999
}
9701000

971-
// Mesh-base SSBO.
972-
const size_t mb_bytes = std::max<size_t>(M * sizeof(uint32_t), sizeof(uint32_t));
1001+
// Mesh-base SSBO — 2M entries (one per bucket).
1002+
const size_t mb_bytes = std::max<size_t>(2u * M * sizeof(uint32_t), sizeof(uint32_t));
9731003
if (m.gpu_mesh_base_ssbo && m.gpu_mesh_base_capacity < mb_bytes) {
9741004
gl_->glDeleteBuffers(1, &m.gpu_mesh_base_ssbo);
9751005
m.gpu_mesh_base_ssbo = 0;
@@ -983,7 +1013,7 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
9831013
}
9841014
if (M > 0) {
9851015
gl_->glNamedBufferSubData(m.gpu_mesh_base_ssbo, 0,
986-
M * sizeof(uint32_t), mesh_base.data());
1016+
2u * M * sizeof(uint32_t), mesh_base.data());
9871017
}
9881018
}
9891019

@@ -1932,10 +1962,10 @@ void ViewportWindow::render() {
19321962

19331963
// Phase 3E: the GPU-cull path. When IFC_GPU_CULL=1 we dispatch two
19341964
// tiny compute shaders per model (reset + compact), then let the draw
1935-
// loop below issue MDI from gpu_indirect_buffer. Single-bucket-per-
1936-
// mesh for now — LOD selection, reflection winding split, and HiZ
1937-
// still live only on the CPU path. Reflected instances therefore
1938-
// render with wrong winding under this gate; that's the next commit.
1965+
// loop below issue MDI from gpu_indirect_buffer. Commands are laid
1966+
// out as two buckets of M entries each — fwd (CCW) then rev (CW) —
1967+
// so reflected instances render with correct winding. LOD and HiZ
1968+
// still live only on the CPU path.
19391969
if (gpu_cull_enabled && cull_this_frame && cull_compact_program_) {
19401970
QElapsedTimer t; t.start();
19411971
float planes_flat[24];
@@ -1970,6 +2000,8 @@ void ViewportWindow::render() {
19702000
gl_->glUniform4fv(gl_->glGetUniformLocation(cull_compact_program_, "u_planes"),
19712001
6, planes_flat);
19722002
gl_->glUniform1ui(gl_->glGetUniformLocation(cull_compact_program_, "u_count"), n);
2003+
gl_->glUniform1ui(gl_->glGetUniformLocation(cull_compact_program_, "u_fwd_mesh_count"),
2004+
m.gpu_forward_command_count);
19732005
gl_->glUniform3f (gl_->glGetUniformLocation(cull_compact_program_, "u_camera_eye"),
19742006
camera_eye_.x(), camera_eye_.y(), camera_eye_.z());
19752007
gl_->glUniform1f (gl_->glGetUniformLocation(cull_compact_program_, "u_focal_px"),
@@ -2004,10 +2036,10 @@ void ViewportWindow::render() {
20042036
if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
20052037

20062038
if (gpu_cull_enabled) {
2007-
// GPU path: compact shader already wrote visible indices into
2008-
// gpu_visible_ssbo at [mesh_base[i], mesh_base[i]+count) and
2009-
// set each command's instanceCount. One MDI per model, no
2010-
// fwd/rev split yet — reflected winding is wrong; step 3b.
2039+
// GPU path: compact shader routed survivors into fwd/rev
2040+
// buckets (commands [0..M) and [M..2M)). Two MDIs: CCW then
2041+
// CW. LOD and HiZ still CPU-only; reflected winding is now
2042+
// correct.
20112043
if (!m.gpu_indirect_buffer || !m.gpu_visible_ssbo ||
20122044
m.gpu_mesh_command_count == 0) continue;
20132045

@@ -2017,18 +2049,29 @@ void ViewportWindow::render() {
20172049
gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, m.mesh_info_ssbo);
20182050
gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer);
20192051

2020-
uint32_t count = m.gpu_mesh_command_count;
2021-
if (max_subdraws < count) count = max_subdraws;
2022-
if (count > 0 && !skip_mdi) {
2052+
uint32_t fwd = m.gpu_forward_command_count;
2053+
uint32_t rev = m.gpu_mesh_command_count - fwd;
2054+
if (max_subdraws < m.gpu_mesh_command_count) {
2055+
const uint32_t total = m.gpu_mesh_command_count;
2056+
fwd = static_cast<uint32_t>((uint64_t)fwd * max_subdraws / total);
2057+
rev = max_subdraws - fwd;
2058+
}
2059+
if (fwd > 0 && !skip_mdi) {
20232060
gl_->glFrontFace(GL_CCW);
20242061
gl_->glMultiDrawElementsIndirect(
20252062
GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
2026-
static_cast<GLsizei>(count), 0);
2063+
static_cast<GLsizei>(fwd), 0);
20272064
++gl_draw_calls_;
20282065
}
2029-
// Stats: we don't have visible_objects / visible_triangles
2030-
// from the GPU yet (would need a readback). Report command
2031-
// count as a proxy for indirect_sub_draws_.
2066+
if (rev > 0 && !skip_mdi) {
2067+
gl_->glFrontFace(GL_CW);
2068+
gl_->glMultiDrawElementsIndirect(
2069+
GL_TRIANGLES, GL_UNSIGNED_INT,
2070+
reinterpret_cast<const void*>(m.gpu_forward_command_count * sizeof(DrawElementsIndirectCommand)),
2071+
static_cast<GLsizei>(rev), 0);
2072+
++gl_draw_calls_;
2073+
gl_->glFrontFace(GL_CCW);
2074+
}
20322075
indirect_sub_draws_ += m.gpu_mesh_command_count;
20332076
continue;
20342077
}
@@ -2140,11 +2183,15 @@ void ViewportWindow::render() {
21402183
gl_->glGetNamedBufferSubData(mm.gpu_indirect_buffer, 0,
21412184
mm.gpu_mesh_command_count * sizeof(DrawElementsIndirectCommand),
21422185
readback.data());
2186+
// Commands [0..M) are fwd, [M..2M) are rev for the same
2187+
// mesh — index meshes[] modulo forward_command_count.
2188+
const uint32_t M = mm.gpu_forward_command_count;
21432189
for (uint32_t i = 0; i < mm.gpu_mesh_command_count; ++i) {
21442190
const uint32_t ic = readback[i].instanceCount;
2191+
const uint32_t mesh_i = (M > 0) ? (i % M) : 0;
21452192
gpu_surv += ic;
21462193
gpu_obj += ic;
2147-
gpu_tri += ic * (mm.meshes[i].index_count / 3u);
2194+
gpu_tri += ic * (mm.meshes[mesh_i].index_count / 3u);
21482195
}
21492196
}
21502197
gpu_cull_last_survivors_ = gpu_surv;

src/ifcviewer/ViewportWindow.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,20 @@ struct ModelGpuData {
108108
// the instanceCount field of gpu_indirect_buffer is rewritten by the
109109
// cull shader (zeroed by the reset shader, atomically incremented as
110110
// survivors are appended into gpu_visible_ssbo at mesh_base[i] + local).
111-
GLuint gpu_indirect_buffer = 0;
112-
size_t gpu_indirect_capacity = 0;
113-
GLuint gpu_visible_ssbo = 0;
114-
size_t gpu_visible_capacity = 0;
115-
GLuint gpu_mesh_base_ssbo = 0;
116-
size_t gpu_mesh_base_capacity = 0;
117-
uint32_t gpu_mesh_command_count = 0;
111+
// Layout per model:
112+
// commands[0..M) fwd bucket (non-reflected, CCW winding)
113+
// commands[M..2M) rev bucket (reflected, CW winding)
114+
// gpu_mesh_command_count = 2M; gpu_forward_command_count = M.
115+
// Each bucket gets its own mesh_base[] slot and its own visible[]
116+
// range, sized to the exact per-mesh count of fwd / rev instances.
117+
GLuint gpu_indirect_buffer = 0;
118+
size_t gpu_indirect_capacity = 0;
119+
GLuint gpu_visible_ssbo = 0;
120+
size_t gpu_visible_capacity = 0;
121+
GLuint gpu_mesh_base_ssbo = 0;
122+
size_t gpu_mesh_base_capacity = 0;
123+
uint32_t gpu_mesh_command_count = 0;
124+
uint32_t gpu_forward_command_count = 0;
118125

119126
// Dynamic visible-instance index buffer (std430, binding = 1).
120127
// Re-uploaded each frame from visible_flat_.

0 commit comments

Comments
 (0)