@@ -278,14 +278,16 @@ static const char* CULL_COMPACT_COMPUTE_SHADER = R"(
278278#version 450 core
279279layout(local_size_x = 64) in;
280280// Each instance contributes two vec4 entries: (min.xyz, mesh_id_as_float),
281- // (max.xyz, flags_as_float). mesh_id is packed via floatBitsToUint.
281+ // (max.xyz, flags_as_float). mesh_id is packed via floatBitsToUint;
282+ // flags bit 0 = reflected (winding-bucket selector).
282283layout(std430, binding = 0) readonly buffer AabbBuf { vec4 entries[]; };
283284layout(std430, binding = 1) coherent buffer IndirectBuf { uint ind[]; };
284285layout(std430, binding = 2) writeonly buffer VisibleBuf { uint visible[]; };
285286layout(std430, binding = 3) readonly buffer MeshBaseBuf { uint mesh_base[]; };
286287
287288uniform vec4 u_planes[6];
288289uniform uint u_count; // num instances
290+ uniform uint u_fwd_mesh_count; // M; reflected bucket is mesh_id + M
289291uniform vec3 u_camera_eye;
290292uniform float u_focal_px;
291293uniform float u_min_pixel_radius;
@@ -322,9 +324,12 @@ void main() {
322324 vec3 mx = hi.xyz;
323325 if (!frustum(mn, mx)) return;
324326 if (!contribution(mn, mx)) return;
325- uint mesh_id = floatBitsToUint(lo.w);
326- uint local = atomicAdd(ind[mesh_id * 5u + 1u], 1u);
327- visible[mesh_base[mesh_id] + local] = gid;
327+ uint mesh_id = floatBitsToUint(lo.w);
328+ uint flags = floatBitsToUint(hi.w);
329+ uint bucket = ((flags & 1u) != 0u) ? (mesh_id + u_fwd_mesh_count)
330+ : mesh_id;
331+ uint local = atomicAdd(ind[bucket * 5u + 1u], 1u);
332+ visible[mesh_base[bucket] + local] = gid;
328333}
329334)" ;
330335
@@ -913,13 +918,27 @@ void ViewportWindow::uploadInstanceAabbs(ModelGpuData& m) {
913918
914919void ViewportWindow::uploadGpuCullStaticBuffers (ModelGpuData& m) {
915920 const uint32_t M = static_cast <uint32_t >(m.meshes .size ());
916- m.gpu_mesh_command_count = M;
921+ m.gpu_mesh_command_count = 2u * M;
922+ m.gpu_forward_command_count = M;
917923
918- // Prefix-sum instance_count to get per-mesh base offsets. Also build a
919- // DrawElementsIndirectCommand template per mesh (count / firstIndex /
920- // baseVertex / baseInstance static; instanceCount starts at 0).
921- std::vector<uint32_t > mesh_base (M, 0 );
922- std::vector<DrawElementsIndirectCommand> indir (M);
924+ // Count fwd / rev instances per mesh so each bucket gets a tight
925+ // per-mesh slot range. (Sum of fwd + rev = total_instances, so the
926+ // visible buffer is no bigger than the single-bucket version.)
927+ std::vector<uint32_t > fwd_n (M, 0 ), rev_n (M, 0 );
928+ for (size_t i = 0 ; i < m.instances .size (); ++i) {
929+ const uint32_t mid = m.instances [i].mesh_id ;
930+ if (mid >= M) continue ;
931+ const bool reflected = i < m.instance_reflected .size ()
932+ && m.instance_reflected [i];
933+ (reflected ? rev_n[mid] : fwd_n[mid]) += 1u ;
934+ }
935+
936+ // Prefix sums. mesh_base[0..M) for fwd, mesh_base[M..2M) for rev.
937+ // Same layout for the indirect commands. baseInstance of each
938+ // command points at its visible[] slot so the vertex shader's
939+ // gl_BaseInstanceARB + gl_InstanceID indexes directly into it.
940+ std::vector<uint32_t > mesh_base (2u * M, 0 );
941+ std::vector<DrawElementsIndirectCommand> indir (2u * M);
923942 uint32_t running = 0 ;
924943 for (uint32_t i = 0 ; i < M; ++i) {
925944 const MeshInfo& mesh = m.meshes [i];
@@ -930,12 +949,23 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
930949 cmd.firstIndex = mesh.ebo_byte_offset / sizeof (uint32_t );
931950 cmd.baseVertex = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
932951 cmd.baseInstance = running;
933- running += mesh.instance_count ;
952+ running += fwd_n[i];
953+ }
954+ for (uint32_t i = 0 ; i < M; ++i) {
955+ const MeshInfo& mesh = m.meshes [i];
956+ mesh_base[M + i] = running;
957+ DrawElementsIndirectCommand& cmd = indir[M + i];
958+ cmd.count = mesh.index_count ;
959+ cmd.instanceCount = 0 ;
960+ cmd.firstIndex = mesh.ebo_byte_offset / sizeof (uint32_t );
961+ cmd.baseVertex = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
962+ cmd.baseInstance = running;
963+ running += rev_n[i];
934964 }
935965 const uint32_t total_instances = running;
936966
937- // Indirect buffer.
938- const size_t ind_bytes = std::max<size_t >(M * sizeof (DrawElementsIndirectCommand),
967+ // Indirect buffer — 2M commands (fwd bucket then rev bucket) .
968+ const size_t ind_bytes = std::max<size_t >(2u * M * sizeof (DrawElementsIndirectCommand),
939969 sizeof (DrawElementsIndirectCommand));
940970 if (m.gpu_indirect_buffer && m.gpu_indirect_capacity < ind_bytes) {
941971 gl_->glDeleteBuffers (1 , &m.gpu_indirect_buffer );
@@ -950,10 +980,10 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
950980 }
951981 if (M > 0 ) {
952982 gl_->glNamedBufferSubData (m.gpu_indirect_buffer , 0 ,
953- M * sizeof (DrawElementsIndirectCommand), indir.data ());
983+ 2u * M * sizeof (DrawElementsIndirectCommand), indir.data ());
954984 }
955985
956- // Visible list — sized to worst case (every instance survives) .
986+ // Visible list — exact: fwd + rev per-mesh counts sum to total_instances .
957987 const size_t vis_bytes = std::max<size_t >(total_instances * sizeof (uint32_t ),
958988 sizeof (uint32_t ));
959989 if (m.gpu_visible_ssbo && m.gpu_visible_capacity < vis_bytes) {
@@ -968,8 +998,8 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
968998 m.gpu_visible_capacity = vis_bytes;
969999 }
9701000
971- // Mesh-base SSBO.
972- const size_t mb_bytes = std::max<size_t >(M * sizeof (uint32_t ), sizeof (uint32_t ));
1001+ // Mesh-base SSBO — 2M entries (one per bucket) .
1002+ const size_t mb_bytes = std::max<size_t >(2u * M * sizeof (uint32_t ), sizeof (uint32_t ));
9731003 if (m.gpu_mesh_base_ssbo && m.gpu_mesh_base_capacity < mb_bytes) {
9741004 gl_->glDeleteBuffers (1 , &m.gpu_mesh_base_ssbo );
9751005 m.gpu_mesh_base_ssbo = 0 ;
@@ -983,7 +1013,7 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
9831013 }
9841014 if (M > 0 ) {
9851015 gl_->glNamedBufferSubData (m.gpu_mesh_base_ssbo , 0 ,
986- M * sizeof (uint32_t ), mesh_base.data ());
1016+ 2u * M * sizeof (uint32_t ), mesh_base.data ());
9871017 }
9881018}
9891019
@@ -1932,10 +1962,10 @@ void ViewportWindow::render() {
19321962
19331963 // Phase 3E: the GPU-cull path. When IFC_GPU_CULL=1 we dispatch two
19341964 // tiny compute shaders per model (reset + compact), then let the draw
1935- // loop below issue MDI from gpu_indirect_buffer. Single-bucket-per-
1936- // mesh for now — LOD selection, reflection winding split, and HiZ
1937- // still live only on the CPU path . Reflected instances therefore
1938- // render with wrong winding under this gate; that's the next commit .
1965+ // loop below issue MDI from gpu_indirect_buffer. Commands are laid
1966+ // out as two buckets of M entries each — fwd (CCW) then rev (CW) —
1967+ // so reflected instances render with correct winding . LOD and HiZ
1968+ // still live only on the CPU path .
19391969 if (gpu_cull_enabled && cull_this_frame && cull_compact_program_) {
19401970 QElapsedTimer t; t.start ();
19411971 float planes_flat[24 ];
@@ -1970,6 +2000,8 @@ void ViewportWindow::render() {
19702000 gl_->glUniform4fv (gl_->glGetUniformLocation (cull_compact_program_, " u_planes" ),
19712001 6 , planes_flat);
19722002 gl_->glUniform1ui (gl_->glGetUniformLocation (cull_compact_program_, " u_count" ), n);
2003+ gl_->glUniform1ui (gl_->glGetUniformLocation (cull_compact_program_, " u_fwd_mesh_count" ),
2004+ m.gpu_forward_command_count );
19732005 gl_->glUniform3f (gl_->glGetUniformLocation (cull_compact_program_, " u_camera_eye" ),
19742006 camera_eye_.x (), camera_eye_.y (), camera_eye_.z ());
19752007 gl_->glUniform1f (gl_->glGetUniformLocation (cull_compact_program_, " u_focal_px" ),
@@ -2004,10 +2036,10 @@ void ViewportWindow::render() {
20042036 if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0 ) continue ;
20052037
20062038 if (gpu_cull_enabled) {
2007- // GPU path: compact shader already wrote visible indices into
2008- // gpu_visible_ssbo at [mesh_base[i], mesh_base[i]+count) and
2009- // set each command's instanceCount. One MDI per model, no
2010- // fwd/rev split yet — reflected winding is wrong; step 3b .
2039+ // GPU path: compact shader routed survivors into fwd/rev
2040+ // buckets (commands [0..M) and [M..2M)). Two MDIs: CCW then
2041+ // CW. LOD and HiZ still CPU-only; reflected winding is now
2042+ // correct .
20112043 if (!m.gpu_indirect_buffer || !m.gpu_visible_ssbo ||
20122044 m.gpu_mesh_command_count == 0 ) continue ;
20132045
@@ -2017,18 +2049,29 @@ void ViewportWindow::render() {
20172049 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 2 , m.mesh_info_ssbo );
20182050 gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer );
20192051
2020- uint32_t count = m.gpu_mesh_command_count ;
2021- if (max_subdraws < count) count = max_subdraws;
2022- if (count > 0 && !skip_mdi) {
2052+ uint32_t fwd = m.gpu_forward_command_count ;
2053+ uint32_t rev = m.gpu_mesh_command_count - fwd;
2054+ if (max_subdraws < m.gpu_mesh_command_count ) {
2055+ const uint32_t total = m.gpu_mesh_command_count ;
2056+ fwd = static_cast <uint32_t >((uint64_t )fwd * max_subdraws / total);
2057+ rev = max_subdraws - fwd;
2058+ }
2059+ if (fwd > 0 && !skip_mdi) {
20232060 gl_->glFrontFace (GL_CCW);
20242061 gl_->glMultiDrawElementsIndirect (
20252062 GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2026- static_cast <GLsizei>(count ), 0 );
2063+ static_cast <GLsizei>(fwd ), 0 );
20272064 ++gl_draw_calls_;
20282065 }
2029- // Stats: we don't have visible_objects / visible_triangles
2030- // from the GPU yet (would need a readback). Report command
2031- // count as a proxy for indirect_sub_draws_.
2066+ if (rev > 0 && !skip_mdi) {
2067+ gl_->glFrontFace (GL_CW);
2068+ gl_->glMultiDrawElementsIndirect (
2069+ GL_TRIANGLES, GL_UNSIGNED_INT,
2070+ reinterpret_cast <const void *>(m.gpu_forward_command_count * sizeof (DrawElementsIndirectCommand)),
2071+ static_cast <GLsizei>(rev), 0 );
2072+ ++gl_draw_calls_;
2073+ gl_->glFrontFace (GL_CCW);
2074+ }
20322075 indirect_sub_draws_ += m.gpu_mesh_command_count ;
20332076 continue ;
20342077 }
@@ -2140,11 +2183,15 @@ void ViewportWindow::render() {
21402183 gl_->glGetNamedBufferSubData (mm.gpu_indirect_buffer , 0 ,
21412184 mm.gpu_mesh_command_count * sizeof (DrawElementsIndirectCommand),
21422185 readback.data ());
2186+ // Commands [0..M) are fwd, [M..2M) are rev for the same
2187+ // mesh — index meshes[] modulo forward_command_count.
2188+ const uint32_t M = mm.gpu_forward_command_count ;
21432189 for (uint32_t i = 0 ; i < mm.gpu_mesh_command_count ; ++i) {
21442190 const uint32_t ic = readback[i].instanceCount ;
2191+ const uint32_t mesh_i = (M > 0 ) ? (i % M) : 0 ;
21452192 gpu_surv += ic;
21462193 gpu_obj += ic;
2147- gpu_tri += ic * (mm.meshes [i ].index_count / 3u );
2194+ gpu_tri += ic * (mm.meshes [mesh_i ].index_count / 3u );
21482195 }
21492196 }
21502197 gpu_cull_last_survivors_ = gpu_surv;
0 commit comments