@@ -436,6 +436,42 @@ void main() {
436436}
437437)" ;
438438
439+ // Pack non-empty indirect commands from the source (4M) into a compact
440+ // buffer. Forward commands [0..2M) pack to dst[0..], reverse [2M..4M) pack
441+ // to dst[2M..]. Two atomic counters in counts[]: [0]=fwd, [1]=rev.
442+ static const char * CULL_PACK_COMPUTE_SHADER = R"(
443+ #version 450 core
444+ layout(local_size_x = 64) in;
445+ layout(std430, binding = 0) readonly buffer SrcBuf { uint src[]; };
446+ layout(std430, binding = 1) writeonly buffer DstBuf { uint dst[]; };
447+ layout(std430, binding = 2) coherent buffer CountBuf { uint counts[]; };
448+ uniform uint u_total_cmds;
449+ uniform uint u_fwd_cmds;
450+ void main() {
451+ uint i = gl_GlobalInvocationID.x;
452+ if (i >= u_total_cmds) return;
453+ uint ic = src[i * 5u + 1u];
454+ if (ic == 0u) return;
455+ bool is_fwd = (i < u_fwd_cmds);
456+ uint slot = is_fwd ? atomicAdd(counts[0], 1u)
457+ : atomicAdd(counts[1], 1u) + u_fwd_cmds;
458+ dst[slot * 5u + 0u] = src[i * 5u + 0u];
459+ dst[slot * 5u + 1u] = ic;
460+ dst[slot * 5u + 2u] = src[i * 5u + 2u];
461+ dst[slot * 5u + 3u] = src[i * 5u + 3u];
462+ dst[slot * 5u + 4u] = src[i * 5u + 4u];
463+ }
464+ )" ;
465+
466+ #ifndef GL_PARAMETER_BUFFER_ARB
467+ #define GL_PARAMETER_BUFFER_ARB 0x80EE
468+ #endif
469+
470+ using PFN_glMultiDrawElementsIndirectCount = void (*)(
471+ GLenum mode, GLenum type, const void * indirect,
472+ GLintptr drawcount, GLsizei maxdrawcount, GLsizei stride);
473+ static PFN_glMultiDrawElementsIndirectCount glMultiDrawElementsIndirectCount_ = nullptr ;
474+
439475static GLuint linkComputeProgram (QOpenGLFunctions_4_5_Core* gl, const char * src) {
440476 GLuint cs = compileShader (gl, GL_COMPUTE_SHADER, src);
441477 GLuint prog = gl->glCreateProgram ();
@@ -618,6 +654,8 @@ ViewportWindow::~ViewportWindow() {
618654 if (m.gpu_visible_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_visible_ssbo );
619655 if (m.gpu_mesh_base_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_mesh_base_ssbo );
620656 if (m.gpu_mesh_flags_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_mesh_flags_ssbo );
657+ if (m.gpu_compacted_buffer ) gl_->glDeleteBuffers (1 , &m.gpu_compacted_buffer );
658+ if (m.gpu_draw_count_buffer ) gl_->glDeleteBuffers (1 , &m.gpu_draw_count_buffer );
621659 }
622660 if (axis_vao_) gl_->glDeleteVertexArrays (1 , &axis_vao_);
623661 if (axis_vbo_) gl_->glDeleteBuffers (1 , &axis_vbo_);
@@ -626,6 +664,7 @@ ViewportWindow::~ViewportWindow() {
626664 if (axis_program_) gl_->glDeleteProgram (axis_program_);
627665 if (cull_reset_program_) gl_->glDeleteProgram (cull_reset_program_);
628666 if (cull_compact_program_) gl_->glDeleteProgram (cull_compact_program_);
667+ if (cull_pack_program_) gl_->glDeleteProgram (cull_pack_program_);
629668 if (hiz_gpu_depth_prog_) gl_->glDeleteProgram (hiz_gpu_depth_prog_);
630669 if (hiz_gpu_copy_prog_) gl_->glDeleteProgram (hiz_gpu_copy_prog_);
631670 if (hiz_gpu_reduce_prog_) gl_->glDeleteProgram (hiz_gpu_reduce_prog_);
@@ -730,6 +769,20 @@ void ViewportWindow::buildShaders() {
730769 }
731770 hiz_gpu_copy_prog_ = linkComputeProgram (gl_, HIZ_COPY_COMPUTE_SHADER);
732771 hiz_gpu_reduce_prog_ = linkComputeProgram (gl_, HIZ_REDUCE_COMPUTE_SHADER);
772+ cull_pack_program_ = linkComputeProgram (gl_, CULL_PACK_COMPUTE_SHADER);
773+
774+ if (!glMultiDrawElementsIndirectCount_) {
775+ glMultiDrawElementsIndirectCount_ =
776+ reinterpret_cast <PFN_glMultiDrawElementsIndirectCount>(
777+ context_->getProcAddress (" glMultiDrawElementsIndirectCount" ));
778+ if (!glMultiDrawElementsIndirectCount_) {
779+ glMultiDrawElementsIndirectCount_ =
780+ reinterpret_cast <PFN_glMultiDrawElementsIndirectCount>(
781+ context_->getProcAddress (" glMultiDrawElementsIndirectCountARB" ));
782+ }
783+ if (!glMultiDrawElementsIndirectCount_)
784+ qWarning (" glMultiDrawElementsIndirectCount not available — MDI compaction disabled" );
785+ }
733786}
734787
735788void ViewportWindow::buildAxisGizmo () {
@@ -1164,6 +1217,26 @@ void ViewportWindow::uploadGpuCullStaticBuffers(ModelGpuData& m) {
11641217 gl_->glNamedBufferSubData (m.gpu_mesh_flags_ssbo , 0 ,
11651218 M * sizeof (uint32_t ), mesh_flags.data ());
11661219 }
1220+
1221+ // Compacted indirect buffer — same capacity as the source buffer.
1222+ if (m.gpu_compacted_buffer && m.gpu_compacted_capacity < ind_bytes) {
1223+ gl_->glDeleteBuffers (1 , &m.gpu_compacted_buffer );
1224+ m.gpu_compacted_buffer = 0 ;
1225+ m.gpu_compacted_capacity = 0 ;
1226+ }
1227+ if (!m.gpu_compacted_buffer ) {
1228+ gl_->glCreateBuffers (1 , &m.gpu_compacted_buffer );
1229+ gl_->glNamedBufferStorage (m.gpu_compacted_buffer , ind_bytes, nullptr ,
1230+ GL_DYNAMIC_STORAGE_BIT);
1231+ m.gpu_compacted_capacity = ind_bytes;
1232+ }
1233+
1234+ // Draw-count buffer — 2 × uint32: [fwd_count, rev_count].
1235+ if (!m.gpu_draw_count_buffer ) {
1236+ gl_->glCreateBuffers (1 , &m.gpu_draw_count_buffer );
1237+ gl_->glNamedBufferStorage (m.gpu_draw_count_buffer , 2 * sizeof (uint32_t ),
1238+ nullptr , GL_DYNAMIC_STORAGE_BIT);
1239+ }
11671240}
11681241
11691242void ViewportWindow::finalizeModel (uint32_t model_id) {
@@ -1244,6 +1317,8 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
12441317 if (existing->second .gpu_visible_ssbo ) gl_->glDeleteBuffers (1 , &existing->second .gpu_visible_ssbo );
12451318 if (existing->second .gpu_mesh_base_ssbo ) gl_->glDeleteBuffers (1 , &existing->second .gpu_mesh_base_ssbo );
12461319 if (existing->second .gpu_mesh_flags_ssbo ) gl_->glDeleteBuffers (1 , &existing->second .gpu_mesh_flags_ssbo );
1320+ if (existing->second .gpu_compacted_buffer ) gl_->glDeleteBuffers (1 , &existing->second .gpu_compacted_buffer );
1321+ if (existing->second .gpu_draw_count_buffer ) gl_->glDeleteBuffers (1 , &existing->second .gpu_draw_count_buffer );
12471322 models_gpu_.erase (existing);
12481323 }
12491324
@@ -1393,6 +1468,8 @@ void ViewportWindow::resetScene() {
13931468 if (m.gpu_visible_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_visible_ssbo );
13941469 if (m.gpu_mesh_base_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_mesh_base_ssbo );
13951470 if (m.gpu_mesh_flags_ssbo ) gl_->glDeleteBuffers (1 , &m.gpu_mesh_flags_ssbo );
1471+ if (m.gpu_compacted_buffer ) gl_->glDeleteBuffers (1 , &m.gpu_compacted_buffer );
1472+ if (m.gpu_draw_count_buffer ) gl_->glDeleteBuffers (1 , &m.gpu_draw_count_buffer );
13961473 }
13971474 models_gpu_.clear ();
13981475 selected_object_id_ = 0 ;
@@ -1435,6 +1512,8 @@ void ViewportWindow::removeModel(uint32_t model_id) {
14351512 if (it->second .gpu_visible_ssbo ) gl_->glDeleteBuffers (1 , &it->second .gpu_visible_ssbo );
14361513 if (it->second .gpu_mesh_base_ssbo ) gl_->glDeleteBuffers (1 , &it->second .gpu_mesh_base_ssbo );
14371514 if (it->second .gpu_mesh_flags_ssbo ) gl_->glDeleteBuffers (1 , &it->second .gpu_mesh_flags_ssbo );
1515+ if (it->second .gpu_compacted_buffer ) gl_->glDeleteBuffers (1 , &it->second .gpu_compacted_buffer );
1516+ if (it->second .gpu_draw_count_buffer ) gl_->glDeleteBuffers (1 , &it->second .gpu_draw_count_buffer );
14381517 models_gpu_.erase (it);
14391518 have_cached_cull_ = false ;
14401519 requestUpdate ();
@@ -2235,8 +2314,35 @@ void ViewportWindow::render() {
22352314 gl_->glMemoryBarrier (GL_COMMAND_BARRIER_BIT | GL_SHADER_STORAGE_BARRIER_BIT);
22362315 };
22372316
2317+ const bool mdi_count_available =
2318+ glMultiDrawElementsIndirectCount_ && cull_pack_program_;
2319+
2320+ // Pack non-empty commands into contiguous fwd / rev ranges.
2321+ auto dispatchPack = [&]() {
2322+ if (!mdi_count_available) return ;
2323+ for (auto & tgt : targets) {
2324+ ModelGpuData& m = *tgt.m ;
2325+ const uint32_t total_cmds = m.gpu_mesh_command_count ;
2326+ const uint32_t fwd_cmds = 2u * m.gpu_forward_command_count ;
2327+ const uint32_t zero[2 ] = {0 , 0 };
2328+ gl_->glNamedBufferSubData (m.gpu_draw_count_buffer , 0 ,
2329+ sizeof (zero), zero);
2330+ gl_->glUseProgram (cull_pack_program_);
2331+ gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0 , m.gpu_indirect_buffer );
2332+ gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1 , m.gpu_compacted_buffer );
2333+ gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 2 , m.gpu_draw_count_buffer );
2334+ gl_->glUniform1ui (gl_->glGetUniformLocation (cull_pack_program_, " u_total_cmds" ),
2335+ total_cmds);
2336+ gl_->glUniform1ui (gl_->glGetUniformLocation (cull_pack_program_, " u_fwd_cmds" ),
2337+ fwd_cmds);
2338+ gl_->glDispatchCompute ((total_cmds + 63u ) / 64u , 1 , 1 );
2339+ }
2340+ gl_->glMemoryBarrier (GL_COMMAND_BARRIER_BIT | GL_SHADER_STORAGE_BARRIER_BIT);
2341+ };
2342+
22382343 // Phase 1: cull without HiZ.
22392344 dispatchCull (false );
2345+ dispatchPack ();
22402346
22412347 // Depth pre-pass: render phase 1 survivors into hiz_gpu_fbo_.
22422348 gl_->glBindFramebuffer (GL_FRAMEBUFFER, hiz_gpu_fbo_);
@@ -2252,17 +2358,31 @@ void ViewportWindow::render() {
22522358 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0 , m.ssbo );
22532359 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1 , m.gpu_visible_ssbo );
22542360 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 2 , m.mesh_info_ssbo );
2255- gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer );
22562361 const uint32_t M = m.gpu_forward_command_count ;
2257- gl_->glFrontFace (GL_CCW);
2258- gl_->glMultiDrawElementsIndirect (
2259- GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2260- static_cast <GLsizei>(2u * M), 0 );
2261- gl_->glFrontFace (GL_CW);
2262- gl_->glMultiDrawElementsIndirect (
2263- GL_TRIANGLES, GL_UNSIGNED_INT,
2264- reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2265- static_cast <GLsizei>(2u * M), 0 );
2362+ if (mdi_count_available && m.gpu_compacted_buffer && m.gpu_draw_count_buffer ) {
2363+ gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_compacted_buffer );
2364+ gl_->glBindBuffer (GL_PARAMETER_BUFFER_ARB, m.gpu_draw_count_buffer );
2365+ gl_->glFrontFace (GL_CCW);
2366+ glMultiDrawElementsIndirectCount_ (
2367+ GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2368+ 0 , static_cast <GLsizei>(2u * M), 0 );
2369+ gl_->glFrontFace (GL_CW);
2370+ glMultiDrawElementsIndirectCount_ (
2371+ GL_TRIANGLES, GL_UNSIGNED_INT,
2372+ reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2373+ sizeof (uint32_t ), static_cast <GLsizei>(2u * M), 0 );
2374+ } else {
2375+ gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer );
2376+ gl_->glFrontFace (GL_CCW);
2377+ gl_->glMultiDrawElementsIndirect (
2378+ GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2379+ static_cast <GLsizei>(2u * M), 0 );
2380+ gl_->glFrontFace (GL_CW);
2381+ gl_->glMultiDrawElementsIndirect (
2382+ GL_TRIANGLES, GL_UNSIGNED_INT,
2383+ reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2384+ static_cast <GLsizei>(2u * M), 0 );
2385+ }
22662386 }
22672387 gl_->glColorMask (GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
22682388
@@ -2304,6 +2424,7 @@ void ViewportWindow::render() {
23042424 // the tighter set.
23052425 gl_->glMemoryBarrier (GL_TEXTURE_FETCH_BARRIER_BIT);
23062426 dispatchCull (true );
2427+ dispatchPack ();
23072428
23082429 gpu_cull_last_input_ = total_in;
23092430 gpu_cull_ns_ += t.nsecsElapsed ();
@@ -2330,41 +2451,58 @@ void ViewportWindow::render() {
23302451 if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0 ) continue ;
23312452
23322453 if (gpu_cull_enabled) {
2333- // GPU path: compact shader routed survivors into 4 buckets
2334- // (fwd_lod0, fwd_lod1, rev_lod0, rev_lod1), each with M
2335- // commands. CCW MDI for [0..2M), CW MDI for [2M..4M).
23362454 if (!m.gpu_indirect_buffer || !m.gpu_visible_ssbo ||
23372455 m.gpu_mesh_command_count == 0 ) continue ;
23382456
23392457 gl_->glBindVertexArray (m.vao );
23402458 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0 , m.ssbo );
23412459 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1 , m.gpu_visible_ssbo );
23422460 gl_->glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 2 , m.mesh_info_ssbo );
2343- gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer );
23442461
23452462 const uint32_t M = m.gpu_forward_command_count ;
2346- uint32_t fwd = 2u * M; // fwd_lod0 + fwd_lod1
2347- uint32_t rev = 2u * M; // rev_lod0 + rev_lod1
2348- if (max_subdraws < m.gpu_mesh_command_count ) {
2349- const uint32_t total = m.gpu_mesh_command_count ;
2350- fwd = static_cast <uint32_t >((uint64_t )fwd * max_subdraws / total);
2351- rev = max_subdraws - fwd;
2352- }
2353- if (fwd > 0 && !skip_mdi) {
2354- gl_->glFrontFace (GL_CCW);
2355- gl_->glMultiDrawElementsIndirect (
2356- GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2357- static_cast <GLsizei>(fwd), 0 );
2358- ++gl_draw_calls_;
2359- }
2360- if (rev > 0 && !skip_mdi) {
2361- gl_->glFrontFace (GL_CW);
2362- gl_->glMultiDrawElementsIndirect (
2363- GL_TRIANGLES, GL_UNSIGNED_INT,
2364- reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2365- static_cast <GLsizei>(rev), 0 );
2366- ++gl_draw_calls_;
2367- gl_->glFrontFace (GL_CCW);
2463+ if (glMultiDrawElementsIndirectCount_ &&
2464+ m.gpu_compacted_buffer && m.gpu_draw_count_buffer ) {
2465+ gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_compacted_buffer );
2466+ gl_->glBindBuffer (GL_PARAMETER_BUFFER_ARB, m.gpu_draw_count_buffer );
2467+ if (!skip_mdi) {
2468+ gl_->glFrontFace (GL_CCW);
2469+ glMultiDrawElementsIndirectCount_ (
2470+ GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2471+ 0 , static_cast <GLsizei>(2u * M), 0 );
2472+ ++gl_draw_calls_;
2473+ gl_->glFrontFace (GL_CW);
2474+ glMultiDrawElementsIndirectCount_ (
2475+ GL_TRIANGLES, GL_UNSIGNED_INT,
2476+ reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2477+ sizeof (uint32_t ), static_cast <GLsizei>(2u * M), 0 );
2478+ ++gl_draw_calls_;
2479+ gl_->glFrontFace (GL_CCW);
2480+ }
2481+ } else {
2482+ gl_->glBindBuffer (GL_DRAW_INDIRECT_BUFFER, m.gpu_indirect_buffer );
2483+ uint32_t fwd = 2u * M;
2484+ uint32_t rev = 2u * M;
2485+ if (max_subdraws < m.gpu_mesh_command_count ) {
2486+ const uint32_t total = m.gpu_mesh_command_count ;
2487+ fwd = static_cast <uint32_t >((uint64_t )fwd * max_subdraws / total);
2488+ rev = max_subdraws - fwd;
2489+ }
2490+ if (fwd > 0 && !skip_mdi) {
2491+ gl_->glFrontFace (GL_CCW);
2492+ gl_->glMultiDrawElementsIndirect (
2493+ GL_TRIANGLES, GL_UNSIGNED_INT, nullptr ,
2494+ static_cast <GLsizei>(fwd), 0 );
2495+ ++gl_draw_calls_;
2496+ }
2497+ if (rev > 0 && !skip_mdi) {
2498+ gl_->glFrontFace (GL_CW);
2499+ gl_->glMultiDrawElementsIndirect (
2500+ GL_TRIANGLES, GL_UNSIGNED_INT,
2501+ reinterpret_cast <const void *>(2u * M * sizeof (DrawElementsIndirectCommand)),
2502+ static_cast <GLsizei>(rev), 0 );
2503+ ++gl_draw_calls_;
2504+ gl_->glFrontFace (GL_CCW);
2505+ }
23682506 }
23692507 indirect_sub_draws_ += m.gpu_mesh_command_count ;
23702508 continue ;
@@ -2497,6 +2635,18 @@ void ViewportWindow::render() {
24972635 gpu_cull_last_survivors_ = gpu_surv;
24982636 visible_objects_ = gpu_obj;
24992637 visible_triangles_ = gpu_tri;
2638+
2639+ if (glMultiDrawElementsIndirectCount_) {
2640+ uint32_t compacted_sub_draws = 0 ;
2641+ uint32_t counts[2 ];
2642+ for (auto & [mid2, mm2] : models_gpu_) {
2643+ if (mm2.hidden || !mm2.gpu_draw_count_buffer ) continue ;
2644+ gl_->glGetNamedBufferSubData (mm2.gpu_draw_count_buffer , 0 ,
2645+ sizeof (counts), counts);
2646+ compacted_sub_draws += counts[0 ] + counts[1 ];
2647+ }
2648+ indirect_sub_draws_ = compacted_sub_draws;
2649+ }
25002650 }
25012651
25022652 FrameStats stats;
0 commit comments