From 907fd036c7af169df7d8b6639ae96e38dff90343 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 16:25:09 +1000
Subject: [PATCH 01/37] Bump util from v0.8.0 branch

---
 .../ifcopenshell/util/alignment.py            |   7 +-
 .../ifcopenshell/util/attribute.py            |   3 +-
 .../ifcopenshell/util/brick.py                |   6 +-
 .../ifcopenshell/util/classification.py       |   3 +-
 .../ifcopenshell/util/constraint.py           |   3 +-
 .../ifcopenshell/util/cost.py                 |  12 +-
 .../ifcopenshell/util/data.py                 |   7 +-
 .../ifcopenshell/util/date.py                 |   8 +-
 .../ifcopenshell/util/doc.py                  |  19 +-
 .../ifcopenshell/util/element.py              |  35 ++-
 .../ifcopenshell/util/file.py                 |   3 +-
 .../ifcopenshell/util/fm.py                   |   6 +-
 .../util/generate_pset_templates.py           |  16 +-
 .../ifcopenshell/util/geolocation.py          |  15 +-
 .../util/ifc4x3dev_scrape_data_for_docs.py    |  15 +-
 .../ifcopenshell/util/mvd_info.py             |   8 +-
 .../ifcopenshell/util/placement.py            |   6 +-
 .../ifcopenshell/util/pset.py                 |   9 +-
 .../ifcopenshell/util/representation.py       |   9 +-
 .../ifcopenshell/util/resource.py             |   6 +-
 .../ifcopenshell/util/schema.py               |  13 +-
 .../util/schema/ifc_classes_suggestions.json  |  20 ++
 .../util/scripts/validate_stub.py             |  27 +-
 .../ifcopenshell/util/selector.py             | 205 ++++++++++----
 .../ifcopenshell/util/sequence.py             |  10 +-
 .../ifcopenshell/util/shape.py                |  30 +-
 .../ifcopenshell/util/shape_builder.py        | 261 +++++++++++++-----
 .../ifcopenshell/util/system.py               |   3 +-
 .../ifcopenshell/util/type.py                 |   3 +-
 .../ifcopenshell/util/unit.py                 |  14 +-
 30 files changed, 555 insertions(+), 227 deletions(-)

diff --git a/src/ifcopenshell-python/ifcopenshell/util/alignment.py b/src/ifcopenshell-python/ifcopenshell/util/alignment.py
index a2e9c08455f..1b90ac7c5f0 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/alignment.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/alignment.py
@@ -17,6 +17,7 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 import math
+
 import ifcopenshell
 import ifcopenshell.util.unit
 
@@ -24,7 +25,7 @@
 def add_linear_placement_fallback_position(file: ifcopenshell.file) -> ifcopenshell.file:
     import ifcopenshell.api.alignment
 
-    patched_file = ifcopenshell.file.from_string(file.to_string())
+    patched_file = ifcopenshell.file.from_string(file.wrapped_data.to_string())
 
     linear_placements = patched_file.by_type("IfcLinearPlacement")
     for lp in linear_placements:
@@ -36,7 +37,7 @@ def add_linear_placement_fallback_position(file: ifcopenshell.file) -> ifcopensh
 def create_alignment_geometry(file: ifcopenshell.file) -> ifcopenshell.file:
     import ifcopenshell.api.alignment
 
-    patched_file = ifcopenshell.file.from_string(file.to_string())
+    patched_file = ifcopenshell.file.from_string(file.wrapped_data.to_string())
 
     alignments = patched_file.by_type("IfcAlignment")
     for alignment in alignments:
@@ -49,7 +50,7 @@ def append_zero_length_segments(file: ifcopenshell.file) -> ifcopenshell.file:
     """Appends zero length segments to all alignment layouts and layout geometry, if missing."""
     import ifcopenshell.api.alignment
 
-    patched_file = ifcopenshell.file.from_string(file.to_string())
+    patched_file = ifcopenshell.file.from_string(file.wrapped_data.to_string())
 
     alignments = patched_file.by_type("IfcAlignment")
     for alignment in alignments:
diff --git a/src/ifcopenshell-python/ifcopenshell/util/attribute.py b/src/ifcopenshell-python/ifcopenshell/util/attribute.py
index a3212aca672..0e34b5c0225 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/attribute.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/attribute.py
@@ -16,8 +16,9 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from typing import Literal, Union
+
 import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
-from typing import Union, Literal
 
 PrimitiveType = Literal["entity", "string", "float", "integer", "boolean", "enum", "binary"]
 ComplexPrimitiveType = Literal["list", "array", "set"]
diff --git a/src/ifcopenshell-python/ifcopenshell/util/brick.py b/src/ifcopenshell-python/ifcopenshell/util/brick.py
index 04e0be5452d..ec358c2b98e 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/brick.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/brick.py
@@ -16,14 +16,14 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import os
 import json
+import os
+from typing import Union
+
 import ifcopenshell
 import ifcopenshell.util.classification
 import ifcopenshell.util.element
 import ifcopenshell.util.system
-from typing import Union
-
 
 cwd = os.path.dirname(os.path.realpath(__file__))
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/classification.py b/src/ifcopenshell-python/ifcopenshell/util/classification.py
index 06f9b0e31ae..df36eb97a48 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/classification.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/classification.py
@@ -16,9 +16,10 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import ifcopenshell.util.element
 from typing import Optional
 
+import ifcopenshell.util.element
+
 
 def get_references(element: ifcopenshell.entity_instance, should_inherit=True) -> set[ifcopenshell.entity_instance]:
     """Gets classification references associated with the element
diff --git a/src/ifcopenshell-python/ifcopenshell/util/constraint.py b/src/ifcopenshell-python/ifcopenshell/util/constraint.py
index 2c370aa997d..6d52d746763 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/constraint.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/constraint.py
@@ -18,9 +18,10 @@
 #
 #
 
-import ifcopenshell
 from typing import Union
 
+import ifcopenshell
+
 
 def get_constraints(product: ifcopenshell.entity_instance) -> list[ifcopenshell.entity_instance]:
     """
diff --git a/src/ifcopenshell-python/ifcopenshell/util/cost.py b/src/ifcopenshell-python/ifcopenshell/util/cost.py
index a7e96221d99..875594f1a5e 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/cost.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/cost.py
@@ -16,10 +16,12 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from collections.abc import Generator
+from typing import Any, Literal, Optional, Union
+
 import lark
+
 import ifcopenshell
-from typing import Optional, Union, Literal, Any
-from collections.abc import Generator
 import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
 import ifcopenshell.util.attribute
 import ifcopenshell.util.element
@@ -350,8 +352,7 @@ def get_cost_rate(
 
 class CostValueUnserialiser:
     def parse(self, formula: str):
-        l = lark.Lark(
-            """start: formula
+        l = lark.Lark("""start: formula
                     formula: operand (operator operand)*
                     operand: value | category "(" formula ")"
                     value: NUMBER?
@@ -388,8 +389,7 @@ def parse(self, formula: str):
                     NEWLINE: (CR? LF)+
 
                     %ignore WS // Disregard spaces in text
-                 """
-        )
+                 """)
         start = l.parse(formula)
         return self.get_formula(start.children[0])
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/data.py b/src/ifcopenshell-python/ifcopenshell/util/data.py
index c65a87ddb0f..4169b1ea4d2 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/data.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/data.py
@@ -17,10 +17,13 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Union
+
 import numpy as np
+
 import ifcopenshell
-from typing import Any, Union
-from dataclasses import dataclass
 from ifcopenshell.util.shape_builder import ShapeBuilder
 
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/date.py b/src/ifcopenshell-python/ifcopenshell/util/date.py
index 3b0761f75cd..dcd1c36ca5b 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/date.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/date.py
@@ -16,12 +16,14 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import ifcopenshell
 import datetime
-import isodate
 from re import findall
+from typing import Any, Literal, Union, overload
+
+import isodate
 from dateutil import parser
-from typing import Literal, Union, Any, overload
+
+import ifcopenshell
 
 
 def timedelta2duration(timedelta):
diff --git a/src/ifcopenshell-python/ifcopenshell/util/doc.py b/src/ifcopenshell-python/ifcopenshell/util/doc.py
index 41d6189fe4e..b0443f6117f 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/doc.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/doc.py
@@ -16,27 +16,30 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+import copy
 import json
 from pathlib import Path
-import copy
+from typing import Optional, TypedDict, Union
+
+from typing_extensions import NotRequired
+
 import ifcopenshell
 import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
 import ifcopenshell.util.attribute
 import ifcopenshell.util.schema
-from typing import Optional, Literal, Any, Union, TypedDict
-from typing_extensions import NotRequired
 
 try:
     import glob
+    import re
+    import shutil
+    import urllib.parse
     import warnings
+    import zipfile
+
     import requests
-    import urllib.parse
-    from markdown import markdown
     from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
-    import zipfile
     from lxml import etree
-    import re
-    import shutil
+    from markdown import markdown
 except:
     pass  # Only necessary if you're using it to generate the docs database
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/element.py b/src/ifcopenshell-python/ifcopenshell/util/element.py
index 7bcce391712..0aeccccea47 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/element.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/element.py
@@ -16,14 +16,14 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from collections import namedtuple
+from collections.abc import Callable, Generator, Sequence
+from typing import Any, Literal, Optional, Union, overload
+
 import ifcopenshell
 import ifcopenshell.guid
 import ifcopenshell.util.element
 import ifcopenshell.util.representation
-from typing import Any, Callable, Optional, Union, Literal, overload
-from collections.abc import Generator, Sequence
-from collections import deque, namedtuple
-
 
 MATERIAL_TYPE = Literal[
     "IfcMaterial",
@@ -737,7 +737,7 @@ def get_material(
                 return relationship.RelatingMaterial
     if should_inherit:
         relating_type = get_type(element)
-        if relating_type != element and (has_associations := getattr(relating_type, "HasAssociations", None)):
+        if relating_type is not None and relating_type != element and (has_associations := getattr(relating_type, "HasAssociations", None)):
             return get_material(relating_type, should_skip_usage)
 
 
@@ -958,7 +958,7 @@ def get_elements_by_profile(profile: ifcopenshell.entity_instance) -> set[ifcope
     :return: The elements using the profile.
     """
     ifc_file = profile.file
-    queue = list(ifc_file.get_inverse(profile))
+    queue = ifc_file.get_inverse(profile)
     processed: set[ifcopenshell.entity_instance] = set()
     representations: set[ifcopenshell.entity_instance] = set()
     while queue:
@@ -1234,7 +1234,9 @@ def get_controls(element: ifcopenshell.entity_instance) -> Generator[ifcopenshel
             yield rel.RelatingControl
 
 
-def get_parent(element: ifcopenshell.entity_instance) -> Union[ifcopenshell.entity_instance, None]:
+def get_parent(
+    element: ifcopenshell.entity_instance, ifc_class: Optional[str] = None
+) -> Union[ifcopenshell.entity_instance, None]:
     """Get the parent in the spatial heirarchy
 
     IFC features a spatial hierarchy tree of all objects. Each spatial element
@@ -1251,6 +1253,8 @@ def get_parent(element: ifcopenshell.entity_instance) -> Union[ifcopenshell.enti
     - Voiding: the opening voids another physical element, such as a hole in a wall
 
     :param element: Any physical or spatial element in the tree
+    :param ifc_class: Optionally filter the type of parent you're after. For
+        example, you may be after the storey, not a space.
     :return: Its parent. This must exist for any valid file, or None if we've reached the IfcProject.
 
     Example:
@@ -1260,7 +1264,7 @@ def get_parent(element: ifcopenshell.entity_instance) -> Union[ifcopenshell.enti
         element = file.by_type("IfcWall")[0]
         parent = ifcopenshell.util.element.get_parent(element)
     """
-    return (
+    parent = (
         get_container(element, should_get_direct=True)
         or get_aggregate(element)
         or get_nest(element)
@@ -1268,6 +1272,16 @@ def get_parent(element: ifcopenshell.entity_instance) -> Union[ifcopenshell.enti
         or get_voided_element(element)
     )
 
+    if not ifc_class:
+        return parent
+
+    while parent:
+        if parent.is_a(ifc_class):
+            return parent
+        parent = get_parent(parent)
+
+    return None
+
 
 def get_filled_void(element: ifcopenshell.entity_instance) -> Union[ifcopenshell.entity_instance, None]:
     """If the element is filling a void, get the void
@@ -1661,14 +1675,14 @@ def are_inverses_contained() -> bool:
     subgraph = list(ifc_file.traverse(element, breadth_first=True))
     subgraph.extend(also_consider)
     subgraph_set = set(subgraph)
-    subelement_queue = deque([element])
+    subelement_queue = [element]
 
     # Cache already processed entities to avoid traversing them multiple time.
     # E.g. lots of IFCINDEXEDPOLYCURVES may reference the same IFCCARTESIANPOINTLIST2D.
     processed_ids: set[int] = set()
 
     while subelement_queue:
-        subelement = subelement_queue.popleft()
+        subelement = subelement_queue.pop(0)
         subelement_id = subelement.id()
         if (
             subelement_id
@@ -1703,7 +1717,6 @@ def are_inverses_contained() -> bool:
 
     # We delete elements from subgraph in reverse order to allow batching to work
     for subelement in filter(lambda e: e in to_delete, subgraph[::-1]):
-        to_delete.remove(subelement)
         ifc_file.remove(subelement)
     # ifc_file.unbatch()
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/file.py b/src/ifcopenshell-python/ifcopenshell/util/file.py
index 4875ddf3b79..a874e1ba4d9 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/file.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/file.py
@@ -17,7 +17,8 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 import zipfile
-from typing import IO, Union, TypedDict
+from typing import IO, TypedDict, Union
+
 from typing_extensions import NotRequired
 
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/fm.py b/src/ifcopenshell-python/ifcopenshell/util/fm.py
index c9b1977e4ec..bfb7391dc3d 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/fm.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/fm.py
@@ -16,11 +16,13 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from typing import Literal
+
+from typing_extensions import assert_never
+
 import ifcopenshell
 import ifcopenshell.ifcopenshell_wrapper as W
 import ifcopenshell.util.attribute
-from typing import Literal
-from typing_extensions import assert_never
 
 # COBie actually uses an exclusion list, but this inclusion list is equivalent.
 cobie_type_classes = [
diff --git a/src/ifcopenshell-python/ifcopenshell/util/generate_pset_templates.py b/src/ifcopenshell-python/ifcopenshell/util/generate_pset_templates.py
index 44d5bc28cf2..681c0740097 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/generate_pset_templates.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/generate_pset_templates.py
@@ -18,22 +18,22 @@
 
 RUN_FROM_DEV_REPO = False
 
-import ifcopenshell.ifcopenshell_wrapper as W
-import ifcopenshell.api.unit
-import ifcopenshell.api.project
-import ifcopenshell.guid
-import ifcopenshell.util.attribute
 import glob
 import sys
-from pathlib import Path
-from lxml import etree
 from itertools import chain
+from pathlib import Path
 from typing import cast
 
+from lxml import etree
+
+import ifcopenshell.api.project
+import ifcopenshell.api.unit
+import ifcopenshell.guid
+import ifcopenshell.ifcopenshell_wrapper as W
 
 if not RUN_FROM_DEV_REPO:
-    import zipfile
     import shutil
+    import zipfile
 
 BASE_MODULE_PATH = Path(__file__).parent
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/geolocation.py b/src/ifcopenshell-python/ifcopenshell/util/geolocation.py
index 748ba8f3c39..47c8886691a 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/geolocation.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/geolocation.py
@@ -17,12 +17,14 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 import math
+from decimal import ROUND_HALF_UP, Decimal
+from typing import Any, NamedTuple, Optional, Union
+
 import numpy as np
+
 import ifcopenshell
 import ifcopenshell.util.element
 import ifcopenshell.util.placement
-from typing import NamedTuple, Optional, Union
-from decimal import Decimal, ROUND_HALF_UP
 
 MatrixType = ifcopenshell.util.placement.MatrixType
 
@@ -266,6 +268,15 @@ def get_helmert_transformation_parameters(ifc_file: ifcopenshell.file) -> Option
     return HelmertTransformation(e, n, h, xaa, xao, scale, factor_x, factor_y, factor_z)
 
 
+def get_crs(ifc_file: ifcopenshell.file) -> dict[str, Any]:
+    """Get CRS information from an IFC file."""
+    if ifc_file.schema == "IFC2X3":
+        return ifcopenshell.util.element.get_pset(ifc_file.by_type("IfcProject")[0], "ePSet_ProjectedCRS")
+    for context in ifc_file.by_type("IfcGeometricRepresentationContext", include_subtypes=False):
+        if operation := context.HasCoordinateOperation:
+            return operation[0].TargetCRS.get_info()
+
+
 def auto_z2e(ifc_file: ifcopenshell.file, z: float, should_return_in_map_units: bool = True) -> float:
     """Convert a Z coordinate to an elevation using model georeferencing data
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/ifc4x3dev_scrape_data_for_docs.py b/src/ifcopenshell-python/ifcopenshell/util/ifc4x3dev_scrape_data_for_docs.py
index 02e4f1f4913..86ab81ef4c1 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/ifc4x3dev_scrape_data_for_docs.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/ifc4x3dev_scrape_data_for_docs.py
@@ -17,7 +17,12 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 try:
-    from server import get_resource_path, resource_documentation_builder, process_markdown, R
+    from server import (
+        R,
+        get_resource_path,
+        process_markdown,
+        resource_documentation_builder,
+    )
 except ModuleNotFoundError as e:
     print(
         "ERROR. Failed to import `server.py`.\n"
@@ -26,13 +31,15 @@
     raise e
 
 import itertools
-import operator
 import json
-import ifcopenshell
+import operator
 from collections import Counter
-from bs4 import BeautifulSoup
 from typing import Any, Union
 
+from bs4 import BeautifulSoup
+
+import ifcopenshell
+
 
 # Hacky modified functions from server.py to make parser work
 def get_definition_from_md(resource: str, mdc: str) -> str:
diff --git a/src/ifcopenshell-python/ifcopenshell/util/mvd_info.py b/src/ifcopenshell-python/ifcopenshell/util/mvd_info.py
index a4ca36039dc..a3bdb073375 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/mvd_info.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/mvd_info.py
@@ -26,8 +26,8 @@
 except ImportError:
     LARK_AVAILABLE = False
 
-from typing import Callable, Union
 import re
+from typing import Union
 
 if LARK_AVAILABLE:
     mvd_grammar = r"""
@@ -51,9 +51,9 @@
 
         value: /[A-Za-z0-9 _\.-]+/
 
-        other_keyword: /[^\[\]]+/  
-        
-        dynamic_option_word: /[^\[\]]+/ 
+        other_keyword: /[^\[\]]+/
+
+        dynamic_option_word: /[^\[\]]+/
 
         %import common.WS
         %ignore WS
diff --git a/src/ifcopenshell-python/ifcopenshell/util/placement.py b/src/ifcopenshell-python/ifcopenshell/util/placement.py
index e7e7b99a682..cc37ab99060 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/placement.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/placement.py
@@ -16,11 +16,13 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from collections.abc import Iterable
+from typing import Literal, Optional
+
 import numpy as np
 import numpy.typing as npt
+
 import ifcopenshell
-from typing import Literal, Optional
-from collections.abc import Iterable
 
 MatrixType = npt.NDArray[np.float64]
 """`npt.NDArray[np.float64]`"""
diff --git a/src/ifcopenshell-python/ifcopenshell/util/pset.py b/src/ifcopenshell-python/ifcopenshell/util/pset.py
index db1b4736c9c..68fc84a5b11 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/pset.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/pset.py
@@ -16,15 +16,16 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import re
 import pathlib
+import re
+from functools import lru_cache
+from typing import Literal, NamedTuple, Optional, Union
+
 import ifcopenshell
 import ifcopenshell.ifcopenshell_wrapper as W
 import ifcopenshell.util.schema
 import ifcopenshell.util.type
-from ifcopenshell import entity_instance
-from functools import lru_cache
-from typing import Optional, Literal, NamedTuple, Union
+from ifcopenshell.entity_instance import entity_instance
 
 templates: dict[ifcopenshell.util.schema.IFC_SCHEMA, "PsetQto"] = {}
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/representation.py b/src/ifcopenshell-python/ifcopenshell/util/representation.py
index 10743294b06..f0c2b54fcae 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/representation.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/representation.py
@@ -16,15 +16,16 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from collections.abc import Generator, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
 import numpy as np
 import numpy.typing as npt
+
 import ifcopenshell
-import ifcopenshell.util.representation
 import ifcopenshell.util.placement
+import ifcopenshell.util.representation
 import ifcopenshell.util.shape
-from typing import Optional, Union, TypedDict, Literal
-from collections.abc import Generator, Sequence
-
 
 CONTEXT_TYPE = Literal["Model", "Plan", "NotDefined"]
 REPRESENTATION_IDENTIFIER = Literal[
diff --git a/src/ifcopenshell-python/ifcopenshell/util/resource.py b/src/ifcopenshell-python/ifcopenshell/util/resource.py
index f518a091250..262ddda125c 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/resource.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/resource.py
@@ -16,11 +16,11 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from typing import Any, Union
+
 import ifcopenshell.util.cost
-import ifcopenshell.util.element
 import ifcopenshell.util.date
-from typing import Union, Any
-
+import ifcopenshell.util.element
 
 PRODUCTIVITY_PSET_DATA = Union[dict[str, Any], None]
 # https://ifc43-docs.standards.buildingsmart.org/IFC/RELEASE/IFC4x3/HTML/lexical/IfcConstructionResource.htm#Table-7.3.3.7.1.3.H
diff --git a/src/ifcopenshell-python/ifcopenshell/util/schema.py b/src/ifcopenshell-python/ifcopenshell/util/schema.py
index e3e9ad7540a..bdd6d489d0f 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/schema.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/schema.py
@@ -16,13 +16,14 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import os
 import json
+import os
 import time
+from typing import Any, Literal, Union
+
 import ifcopenshell
-import ifcopenshell.util.attribute
 import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
-from typing import Union, Any, Literal
+import ifcopenshell.util.attribute
 
 # This is highly experimental and incomplete, however, it may work for simple datasets.
 
@@ -70,7 +71,7 @@ def get_declaration(element: ifcopenshell.entity_instance):
         print(declaration.is_abstract()) # False
         print(declaration.supertype().name()) # IfcBuildingElement
     """
-    return element.declaration
+    return element.wrapped_data.declaration().as_entity()
 
 
 def is_a(declaration: ifcopenshell.ifcopenshell_wrapper.declaration, ifc_class: str) -> bool:
@@ -104,7 +105,7 @@ def get_supertypes(
     .. code:: python
 
         wall = model.createIfcWall()
-        results = ifcopenshell.util.schema.get_supertypes(wall.declaration.as_entity())
+        results = ifcopenshell.util.schema.get_supertypes(wall.wrapped_data.declaration().as_entity())
         # [<entity IfcBuildingElement>, <entity IfcElement>, ..., <entity IfcRoot>]
     """
     results = []
@@ -462,7 +463,7 @@ def migrate_attribute(
     ) -> None:
         # NOTE: `attribute` is an attribute in new file schema
         # print("Migrating attribute", element, new_element, attribute.name())
-        old_file = element.file
+        old_file = element.wrapped_data.file
         if hasattr(element, attribute.name()):
             value = getattr(element, attribute.name())
             # print("Attribute names matched", value)
diff --git a/src/ifcopenshell-python/ifcopenshell/util/schema/ifc_classes_suggestions.json b/src/ifcopenshell-python/ifcopenshell/util/schema/ifc_classes_suggestions.json
index ea520e028b7..c3cf248ada8 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/schema/ifc_classes_suggestions.json
+++ b/src/ifcopenshell-python/ifcopenshell/util/schema/ifc_classes_suggestions.json
@@ -5,6 +5,16 @@
             "predefined_type": "NOTDEFINED"
         }
     ],
+    "IfcAirTerminal": [
+        {
+            "name": "Commercial Kitchen Hood"
+        }
+    ],
+    "IfcAirTerminalType": [
+        {
+            "name": "Commercial Kitchen Hood"
+        }
+    ],
     "IfcAirTerminalBox": [
         {
             "name": "VAV Box"
@@ -51,6 +61,16 @@
             "predefined_type": "DISTRIBUTIONBOARD"
         }
     ],
+    "IfcFireSuppressionTerminal": [
+        {
+            "name": "Fire Extinguisher"
+        }
+    ],
+    "IfcFireSuppressionTerminalType": [
+        {
+            "name": "Fire Extinguisher"
+        }
+    ],
     "IfcFurniture": [
         {
             "name": "Casework"
diff --git a/src/ifcopenshell-python/ifcopenshell/util/scripts/validate_stub.py b/src/ifcopenshell-python/ifcopenshell/util/scripts/validate_stub.py
index 14b05ed258f..1c3b6cb0015 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/scripts/validate_stub.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/scripts/validate_stub.py
@@ -25,11 +25,11 @@
 - class hierarchy
 """
 
-
 import ast
 import difflib
 from pathlib import Path
 from typing import Union
+
 from typing_extensions import assert_never
 
 
@@ -57,11 +57,28 @@ def get_function_node_name(node: ast.FunctionDef) -> Union[SubnameType, None]:
     :return: Function node name as ``SubnameType``  or ``None``, if function wasn't processed and can be skipped.
     """
     node_name = node.name
-    if node_name.startswith("_") and node_name not in ("_is",):
+    is_init = node_name == "__init__"
+
+    if node_name.startswith("_") and node_name not in ("_is",) and not is_init:
+        return None
+    arg_nodes = node.args.args
+    defaults = [None] * (len(arg_nodes) - len(node.args.defaults)) + node.args.defaults
+    args: list[str] = []
+    for arg, default in zip(arg_nodes, defaults):
+        if default is None:
+            args.append(arg.arg)
+        else:
+            args.append(f"{arg.arg}={ast.unparse(default)}")
+
+    if arg := node.args.vararg:
+        args.append(f"*{arg.arg}")
+
+    if arg := node.args.kwarg:
+        args.append(f"**{arg.arg}")
+
+    # Skip non-informative constructors.
+    if is_init and args == ["self"]:
         return None
-    args = [a.arg for a in node.args.args]
-    if node.args.vararg:
-        args.append("*args")
 
     node_name = f"def {node.name}"
     node_name = f"{node_name}({', '.join(args)}): ..."
diff --git a/src/ifcopenshell-python/ifcopenshell/util/selector.py b/src/ifcopenshell-python/ifcopenshell/util/selector.py
index 4d05912eb80..bbe8125927a 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/selector.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/selector.py
@@ -17,16 +17,20 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 import re
-import sys
+from collections.abc import Iterable
+from decimal import Decimal
+from types import EllipsisType
+from typing import Any, Optional, Union
+
 import lark
 import numpy as np
-import ifcopenshell.api.pset
+
 import ifcopenshell.api.geometry
+import ifcopenshell.api.pset
 import ifcopenshell.util
 import ifcopenshell.util.attribute
 import ifcopenshell.util.classification
 import ifcopenshell.util.element
-import ifcopenshell.util.fm
 import ifcopenshell.util.geolocation
 import ifcopenshell.util.placement
 import ifcopenshell.util.pset
@@ -34,18 +38,8 @@
 import ifcopenshell.util.shape
 import ifcopenshell.util.system
 import ifcopenshell.util.unit
-from decimal import Decimal
-from typing import Optional, Any, Union
-from collections.abc import Iterable
-
-if sys.version_info >= (3, 10):
-    from types import EllipsisType
-else:
-    EllipsisType = type(...)
 
-
-filter_elements_grammar = lark.Lark(
-    """start: filter_group
+filter_elements_grammar = lark.Lark("""start: filter_group
     filter_group: facet_list ("+" facet_list)*
     facet_list: facet ("," facet)*
 
@@ -116,11 +110,9 @@
     NEWLINE: (CR? LF)+
 
     %ignore WS // Disregard spaces in text
-"""
-)
+""")
 
-get_element_grammar = lark.Lark(
-    """start: keys
+get_element_grammar = lark.Lark("""start: keys
 
     keys: key ("." key)*
     key: quoted_string | regex_string | unquoted_string
@@ -135,25 +127,37 @@
     WS: /[ \\t\\f\\r\\n]/+
 
     %ignore WS // Disregard spaces in text
- """
-)
-
-format_grammar = lark.Lark(
-    """start: function
-
-    function: round | number | int | format_length | lower | upper | title | concat | substr | ESCAPED_STRING | NUMBER
-
-    round: "round(" function "," NUMBER ")"
-    number: "number(" function ["," ESCAPED_STRING ["," ESCAPED_STRING]] ")"
-    int: "int(" function ")"
+ """)
+
+format_grammar = lark.Lark("""start: expression
+
+    ?expression: add_sub
+    ?add_sub: mul_div
+        | add_sub "+" mul_div   -> add
+        | add_sub "-" mul_div   -> subtract
+    ?mul_div: function
+        | mul_div "*" function  -> multiply
+        | mul_div "/" function  -> divide
+    
+    function: round | number | int | format_length | lower | upper | title | concat | substr | sort | reverse | join | variable | ESCAPED_STRING | SIGNED_NUMBER | "(" expression ")"
+
+    variable: "{{" query_path "}}"
+    query_path: /[^}]+/
+
+    round: "round(" expression "," NUMBER ")"
+    number: "number(" expression ["," ESCAPED_STRING ["," ESCAPED_STRING]] ")"
+    int: "int(" expression ")"
     format_length: metric_length | imperial_length
-    metric_length: "metric_length(" function "," NUMBER "," NUMBER ")"
-    imperial_length: "imperial_length(" function "," NUMBER ["," ESCAPED_STRING "," ESCAPED_STRING ["," boolean]] ")"
-    lower: "lower(" function ")"
-    upper: "upper(" function ")"
-    title: "title(" function ")"
-    concat: "concat(" function ("," function)* ")"
-    substr: "substr(" function "," SIGNED_INT ["," SIGNED_INT] ")"
+    metric_length: "metric_length(" expression "," NUMBER "," NUMBER ")"
+    imperial_length: "imperial_length(" expression "," NUMBER ["," ESCAPED_STRING "," ESCAPED_STRING ["," boolean]] ")"
+    lower: "lower(" expression ")"
+    upper: "upper(" expression ")"
+    title: "title(" expression ")"
+    concat: "concat(" expression ("," expression)* ")"
+    substr: "substr(" expression "," SIGNED_INT ["," SIGNED_INT] ")"
+    sort: "sort(" expression ")"
+    reverse: "reverse(" expression ")"
+    join: "join(" ESCAPED_STRING "," expression ")"
     boolean: TRUE | FALSE
 
     TRUE: "true" | "True" | "TRUE"
@@ -184,14 +188,82 @@
     NEWLINE: (CR? LF)+
 
     %ignore WS // Disregard spaces in text
-"""
-)
+""")
 
 
 class FormatTransformer(lark.Transformer):
+    def __init__(self, element=None):
+        """Initialize transformer with optional element for variable substitution"""
+        super().__init__()
+        self.element = element
+
     def start(self, args):
+        if isinstance(args[0], (list, tuple)):
+            return ", ".join(args[0])
+        return args[0]
+
+    def expression(self, args):
         return args[0]
 
+    def variable(self, args):
+        """Handle variable substitution like {{z}} or {{Pset_Wall.FireRating}}"""
+        if self.element:
+            try:
+                return get_element_value(self.element, args[0])
+            except:
+                pass
+
+    def query_path(self, args):
+        """Extract the query path from variable"""
+        return str(args[0]).strip()
+
+    def add(self, args):
+        """Handle addition operation"""
+        left, right = args
+        try:
+            left_val = float(left) if left != "None" and left is not None else 0.0
+            right_val = float(right) if right != "None" and right is not None else 0.0
+            result = left_val + right_val
+            # Return integer if result has no decimal part
+            if result % 1 == 0:
+                return str(int(result))
+            return str(result)
+        except (ValueError, TypeError):
+            # If can't convert to numbers, concatenate as strings
+            return str(left) + str(right)
+
+    def subtract(self, args):
+        """Handle subtraction operation"""
+        left, right = args
+        left_val = float(left) if left != "None" and left is not None else 0.0
+        right_val = float(right) if right != "None" and right is not None else 0.0
+        result = left_val - right_val
+        if result % 1 == 0:
+            return str(int(result))
+        return str(result)
+
+    def multiply(self, args):
+        """Handle multiplication operation"""
+        left, right = args
+        left_val = float(left) if left != "None" and left is not None else 0.0
+        right_val = float(right) if right != "None" and right is not None else 0.0
+        result = left_val * right_val
+        if result % 1 == 0:
+            return str(int(result))
+        return str(result)
+
+    def divide(self, args):
+        """Handle division operation"""
+        left, right = args
+        left_val = float(left) if left != "None" and left is not None else 0.0
+        right_val = float(right) if right != "None" and right is not None else 1.0
+        if right_val == 0:
+            return "inf"  # or raise an error, or return "0"
+        result = left_val / right_val
+        if result % 1 == 0:
+            return str(int(result))
+        return str(result)
+
     def function(self, args):
         return args[0]
 
@@ -211,7 +283,7 @@ def title(self, args):
         return str(args[0]).title()
 
     def concat(self, args):
-        return "".join(args)
+        return "".join(str(arg) for arg in args)
 
     def substr(self, args):
         if len(args) == 3:
@@ -221,6 +293,15 @@ def substr(self, args):
         elif len(args) == 2:
             return str(args[0])[int(args[1]) :]
 
+    def sort(self, args):
+        return sorted(args[0])
+
+    def reverse(self, args):
+        return list(reversed(args[0]))
+
+    def join(self, args):
+        return args[0].join(args[1])
+
     def boolean(self, args):
         if not args:
             return True
@@ -241,13 +322,14 @@ def round(self, args):
         return str(result)
 
     def number(self, args):
-        if isinstance(args[0], str):
-            args[0] = float(args[0]) if "." in args[0] else int(args[0])
+        arg_val = args[0]
+        if isinstance(arg_val, str):
+            arg_val = float(arg_val) if "." in arg_val else int(arg_val)
         if len(args) >= 3 and args[2]:
-            return "{:,}".format(args[0]).replace(".", "*").replace(",", args[2]).replace("*", args[1])
+            return "{:,}".format(arg_val).replace(".", "*").replace(",", args[2]).replace("*", args[1])
         elif len(args) >= 2 and args[1]:
-            return "{}".format(args[0]).replace(".", args[1])
-        return "{:,}".format(args[0])
+            return "{}".format(arg_val).replace(".", args[1])
+        return "{:,}".format(arg_val)
 
     def format_length(self, args):
         return args[0]
@@ -287,7 +369,8 @@ def imperial_length(self, args):
         )
 
     def int(self, args: list[str]) -> str:
-        return str(int(float(args[0])))
+        value = 0.0 if args[0] == "None" else args[0] or 0.0
+        return str(int(float(value)))
 
 
 class GetElementTransformer(lark.Transformer):
@@ -313,8 +396,18 @@ def ESCAPED_STRING(self, args):
         return args[1:-1].replace("\\", "")
 
 
-def format(query: str) -> str:
-    return FormatTransformer().transform(format_grammar.parse(query))
+def format(query: str, element: Optional[ifcopenshell.entity_instance] = None) -> str:
+    """Format a query string with optional element context for variable substitution.
+
+    :param query: Format query string (can include {{variable}} placeholders)
+    :param element: Optional IFC element for variable substitution
+    :return: Formatted string
+
+    Example:
+        format("{{z}} / 2", element)  # Substitutes element's z value
+        format("imperial_length({{z}} / 2, 4)", element)  # Uses z in calculation
+    """
+    return FormatTransformer(element).transform(format_grammar.parse(query))
 
 
 def get_element_value(element: ifcopenshell.entity_instance, query: str) -> Any:
@@ -347,13 +440,13 @@ def _get_element_value(element: ifcopenshell.entity_instance, keys: list[str]) -
         elif key == "container":
             value = ifcopenshell.util.element.get_container(value)
         elif key == "space":
-            value = ifcopenshell.util.element.get_container(value, ifc_class="IfcSpace")
+            value = ifcopenshell.util.element.get_parent(value, ifc_class="IfcSpace")
         elif key == "storey":
-            value = ifcopenshell.util.element.get_container(value, ifc_class="IfcBuildingStorey")
+            value = ifcopenshell.util.element.get_parent(value, ifc_class="IfcBuildingStorey")
         elif key == "building":
-            value = ifcopenshell.util.element.get_container(value, ifc_class="IfcBuilding")
+            value = ifcopenshell.util.element.get_parent(value, ifc_class="IfcBuilding")
         elif key == "site":
-            value = ifcopenshell.util.element.get_container(value, ifc_class="IfcSite")
+            value = ifcopenshell.util.element.get_parent(value, ifc_class="IfcSite")
         elif key == "parent":
             value = ifcopenshell.util.element.get_parent(value)
         elif key in ("types", "occurrences"):
@@ -386,7 +479,7 @@ def _get_element_value(element: ifcopenshell.entity_instance, keys: list[str]) -
                 if key in ("x", "y", "z"):
                     value = xyz["xyz".index(key)]
                 else:
-                    enh = ifcopenshell.util.geolocation.auto_xyz2enh(element.file, *xyz)
+                    enh = ifcopenshell.util.geolocation.auto_xyz2enh(element.wrapped_data.file, *xyz)
                     value = enh[("easting", "northing", "elevation").index(key)]
             else:
                 value = None
@@ -569,8 +662,8 @@ def set_predefined_type(
                 element: ifcopenshell.entity_instance, value: Union[str, None], *, is_type: bool
             ) -> None:
                 predefined_type = element.PredefinedType
-                declaration = element.declaration
-                entity = declaration
+                declaration = element.wrapped_data.declaration()
+                entity = declaration.as_entity()
                 enum_attr = next(attr for attr in entity.attributes() if attr.name() == "PredefinedType")
                 enum_items = ifcopenshell.util.attribute.get_enum_items(enum_attr)
 
@@ -639,7 +732,9 @@ def set_predefined_type(
                     except:
                         # Try to cast
                         data_type = ifcopenshell.util.attribute.get_primitive_type(
-                            element.declaration.attribute_by_index(element.get_argument_index(key))
+                            element.wrapped_data.declaration()
+                            .as_entity()
+                            .attribute_by_index(element.wrapped_data.get_argument_index(key))
                         )
                         if data_type == "string":
                             value = str(value)
diff --git a/src/ifcopenshell-python/ifcopenshell/util/sequence.py b/src/ifcopenshell-python/ifcopenshell/util/sequence.py
index 51f34b7226a..518d581657a 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/sequence.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/sequence.py
@@ -17,13 +17,13 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 import datetime
-import ifcopenshell.util.date
-import ifcopenshell.util.element
-from math import floor
-from functools import cache
-from typing import Union, Literal, Optional
 from collections.abc import Generator
+from functools import cache
+from math import floor
+from typing import Literal, Optional, Union
 
+import ifcopenshell.util.date
+import ifcopenshell.util.element
 
 DURATION_TYPE = Literal["ELAPSEDTIME", "WORKTIME", "NOTDEFINED"]
 RECURRENCE_TYPE = Literal[
diff --git a/src/ifcopenshell-python/ifcopenshell/util/shape.py b/src/ifcopenshell-python/ifcopenshell/util/shape.py
index 75c15b69353..7732412861c 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/shape.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/shape.py
@@ -16,26 +16,36 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import shapely
-import shapely.ops
+from __future__ import annotations
+
+from math import cos, radians
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
 import numpy as np
 import numpy.typing as npt
-import ifcopenshell.ifcopenshell_wrapper as W
+import shapely
+import shapely.ops
+
 import ifcopenshell.util.element
 import ifcopenshell.util.placement
 import ifcopenshell.util.representation
-from ifcopenshell.util.shape_builder import VectorType
-from math import radians, cos
-from ifcopenshell.geom import ShapeElementType
-from typing import Optional, Literal, Union
 
-tol = 1e-6
-AXIS_LITERAL = Literal["X", "Y", "Z"]
-VECTOR_3D = tuple[float, float, float]
+if TYPE_CHECKING:
+
+    import ifcopenshell.ifcopenshell_wrapper as W
+    from ifcopenshell.geom import ShapeElementType
+    from ifcopenshell.util.shape_builder import VectorType
+
+    AXIS_LITERAL = Literal["X", "Y", "Z"]
 
+    VECTOR_3D = tuple[float, float, float]
+
+# Used only for typing, but reused by `shape.py` users.
 MatrixType = npt.NDArray[np.float64]
 """`npt.NDArray[np.float64]`"""
 
+tol = 1e-6
+
 # NOTE: See IfcGeomRepresentation.h for W.Triangulation buffer types.
 
 # NOTE: For functions that return a single scalar ensure to use .item() to
diff --git a/src/ifcopenshell-python/ifcopenshell/util/shape_builder.py b/src/ifcopenshell-python/ifcopenshell/util/shape_builder.py
index 0811b8798cd..e53d069a76b 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/shape_builder.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/shape_builder.py
@@ -17,20 +17,21 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 from __future__ import annotations
+
+import collections.abc
+from collections.abc import Sequence
+from itertools import chain
+from math import atan, cos, degrees, pi, radians, sin, sqrt, tan
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
 import numpy as np
 import numpy.typing as npt
-import collections
-import collections.abc
+
 import ifcopenshell
-import ifcopenshell.api
 import ifcopenshell.util.element
 import ifcopenshell.util.placement
 import ifcopenshell.util.representation
 import ifcopenshell.util.unit
-from math import cos, sin, pi, tan, radians, degrees, atan, sqrt
-from typing import Union, Optional, Literal, Any, TYPE_CHECKING
-from collections.abc import Sequence
-from itertools import chain
 
 PRECISION = 1.0e-5
 
@@ -39,7 +40,7 @@
     # NOTE: mathutils is never used at runtime in ifcopenshell,
     # only for type checking to ensure methods are compatible with
     # Blender vectors.
-    from mathutils import Vector
+    from mathutils import Vector  # pyright: ignore[reportMissingImports]  # ty:ignore[unresolved-import]
 
     # Support both numpy arrays and python sequences as inputs.
     VectorType = Union[Sequence[float], Vector, np.ndarray]
@@ -312,7 +313,7 @@ def polyline(
         Generate an IfcIndexedPolyCurve based on the provided points.
 
         :param points: List of 2d or 3d points
-        :param closed: Whether polyline should be closed. Default is `False`
+        :param closed: Whether polyline should be closed.
         :param position_offset: offset to be applied to all points
         :param arc_points: Indices of the middle points for arcs. For creating an arc segment,
             provide 3 points: `arc_start`, `arc_middle` and `arc_end` to `points` and add the `arc_middle`
@@ -415,8 +416,9 @@ def get_rectangle_coords(size: VectorType = (1.0, 1.0), position: Optional[Vecto
             3 2
             0 1
 
-        :param size: rectangle size, could be either 2d or 3d, defaults to `(1,1)`
-        :param position: rectangle position, default to `None`.
+        :param size: rectangle size, could be either 2d or 3d.
+            Use 0 for one of 3d dimensions to create 2d rectangle in 3d space.
+        :param position: rectangle position.
             if `position` not specified zero-vector will be used
         :return: list of rectangle coords
         """
@@ -441,9 +443,11 @@ def rectangle(
         """
         Generate a rectangle polyline.
 
-        :param size: rectangle size, could be either 2d or 3d, defaults to `(1,1)`
-        :param position: rectangle position, default to `None`.
-            if `position` not specified zero-vector will be used
+        :param size: rectangle.
+        :param position: rectangle position.
+
+        See ``get_rectangle_coords`` for more information.
+
         :return: IfcIndexedPolyCurve
         """
         return self.polyline(self.get_rectangle_coords(size, position), closed=True)
@@ -513,11 +517,18 @@ def get_trim_points_from_mask(
         trim_points_mask: Sequence[int],
         position_offset: Optional[VectorType] = None,
     ) -> np.ndarray:
-        """Handy way to get edge points of the ellipse like shape of a given radiuses.
+        """Get cardinal-point coordinates of an ellipse by index mask.
 
-        Mask points are numerated from 0 to 3 ccw starting from (x_axis_radius/2; 0).
+        The four cardinal points are numbered 0–3 counter-clockwise starting from the
+        positive X axis: 0 → ``(x, 0)``, 1 → ``(0, y)``, 2 → ``(-x, 0)``, 3 → ``(0, -y)``.
 
-        Example: mask (0, 1, 2, 3) will return points (x, 0), (0, y), (-x, 0), (0, -y)
+        Example: mask ``(0, 1, 2, 3)`` returns all four points in order.
+
+        :param x_axis_radius: Radius (semi-axis length) along the X axis.
+        :param y_axis_radius: Radius (semi-axis length) along the Y axis.
+        :param trim_points_mask: Sequence of cardinal-point indices (0–3) to select.
+        :param position_offset: Optional 2D offset added to all returned points.
+        :return: Numpy array of the selected 2D points.
         """
         points = np.array(
             (
@@ -542,15 +553,23 @@ def create_ellipse_curve(
         ref_x_direction: VectorType = (1.0, 0.0),
         trim_points_mask: Sequence[int] = (),
     ) -> ifcopenshell.entity_instance:
-        """
-        Ellipse trimming points should be specified in counter clockwise order.
-
-        For example, if you need to get the part of the ellipse ABOVE y-axis, you need to use mask (0,2). Below y-axis - (2,0)
-
-        For more information about trim_points_mask check builder.get_trim_points_from_mask
-
-        Notion: trimmed ellipse also contains polyline between trim points, meaning IfcTrimmedCurve could be used
-        for further extrusion.
+        """Create an IfcEllipse, optionally trimmed to an arc.
+
+        If neither ``trim_points`` nor ``trim_points_mask`` is provided, a full IfcEllipse is returned.
+        Trimming points must be given in counter-clockwise order. For example, to get the arc
+        above the Y-axis use mask ``(0, 2)``; below the Y-axis use ``(2, 0)``.
+
+        A trimmed result (IfcTrimmedCurve) includes a closing segment between the trim points,
+        making it suitable for use as a profile in :meth:`extrude`.
+
+        :param x_axis_radius: Semi-axis length along the local X axis.
+        :param y_axis_radius: Semi-axis length along the local Y axis.
+        :param position: 2D centre of the ellipse.
+        :param trim_points: Explicit pair of 2D trim points. Takes precedence over ``trim_points_mask``.
+        :param ref_x_direction: Direction of the local X axis.
+        :param trim_points_mask: Pair of cardinal-point indices (0–3) used when ``trim_points`` is empty.
+            See :meth:`get_trim_points_from_mask` for index definitions.
+        :return: IfcEllipse (untrimmed) or IfcTrimmedCurve (trimmed).
         """
         ifc_position = self.create_axis2_placement_2d(position, ref_x_direction)
         ifc_ellipse = self.file.createIfcEllipse(
@@ -681,6 +700,14 @@ def rotate_2d_point(
         pivot_point: VectorType = (0.0, 0.0),
         counter_clockwise: bool = False,
     ) -> np.ndarray:
+        """Rotate a single 2D point around a pivot.
+
+        :param point_2d: The 2D point to rotate.
+        :param angle: Rotation angle, in degrees. Defaults to 90.
+        :param pivot_point: The point to rotate around.
+        :param counter_clockwise: If True, rotate counter-clockwise. Defaults to clockwise.
+        :return: Rotated 2D point as a numpy array.
+        """
         angle_rad = radians(angle) * (1 if counter_clockwise else -1)
         relative_point = np.array(point_2d) - pivot_point
         relative_point = np_rotation_matrix(angle_rad, 2) @ relative_point
@@ -748,7 +775,16 @@ def mirror_2d_point(
         mirror_axes: VectorType = (1.0, 1.0),
         mirror_point: VectorType = (0.0, 0.0),
     ) -> np.ndarray:
-        """mirror_axes - along which axes mirror will be applied"""
+        """Mirror a single 2D point across the specified axes.
+
+        :param point_2d: The 2D point to mirror.
+        :param mirror_axes: Indicates which axes to mirror across. A positive value in a
+            component means that axis is mirrored (negated relative to ``mirror_point``).
+            Example: ``(1, 0)`` mirrors across the Y-axis (negates X only),
+            ``(1, 1)`` mirrors across both axes.
+        :param mirror_point: Origin of the mirror operation.
+        :return: Mirrored 2D point as a numpy array.
+        """
         mirror_axes: np.ndarray = np.where(np.array(mirror_axes) > 0, -1, 1)
         mirror_point: np.ndarray = np.array(mirror_point)
         relative_point = point_2d - mirror_point
@@ -784,7 +820,7 @@ def create_axis2_placement_3d_from_matrix(
         """
         Create IfcAxis2Placement3D from numpy matrix.
 
-        :param matrix: 4x4 transformation matrix, defaults to `np.eye(4)`
+        :param matrix: 4x4 transformation matrix, defaults to ``np.eye(4)``
         :return: IfcAxis2Placement3D
         """
         if matrix is None:
@@ -794,7 +830,13 @@ def create_axis2_placement_3d_from_matrix(
     def create_axis2_placement_2d(
         self, position: VectorType = (0.0, 0.0), x_direction: Optional[VectorType] = None
     ) -> ifcopenshell.entity_instance:
-        """Create IfcAxis2Placement2D."""
+        """Create IfcAxis2Placement2D.
+
+        :param position: 2D origin of the placement.
+        :param x_direction: Direction of the local X axis. If not provided, defaults to
+            the global X axis ``(1, 0)``.
+        :return: IfcAxis2Placement2D
+        """
         ref_direction = (
             self.file.create_entity("IfcDirection", ifc_safe_vector_type(x_direction)) if x_direction else None
         )
@@ -965,8 +1007,8 @@ def mirror(
 
     def sphere(self, radius: float = 1.0, center: VectorType = (0.0, 0.0, 0.0)) -> ifcopenshell.entity_instance:
         """
-        :param radius: radius of the sphere, defaults to 1.0
-        :param center: sphere position, defaults to `(0.0, 0.0, 0.0)`
+        :param radius: radius of the sphere.
+        :param center: sphere position.
 
         :return: IfcSphere
         """
@@ -996,7 +1038,7 @@ def half_space_solid(
     ) -> ifcopenshell.entity_instance:
         """
         :param plane: The IfcPlane representing the half space.
-        :param agreement_flag: False if +Z represents the void
+        :param agreement_flag: If False (default), the plane normal points toward the **removed** material (the void). The kept region is on the opposite side from the normal.
         :return: IfcHalfSpaceSolid
         """
         return self.file.createIfcHalfSpaceSolid(plane, AgreementFlag=agreement_flag)
@@ -1049,7 +1091,14 @@ def extrude(
     def create_swept_disk_solid(
         self, path_curve: ifcopenshell.entity_instance, radius: float
     ) -> ifcopenshell.entity_instance:
-        """Create IfcSweptDiskSolid from `path_curve` (must be 3D) and `radius`"""
+        """Create an IfcSweptDiskSolid — a circular cross-section swept along a 3D path.
+
+        Useful for modelling round pipes, conduits, and cables.
+
+        :param path_curve: A 3D curve entity defining the centreline path. Must have ``Dim == 3``.
+        :param radius: Radius of the circular disk cross-section.
+        :return: IfcSweptDiskSolid
+        """
         if path_curve.Dim != 3:
             raise Exception(
                 f"Path curve for IfcSweptDiskSolid should be 3D to be valid, currently it has {path_curve.Dim} dimensions.\n"
@@ -1067,10 +1116,22 @@ def get_representation(
     ) -> ifcopenshell.entity_instance:
         """Create IFC representation for the specified context and items.
 
+        **All items must belong to the same geometry category.** IFC prohibits
+        mixing incompatible item types in one representation (e.g.
+        ``IfcExtrudedAreaSolid`` with ``IfcBlock``, or solids with curves).
+        When ``representation_type`` is omitted the type is inferred via
+        :func:`ifcopenshell.util.representation.guess_type`; if the items are
+        heterogeneous ``guess_type`` returns ``None`` and the representation is
+        written with no ``RepresentationType``, which fails IFC validation.
+        Avoid mixing swept-solid primitives (``IfcExtrudedAreaSolid``,
+        ``IfcRevolvedAreaSolid``) with CSG primitives (``IfcBlock``,
+        ``IfcSphere``, etc.) or any other category in a single call.
+
         :param context: IfcGeometricRepresentationSubContext
-        :param items: could be a list or single curve/IfcExtrudedAreaSolid
-        :param representation_type: Explicitly specified RepresentationType, defaults to `None`.
-            If not provided it will be guessed from the items types
+        :param items: A single item or list of items, all of the same geometry
+            category (e.g. all ``IfcExtrudedAreaSolid``, all ``IfcIndexedPolyCurve``)
+        :param representation_type: Explicitly specified RepresentationType.
+            If not provided it will be guessed from the items types.
         :return: IfcShapeRepresentation
         """
         if not isinstance(items, collections.abc.Iterable):
@@ -1092,18 +1153,26 @@ def get_representation(
         )
 
     def deep_copy(self, element: ifcopenshell.entity_instance) -> ifcopenshell.entity_instance:
+        """Create a deep copy of an IFC element and all its referenced entities.
+
+        :param element: The IFC entity to copy.
+        :return: A new independent copy of the element.
+        """
         return ifcopenshell.util.element.copy_deep(self.file, element)
 
     # UTILITIES
     def extrude_kwargs(self, axis: Literal["Y", "X", "Z"]) -> dict[str, tuple[float, float, float]]:
-        """Shortcut to get kwargs for `ShapeBuilder.extrude` to extrude by some axis.
+        """Shortcut to get kwargs for :meth:`extrude` to extrude along a principal axis.
 
-        It assumes you have 2D profile in:
-            XZ plane for Y axis extrusion, \n
-            YZ plane for X axis extrusion, \n
-            XY plane for Z axis extrusion, \n
+        Assumes the 2D profile lies in the plane perpendicular to the extrusion axis:
+        XZ plane for Y-axis extrusion, YZ plane for X-axis extrusion, XY plane for Z-axis extrusion.
 
-        Extruding by X/Y using other kwargs might break ValidExtrusionDirection."""
+        Extruding along X or Y with other kwargs may violate the IFC ValidExtrusionDirection constraint.
+
+        :param axis: The extrusion axis: ``'X'``, ``'Y'``, or ``'Z'``.
+        :return: A dict with keys ``position_x_axis``, ``position_z_axis``, and ``extrusion_vector``
+            suitable for passing as ``**kwargs`` to :meth:`extrude`.
+        """
 
         if axis == "Y":
             return {
@@ -1127,13 +1196,16 @@ def extrude_kwargs(self, axis: Literal["Y", "X", "Z"]) -> dict[str, tuple[float,
     def rotate_extrusion_kwargs_by_z(
         self, kwargs: dict[str, Any], angle: float, counter_clockwise: bool = False
     ) -> dict[str, VectorType]:
-        """shortcut to rotate extrusion kwargs by z axis
-
-        `kwargs` expected to have `position_x_axis` and `position_z_axis` keys
+        """Rotate extrusion kwargs around the Z axis.
 
-        `angle` is a rotation value in radians
+        A shortcut to rotate the ``position_x_axis`` and ``position_z_axis`` values returned by
+        :meth:`extrude_kwargs` around the Z axis before passing them to :meth:`extrude`.
 
-        by default rotation is clockwise, to make it counter clockwise use `counter_clockwise` flag
+        :param kwargs: A dict with ``position_x_axis`` and ``position_z_axis`` keys,
+            as returned by :meth:`extrude_kwargs`. The original dict is not mutated.
+        :param angle: Rotation angle, in radians.
+        :param counter_clockwise: If True, rotate counter-clockwise. Defaults to clockwise.
+        :return: A new dict with ``position_x_axis`` and ``position_z_axis`` rotated around Z.
         """
         rot = np_rotation_matrix(-angle, 3, "Z")
         kwargs = kwargs.copy()  # prevent mutation of original kwargs
@@ -1142,7 +1214,11 @@ def rotate_extrusion_kwargs_by_z(
         return kwargs
 
     def get_polyline_coords(self, polyline: ifcopenshell.entity_instance) -> np.ndarray:
-        """polyline should be either `IfcIndexedPolyCurve` or `IfcPolyline`"""
+        """Extract the coordinate array from a polyline entity.
+
+        :param polyline: An ``IfcIndexedPolyCurve`` or ``IfcPolyline`` entity.
+        :return: Numpy array of the polyline's point coordinates.
+        """
         coords = None
         if polyline.is_a("IfcIndexedPolyCurve"):
             coords = np.array(polyline.Points.CoordList)
@@ -1153,7 +1229,12 @@ def get_polyline_coords(self, polyline: ifcopenshell.entity_instance) -> np.ndar
         return coords
 
     def set_polyline_coords(self, polyline: ifcopenshell.entity_instance, coords: SequenceOfVectors) -> None:
-        """polyline should be either `IfcIndexedPolyCurve` or `IfcPolyline`"""
+        """Update the coordinates of a polyline entity in-place.
+
+        :param polyline: An ``IfcIndexedPolyCurve`` or ``IfcPolyline`` entity.
+        :param coords: New sequence of point coordinates. Must contain the same number of
+            points as the original polyline.
+        """
         if polyline.is_a("IfcIndexedPolyCurve"):
             polyline.Points.CoordList = ifc_safe_vector_type(coords)
         elif polyline.is_a("IfcPolyline"):
@@ -1180,8 +1261,8 @@ def get_simple_2dcurve_data(
         :param fillets:          list of points from `coords` to base fillet on. Example: (1,)
         :param fillet_radius:    list of fillet radius for each of corresponding point form `fillets`.
             Example: (5.,) Note: `fillet_radius` could be just 1 float value if it's the same for all fillets.
-        :param closed:           boolean whether curve should be closed (whether last point connected to first one). Default: True
-        :param create_ifc_curve: create IfcIndexedPolyCurve or just return the data. Default: False
+        :param closed:           boolean whether curve should be closed (whether last point connected to first one).
+        :param create_ifc_curve: create IfcIndexedPolyCurve or just return the data.
 
         :return: (points, segments, ifc_curve) for the created simple curve
             if both points in e are equally far from pt, then v1 is returned.
@@ -1292,6 +1373,18 @@ def create_z_profile_lips_curve(
         WallThickness: float,
         FilletRadius: float,
     ) -> ifcopenshell.entity_instance:
+        """Create a Z-profile (cold-formed steel section) outline curve with lips and fillets.
+
+        All dimensions are in the IFC project's length units.
+
+        :param FirstFlangeWidth: Width of the first (top) flange, measured from the web centreline.
+        :param SecondFlangeWidth: Width of the second (bottom) flange, measured from the web centreline.
+        :param Depth: Total depth of the section (web height).
+        :param Girth: Length of the return lips on each flange.
+        :param WallThickness: Uniform material thickness.
+        :param FilletRadius: Inner bend radius at each corner.
+        :return: IfcIndexedPolyCurve representing the closed Z-profile outline.
+        """
         x1 = FirstFlangeWidth
         x2 = SecondFlangeWidth
         y = Depth / 2
@@ -1333,10 +1426,17 @@ def create_z_profile_lips_curve(
     def create_transition_arc_ifc(
         self, width: float, height: float, create_ifc_curve: bool = False
     ) -> tuple[SequenceOfVectors, list[list[int]], Union[ifcopenshell.entity_instance, None]]:
-        """Create an arc in the rectangle with specified width and height.
+        """Create an arc fitting inside a rectangle of the given width and height.
+
+        If a single arc cannot span the full width, the longest possible radius is used and
+        a straight segment is inserted in the middle.
 
-        If it's not possible to make a complete arc, create an arc with longest radius possible
-        and straight segment in the middle.
+        :param width: Width of the bounding rectangle.
+        :param height: Height of the bounding rectangle (also the maximum arc radius).
+        :param create_ifc_curve: If True, also create and return an ``IfcIndexedPolyCurve``.
+            If False, only return the raw point and segment data.
+        :return: A tuple ``(points, segments, ifc_curve)`` where ``ifc_curve`` is an
+            ``IfcIndexedPolyCurve`` when ``create_ifc_curve=True``, otherwise ``None``.
         """
         fillet_size = (width / 2) / height
         if fillet_size <= 1:
@@ -1366,6 +1466,14 @@ def create_transition_arc_ifc(
         return points, segments, transition_arc
 
     def mesh(self, points: SequenceOfVectors, faces: Sequence[Sequence[int]]) -> ifcopenshell.entity_instance:
+        """Create a tessellated mesh from points and face indices.
+
+        Delegates to :meth:`faceted_brep` for IFC2X3, or :meth:`polygonal_face_set` for IFC4 and later.
+
+        :param points: List of 3D coordinates.
+        :param faces: List of faces, each face a sequence of zero-based point indices.
+        :return: IfcFacetedBrep (IFC2X3) or IfcPolygonalFaceSet (IFC4+).
+        """
         if self.file.schema == "IFC2X3":
             return self.faceted_brep(points, faces)
         return self.polygonal_face_set(points, faces)
@@ -1460,10 +1568,10 @@ def extrude_face_set(
 
         :param points: list of points, assuming they form consecutive closed polyline.
         :param magnitude: extrusion magnitude
-        :param extrusion_vector: extrusion direction, by default it's extruding by Z+ axis
+        :param extrusion_vector: extrusion direction.
         :param offset: offset from the points
-        :param start_cap: if True, create start cap, by default it's True
-        :param end_cap: if True, create end cap, by default it's True
+        :param start_cap: if True, create start cap.
+        :param end_cap: if True, create end cap.
         :return: IfcPolygonalFaceSet
         """
 
@@ -1719,11 +1827,20 @@ def mep_transition_length(
         angle: float,
         profile_offset: VectorType = (0.0, 0.0),
         verbose: bool = True,
-    ):
-        """get the final transition length for two profiles dimensions, angle and XY offset between them,
-
-        the difference from `calculate_transition` - `get_transition_length` is making sure
-        that length will fit both sides of the transition
+    ) -> Optional[float]:
+        """Get the transition length for two profile half-dimensions, an angle, and an XY offset.
+
+        Unlike :meth:`mep_transition_calculate`, this method checks that the resulting length
+        satisfies the angle constraint from both the start and end profile perspectives.
+
+        :param start_half_dim: Half-dimensions of the start profile as a 3-element array
+            ``[half_x, half_y, depth]``. For circular profiles ``half_x == half_y == radius``.
+        :param end_half_dim: Half-dimensions of the end profile in the same format.
+        :param angle: Maximum allowed transition angle, in degrees.
+        :param profile_offset: 2D XY offset between the centrelines of the start and end profiles.
+        :param verbose: If True, print diagnostic values during calculation.
+        :return: Transition length in project length units, or ``None`` if no valid length exists
+            for the given angle and offset.
         """
         print = lambda *args, **kwargs: __builtins__["print"](*args, **kwargs) if verbose else None
         np_X, np_Y = 0, 1
@@ -1784,9 +1901,23 @@ def mep_transition_calculate(
         angle: Optional[float] = None,
         verbose: bool = True,
     ) -> Union[float, None]:
-        """will return transition length based on the profile dimension differences and offset.
-
-        If `length` is provided will return transition angle"""
+        """Calculate MEP transition length from angle, or transition angle from length.
+
+        Low-level calculation kernel used by :meth:`mep_transition_length`. Provide either
+        ``angle`` or ``length`` (not both); the other value is computed and returned.
+
+        :param start_half_dim: Half-dimensions of the start profile ``[half_x, half_y, depth]``.
+        :param end_half_dim: Half-dimensions of the end profile ``[half_x, half_y, depth]``.
+        :param offset: 2D XY offset between profile centrelines.
+        :param diff: Pre-computed absolute difference of start and end half-dimensions (XY only).
+            Computed from ``start_half_dim`` and ``end_half_dim`` if not provided.
+        :param end_profile: If True, swap X and Y axes to compute from the end-profile perspective.
+        :param length: Known transition length. If provided, the corresponding angle is returned.
+        :param angle: Known transition angle, in degrees. If provided, the corresponding length is returned.
+        :param verbose: If True, print diagnostic values during calculation.
+        :return: Transition length (if ``angle`` was given) or transition angle in degrees
+            (if ``length`` was given), or ``None`` if the geometry is not feasible.
+        """
 
         print = lambda *args, **kwargs: __builtins__["print"](*args, **kwargs) if verbose else None
 
diff --git a/src/ifcopenshell-python/ifcopenshell/util/system.py b/src/ifcopenshell-python/ifcopenshell/util/system.py
index 832daefc0d6..39d92070180 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/system.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/system.py
@@ -17,8 +17,9 @@
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
 
+from typing import Literal, Optional, Union
+
 import ifcopenshell.util.system
-from typing import Optional, Union, Literal
 
 group_types: dict[str, tuple[str, ...]] = {
     "IfcZone": ("IfcZone", "IfcSpace", "IfcSpatialZone"),
diff --git a/src/ifcopenshell-python/ifcopenshell/util/type.py b/src/ifcopenshell-python/ifcopenshell/util/type.py
index ab588142671..86a1ccc7c94 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/type.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/type.py
@@ -16,8 +16,9 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
-import os
 import json
+import os
+
 import ifcopenshell.util.schema
 
 cwd = os.path.dirname(os.path.realpath(__file__))
diff --git a/src/ifcopenshell-python/ifcopenshell/util/unit.py b/src/ifcopenshell-python/ifcopenshell/util/unit.py
index c45d226128b..7e4686d4e4a 100644
--- a/src/ifcopenshell-python/ifcopenshell/util/unit.py
+++ b/src/ifcopenshell-python/ifcopenshell/util/unit.py
@@ -16,14 +16,13 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.
 
+from collections.abc import Generator
 from fractions import Fraction
 from math import pi
 from typing import Literal, Optional, Union
-from collections.abc import Generator
 
 import ifcopenshell
 import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
-import ifcopenshell.api.unit
 
 prefixes = {
     "EXA": 1e18,
@@ -209,6 +208,7 @@
     "pound": 0.454,
     "ton UK": 1016.0469088,
     "ton US": 907.18474,
+    "tonne": 1000.0,
     "lbf": 4.4482216153,
     "kip": 4448.2216153,
     "psi": 6894.7572932,
@@ -253,6 +253,7 @@
     "pound": "MASSUNIT",
     "ton UK": "MASSUNIT",
     "ton US": "MASSUNIT",
+    "tonne": "MASSUNIT",
     "lbf": "FORCEUNIT",
     "kip": "FORCEUNIT",
     "psi": "PRESSUREUNIT",
@@ -323,6 +324,7 @@
     "pound": "lb",
     "ton UK": "ton",
     "ton US": "ton",
+    "tonne": "t",
     "lbf": "lbf",
     "kip": "kip",
     "psi": "psi",
@@ -475,7 +477,7 @@ def get_property_unit(
     measure_class = None
 
     if prop.is_a("IfcPhysicalSimpleQuantity"):
-        entity = prop.declaration
+        entity = prop.wrapped_data.declaration().as_entity()
         measure_class = entity.attribute_by_index(3).type_of_attribute().declared_type().name()
     elif prop.is_a("IfcPropertySingleValue"):
         measure_class = prop.NominalValue.is_a()
@@ -866,16 +868,16 @@ def iter_element_and_attributes_per_type(ifc_file: ifcopenshell.file, attr_type_
 
 def convert_file_length_units(ifc_file: ifcopenshell.file, target_units: str = "METER") -> ifcopenshell.file:
     """Converts all units in an IFC file to the specified target units. Returns a new file."""
-    import ifcopenshell.util.element
-    import ifcopenshell.util.geolocation
     import ifcopenshell.api.georeference
     import ifcopenshell.api.unit
+    import ifcopenshell.util.element
+    import ifcopenshell.util.geolocation
 
     prefix = get_prefix(target_units)
     si_unit = get_unit_name(target_units)
 
     # Copy all elements from the original file to the patched file
-    file_patched = ifcopenshell.file.from_string(ifc_file.to_string())
+    file_patched = ifcopenshell.file.from_string(ifc_file.wrapped_data.to_string())
 
     old_length = get_project_unit(file_patched, "LENGTHUNIT")
     if si_unit:

From b13795341919ce2708a5f4f35c6bf5331e524c98 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 16:27:06 +1000
Subject: [PATCH 02/37] Local hacks to compile and monkey patch issues in the
 Python world

All AI generated slop. Do NOT trust these "fixes". It's just to get it
working on my machine.
---
 build.sh                                      |  30 ++
 cmake/CMakeLists.txt                          |   3 +-
 findings.md                                   | 317 ++++++++++++++++++
 .../ifcopenshell/__init__.py                  |   3 +
 .../ifcopenshell/entity_instance.py           |  52 ++-
 src/ifcparse/spf_header.h                     |   2 +-
 src/ifcparse/storage.h                        |   1 +
 7 files changed, 404 insertions(+), 4 deletions(-)
 create mode 100755 build.sh
 create mode 100644 findings.md

diff --git a/build.sh b/build.sh
new file mode 100755
index 00000000000..939c4ae4435
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+set -e
+
+mkdir -p build && cd build
+
+cmake ../cmake \
+  -G Ninja \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DPython_EXECUTABLE=/home/dion/Projects/env/bin/python3.11 \
+  -DPython_INCLUDE_DIR=/usr/include/python3.11 \
+  -DBUILD_IFCPYTHON=ON \
+  -DBUILD_IFCGEOM=ON \
+  -DBUILD_CONVERT=ON \
+  -DBUILD_GEOMSERVER=OFF \
+  -DBUILD_EXAMPLES=OFF \
+  -DWITH_OPENCASCADE=ON \
+  -DWITH_CGAL=ON \
+  -DWITH_MANIFOLD=ON \
+  -DHDF5_SUPPORT=OFF \
+  -DGLTF_SUPPORT=ON \
+  -DIFCXML_SUPPORT=OFF \
+  -DCOLLADA_SUPPORT=OFF \
+  -DSCHEMA_VERSIONS="2x3;4;4x3_add2" \
+  -DOCC_INCLUDE_DIR=/usr/include/opencascade \
+  -DOCC_LIBRARY_DIR=/usr/lib64/opencascade
+
+ninja
+
+cp ifcwrap/_ifcopenshell_wrapper*.so ifcwrap/ifcopenshell_wrapper.py \
+   ../src/ifcopenshell-python/ifcopenshell/
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index cbc67041792..8a0518c87f1 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -332,7 +332,8 @@ if(WASM_BUILD)
 else()
     # @todo review this, shouldn't this be all possible header-only now?
     # ... or rewritten using C++17 features?
-    set(BOOST_COMPONENTS system program_options regex thread date_time iostreams)
+    # set(BOOST_COMPONENTS system program_options regex thread date_time iostreams)
+    set(BOOST_COMPONENTS program_options regex thread date_time iostreams)
 endif()
 
 if(USE_MMAP)
diff --git a/findings.md b/findings.md
new file mode 100644
index 00000000000..ea211e59528
--- /dev/null
+++ b/findings.md
@@ -0,0 +1,317 @@
+# Build fix: remove `boost_system` from CMake components
+
+`Boost.System` became header-only in Boost 1.69. Boost 1.90.0 no longer ships a compiled library or CMake config for it, so `find_package(Boost REQUIRED COMPONENTS system ...)` fails.
+
+## Fix
+
+`cmake/CMakeLists.txt`:
+
+```diff
+-    set(BOOST_COMPONENTS system program_options regex thread date_time iostreams)
++    set(BOOST_COMPONENTS program_options regex thread date_time iostreams)
+```
+
+The headers are still available; no linking is needed.
+
+# Build fix: add `template` keyword for dependent template member calls
+
+Calling a template member function through a dependent expression (e.g. `storage->has_attribute_value<T>(...)` where `storage`'s type depends on a template parameter) requires the `template` keyword to disambiguate from a less-than comparison.
+
+## Error
+
+```
+src/ifcparse/IfcParse.cpp:1856:67: error: expected primary-expression before '>' token
+ 1856 |                     if (storage->has_attribute_value<express::Base>(attr_index)) {
+      |                                                                   ^
+```
+
+Six identical errors at lines 1856, 1865, 1896, 1905, 1934, 1943.
+
+## Fix
+
+`src/ifcparse/IfcParse.cpp`:
+
+```diff
+-storage->has_attribute_value<express::Base>(attr_index)
++storage->template has_attribute_value<express::Base>(attr_index)
+
+-storage->has_attribute_value<Blank>(attr_index)
++storage->template has_attribute_value<Blank>(attr_index)
+```
+
+Applied at all six call sites in `in_memory_file_storage::read_from_stream`.
+
+# Linker fix: missing explicit template instantiations for `InstanceStreamer`
+
+`InstanceStreamer` is a class template with methods defined in `IfcParse.cpp`, not the header. Without explicit instantiations, the linker can't find the symbols when the SWIG wrapper loads.
+
+## Error
+
+```
+ImportError: undefined symbol: _ZN8IfcParse16InstanceStreamerINS_10FileReaderINS_14FullBufferImplEEEEC1EPS3_PNS_7IfcFileE
+  (IfcParse::InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(FileReader<FullBufferImpl>*, IfcFile*))
+```
+
+## Fix
+
+Cannot use `template class InstanceStreamer<...>` because some constructors have `static_assert` guards that reject certain reader types. Instead, instantiate each member function individually per reader type, only including the constructors valid for that type.
+
+`src/ifcparse/IfcParse.cpp` (after the last `InstanceStreamer` method definition):
+
+```cpp
+// FullBufferImpl
+template IfcParse::InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(IfcParse::IfcFile*);
+template IfcParse::InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(const std::string&, bool, IfcParse::IfcFile*);
+template IfcParse::InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(void*, int, IfcParse::IfcFile*);
+template IfcParse::InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(FileReader<FullBufferImpl>*, IfcParse::IfcFile*);
+// ... plus ensure_header, initialize_header, hasSemicolon, semicolonCount,
+//     pushPage, bypassTypes, readInstance
+
+// PushedSequentialImpl — same pattern, different valid constructors
+
+// MMapFileReader (ifdef USE_MMAP) — same pattern
+```
+
+# Linker fix: `FullBufferImpl` missing buffer constructor
+
+SWIG's `stream_from_string` calls `InstanceStreamer<FileReader<FullBufferImpl>>(void*, int, IfcFile*)`, but the `(void*, int)` constructor previously hit a `static_assert` for `FullBufferImpl` — it only allowed `PushedSequentialImpl`.
+
+## Error
+
+```
+ImportError: undefined symbol: _ZN8IfcParse16InstanceStreamerINS_10FileReaderINS_14FullBufferImplEEEEC1EPviPNS_7IfcFileE
+  (InstanceStreamer<FileReader<FullBufferImpl>>::InstanceStreamer(void*, int, IfcFile*))
+```
+
+## Fix
+
+Three changes to make `FullBufferImpl` support buffer-based and default construction:
+
+`src/ifcparse/FileReader.h` — add buffer constructor to `FullBufferImpl`:
+
+```diff
+ class IFC_PARSE_API FullBufferImpl {
+ public:
+     explicit FullBufferImpl(const std::string& fn);
++    FullBufferImpl(void* data, size_t length);
+```
+
+`src/ifcparse/FileReader.h` — add `FileReader(void*, size_t)` forwarding constructor:
+
+```diff
++    FileReader(void* data, size_t length)
++        : cursor_(0) {
++        if constexpr (std::is_same_v<Impl, FullBufferImpl>) {
++            impl_ = std::make_shared<Impl>(data, length);
++        } else {
++            static_assert(...);
++        }
++    }
+```
+
+`src/ifcparse/FileReader.cpp` — implement the constructor:
+
+```cpp
+FullBufferImpl::FullBufferImpl(void* data, size_t length)
+    : buf_(static_cast<char*>(data), static_cast<char*>(data) + length)
+    , size_(length) {
+}
+```
+
+`src/ifcparse/IfcParse.cpp` — extend the two `InstanceStreamer` constructors to accept `FullBufferImpl`:
+
+```diff
+ // InstanceStreamer(IfcFile*):
++    } else if constexpr (std::is_same_v<Reader, FileReader<FullBufferImpl>>) {
++        owned_stream_ = std::make_unique<Reader>(nullptr, (size_t)0);
+
+ // InstanceStreamer(void*, int, IfcFile*):
++    } else if constexpr (std::is_same_v<Reader, FileReader<FullBufferImpl>>) {
++        owned_stream_ = std::make_unique<Reader>(data, (size_t)length);
+```
+
+# Runtime fix: segfault in `parse_context::push()` due to vector reallocation
+
+`parse_context_pool` stores nodes in a `std::vector<parse_context>`. During parsing, `load()` takes a `parse_context&` parameter and calls `context.push()`, which calls `pool_->make()`. If the pool's vector reallocates (via `emplace_back`), all existing references into the vector — including the `context` reference held by the caller — become dangling. Subsequent access through the dangling reference causes a segfault.
+
+Triggered by larger IFC files (e.g. `ISSUE_159_kleine_Wohnung_R22.ifc`, 9.5 MB) that cause enough pool growth to trigger reallocation.
+
+## Error
+
+```
+Thread 1 received signal SIGSEGV, Segmentation fault.
+0x... in IfcParse::parse_context::push()
+  #1  in_memory_file_storage::load(...)   // context& is dangling after reallocation
+  #2  in_memory_file_storage::load(...)   // parent call
+  #3  InstanceStreamer::readInstance()
+```
+
+## Fix
+
+`src/ifcparse/storage.h` — change the pool container from `std::vector` to `std::deque`, which does not invalidate references on `push_back`/`emplace_back`:
+
+```diff
++#include <deque>
+
+ struct parse_context_pool {
+-    std::vector<parse_context> nodes_;
++    std::deque<parse_context> nodes_;
+```
+
+# Runtime fix: `express::Base` comparison operators throw on null/expired instances
+
+`express::Base::operator<` and `operator==` called `data()`, which throws `std::runtime_error("Trying to access deleted instance reference")` when the internal `weak_ptr` is expired. A default-constructed `express::Base` (the value-type equivalent of a null pointer) always has an expired `weak_ptr`.
+
+## Why this model triggers it
+
+The bug requires two conditions to coincide:
+
+1. A representation is shared by **more than one product** (via `IfcRepresentationMap` / `IfcMappedItem`).
+2. At least one of those products has **no material association**, so `get_single_material_association()` returns `express::Base{}` (the null equivalent).
+
+In `advanced_model.ifc`, Body representations like `#449` (Body/Brep) have a single `IfcRepresentationMap` (`#453`) with 13 `IfcMappedItem` usages, meaning 13 products share the geometry. Some of those products (e.g. `IfcFlowTerminal` instances) have no `IfcRelAssociatesMaterial`, so `get_single_material_association` returns `express::Base{}`.
+
+Smaller or simpler models don't hit this because either:
+- Every representation maps to only 1 product → `reuse_ok_` short-circuits at `products.size() == 1` before reaching the material check.
+- Every product has a material association → no null `express::Base` is ever inserted into the set.
+
+## Exact call sequence
+
+```
+Iterator::initialize()
+  try {
+    mapping::get_representations(reps, filters_)
+      addRepresentationsFromDefaultContexts(representations)
+        → collects reps from subcontexts in order:
+          Axis (#115): 143 reps
+          Body (#117): 7550 reps
+          FootPrint (#119): 12 reps
+
+      for (auto representation : representations):
+
+        ── Axis reps (indices 0–142) ──────────────────────────
+        products_represented_by(rep, rmap)
+          → OfProductRepresentation: 1 product each
+        filter_products(products, filters)   → 1 product
+        reuse_ok_(ifcproducts)
+          → products.size() == 1 → return true  ← SHORT-CIRCUIT, no material check
+        representation_mapped_to(rep)        → null (no MappedItem)
+        → task created.  143 tasks accumulated.
+
+        ── First Body rep #449 (Body/Brep) ────────────────────
+        products_represented_by(#449, rmap)
+          → OfProductRepresentation: empty
+          → RepresentationMap: 1 map (#453)
+          → MapUsage: 13 MappedItems → traces through to 13 IfcProducts
+        filter_products(products, filters)   → 13 products
+        reuse_ok_(ifcproducts)               ← CRASH HERE
+          → products.size() == 1?  NO (13 products)
+          → for each product:
+              find_openings(product)          → OK
+              get_single_material_association(product)
+                → some products have no IfcRelAssociatesMaterial
+                → returns express::Base{}     (expired weak_ptr)
+              associated_single_materials.insert(result)
+                → std::set::insert calls operator<
+                → operator< calls data()
+                → data() calls data_.lock() → expired → THROWS
+                  "Trying to access deleted instance reference"
+
+  } catch (const std::exception& e) {
+    Logger::Error(e)       ← exception caught here, get_representations aborted
+  }
+
+  → reps contains only the 143 Axis tasks created before the throw
+  → all 143 Axis reps have Curve2D geometry → map(representation) returns null
+  → no valid elements produced → initialize() returns false
+```
+
+In the old pointer-based code, `reuse_ok_` used `std::set<const IfcUtil::IfcBaseEntity*>` and `get_single_material_association` returned `nullptr`. Inserting `nullptr` into a `std::set<T*>` is a plain pointer comparison — no dereference, no throw. The refactoring to `std::set<express::Base>` changed the comparison from pointer comparison to `express::Base::operator<`, which unconditionally dereferences through `data()`.
+
+## Error
+
+```
+[Error] Trying to access deleted instance reference
+[Notice] Created 143 tasks for 143 products    ← only Axis reps; all Body reps lost
+initialize() returned: False
+```
+
+## Fix
+
+`src/ifcparse/express.h` — use `weak_ptr::lock().get()` instead of `data()` so that expired pointers compare as `nullptr` (matching old raw-pointer semantics):
+
+```diff
+     bool operator<(const Base& other) const {
+-        return data() < other.data();
++        auto a = data_.lock();
++        auto b = other.data_.lock();
++        return a.get() < b.get();
+     }
+
+     bool operator==(const Base& other) const {
+-        return data() == other.data();
++        auto a = data_.lock();
++        auto b = other.data_.lock();
++        return a.get() == b.get();
+     }
+```
+
+# Runtime fix: `entity_instance` missing `get_inverse` due to SWIG `%rename` collision
+
+Accessing inverse attributes (e.g. `element.IsDecomposedBy`) on any entity raises `AttributeError: entity instance of type 'IFC2X3.IfcProject' has no attribute 'get_inverse'`.
+
+## Why
+
+`entity_instance_mixin.__getattr__` (line 106 of `entity_instance.py`) calls `self.get_inverse(name)` when it detects an inverse attribute. Since the mixin inherits into the SWIG-generated `entity_instance` class (via the `object = custom_base` hack in `IfcParseWrapper.i:936`), `self.get_inverse` must resolve to a method on the SWIG class.
+
+However, `IfcParseWrapper.i:70` has a global rename:
+
+```
+%rename("get_inverses_by_declaration") get_inverse;
+```
+
+This was intended for `ifcopenshell::file::get_inverse` (which takes an entity + declaration and returns instances by reference), but SWIG `%rename` is global — it also renames the `%extend express::Base` method `get_inverse(const std::string& a)` at line 551. So the Python-side `entity_instance` class exposes the method as `get_inverses_by_declaration`, not `get_inverse`.
+
+The old code (`v0.8.0`) didn't hit this because `__getattr__` called `self.wrapped_data.get_inverse(name)` on an inner `ifcopenshell_wrapper.entity_instance` object — but in that old layout, the inner object was constructed differently and the rename didn't apply the same way (or the method had a different path). In the new mixin approach, `self` **is** the SWIG object, so the rename is directly visible.
+
+## Fix
+
+`src/ifcwrap/IfcParseWrapper.i` — override the global rename specifically for `express::Base::get_inverse`, restoring the original name on entity instances:
+
+```diff
++%rename("get_inverse") express::Base::get_inverse;
+ %rename("get_inverses_by_declaration") get_inverse;
+```
+
+Add this line **before** the global rename (or anywhere before the `%extend express::Base` block). This scoped rename takes precedence for `express::Base`, so:
+- `entity_instance.get_inverse(name)` works as the mixin expects
+- `file.get_inverses_by_declaration(...)` keeps its intended name
+
+## Python-side workaround
+
+`entity_instance.py:106` — call the method by its SWIG-renamed name:
+
+```diff
+-            vs = self.get_inverse(name)
++            vs = self.get_inverses_by_declaration(name)
+```
+
+# Runtime fix: `entity_instance` class no longer importable from `entity_instance` module
+
+The class rename from `entity_instance` to `entity_instance_mixin` broke external code that does `from ifcopenshell.entity_instance import entity_instance`.
+
+## Error
+
+```
+ImportError: cannot import name 'entity_instance' from 'ifcopenshell.entity_instance'
+```
+
+Triggered at import time via `ifcopenshell.util.pset` (and likely other modules).
+
+## Fix
+
+`src/ifcopenshell-python/ifcopenshell/entity_instance.py` — add a backwards-compatible alias at the bottom of the module:
+
+```python
+entity_instance = entity_instance_mixin
+```
diff --git a/src/ifcopenshell-python/ifcopenshell/__init__.py b/src/ifcopenshell-python/ifcopenshell/__init__.py
index faafc055605..c9cffd5c6bc 100644
--- a/src/ifcopenshell-python/ifcopenshell/__init__.py
+++ b/src/ifcopenshell-python/ifcopenshell/__init__.py
@@ -94,6 +94,9 @@
 from .file import rocksdb_lazy_instance
 from . import guid
 from .ifcopenshell_wrapper import entity_instance
+from .entity_instance import _patch_swig_comparisons
+_patch_swig_comparisons()
+del _patch_swig_comparisons
 from .sql import sqlite, sqlite_entity
 
 # explicitly specify available imported symbols
diff --git a/src/ifcopenshell-python/ifcopenshell/entity_instance.py b/src/ifcopenshell-python/ifcopenshell/entity_instance.py
index 121d0c68843..b7f5d1ac7c6 100644
--- a/src/ifcopenshell-python/ifcopenshell/entity_instance.py
+++ b/src/ifcopenshell-python/ifcopenshell/entity_instance.py
@@ -103,7 +103,7 @@ def __getattr__(self, name: str) -> Any:
             idx = self.get_argument_index(name)
             return self.get_argument(idx)
         elif attr_cat == INVERSE:
-            vs = self.get_inverse(name)
+            vs = self.get_inverses_by_declaration(name)
             if settings.unpack_non_aggregate_inverses:
                 schema_name = self.is_a(True).split(".")[0]
                 ent: ifcopenshell_wrapper.entity
@@ -213,11 +213,17 @@ def __setitem__(self, idx: int, value: T) -> T:
         return value
 
     def __eq__(self, other: entity_instance_mixin) -> bool:
-        if not isinstance(self, type(other)):
+        if other is None or not isinstance(other, entity_instance_mixin):
             return False
         else:
             raise NotImplementedError
 
+    def __ne__(self, other: entity_instance_mixin) -> bool:
+        if other is None or not isinstance(other, entity_instance_mixin):
+            return True
+        else:
+            raise NotImplementedError
+
     def is_entity(self) -> bool:
         """Tests whether the instance is an entity type as opposed to a simple data type.
 
@@ -395,3 +401,45 @@ def get_info_2(
         assert return_type is dict
         assert len(ignore) == 0
         return ifcopenshell_wrapper.get_info_cpp(self, recursive, include_identifier)
+
+
+# Alias for backwards compatibility — external code imports this name.
+entity_instance = entity_instance_mixin
+
+
+# Monkey-patch SWIG's __eq__, __ne__, __lt__ on the generated entity_instance
+# class to guard against None / non-entity arguments. SWIG generates these
+# directly on the class (overriding the mixin), and they pass arguments straight
+# to C++ which rejects null references.
+# Deferred until after ifcopenshell_wrapper finishes loading to avoid circular import.
+_swig_comparisons_patched = False
+
+
+def _patch_swig_comparisons():
+    global _swig_comparisons_patched
+    if _swig_comparisons_patched:
+        return
+    _swig_cls = ifcopenshell_wrapper.entity_instance
+    _orig_eq = _swig_cls.__eq__
+    _orig_ne = _swig_cls.__ne__
+    _orig_lt = _swig_cls.__lt__
+
+    def _safe_eq(self, other):
+        if other is None or not isinstance(other, _swig_cls):
+            return NotImplemented
+        return _orig_eq(self, other)
+
+    def _safe_ne(self, other):
+        if other is None or not isinstance(other, _swig_cls):
+            return NotImplemented
+        return _orig_ne(self, other)
+
+    def _safe_lt(self, other):
+        if other is None or not isinstance(other, _swig_cls):
+            return NotImplemented
+        return _orig_lt(self, other)
+
+    _swig_cls.__eq__ = _safe_eq
+    _swig_cls.__ne__ = _safe_ne
+    _swig_cls.__lt__ = _safe_lt
+    _swig_comparisons_patched = True
diff --git a/src/ifcparse/spf_header.h b/src/ifcparse/spf_header.h
index 9daf4a25e80..efd6304553a 100644
--- a/src/ifcparse/spf_header.h
+++ b/src/ifcparse/spf_header.h
@@ -30,7 +30,7 @@ class file;
 
 class IFC_PARSE_API spf_header {
   private:
-    file* file_;
+    ifcopenshell::file* file_;
 
     std::array<std::shared_ptr<instance_data>, 3> header_entities_;
 
diff --git a/src/ifcparse/storage.h b/src/ifcparse/storage.h
index 06592b84777..38ada04cd81 100644
--- a/src/ifcparse/storage.h
+++ b/src/ifcparse/storage.h
@@ -31,6 +31,7 @@ namespace rocksdb {
 #include <iterator>
 #include <type_traits>
 #include <iostream>
+#include <deque>
 #include <vector>
 #include <deque>
 #include <list>

From 92372f4324979001a6620217f0447f10d3b58020 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 16:30:10 +1000
Subject: [PATCH 03/37] Dump of hello world ifc viewer code

---
 cmake/CMakeLists.txt               |   4 +
 src/ifcviewer/AppSettings.cpp      |  57 +++
 src/ifcviewer/AppSettings.h        |  48 ++
 src/ifcviewer/CMakeLists.txt       |  61 +++
 src/ifcviewer/GeometryStreamer.cpp | 285 ++++++++++++
 src/ifcviewer/GeometryStreamer.h   |  89 ++++
 src/ifcviewer/MainWindow.cpp       | 270 ++++++++++++
 src/ifcviewer/MainWindow.h         |  80 ++++
 src/ifcviewer/README.md            | 129 ++++++
 src/ifcviewer/SettingsWindow.cpp   |  68 +++
 src/ifcviewer/SettingsWindow.h     |  46 ++
 src/ifcviewer/ViewportWindow.cpp   | 674 +++++++++++++++++++++++++++++
 src/ifcviewer/ViewportWindow.h     | 146 +++++++
 src/ifcviewer/main.cpp             |  55 +++
 14 files changed, 2012 insertions(+)
 create mode 100644 src/ifcviewer/AppSettings.cpp
 create mode 100644 src/ifcviewer/AppSettings.h
 create mode 100644 src/ifcviewer/CMakeLists.txt
 create mode 100644 src/ifcviewer/GeometryStreamer.cpp
 create mode 100644 src/ifcviewer/GeometryStreamer.h
 create mode 100644 src/ifcviewer/MainWindow.cpp
 create mode 100644 src/ifcviewer/MainWindow.h
 create mode 100644 src/ifcviewer/README.md
 create mode 100644 src/ifcviewer/SettingsWindow.cpp
 create mode 100644 src/ifcviewer/SettingsWindow.h
 create mode 100644 src/ifcviewer/ViewportWindow.cpp
 create mode 100644 src/ifcviewer/ViewportWindow.h
 create mode 100644 src/ifcviewer/main.cpp

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8a0518c87f1..e06bdc1b45a 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -70,6 +70,7 @@ option(BUILD_EXAMPLES "Build example applications." ON)
 option(BUILD_GEOMSERVER "Build IfcGeomServer executable (Open CASCADE is required)." ON)
 option(BUILD_IFCMAX "Build IfcMax, a 3ds Max plug-in, Windows-only." OFF)
 option(BUILD_QTVIEWER "Build IfcOpenShell Qt GUI Viewer" OFF) # QtViewer requires Qt6
+option(BUILD_IFCVIEWER "Build IfcViewer, a high-performance IFC viewer" OFF) # Requires Qt6 + OpenGL 4.5
 option(BUILD_PACKAGE "" OFF)
 
 option(WITH_OPENCASCADE "Enable geometry interpretation using Open CASCADE" ON)
@@ -713,6 +714,9 @@ if(BUILD_IFCGEOM)
 
     install(TARGETS ${IFCGEOM_SCHEMA_LIBRARIES} ${kernel_libraries} IfcGeom)
 endif(BUILD_IFCGEOM)
+if(BUILD_IFCVIEWER)
+    add_subdirectory(../src/ifcviewer ifcviewer)
+endif()
 
 # Cmake uninstall target
 if(NOT TARGET uninstall)
diff --git a/src/ifcviewer/AppSettings.cpp b/src/ifcviewer/AppSettings.cpp
new file mode 100644
index 00000000000..07c5f8c3bc2
--- /dev/null
+++ b/src/ifcviewer/AppSettings.cpp
@@ -0,0 +1,57 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "AppSettings.h"
+
+#include <QSettings>
+
+namespace {
+constexpr const char* kGeometryLibraryKey = "geometry/library";
+constexpr const char* kGeometryLibraryDefault = "hybrid-cgal-simple-opencascade";
+}
+
+AppSettings& AppSettings::instance() {
+    static AppSettings inst;
+    return inst;
+}
+
+AppSettings::AppSettings() {
+    load();
+}
+
+QString AppSettings::geometryLibrary() const {
+    return geometry_library_;
+}
+
+void AppSettings::setGeometryLibrary(const QString& value) {
+    if (geometry_library_ == value) return;
+    geometry_library_ = value;
+    persist();
+    emit geometryLibraryChanged(value);
+}
+
+void AppSettings::load() {
+    QSettings settings;
+    geometry_library_ = settings.value(kGeometryLibraryKey, kGeometryLibraryDefault).toString();
+}
+
+void AppSettings::persist() {
+    QSettings settings;
+    settings.setValue(kGeometryLibraryKey, geometry_library_);
+}
diff --git a/src/ifcviewer/AppSettings.h b/src/ifcviewer/AppSettings.h
new file mode 100644
index 00000000000..9658c10b955
--- /dev/null
+++ b/src/ifcviewer/AppSettings.h
@@ -0,0 +1,48 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef APPSETTINGS_H
+#define APPSETTINGS_H
+
+#include <QObject>
+#include <QString>
+
+// Application-wide preferences. Cached in memory, persisted via QSettings to
+// the OS-native config location (registry on Windows, plist on macOS, INI on
+// Linux). Access via AppSettings::instance().
+class AppSettings : public QObject {
+    Q_OBJECT
+public:
+    static AppSettings& instance();
+
+    QString geometryLibrary() const;
+    void setGeometryLibrary(const QString& value);
+
+signals:
+    void geometryLibraryChanged(const QString& value);
+
+private:
+    AppSettings();
+    void load();
+    void persist();
+
+    QString geometry_library_;
+};
+
+#endif // APPSETTINGS_H
diff --git a/src/ifcviewer/CMakeLists.txt b/src/ifcviewer/CMakeLists.txt
new file mode 100644
index 00000000000..9f1c4dac502
--- /dev/null
+++ b/src/ifcviewer/CMakeLists.txt
@@ -0,0 +1,61 @@
+################################################################################
+#                                                                              #
+# This file is part of IfcOpenShell.                                           #
+#                                                                              #
+# IfcOpenShell is free software: you can redistribute it and/or modify         #
+# it under the terms of the Lesser GNU General Public License as published by  #
+# the Free Software Foundation, either version 3.0 of the License, or          #
+# (at your option) any later version.                                          #
+#                                                                              #
+# IfcOpenShell is distributed in the hope that it will be useful,              #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 #
+# Lesser GNU General Public License for more details.                          #
+#                                                                              #
+# You should have received a copy of the Lesser GNU General Public License     #
+# along with this program. If not, see <http://www.gnu.org/licenses/>.         #
+#                                                                              #
+################################################################################
+
+message("Running CMakeLists.txt in /src/ifcviewer")
+
+set(QT_VERSION 6 CACHE STRING "Qt version")
+# IfcViewer always needs OpenGL in addition to Core/Gui/Widgets. We don't use
+# the CACHE'd QT_COMPONENTS here because it may have been set by another target
+# (e.g. qtviewer) without the OpenGL component.
+find_package(Qt${QT_VERSION} COMPONENTS Core Gui Widgets OpenGL REQUIRED PATHS ${QT_DIR})
+
+find_package(OpenGL REQUIRED)
+
+file(GLOB IFCVIEWER_CPP_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB IFCVIEWER_H_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(IFCVIEWER_FILES ${IFCVIEWER_CPP_FILES} ${IFCVIEWER_H_FILES})
+
+add_executable(IfcViewer ${IFCVIEWER_FILES})
+
+set_target_properties(IfcViewer PROPERTIES
+    AUTOMOC ON
+    WIN32_EXECUTABLE ON
+    MACOSX_BUNDLE ON
+)
+
+target_link_libraries(IfcViewer PRIVATE
+    IfcGeom
+    IfcParse
+    ${kernel_libraries}
+    ${OpenCASCADE_LIBRARIES}
+    ${Boost_LIBRARIES}
+    ${CGAL_LIBRARIES}
+    Qt${QT_VERSION}::Core
+    Qt${QT_VERSION}::Gui
+    Qt${QT_VERSION}::Widgets
+    Qt${QT_VERSION}::OpenGL
+    OpenGL::GL
+)
+
+if(UNIX AND NOT APPLE)
+    find_package(Threads REQUIRED)
+    target_link_libraries(IfcViewer PRIVATE Threads::Threads)
+endif()
+
+install(TARGETS IfcViewer EXPORT ${IFCOPENSHELL_EXPORT_TARGETS})
diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
new file mode 100644
index 00000000000..39698c84e62
--- /dev/null
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -0,0 +1,285 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "GeometryStreamer.h"
+#include "AppSettings.h"
+#include "../ifcgeom/hybrid_kernel.h"
+
+#include <thread>
+#include <unordered_map>
+#include <cmath>
+#include <cstring>
+#include <algorithm>
+
+GeometryStreamer::GeometryStreamer(QObject* parent)
+    : QObject(parent)
+{
+}
+
+GeometryStreamer::~GeometryStreamer() {
+    cancel();
+    if (worker_thread_ && worker_thread_->isRunning()) {
+        worker_thread_->quit();
+        worker_thread_->wait();
+    }
+}
+
+void GeometryStreamer::loadFile(const std::string& path, int num_threads) {
+    if (running_.load()) {
+        cancel();
+        if (worker_thread_ && worker_thread_->isRunning()) {
+            worker_thread_->quit();
+            worker_thread_->wait();
+        }
+    }
+
+    cancel_requested_ = false;
+    running_ = true;
+    progress_ = 0;
+    next_object_id_ = 1;
+
+    {
+        std::lock_guard<std::mutex> lock(elements_mutex_);
+        pending_elements_.clear();
+    }
+
+    if (num_threads <= 0) {
+        num_threads = std::max(1u, std::thread::hardware_concurrency());
+    }
+
+    worker_thread_ = std::make_unique<QThread>();
+    QObject* context = new QObject();
+    context->moveToThread(worker_thread_.get());
+
+    connect(worker_thread_.get(), &QThread::started, context, [this, path, num_threads, context]() {
+        run(path, num_threads);
+        context->deleteLater();
+        worker_thread_->quit();
+    });
+
+    connect(worker_thread_.get(), &QThread::finished, this, [this]() {
+        running_ = false;
+        emit finished();
+    });
+
+    worker_thread_->start();
+}
+
+void GeometryStreamer::cancel() {
+    cancel_requested_ = true;
+}
+
+std::vector<ElementInfo> GeometryStreamer::drainElements() {
+    std::lock_guard<std::mutex> lock(elements_mutex_);
+    std::vector<ElementInfo> result;
+    result.swap(pending_elements_);
+    return result;
+}
+
+void GeometryStreamer::run(const std::string& path, int num_threads) {
+    try {
+        ifc_file_ = std::make_unique<IfcParse::IfcFile>(path);
+    } catch (const std::exception& e) {
+        emit errorOccurred(QString("Failed to parse IFC file: %1").arg(e.what()));
+        return;
+    }
+
+    ifcopenshell::geometry::Settings settings;
+    settings.set("use-world-coords", true);
+    settings.set("weld-vertices", false);
+    settings.set("apply-default-materials", true);
+
+    std::unique_ptr<IfcGeom::Iterator> iterator;
+    try {
+        const std::string geometry_library =
+            AppSettings::instance().geometryLibrary().toStdString();
+        auto kernel = ifcopenshell::geometry::kernels::construct(
+            ifc_file_.get(), geometry_library, settings);
+        iterator = std::make_unique<IfcGeom::Iterator>(
+            std::move(kernel), settings, ifc_file_.get(), std::vector<IfcGeom::filter_t>(), num_threads);
+    } catch (const std::exception& e) {
+        emit errorOccurred(QString("Failed to create geometry iterator: %1").arg(e.what()));
+        return;
+    }
+
+    if (!iterator->initialize()) {
+        emit errorOccurred("No geometry found in IFC file");
+        return;
+    }
+
+    int last_progress = 0;
+
+    do {
+        if (cancel_requested_.load()) break;
+
+        const IfcGeom::Element* elem = iterator->get();
+        if (!elem) continue;
+
+        const auto* tri_elem = dynamic_cast<const IfcGeom::TriangulationElement*>(elem);
+        if (!tri_elem) continue;
+
+        uint32_t object_id = next_object_id_++;
+
+        // Record element metadata
+        ElementInfo info;
+        info.object_id = object_id;
+        info.ifc_id = tri_elem->id();
+        info.guid = tri_elem->guid();
+        info.name = tri_elem->name();
+        info.type = tri_elem->type();
+        info.parent_id = tri_elem->parent_id();
+
+        {
+            std::lock_guard<std::mutex> lock(elements_mutex_);
+            pending_elements_.push_back(std::move(info));
+        }
+
+        // Convert geometry to upload chunk
+        UploadChunk chunk = convertElement(tri_elem, object_id);
+        if (!chunk.indices.empty()) {
+            emit elementReady(std::move(chunk));
+        }
+
+        int p = iterator->progress();
+        if (p != last_progress) {
+            last_progress = p;
+            progress_ = p;
+            emit progressChanged(p);
+        }
+    } while (iterator->next());
+
+    progress_ = 100;
+    emit progressChanged(100);
+}
+
+static MaterialInfo materialFromStyle(const ifcopenshell::geometry::taxonomy::style::ptr& style) {
+    MaterialInfo m;
+    if (!style) return m;
+
+    const auto& color = style->get_color();
+    if (color) {
+        m.r = static_cast<float>(color.r());
+        m.g = static_cast<float>(color.g());
+        m.b = static_cast<float>(color.b());
+    }
+    if (!std::isnan(style->transparency)) {
+        m.a = 1.0f - static_cast<float>(style->transparency);
+    }
+    return m;
+}
+
+static inline uint32_t packRGBA8(const MaterialInfo& m) {
+    auto to_byte = [](float v) -> uint32_t {
+        float c = std::clamp(v, 0.0f, 1.0f);
+        return static_cast<uint32_t>(c * 255.0f + 0.5f);
+    };
+    uint32_t r = to_byte(m.r);
+    uint32_t g = to_byte(m.g);
+    uint32_t b = to_byte(m.b);
+    uint32_t a = to_byte(m.a);
+    // Layout in memory (little-endian) reads as bytes [r, g, b, a] which is
+    // what the GL_UNSIGNED_BYTE * 4 normalized vertex attribute expects.
+    return r | (g << 8) | (b << 16) | (a << 24);
+}
+
+UploadChunk GeometryStreamer::convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id) {
+    UploadChunk chunk;
+    chunk.object_id = object_id;
+
+    const auto& geom = elem->geometry();
+    const auto& verts = geom.verts();
+    const auto& faces = geom.faces();
+    const auto& normals = geom.normals();
+    const auto& materials = geom.materials();
+    const auto& material_ids = geom.material_ids();
+
+    if (verts.empty() || faces.empty()) return chunk;
+
+    // Encode object_id as float bits for the vertex attribute
+    float id_as_float;
+    static_assert(sizeof(float) == sizeof(uint32_t));
+    std::memcpy(&id_as_float, &object_id, sizeof(float));
+
+    const size_t num_verts = verts.size() / 3;
+    const size_t num_tris = faces.size() / 3;
+    const bool have_per_tri_material = (material_ids.size() == num_tris);
+
+    // Per-vertex color requires that any vertex shared between triangles with
+    // *different* materials be split. We dedupe (orig_vert_idx, mat_id) pairs
+    // so vertices that are only ever used by one material stay shared.
+    auto make_key = [](uint32_t orig_idx, int mat_id) -> uint64_t {
+        return (static_cast<uint64_t>(orig_idx) << 32) |
+               static_cast<uint32_t>(mat_id);
+    };
+
+    std::unordered_map<uint64_t, uint32_t> remap;
+    remap.reserve(num_verts);
+
+    chunk.vertices.reserve(num_verts * 8);
+    chunk.indices.reserve(faces.size());
+
+    auto emit_vertex = [&](uint32_t orig_idx, int mat_id) -> uint32_t {
+        const uint64_t key = make_key(orig_idx, mat_id);
+        auto it = remap.find(key);
+        if (it != remap.end()) return it->second;
+
+        const uint32_t new_idx = static_cast<uint32_t>(chunk.vertices.size() / 8);
+
+        // pos
+        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 0]));
+        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 1]));
+        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 2]));
+
+        // normal
+        if (orig_idx * 3 + 2 < normals.size()) {
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 0]));
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 1]));
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 2]));
+        } else {
+            chunk.vertices.push_back(0.0f);
+            chunk.vertices.push_back(1.0f);
+            chunk.vertices.push_back(0.0f);
+        }
+
+        // object_id (float bits)
+        chunk.vertices.push_back(id_as_float);
+
+        // color (packed RGBA8 reinterpreted as float)
+        MaterialInfo m;
+        if (mat_id >= 0 && mat_id < static_cast<int>(materials.size())) {
+            m = materialFromStyle(materials[mat_id]);
+        }
+        uint32_t packed = packRGBA8(m);
+        float packed_as_float;
+        std::memcpy(&packed_as_float, &packed, sizeof(float));
+        chunk.vertices.push_back(packed_as_float);
+
+        remap.emplace(key, new_idx);
+        return new_idx;
+    };
+
+    for (size_t t = 0; t < num_tris; ++t) {
+        const int mat_id = have_per_tri_material ? material_ids[t] : -1;
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 0]), mat_id));
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 1]), mat_id));
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 2]), mat_id));
+    }
+
+    return chunk;
+}
diff --git a/src/ifcviewer/GeometryStreamer.h b/src/ifcviewer/GeometryStreamer.h
new file mode 100644
index 00000000000..06b6364a244
--- /dev/null
+++ b/src/ifcviewer/GeometryStreamer.h
@@ -0,0 +1,89 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef GEOMETRYSTREAMER_H
+#define GEOMETRYSTREAMER_H
+
+#include <QObject>
+#include <QThread>
+
+#include <string>
+#include <vector>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <deque>
+
+#include "../ifcparse/IfcFile.h"
+#include "../ifcgeom/Iterator.h"
+
+#include "ViewportWindow.h"
+
+struct ElementInfo {
+    uint32_t object_id;
+    int ifc_id;
+    std::string guid;
+    std::string name;
+    std::string type;
+    int parent_id;
+};
+
+class GeometryStreamer : public QObject {
+    Q_OBJECT
+public:
+    explicit GeometryStreamer(QObject* parent = nullptr);
+    ~GeometryStreamer();
+
+    void loadFile(const std::string& path, int num_threads = 0);
+    void cancel();
+
+    bool isRunning() const { return running_.load(); }
+    int progress() const { return progress_.load(); }
+
+    IfcParse::IfcFile* ifcFile() const { return ifc_file_.get(); }
+
+    // Thread-safe access to discovered elements
+    std::vector<ElementInfo> drainElements();
+
+signals:
+    void progressChanged(int percent);
+    void elementReady(UploadChunk chunk);
+    void finished();
+    void errorOccurred(const QString& message);
+
+private:
+    void run(const std::string& path, int num_threads);
+
+    UploadChunk convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id);
+
+    std::unique_ptr<IfcParse::IfcFile> ifc_file_;
+    std::unique_ptr<QThread> worker_thread_;
+    std::atomic<bool> running_{false};
+    std::atomic<bool> cancel_requested_{false};
+    std::atomic<int> progress_{0};
+
+    std::mutex elements_mutex_;
+    std::vector<ElementInfo> pending_elements_;
+
+    // Map from IFC product id to our compact object_id
+    uint32_t next_object_id_ = 1; // 0 = no object
+};
+
+#endif // GEOMETRYSTREAMER_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
new file mode 100644
index 00000000000..1f32ce0877c
--- /dev/null
+++ b/src/ifcviewer/MainWindow.cpp
@@ -0,0 +1,270 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "MainWindow.h"
+#include "SettingsWindow.h"
+
+#include <QApplication>
+#include <QMenuBar>
+#include <QFileDialog>
+#include <QMessageBox>
+#include <QStatusBar>
+#include <QHeaderView>
+#include <QVBoxLayout>
+#include <QDockWidget>
+
+MainWindow::MainWindow(QWidget* parent)
+    : QMainWindow(parent)
+{
+    setupUi();
+    setupMenus();
+
+    streamer_ = new GeometryStreamer(this);
+    connect(streamer_, &GeometryStreamer::progressChanged, this, &MainWindow::onProgressChanged, Qt::QueuedConnection);
+    connect(streamer_, &GeometryStreamer::elementReady, this, &MainWindow::onElementReady, Qt::QueuedConnection);
+    connect(streamer_, &GeometryStreamer::finished, this, &MainWindow::onStreamingFinished, Qt::QueuedConnection);
+    connect(streamer_, &GeometryStreamer::errorOccurred, this, [this](const QString& msg) {
+        QMessageBox::warning(this, "Error", msg);
+    }, Qt::QueuedConnection);
+
+    connect(&element_poll_timer_, &QTimer::timeout, this, &MainWindow::pollNewElements);
+    element_poll_timer_.setInterval(100);
+
+    setWindowTitle("IfcViewer");
+    resize(1400, 900);
+}
+
+MainWindow::~MainWindow() {}
+
+void MainWindow::setupUi() {
+    // 3D Viewport as central widget
+    viewport_ = new ViewportWindow();
+    viewport_container_ = QWidget::createWindowContainer(viewport_, this);
+    viewport_container_->setMinimumSize(400, 300);
+    viewport_container_->setFocusPolicy(Qt::StrongFocus);
+    setCentralWidget(viewport_container_);
+
+    connect(viewport_, &ViewportWindow::objectPicked, this, &MainWindow::onObjectPicked);
+
+    // Element tree dock
+    auto* tree_dock = new QDockWidget("Elements", this);
+    tree_dock->setAllowedAreas(Qt::LeftDockWidgetArea | Qt::RightDockWidgetArea);
+    element_tree_ = new QTreeWidget();
+    element_tree_->setHeaderLabels({"Name", "Type", "GUID"});
+    element_tree_->setColumnWidth(0, 200);
+    element_tree_->setColumnWidth(1, 120);
+    element_tree_->setSelectionMode(QAbstractItemView::SingleSelection);
+    connect(element_tree_, &QTreeWidget::itemSelectionChanged, this, &MainWindow::onTreeSelectionChanged);
+    tree_dock->setWidget(element_tree_);
+    addDockWidget(Qt::LeftDockWidgetArea, tree_dock);
+
+    // Properties dock
+    auto* prop_dock = new QDockWidget("Properties", this);
+    prop_dock->setAllowedAreas(Qt::LeftDockWidgetArea | Qt::RightDockWidgetArea);
+    property_table_ = new QTableWidget();
+    property_table_->setColumnCount(2);
+    property_table_->setHorizontalHeaderLabels({"Property", "Value"});
+    property_table_->horizontalHeader()->setStretchLastSection(true);
+    property_table_->setEditTriggers(QAbstractItemView::NoEditTriggers);
+    property_table_->setSelectionBehavior(QAbstractItemView::SelectRows);
+    prop_dock->setWidget(property_table_);
+    addDockWidget(Qt::RightDockWidgetArea, prop_dock);
+
+    // Status bar with progress
+    progress_bar_ = new QProgressBar();
+    progress_bar_->setMaximumWidth(200);
+    progress_bar_->setVisible(false);
+    status_label_ = new QLabel("Ready");
+    statusBar()->addWidget(status_label_, 1);
+    statusBar()->addPermanentWidget(progress_bar_);
+}
+
+void MainWindow::setupMenus() {
+    auto* file_menu = menuBar()->addMenu("&File");
+    auto* open_action = file_menu->addAction("&Open...", this, &MainWindow::onFileOpen);
+    open_action->setShortcut(QKeySequence::Open);
+    file_menu->addAction("&Settings...", this, &MainWindow::onFileSettings);
+    file_menu->addSeparator();
+    file_menu->addAction("&Quit", QKeySequence::Quit, qApp, &QApplication::quit);
+}
+
+void MainWindow::onFileOpen() {
+    QString path = QFileDialog::getOpenFileName(this, "Open IFC File", QString(), "IFC Files (*.ifc *.ifcxml *.ifczip);;All Files (*)");
+    if (!path.isEmpty()) {
+        openFile(path);
+    }
+}
+
+void MainWindow::onFileSettings() {
+    if (settings_ == nullptr) {
+        settings_ = new SettingsWindow(this);
+    }
+    settings_->open();
+    settings_->activateWindow();
+    settings_->raise();
+}
+
+void MainWindow::openFile(const QString& path) {
+    viewport_->resetScene();
+    element_tree_->clear();
+    property_table_->setRowCount(0);
+    element_map_.clear();
+    tree_items_.clear();
+    ifc_id_to_object_id_.clear();
+
+    progress_bar_->setValue(0);
+    progress_bar_->setVisible(true);
+    status_label_->setText("Loading: " + path);
+
+    load_timer_.restart();
+    element_poll_timer_.start();
+    streamer_->loadFile(path.toStdString());
+}
+
+void MainWindow::onProgressChanged(int percent) {
+    progress_bar_->setValue(percent);
+}
+
+void MainWindow::onElementReady(UploadChunk chunk) {
+    viewport_->uploadChunk(chunk);
+}
+
+void MainWindow::onStreamingFinished() {
+    element_poll_timer_.stop();
+    pollNewElements(); // drain remaining
+
+    progress_bar_->setVisible(false);
+
+    qint64 ms = load_timer_.elapsed();
+    QString elapsed = (ms >= 1000)
+        ? QString::number(ms / 1000.0, 'f', 2) + " s"
+        : QString::number(ms) + " ms";
+    status_label_->setText(QString("Loaded %1 elements in %2")
+        .arg(element_map_.size())
+        .arg(elapsed));
+}
+
+void MainWindow::onObjectPicked(uint32_t object_id) {
+    viewport_->setSelectedObjectId(object_id);
+
+    // Select in tree
+    auto it = tree_items_.find(object_id);
+    if (it != tree_items_.end()) {
+        element_tree_->blockSignals(true);
+        element_tree_->setCurrentItem(it->second);
+        element_tree_->blockSignals(false);
+    }
+
+    populateProperties(object_id);
+}
+
+void MainWindow::onTreeSelectionChanged() {
+    auto items = element_tree_->selectedItems();
+    if (items.isEmpty()) return;
+
+    uint32_t object_id = items.first()->data(0, Qt::UserRole).toUInt();
+    viewport_->setSelectedObjectId(object_id);
+    populateProperties(object_id);
+}
+
+void MainWindow::pollNewElements() {
+    auto elements = streamer_->drainElements();
+    for (auto& info : elements) {
+        element_map_[info.object_id] = info;
+        ifc_id_to_object_id_[info.ifc_id] = info.object_id;
+
+        // Find parent tree item
+        QTreeWidgetItem* parent_item = nullptr;
+        auto parent_obj_it = ifc_id_to_object_id_.find(info.parent_id);
+        if (parent_obj_it != ifc_id_to_object_id_.end()) {
+            auto tree_it = tree_items_.find(parent_obj_it->second);
+            if (tree_it != tree_items_.end()) {
+                parent_item = tree_it->second;
+            }
+        }
+
+        QString display_name = QString::fromStdString(info.name);
+        if (display_name.isEmpty()) {
+            display_name = QString::fromStdString(info.type) + " #" + QString::number(info.ifc_id);
+        }
+
+        QTreeWidgetItem* item;
+        if (parent_item) {
+            item = new QTreeWidgetItem(parent_item);
+        } else {
+            item = new QTreeWidgetItem(element_tree_);
+        }
+        item->setText(0, display_name);
+        item->setText(1, QString::fromStdString(info.type));
+        item->setText(2, QString::fromStdString(info.guid));
+        item->setData(0, Qt::UserRole, info.object_id);
+
+        tree_items_[info.object_id] = item;
+    }
+}
+
+void MainWindow::populateProperties(uint32_t object_id) {
+    property_table_->setRowCount(0);
+    if (object_id == 0) return;
+
+    auto it = element_map_.find(object_id);
+    if (it == element_map_.end()) return;
+
+    const auto& info = it->second;
+
+    auto addRow = [this](const QString& key, const QString& value) {
+        int row = property_table_->rowCount();
+        property_table_->insertRow(row);
+        property_table_->setItem(row, 0, new QTableWidgetItem(key));
+        property_table_->setItem(row, 1, new QTableWidgetItem(value));
+    };
+
+    addRow("IFC ID", QString::number(info.ifc_id));
+    addRow("GUID", QString::fromStdString(info.guid));
+    addRow("Name", QString::fromStdString(info.name));
+    addRow("Type", QString::fromStdString(info.type));
+
+    // If the file is loaded, try to get property sets
+    auto* file = streamer_->ifcFile();
+    if (!file) return;
+
+    auto* product = file->instance_by_id(info.ifc_id);
+    if (!product) return;
+
+    // Show all direct attributes
+    auto& decl = product->declaration();
+    if (auto* entity = decl.as_entity()) {
+        for (size_t i = 0; i < entity->attribute_count(); ++i) {
+            auto* attr = entity->attribute_by_index(i);
+            try {
+                auto val = product->get_attribute_value(i);
+                if (!val.isNull()) {
+                    std::string str_val;
+                    try {
+                        str_val = static_cast<std::string>(val);
+                    } catch (...) {
+                        // Not a string-convertible attribute (entity ref, aggregate, etc.)
+                        str_val = "<" + std::string(IfcUtil::ArgumentTypeToString(val.type())) + ">";
+                    }
+                    addRow(QString::fromStdString(attr->name()), QString::fromStdString(str_val));
+                }
+            } catch (...) {}
+        }
+    }
+}
diff --git a/src/ifcviewer/MainWindow.h b/src/ifcviewer/MainWindow.h
new file mode 100644
index 00000000000..d5f4c18a395
--- /dev/null
+++ b/src/ifcviewer/MainWindow.h
@@ -0,0 +1,80 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef MAINWINDOW_H
+#define MAINWINDOW_H
+
+#include <QMainWindow>
+#include <QTreeWidget>
+#include <QTableWidget>
+#include <QProgressBar>
+#include <QLabel>
+#include <QSplitter>
+#include <QTimer>
+#include <QElapsedTimer>
+
+#include <unordered_map>
+
+#include "ViewportWindow.h"
+#include "GeometryStreamer.h"
+
+class SettingsWindow;
+
+class MainWindow : public QMainWindow {
+    Q_OBJECT
+public:
+    explicit MainWindow(QWidget* parent = nullptr);
+    ~MainWindow();
+
+    void openFile(const QString& path);
+
+private slots:
+    void onFileOpen();
+    void onFileSettings();
+    void onProgressChanged(int percent);
+    void onElementReady(UploadChunk chunk);
+    void onStreamingFinished();
+    void onObjectPicked(uint32_t object_id);
+    void onTreeSelectionChanged();
+    void pollNewElements();
+
+private:
+    void setupUi();
+    void setupMenus();
+    void populateProperties(uint32_t object_id);
+
+    ViewportWindow* viewport_ = nullptr;
+    SettingsWindow* settings_ = nullptr;
+    QWidget* viewport_container_ = nullptr;
+    QTreeWidget* element_tree_ = nullptr;
+    QTableWidget* property_table_ = nullptr;
+    QProgressBar* progress_bar_ = nullptr;
+    QLabel* status_label_ = nullptr;
+    QTimer element_poll_timer_;
+    QElapsedTimer load_timer_;
+
+    GeometryStreamer* streamer_ = nullptr;
+
+    // Map object_id -> tree item and element info
+    std::unordered_map<uint32_t, ElementInfo> element_map_;
+    std::unordered_map<uint32_t, QTreeWidgetItem*> tree_items_;
+    std::unordered_map<int, uint32_t> ifc_id_to_object_id_;
+};
+
+#endif // MAINWINDOW_H
diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
new file mode 100644
index 00000000000..b9194cefd1a
--- /dev/null
+++ b/src/ifcviewer/README.md
@@ -0,0 +1,129 @@
+# IfcViewer
+
+A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine with a Qt6 interface and OpenGL 4.5 rendering.
+
+## Architecture
+
+```
++-------------------------------------------+
+|  Qt6 Application (MainWindow)             |
+|  +----------+ +--------------------------+|
+|  | Element  | | 3D Viewport              ||
+|  | Tree     | | (QWindow + OpenGL 4.5)   ||
+|  |          | |                          ||
+|  +----------+ | Single VBO/EBO           ||
+|  | Property | | DrawElementsBaseVertex   ||
+|  | Table    | | GPU pick pass            ||
+|  +----------+ +--------------------------+|
+|  | Status / Progress                      |
++-------------------------------------------+
+        ^                    ^
+        |                    |
+  element metadata     UploadChunks
+        |                    |
++-------------------------------------------+
+|  GeometryStreamer (background QThread)     |
+|  IfcGeom::Iterator with N threads         |
+|  (one per CPU core by default)            |
++-------------------------------------------+
+```
+
+### Key design decisions
+
+- **QWindow viewport** embedded via `QWidget::createWindowContainer()`. This gives us a raw native surface for OpenGL, bypassing `QOpenGLWidget`'s compositor overhead.
+- **One big vertex buffer + index buffer** (64 MB + 32 MB initial). Geometry is appended as it streams in. No per-object VBOs, no rebinding.
+- **Interleaved vertex format**: position (3 floats) + normal (3 floats) + object ID (1 float, bitcast uint32) = 28 bytes per vertex.
+- **GPU object picking**: a second render pass writes object IDs to an R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
+- **Multi-threaded tessellation**: `IfcGeom::Iterator` runs on a background thread and internally parallelizes geometry conversion across all CPU cores.
+- **Non-blocking streaming**: the iterator emits `UploadChunk` signals via Qt's queued connection. The main thread uploads to the GPU without blocking iteration.
+- **World coordinates**: geometry is emitted in world space (`use-world-coords=true`) so no per-object transform matrices are needed on the GPU.
+
+### Files
+
+| File | Purpose |
+|------|---------|
+| `main.cpp` | Application entry point, GL 4.5 surface format, CLI argument parsing |
+| `MainWindow.h/cpp` | Qt main window: dockable element tree, property table, status bar, menus |
+| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, picking |
+| `GeometryStreamer.h/cpp` | Background geometry processing: loads IFC, runs iterator, emits chunks |
+| `CMakeLists.txt` | Build configuration |
+
+## Dependencies
+
+- **Qt6** (Core, Gui, Widgets)
+- **OpenGL 4.5** (GL_ARB_direct_state_access) - available on Windows and Linux; macOS will need a Vulkan/MoltenVK backend (not yet implemented)
+- **IfcOpenShell C++ libraries** (IfcParse, IfcGeom, and their dependencies: Open CASCADE, Boost, Eigen3, optionally CGAL)
+
+## Building
+
+IfcViewer is built as part of the IfcOpenShell CMake project. You do not need to build everything - disable the targets you don't need.
+
+### Minimal build (IfcViewer only)
+
+From the repository root:
+
+```sh
+mkdir build && cd build
+
+cmake ../cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_IFCVIEWER=ON \
+    -DBUILD_CONVERT=OFF \
+    -DBUILD_IFCPYTHON=OFF \
+    -DBUILD_GEOMSERVER=OFF \
+    -DBUILD_DOCUMENTATION=OFF \
+    -DBUILD_EXAMPLES=OFF \
+    -DCOLLADA_SUPPORT=OFF \
+    -DGLTF_SUPPORT=OFF \
+    -DHDF5_SUPPORT=OFF
+
+make -j$(nproc) IfcViewer
+```
+
+This builds only IfcParse, IfcGeom (with geometry kernels), and IfcViewer itself. All other targets (IfcConvert, Python bindings, serializers, etc.) are skipped.
+
+If Qt6 is not in a standard location, pass `-DQT_DIR=/path/to/qt6`.
+
+### Full build with IfcViewer enabled
+
+```sh
+cmake ../cmake -DBUILD_IFCVIEWER=ON
+make -j$(nproc)
+```
+
+## Usage
+
+```sh
+# Open a file directly
+./IfcViewer model.ifc
+
+# Or use File -> Open from the menu
+./IfcViewer
+```
+
+### Controls
+
+| Input | Action |
+|-------|--------|
+| Middle mouse drag | Orbit camera |
+| Shift + middle mouse drag | Pan camera |
+| Scroll wheel | Zoom |
+| Left click | Select object (highlights in viewport and tree) |
+
+### Keyboard shortcuts
+
+| Key | Action |
+|-----|--------|
+| Ctrl+O | Open file |
+| Ctrl+Q | Quit |
+
+## Roadmap
+
+- [ ] Material color support (currently renders default grey per batch)
+- [ ] Buffer growth (reallocate when 64 MB VBO fills up)
+- [ ] `glMultiDrawElementsIndirect` for fewer draw calls
+- [ ] Vulkan/MoltenVK backend for macOS
+- [ ] Spatial tree (BVH) for frustum culling
+- [ ] LOD: coarse tessellation during streaming, refine in background
+- [ ] Embedded Python scripting console
+- [ ] CJK text input support (Qt6 handles this natively)
diff --git a/src/ifcviewer/SettingsWindow.cpp b/src/ifcviewer/SettingsWindow.cpp
new file mode 100644
index 00000000000..a24f9bc9763
--- /dev/null
+++ b/src/ifcviewer/SettingsWindow.cpp
@@ -0,0 +1,68 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "SettingsWindow.h"
+#include "AppSettings.h"
+
+#include <QDialogButtonBox>
+#include <QFormLayout>
+#include <QLineEdit>
+#include <QShowEvent>
+#include <QVBoxLayout>
+
+SettingsWindow::SettingsWindow(QWidget *parent)
+    : QDialog(parent)
+{
+    setWindowTitle("Settings");
+    setupUi();
+}
+
+void SettingsWindow::setupUi() {
+    auto* form = new QFormLayout();
+
+    geometry_library_edit_ = new QLineEdit(this);
+    geometry_library_edit_->setMinimumWidth(280);
+    form->addRow("Geometry Library", geometry_library_edit_);
+
+    auto* button_box = new QDialogButtonBox(
+        QDialogButtonBox::Ok | QDialogButtonBox::Cancel, this);
+
+    auto* root = new QVBoxLayout(this);
+    root->addLayout(form);
+    root->addWidget(button_box);
+
+    connect(button_box, &QDialogButtonBox::accepted, this, &SettingsWindow::onAccepted);
+    connect(button_box, &QDialogButtonBox::rejected, this, &SettingsWindow::reject);
+}
+
+void SettingsWindow::showEvent(QShowEvent* event) {
+    // Re-sync widgets from the persisted settings every time the dialog is
+    // shown, so a previous Cancel doesn't leave stale text in the field.
+    syncFromSettings();
+    QDialog::showEvent(event);
+}
+
+void SettingsWindow::syncFromSettings() {
+    geometry_library_edit_->setText(AppSettings::instance().geometryLibrary());
+}
+
+void SettingsWindow::onAccepted() {
+    AppSettings::instance().setGeometryLibrary(geometry_library_edit_->text());
+    accept();
+}
diff --git a/src/ifcviewer/SettingsWindow.h b/src/ifcviewer/SettingsWindow.h
new file mode 100644
index 00000000000..77affe77578
--- /dev/null
+++ b/src/ifcviewer/SettingsWindow.h
@@ -0,0 +1,46 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef SETTINGSWINDOW_H
+#define SETTINGSWINDOW_H
+
+#include <QDialog>
+
+class QLineEdit;
+class QShowEvent;
+
+class SettingsWindow : public QDialog {
+    Q_OBJECT
+public:
+    explicit SettingsWindow(QWidget *parent = nullptr);
+
+protected:
+    void showEvent(QShowEvent* event) override;
+
+private slots:
+    void onAccepted();
+
+private:
+    void setupUi();
+    void syncFromSettings();
+
+    QLineEdit* geometry_library_edit_ = nullptr;
+};
+
+#endif
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
new file mode 100644
index 00000000000..99624cb9f54
--- /dev/null
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -0,0 +1,674 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "ViewportWindow.h"
+
+#include <QMouseEvent>
+#include <QWheelEvent>
+#include <QSurfaceFormat>
+#include <QtMath>
+#include <QtOpenGL/QOpenGLVersionFunctionsFactory>
+
+#include <cstring>
+#include <algorithm>
+
+static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;  // 64 MB
+static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;  // 32 MB
+// Cap buffer growth so a runaway upload can't try to allocate the world.
+static const size_t MAX_BUFFER_SIZE = 4ull * 1024 * 1024 * 1024;  // 4 GB
+static const int VERTEX_STRIDE = 8;  // pos(3) + normal(3) + object_id(1) + color(1 packed)
+
+static const char* MAIN_VERTEX_SHADER = R"(
+#version 450 core
+layout(location = 0) in vec3 a_position;
+layout(location = 1) in vec3 a_normal;
+layout(location = 2) in float a_object_id;
+layout(location = 3) in vec4 a_color;
+
+uniform mat4 u_view_projection;
+uniform uint u_selected_id;
+
+out vec3 v_normal;
+out vec3 v_position;
+out vec4 v_color;
+flat out uint v_object_id;
+flat out uint v_selected;
+
+void main() {
+    gl_Position = u_view_projection * vec4(a_position, 1.0);
+    v_normal = a_normal;
+    v_position = a_position;
+    v_color = a_color;
+    v_object_id = floatBitsToUint(a_object_id);
+    v_selected = (v_object_id == u_selected_id) ? 1u : 0u;
+}
+)";
+
+static const char* MAIN_FRAGMENT_SHADER = R"(
+#version 450 core
+in vec3 v_normal;
+in vec3 v_position;
+in vec4 v_color;
+flat in uint v_object_id;
+flat in uint v_selected;
+
+uniform vec3 u_light_dir;
+
+out vec4 frag_color;
+
+void main() {
+    vec3 n = normalize(v_normal);
+    float ndotl = max(dot(n, u_light_dir), 0.0);
+    float ambient = 0.25;
+    float diffuse = 0.75 * ndotl;
+    vec3 color = v_color.rgb * (ambient + diffuse);
+
+    if (v_selected == 1u) {
+        color = mix(color, vec3(0.2, 0.6, 1.0), 0.5);
+    }
+
+    frag_color = vec4(color, v_color.a);
+}
+)";
+
+static const char* PICK_VERTEX_SHADER = R"(
+#version 450 core
+layout(location = 0) in vec3 a_position;
+layout(location = 1) in vec3 a_normal;
+layout(location = 2) in float a_object_id;
+
+uniform mat4 u_view_projection;
+
+flat out uint v_object_id;
+
+void main() {
+    gl_Position = u_view_projection * vec4(a_position, 1.0);
+    v_object_id = floatBitsToUint(a_object_id);
+}
+)";
+
+static const char* PICK_FRAGMENT_SHADER = R"(
+#version 450 core
+flat in uint v_object_id;
+
+out uint frag_id;
+
+void main() {
+    frag_id = v_object_id;
+}
+)";
+
+static const char* AXIS_VERTEX_SHADER = R"(
+#version 450 core
+layout(location = 0) in vec3 a_position;
+layout(location = 1) in vec3 a_color;
+
+uniform mat4 u_mvp;
+
+out vec3 v_color;
+
+void main() {
+    gl_Position = u_mvp * vec4(a_position, 1.0);
+    v_color = a_color;
+}
+)";
+
+static const char* AXIS_FRAGMENT_SHADER = R"(
+#version 450 core
+in vec3 v_color;
+out vec4 frag_color;
+
+void main() {
+    frag_color = vec4(v_color, 1.0);
+}
+)";
+
+static GLuint compileShader(QOpenGLFunctions_4_5_Core* gl, GLenum type, const char* source) {
+    GLuint shader = gl->glCreateShader(type);
+    gl->glShaderSource(shader, 1, &source, nullptr);
+    gl->glCompileShader(shader);
+    GLint ok = 0;
+    gl->glGetShaderiv(shader, GL_COMPILE_STATUS, &ok);
+    if (!ok) {
+        char log[1024];
+        gl->glGetShaderInfoLog(shader, sizeof(log), nullptr, log);
+        qWarning("Shader compile error: %s", log);
+    }
+    return shader;
+}
+
+static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint frag) {
+    GLuint prog = gl->glCreateProgram();
+    gl->glAttachShader(prog, vert);
+    gl->glAttachShader(prog, frag);
+    gl->glLinkProgram(prog);
+    GLint ok = 0;
+    gl->glGetProgramiv(prog, GL_LINK_STATUS, &ok);
+    if (!ok) {
+        char log[1024];
+        gl->glGetProgramInfoLog(prog, sizeof(log), nullptr, log);
+        qWarning("Program link error: %s", log);
+    }
+    gl->glDeleteShader(vert);
+    gl->glDeleteShader(frag);
+    return prog;
+}
+
+ViewportWindow::ViewportWindow(QWindow* parent)
+    : QWindow(parent)
+{
+    setSurfaceType(QWindow::OpenGLSurface);
+
+    QSurfaceFormat fmt;
+    fmt.setVersion(4, 5);
+    fmt.setProfile(QSurfaceFormat::CoreProfile);
+    fmt.setDepthBufferSize(24);
+    fmt.setSwapBehavior(QSurfaceFormat::DoubleBuffer);
+    fmt.setSamples(4);
+    setFormat(fmt);
+
+    connect(&render_timer_, &QTimer::timeout, this, [this]() {
+        if (isExposed()) render();
+    });
+    render_timer_.setInterval(16); // ~60 fps
+}
+
+ViewportWindow::~ViewportWindow() {
+    if (context_) {
+        context_->makeCurrent(this);
+        if (gl_) {
+            if (vao_) gl_->glDeleteVertexArrays(1, &vao_);
+            if (vbo_) gl_->glDeleteBuffers(1, &vbo_);
+            if (ebo_) gl_->glDeleteBuffers(1, &ebo_);
+            if (axis_vao_) gl_->glDeleteVertexArrays(1, &axis_vao_);
+            if (axis_vbo_) gl_->glDeleteBuffers(1, &axis_vbo_);
+            if (main_program_) gl_->glDeleteProgram(main_program_);
+            if (pick_program_) gl_->glDeleteProgram(pick_program_);
+            if (axis_program_) gl_->glDeleteProgram(axis_program_);
+            if (pick_fbo_) gl_->glDeleteFramebuffers(1, &pick_fbo_);
+            if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
+            if (pick_depth_rbo_) gl_->glDeleteRenderbuffers(1, &pick_depth_rbo_);
+        }
+        context_->doneCurrent();
+    }
+}
+
+void ViewportWindow::initGL() {
+    if (gl_initialized_) return;
+
+    context_ = new QOpenGLContext(this);
+    context_->setFormat(requestedFormat());
+    if (!context_->create()) {
+        qFatal("Failed to create OpenGL context");
+        return;
+    }
+    context_->makeCurrent(this);
+
+    gl_ = QOpenGLVersionFunctionsFactory::get<QOpenGLFunctions_4_5_Core>(context_);
+    if (!gl_) {
+        qWarning("OpenGL 4.5 not available, falling back");
+        return;
+    }
+
+    buildShaders();
+    buildAxisGizmo();
+
+    // Create VAO
+    gl_->glCreateVertexArrays(1, &vao_);
+
+    // Create VBO with initial capacity
+    vbo_capacity_ = INITIAL_VBO_SIZE;
+    gl_->glCreateBuffers(1, &vbo_);
+    gl_->glNamedBufferStorage(vbo_, vbo_capacity_, nullptr,
+        GL_DYNAMIC_STORAGE_BIT);
+
+    // Create EBO with initial capacity
+    ebo_capacity_ = INITIAL_EBO_SIZE;
+    gl_->glCreateBuffers(1, &ebo_);
+    gl_->glNamedBufferStorage(ebo_, ebo_capacity_, nullptr,
+        GL_DYNAMIC_STORAGE_BIT);
+
+    // Vertex layout: pos(3f) + normal(3f) + object_id(1f) + color(4 unorm bytes)
+    // = 8 floats = 32 bytes per vertex.
+    gl_->glVertexArrayVertexBuffer(vao_, 0, vbo_, 0, VERTEX_STRIDE * sizeof(float));
+    gl_->glVertexArrayElementBuffer(vao_, ebo_);
+
+    // position
+    gl_->glEnableVertexArrayAttrib(vao_, 0);
+    gl_->glVertexArrayAttribFormat(vao_, 0, 3, GL_FLOAT, GL_FALSE, 0);
+    gl_->glVertexArrayAttribBinding(vao_, 0, 0);
+
+    // normal
+    gl_->glEnableVertexArrayAttrib(vao_, 1);
+    gl_->glVertexArrayAttribFormat(vao_, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao_, 1, 0);
+
+    // object_id (passed as float, decoded in shader via floatBitsToUint)
+    gl_->glEnableVertexArrayAttrib(vao_, 2);
+    gl_->glVertexArrayAttribFormat(vao_, 2, 1, GL_FLOAT, GL_FALSE, 6 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao_, 2, 0);
+
+    // color (RGBA8 packed into the 4 bytes at offset 28; normalized to vec4)
+    gl_->glEnableVertexArrayAttrib(vao_, 3);
+    gl_->glVertexArrayAttribFormat(vao_, 3, 4, GL_UNSIGNED_BYTE, GL_TRUE, 7 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao_, 3, 0);
+
+    gl_->glEnable(GL_DEPTH_TEST);
+    gl_->glEnable(GL_MULTISAMPLE);
+    gl_->glClearColor(0.18f, 0.20f, 0.22f, 1.0f);
+
+    gl_initialized_ = true;
+    frame_clock_.start();
+    render_timer_.start();
+
+    emit initialized();
+}
+
+void ViewportWindow::buildShaders() {
+    {
+        GLuint vs = compileShader(gl_, GL_VERTEX_SHADER, MAIN_VERTEX_SHADER);
+        GLuint fs = compileShader(gl_, GL_FRAGMENT_SHADER, MAIN_FRAGMENT_SHADER);
+        main_program_ = linkProgram(gl_, vs, fs);
+    }
+    {
+        GLuint vs = compileShader(gl_, GL_VERTEX_SHADER, PICK_VERTEX_SHADER);
+        GLuint fs = compileShader(gl_, GL_FRAGMENT_SHADER, PICK_FRAGMENT_SHADER);
+        pick_program_ = linkProgram(gl_, vs, fs);
+    }
+    {
+        GLuint vs = compileShader(gl_, GL_VERTEX_SHADER, AXIS_VERTEX_SHADER);
+        GLuint fs = compileShader(gl_, GL_FRAGMENT_SHADER, AXIS_FRAGMENT_SHADER);
+        axis_program_ = linkProgram(gl_, vs, fs);
+    }
+}
+
+void ViewportWindow::buildAxisGizmo() {
+    // 3 line segments (X red, Y green, Z blue), 6 vertices, pos(3) + color(3).
+    static const float axis_data[] = {
+        // X axis - red
+        0.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
+        1.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
+        // Y axis - green
+        0.0f, 0.0f, 0.0f,   0.30f, 0.95f, 0.30f,
+        0.0f, 1.0f, 0.0f,   0.30f, 0.95f, 0.30f,
+        // Z axis - blue
+        0.0f, 0.0f, 0.0f,   0.30f, 0.55f, 1.0f,
+        0.0f, 0.0f, 1.0f,   0.30f, 0.55f, 1.0f,
+    };
+
+    gl_->glCreateVertexArrays(1, &axis_vao_);
+    gl_->glCreateBuffers(1, &axis_vbo_);
+    gl_->glNamedBufferStorage(axis_vbo_, sizeof(axis_data), axis_data, 0);
+
+    gl_->glVertexArrayVertexBuffer(axis_vao_, 0, axis_vbo_, 0, 6 * sizeof(float));
+
+    gl_->glEnableVertexArrayAttrib(axis_vao_, 0);
+    gl_->glVertexArrayAttribFormat(axis_vao_, 0, 3, GL_FLOAT, GL_FALSE, 0);
+    gl_->glVertexArrayAttribBinding(axis_vao_, 0, 0);
+
+    gl_->glEnableVertexArrayAttrib(axis_vao_, 1);
+    gl_->glVertexArrayAttribFormat(axis_vao_, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(axis_vao_, 1, 0);
+}
+
+bool ViewportWindow::growVbo(size_t needed_total) {
+    // Double until it fits, but don't blow past the cap.
+    size_t new_capacity = vbo_capacity_;
+    while (new_capacity < needed_total) {
+        new_capacity *= 2;
+    }
+    if (new_capacity > MAX_BUFFER_SIZE) {
+        qWarning("VBO grow request (%zu MB) exceeds cap (%zu MB)",
+            new_capacity / (1024 * 1024), MAX_BUFFER_SIZE / (1024 * 1024));
+        return false;
+    }
+
+    GLuint new_vbo = 0;
+    gl_->glCreateBuffers(1, &new_vbo);
+    gl_->glNamedBufferStorage(new_vbo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+
+    if (vbo_used_ > 0) {
+        gl_->glCopyNamedBufferSubData(vbo_, new_vbo, 0, 0, vbo_used_);
+    }
+
+    gl_->glDeleteBuffers(1, &vbo_);
+    vbo_ = new_vbo;
+    vbo_capacity_ = new_capacity;
+
+    // Rebind on the VAO so subsequent draws see the new buffer.
+    gl_->glVertexArrayVertexBuffer(vao_, 0, vbo_, 0, VERTEX_STRIDE * sizeof(float));
+
+    qInfo("VBO grew to %zu MB", vbo_capacity_ / (1024 * 1024));
+    return true;
+}
+
+bool ViewportWindow::growEbo(size_t needed_total) {
+    size_t new_capacity = ebo_capacity_;
+    while (new_capacity < needed_total) {
+        new_capacity *= 2;
+    }
+    if (new_capacity > MAX_BUFFER_SIZE) {
+        qWarning("EBO grow request (%zu MB) exceeds cap (%zu MB)",
+            new_capacity / (1024 * 1024), MAX_BUFFER_SIZE / (1024 * 1024));
+        return false;
+    }
+
+    GLuint new_ebo = 0;
+    gl_->glCreateBuffers(1, &new_ebo);
+    gl_->glNamedBufferStorage(new_ebo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+
+    if (ebo_used_ > 0) {
+        gl_->glCopyNamedBufferSubData(ebo_, new_ebo, 0, 0, ebo_used_);
+    }
+
+    gl_->glDeleteBuffers(1, &ebo_);
+    ebo_ = new_ebo;
+    ebo_capacity_ = new_capacity;
+
+    gl_->glVertexArrayElementBuffer(vao_, ebo_);
+
+    qInfo("EBO grew to %zu MB", ebo_capacity_ / (1024 * 1024));
+    return true;
+}
+
+void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
+    if (!gl_initialized_) return;
+    if (chunk.vertices.empty() || chunk.indices.empty()) return;
+
+    context_->makeCurrent(this);
+
+    size_t vb_size = chunk.vertices.size() * sizeof(float);
+    size_t ib_size = chunk.indices.size() * sizeof(uint32_t);
+
+    if (vbo_used_ + vb_size > vbo_capacity_) {
+        if (!growVbo(vbo_used_ + vb_size)) {
+            qWarning("VBO at cap, skipping chunk");
+            return;
+        }
+    }
+    if (ebo_used_ + ib_size > ebo_capacity_) {
+        if (!growEbo(ebo_used_ + ib_size)) {
+            qWarning("EBO at cap, skipping chunk");
+            return;
+        }
+    }
+
+    uint32_t base_vertex = vertex_count_;
+
+    gl_->glNamedBufferSubData(vbo_, vbo_used_, vb_size, chunk.vertices.data());
+
+    // Remap chunk-local indices into global indices so the whole EBO can be
+    // drawn with a single glDrawElements call.
+    std::vector<uint32_t> global_indices(chunk.indices.size());
+    for (size_t i = 0; i < chunk.indices.size(); ++i) {
+        global_indices[i] = chunk.indices[i] + base_vertex;
+    }
+    gl_->glNamedBufferSubData(ebo_, ebo_used_, ib_size, global_indices.data());
+
+    {
+        std::lock_guard<std::mutex> lock(upload_mutex_);
+        total_index_count_ += static_cast<uint32_t>(chunk.indices.size());
+    }
+
+    vbo_used_ += vb_size;
+    ebo_used_ += ib_size;
+    vertex_count_ += static_cast<uint32_t>(chunk.vertices.size() / VERTEX_STRIDE);
+    total_triangles_ += static_cast<uint32_t>(chunk.indices.size() / 3);
+}
+
+void ViewportWindow::resetScene() {
+    if (!gl_initialized_) return;
+
+    std::lock_guard<std::mutex> lock(upload_mutex_);
+    total_index_count_ = 0;
+    vbo_used_ = 0;
+    ebo_used_ = 0;
+    vertex_count_ = 0;
+    total_triangles_ = 0;
+    selected_object_id_ = 0;
+}
+
+void ViewportWindow::setSelectedObjectId(uint32_t id) {
+    selected_object_id_ = id;
+}
+
+uint32_t ViewportWindow::pickObjectAt(int x, int y) {
+    if (!gl_initialized_) return 0;
+
+    context_->makeCurrent(this);
+
+    int w = width() * devicePixelRatio();
+    int h = height() * devicePixelRatio();
+
+    // Create/resize pick FBO if needed
+    if (pick_width_ != w || pick_height_ != h) {
+        if (pick_fbo_) gl_->glDeleteFramebuffers(1, &pick_fbo_);
+        if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
+        if (pick_depth_rbo_) gl_->glDeleteRenderbuffers(1, &pick_depth_rbo_);
+
+        gl_->glCreateFramebuffers(1, &pick_fbo_);
+
+        gl_->glCreateTextures(GL_TEXTURE_2D, 1, &pick_color_tex_);
+        gl_->glTextureStorage2D(pick_color_tex_, 1, GL_R32UI, w, h);
+        gl_->glNamedFramebufferTexture(pick_fbo_, GL_COLOR_ATTACHMENT0, pick_color_tex_, 0);
+
+        gl_->glCreateRenderbuffers(1, &pick_depth_rbo_);
+        gl_->glNamedRenderbufferStorage(pick_depth_rbo_, GL_DEPTH_COMPONENT24, w, h);
+        gl_->glNamedFramebufferRenderbuffer(pick_fbo_, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, pick_depth_rbo_);
+
+        pick_width_ = w;
+        pick_height_ = h;
+    }
+
+    renderPickPass();
+
+    int px = x * devicePixelRatio();
+    int py = (height() - y) * devicePixelRatio();
+    uint32_t pixel = 0;
+    gl_->glGetTextureSubImage(pick_color_tex_, 0, px, py, 0, 1, 1, 1, GL_RED_INTEGER, GL_UNSIGNED_INT, sizeof(pixel), &pixel);
+
+    return pixel;
+}
+
+void ViewportWindow::updateCamera() {
+    float yaw_rad = qDegreesToRadians(camera_yaw_);
+    float pitch_rad = qDegreesToRadians(camera_pitch_);
+
+    // IFC / Blender convention: X right, Y forward, Z up.
+    QVector3D eye;
+    eye.setX(camera_target_.x() + camera_distance_ * cosf(pitch_rad) * cosf(yaw_rad));
+    eye.setY(camera_target_.y() + camera_distance_ * cosf(pitch_rad) * sinf(yaw_rad));
+    eye.setZ(camera_target_.z() + camera_distance_ * sinf(pitch_rad));
+
+    view_matrix_.setToIdentity();
+    view_matrix_.lookAt(eye, camera_target_, QVector3D(0, 0, 1));
+
+    proj_matrix_.setToIdentity();
+    float aspect = width() > 0 ? float(width()) / float(height()) : 1.0f;
+    proj_matrix_.perspective(45.0f, aspect, 0.1f, camera_distance_ * 10.0f);
+}
+
+void ViewportWindow::render() {
+    if (!gl_initialized_ || !isExposed()) return;
+
+    context_->makeCurrent(this);
+    updateCamera();
+
+    int w = width() * devicePixelRatio();
+    int h = height() * devicePixelRatio();
+    gl_->glViewport(0, 0, w, h);
+    gl_->glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+    QMatrix4x4 vp = proj_matrix_ * view_matrix_;
+
+    gl_->glUseProgram(main_program_);
+    gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(main_program_, "u_view_projection"), 1, GL_FALSE, vp.constData());
+    gl_->glUniform3f(gl_->glGetUniformLocation(main_program_, "u_light_dir"), 0.3f, 0.5f, 0.8f);
+    gl_->glUniform1ui(gl_->glGetUniformLocation(main_program_, "u_selected_id"), selected_object_id_);
+
+    gl_->glBindVertexArray(vao_);
+
+    {
+        std::lock_guard<std::mutex> lock(upload_mutex_);
+        if (total_index_count_ > 0) {
+            gl_->glDrawElements(GL_TRIANGLES, total_index_count_, GL_UNSIGNED_INT, nullptr);
+        }
+    }
+
+    renderAxisGizmo();
+
+    context_->swapBuffers(this);
+}
+
+void ViewportWindow::renderAxisGizmo() {
+    if (!axis_program_ || !axis_vao_) return;
+
+    const int dpr = devicePixelRatio();
+    const int gizmo_size = 110 * dpr;
+    const int margin = 10 * dpr;
+
+    gl_->glViewport(margin, margin, gizmo_size, gizmo_size);
+    gl_->glDisable(GL_DEPTH_TEST);
+
+    // Build a view matrix from the same camera orientation but with a fixed
+    // close-up distance, so the gizmo rotates with the scene camera. Z-up.
+    float yaw_rad = qDegreesToRadians(camera_yaw_);
+    float pitch_rad = qDegreesToRadians(camera_pitch_);
+
+    QVector3D eye_dir;
+    eye_dir.setX(cosf(pitch_rad) * cosf(yaw_rad));
+    eye_dir.setY(cosf(pitch_rad) * sinf(yaw_rad));
+    eye_dir.setZ(sinf(pitch_rad));
+
+    QMatrix4x4 gizmo_view;
+    gizmo_view.lookAt(eye_dir * 3.0f, QVector3D(0, 0, 0), QVector3D(0, 0, 1));
+
+    QMatrix4x4 gizmo_proj;
+    gizmo_proj.ortho(-1.4f, 1.4f, -1.4f, 1.4f, 0.1f, 10.0f);
+
+    QMatrix4x4 mvp = gizmo_proj * gizmo_view;
+
+    gl_->glUseProgram(axis_program_);
+    gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(axis_program_, "u_mvp"), 1, GL_FALSE, mvp.constData());
+
+    gl_->glLineWidth(2.5f);  // ignored on some core-profile drivers, that's OK
+    gl_->glBindVertexArray(axis_vao_);
+    gl_->glDrawArrays(GL_LINES, 0, 6);
+
+    gl_->glEnable(GL_DEPTH_TEST);
+}
+
+void ViewportWindow::renderPickPass() {
+    gl_->glBindFramebuffer(GL_FRAMEBUFFER, pick_fbo_);
+    gl_->glViewport(0, 0, pick_width_, pick_height_);
+
+    GLuint clear_val = 0;
+    gl_->glClearBufferuiv(GL_COLOR, 0, &clear_val);
+    gl_->glClear(GL_DEPTH_BUFFER_BIT);
+
+    QMatrix4x4 vp = proj_matrix_ * view_matrix_;
+    gl_->glUseProgram(pick_program_);
+    gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(pick_program_, "u_view_projection"), 1, GL_FALSE, vp.constData());
+
+    gl_->glBindVertexArray(vao_);
+
+    {
+        std::lock_guard<std::mutex> lock(upload_mutex_);
+        if (total_index_count_ > 0) {
+            gl_->glDrawElements(GL_TRIANGLES, total_index_count_, GL_UNSIGNED_INT, nullptr);
+        }
+    }
+
+    gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
+}
+
+void ViewportWindow::exposeEvent(QExposeEvent*) {
+    if (isExposed() && !gl_initialized_) {
+        initGL();
+    }
+}
+
+void ViewportWindow::resizeEvent(QResizeEvent*) {
+    if (gl_initialized_) render();
+}
+
+bool ViewportWindow::event(QEvent* e) {
+    switch (e->type()) {
+    case QEvent::MouseButtonPress:
+        handleMousePress(static_cast<QMouseEvent*>(e));
+        return true;
+    case QEvent::MouseButtonRelease:
+        handleMouseRelease(static_cast<QMouseEvent*>(e));
+        return true;
+    case QEvent::MouseMove:
+        handleMouseMove(static_cast<QMouseEvent*>(e));
+        return true;
+    case QEvent::Wheel:
+        handleWheel(static_cast<QWheelEvent*>(e));
+        return true;
+    default:
+        return QWindow::event(e);
+    }
+}
+
+void ViewportWindow::handleMousePress(QMouseEvent* e) {
+    active_button_ = e->button();
+    last_mouse_pos_ = e->pos();
+}
+
+void ViewportWindow::handleMouseRelease(QMouseEvent* e) {
+    if (active_button_ == Qt::LeftButton && (e->pos() - last_mouse_pos_).manhattanLength() < 5) {
+        uint32_t id = pickObjectAt(e->pos().x(), e->pos().y());
+        selected_object_id_ = id;
+        emit objectPicked(id);
+    }
+    active_button_ = Qt::NoButton;
+}
+
+void ViewportWindow::handleMouseMove(QMouseEvent* e) {
+    QPoint delta = e->pos() - last_mouse_pos_;
+    last_mouse_pos_ = e->pos();
+
+    if (active_button_ == Qt::MiddleButton) {
+        if (e->modifiers() & Qt::ShiftModifier) {
+            // Pan in screen space, derived from the Z-up camera basis.
+            float pan_speed = camera_distance_ * 0.002f;
+            float yaw_rad = qDegreesToRadians(camera_yaw_);
+            float pitch_rad = qDegreesToRadians(camera_pitch_);
+            QVector3D right(-sinf(yaw_rad), cosf(yaw_rad), 0.0f);
+            QVector3D up(
+                -sinf(pitch_rad) * cosf(yaw_rad),
+                -sinf(pitch_rad) * sinf(yaw_rad),
+                 cosf(pitch_rad));
+            camera_target_ -= right * delta.x() * pan_speed;
+            camera_target_ += up * delta.y() * pan_speed;
+        } else {
+            // Orbit
+            camera_yaw_ -= delta.x() * 0.3f;
+            camera_pitch_ += delta.y() * 0.3f;
+            camera_pitch_ = qBound(-89.0f, camera_pitch_, 89.0f);
+        }
+    }
+}
+
+void ViewportWindow::handleWheel(QWheelEvent* e) {
+    float factor = e->angleDelta().y() > 0 ? 0.9f : 1.1f;
+    camera_distance_ *= factor;
+    camera_distance_ = qMax(0.1f, camera_distance_);
+}
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
new file mode 100644
index 00000000000..cb718050c8f
--- /dev/null
+++ b/src/ifcviewer/ViewportWindow.h
@@ -0,0 +1,146 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef VIEWPORTWINDOW_H
+#define VIEWPORTWINDOW_H
+
+#include <QWindow>
+#include <QOpenGLContext>
+#include <QtOpenGL/QOpenGLFunctions_4_5_Core>
+#include <QTimer>
+#include <QElapsedTimer>
+#include <QMatrix4x4>
+#include <QVector3D>
+
+#include <vector>
+#include <cstdint>
+#include <mutex>
+
+struct MaterialInfo {
+    float r = 0.75f, g = 0.75f, b = 0.78f, a = 1.0f;
+};
+
+struct UploadChunk {
+    // Interleaved per-vertex layout (8 floats / 32 bytes per vertex):
+    //   pos(3 float) + normal(3 float) + object_id(1 float bitcast from uint)
+    //   + color(1 float holding RGBA8 packed bytes, read on the GPU as
+    //   GL_UNSIGNED_BYTE * 4 normalized).
+    std::vector<float> vertices;
+    std::vector<uint32_t> indices; // local to this chunk's vertices
+    uint32_t object_id = 0;
+};
+
+class ViewportWindow : public QWindow {
+    Q_OBJECT
+public:
+    explicit ViewportWindow(QWindow* parent = nullptr);
+    ~ViewportWindow();
+
+    void uploadChunk(const UploadChunk& chunk);
+    void resetScene();
+
+    void setSelectedObjectId(uint32_t id);
+    uint32_t pickObjectAt(int x, int y);
+
+signals:
+    void objectPicked(uint32_t object_id);
+    void initialized();
+
+protected:
+    void exposeEvent(QExposeEvent* event) override;
+    void resizeEvent(QResizeEvent* event) override;
+    bool event(QEvent* event) override;
+
+private:
+    void initGL();
+    void render();
+    void renderPickPass();
+    void renderAxisGizmo();
+    void updateCamera();
+    void buildShaders();
+    void buildAxisGizmo();
+    bool growVbo(size_t needed_total);
+    bool growEbo(size_t needed_total);
+
+    // Mouse interaction
+    void handleMousePress(QMouseEvent* event);
+    void handleMouseRelease(QMouseEvent* event);
+    void handleMouseMove(QMouseEvent* event);
+    void handleWheel(QWheelEvent* event);
+
+    QOpenGLContext* context_ = nullptr;
+    QOpenGLFunctions_4_5_Core* gl_ = nullptr;
+    QTimer render_timer_;
+    QElapsedTimer frame_clock_;
+    bool gl_initialized_ = false;
+
+    // Shaders
+    GLuint main_program_ = 0;
+    GLuint pick_program_ = 0;
+    GLuint axis_program_ = 0;
+
+    // Axis gizmo (separate VAO/VBO since vertex layout differs from scene)
+    GLuint axis_vao_ = 0;
+    GLuint axis_vbo_ = 0;
+
+    // Geometry buffers - one big buffer pair
+    GLuint vao_ = 0;
+    GLuint vbo_ = 0;
+    GLuint ebo_ = 0;
+    size_t vbo_capacity_ = 0;
+    size_t ebo_capacity_ = 0;
+    size_t vbo_used_ = 0;  // in bytes
+    size_t ebo_used_ = 0;  // in bytes
+    uint32_t vertex_count_ = 0;
+
+    // Pick framebuffer
+    GLuint pick_fbo_ = 0;
+    GLuint pick_color_tex_ = 0;
+    GLuint pick_depth_rbo_ = 0;
+    int pick_width_ = 0;
+    int pick_height_ = 0;
+
+    // The entire scene is a single mega-batch: per-vertex color removes the
+    // need to switch materials between draw calls. Indices are written into
+    // the EBO already offset by base_vertex so one glDrawElements covers all.
+    uint32_t total_index_count_ = 0;
+    std::mutex upload_mutex_;
+
+    // Camera
+    QVector3D camera_target_{0, 0, 0};
+    float camera_distance_ = 50.0f;
+    float camera_yaw_ = 45.0f;
+    float camera_pitch_ = 30.0f;
+    QMatrix4x4 view_matrix_;
+    QMatrix4x4 proj_matrix_;
+
+    // Mouse state
+    Qt::MouseButton active_button_ = Qt::NoButton;
+    QPoint last_mouse_pos_;
+
+    // Selection
+    uint32_t selected_object_id_ = 0;
+    bool pick_requested_ = false;
+    int pick_x_ = 0, pick_y_ = 0;
+
+    // Stats
+    uint32_t total_triangles_ = 0;
+};
+
+#endif // VIEWPORTWINDOW_H
diff --git a/src/ifcviewer/main.cpp b/src/ifcviewer/main.cpp
new file mode 100644
index 00000000000..3bca693a371
--- /dev/null
+++ b/src/ifcviewer/main.cpp
@@ -0,0 +1,55 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include <QApplication>
+#include <QSurfaceFormat>
+#include <QCommandLineParser>
+
+#include "MainWindow.h"
+
+int main(int argc, char* argv[]) {
+    QApplication app(argc, argv);
+    app.setApplicationName("IfcViewer");
+    app.setOrganizationName("IfcOpenShell");
+
+    // Request OpenGL 4.5 Core globally
+    QSurfaceFormat fmt;
+    fmt.setVersion(4, 5);
+    fmt.setProfile(QSurfaceFormat::CoreProfile);
+    fmt.setDepthBufferSize(24);
+    fmt.setSwapBehavior(QSurfaceFormat::DoubleBuffer);
+    fmt.setSamples(4);
+    QSurfaceFormat::setDefaultFormat(fmt);
+
+    QCommandLineParser parser;
+    parser.setApplicationDescription("IfcOpenShell IFC Viewer");
+    parser.addHelpOption();
+    parser.addPositionalArgument("file", "IFC file to open");
+    parser.process(app);
+
+    MainWindow window;
+    window.show();
+
+    auto args = parser.positionalArguments();
+    if (!args.isEmpty()) {
+        window.openFile(args.first());
+    }
+
+    return app.exec();
+}

From 3c2a5035ac7430fa7be5d7cf8ef9b02828897634 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 19:22:56 +1000
Subject: [PATCH 04/37] More hacks to compile

---
 src/ifcgeom/mapping/IfcOffsetCurveByDistance.cpp | 2 +-
 src/ifcparse/hierarchy_helper.cpp                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ifcgeom/mapping/IfcOffsetCurveByDistance.cpp b/src/ifcgeom/mapping/IfcOffsetCurveByDistance.cpp
index 1a0dc34e388..666f4b61189 100644
--- a/src/ifcgeom/mapping/IfcOffsetCurveByDistance.cpp
+++ b/src/ifcgeom/mapping/IfcOffsetCurveByDistance.cpp
@@ -158,7 +158,7 @@ taxonomy::ptr mapping::map_impl(const IfcSchema::IfcOffsetCurveByDistances& inst
 
 	 // at this point, next == end and prev == end-1
     #if defined SCHEMA_HAS_IfcDistanceExpression
-    double last_distance = (*prev)->DistanceAlong() * length_unit_;
+    double last_distance = (*prev).DistanceAlong() * length_unit_;
     #else
     double last_distance = (double) prev->DistanceAlong().as<IfcSchema::IfcLengthMeasure>() * length_unit_;
     #endif
diff --git a/src/ifcparse/hierarchy_helper.cpp b/src/ifcparse/hierarchy_helper.cpp
index 9d7db7a2d01..1f98c0a9e6d 100644
--- a/src/ifcparse/hierarchy_helper.cpp
+++ b/src/ifcparse/hierarchy_helper.cpp
@@ -625,7 +625,7 @@ Ifc4x3::IfcStyledItem create_styled_item(ifcopenshell::file* file, const Ifc4x3:
 
 #ifdef HAS_SCHEMA_4x3_tc1
 Ifc4x3_tc1::IfcStyledItem create_styled_item(ifcopenshell::file* file, const Ifc4x3_tc1::IfcRepresentationItem& item, const Ifc4x3_tc1::IfcPresentationStyle& style) {
-    auto sitem = file.crefile->createate<Ifc4x3_tc1::IfcStyledItem>();
+    auto sitem = file->create<Ifc4x3_tc1::IfcStyledItem>();
     sitem.setItem(item);
     sitem.setStyles(std::vector<Ifc4x3_tc1::IfcPresentationStyle>{style});
     return sitem;

From fe498ed332e0ad4119d0a6e004785e8991c7c01a Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 19:23:10 +1000
Subject: [PATCH 05/37] Update ifcviewer to compile with datamodel refactor

---
 src/ifcviewer/GeometryStreamer.cpp | 4 ++--
 src/ifcviewer/GeometryStreamer.h   | 6 +++---
 src/ifcviewer/MainWindow.cpp       | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
index 39698c84e62..437209c9098 100644
--- a/src/ifcviewer/GeometryStreamer.cpp
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -94,7 +94,7 @@ std::vector<ElementInfo> GeometryStreamer::drainElements() {
 
 void GeometryStreamer::run(const std::string& path, int num_threads) {
     try {
-        ifc_file_ = std::make_unique<IfcParse::IfcFile>(path);
+        ifc_file_ = std::make_unique<ifcopenshell::file>(path);
     } catch (const std::exception& e) {
         emit errorOccurred(QString("Failed to parse IFC file: %1").arg(e.what()));
         return;
@@ -112,7 +112,7 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
         auto kernel = ifcopenshell::geometry::kernels::construct(
             ifc_file_.get(), geometry_library, settings);
         iterator = std::make_unique<IfcGeom::Iterator>(
-            std::move(kernel), settings, ifc_file_.get(), std::vector<IfcGeom::filter_t>(), num_threads);
+            std::move(kernel), settings, ifc_file_.get(), std::vector<ifcopenshell::geometry::filter_t>(), num_threads);
     } catch (const std::exception& e) {
         emit errorOccurred(QString("Failed to create geometry iterator: %1").arg(e.what()));
         return;
diff --git a/src/ifcviewer/GeometryStreamer.h b/src/ifcviewer/GeometryStreamer.h
index 06b6364a244..abd087463cc 100644
--- a/src/ifcviewer/GeometryStreamer.h
+++ b/src/ifcviewer/GeometryStreamer.h
@@ -31,7 +31,7 @@
 #include <mutex>
 #include <deque>
 
-#include "../ifcparse/IfcFile.h"
+#include "../ifcparse/file.h"
 #include "../ifcgeom/Iterator.h"
 
 #include "ViewportWindow.h"
@@ -57,7 +57,7 @@ class GeometryStreamer : public QObject {
     bool isRunning() const { return running_.load(); }
     int progress() const { return progress_.load(); }
 
-    IfcParse::IfcFile* ifcFile() const { return ifc_file_.get(); }
+    ifcopenshell::file* ifcFile() const { return ifc_file_.get(); }
 
     // Thread-safe access to discovered elements
     std::vector<ElementInfo> drainElements();
@@ -73,7 +73,7 @@ class GeometryStreamer : public QObject {
 
     UploadChunk convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id);
 
-    std::unique_ptr<IfcParse::IfcFile> ifc_file_;
+    std::unique_ptr<ifcopenshell::file> ifc_file_;
     std::unique_ptr<QThread> worker_thread_;
     std::atomic<bool> running_{false};
     std::atomic<bool> cancel_requested_{false};
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 1f32ce0877c..6eede353532 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -244,23 +244,23 @@ void MainWindow::populateProperties(uint32_t object_id) {
     auto* file = streamer_->ifcFile();
     if (!file) return;
 
-    auto* product = file->instance_by_id(info.ifc_id);
+    auto product = file->instance_by_id(info.ifc_id);
     if (!product) return;
 
     // Show all direct attributes
-    auto& decl = product->declaration();
+    auto& decl = product.declaration();
     if (auto* entity = decl.as_entity()) {
         for (size_t i = 0; i < entity->attribute_count(); ++i) {
             auto* attr = entity->attribute_by_index(i);
             try {
-                auto val = product->get_attribute_value(i);
+                auto val = product.get_attribute_value(i);
                 if (!val.isNull()) {
                     std::string str_val;
                     try {
                         str_val = static_cast<std::string>(val);
                     } catch (...) {
                         // Not a string-convertible attribute (entity ref, aggregate, etc.)
-                        str_val = "<" + std::string(IfcUtil::ArgumentTypeToString(val.type())) + ">";
+                        str_val = "<" + std::string(ifcopenshell::argument_type_to_string(val.type())) + ">";
                     }
                     addRow(QString::fromStdString(attr->name()), QString::fromStdString(str_val));
                 }

From 750cc89964cf3829b3ee31184d3a3f05c6f33338 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 19:50:44 +1000
Subject: [PATCH 06/37] Plan out performance strategy

---
 src/ifcviewer/README.md | 335 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 329 insertions(+), 6 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index b9194cefd1a..9c6c52560ce 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -117,13 +117,336 @@ make -j$(nproc)
 | Ctrl+O | Open file |
 | Ctrl+Q | Quit |
 
+## Performance Strategy
+
+The viewer targets smooth orbiting at 60 fps on models up to 1 million IFC objects.
+Rendering performance is addressed in three phases. Each phase builds on the
+previous one, and the system is designed so that smaller models never pay for
+optimizations they don't need.
+
+### Phase 1: Per-Object Frustum Culling (CPU)
+
+**Status:** Implemented.
+
+The simplest win: don't draw what's off screen.
+
+#### Data model
+
+During `uploadChunk()`, the viewport records a small metadata struct for every
+object that enters the GPU buffers:
+
+```cpp
+struct ObjectDrawInfo {
+    uint32_t index_offset;   // byte offset into the shared EBO
+    uint32_t index_count;    // number of indices (triangles * 3)
+    float    aabb_min[3];    // world-space axis-aligned bounding box
+    float    aabb_max[3];    // (computed from vertex positions at upload time)
+};
+```
+
+This costs 32 bytes per object. For 1M objects that's ~32 MB of CPU-side
+metadata — negligible next to the vertex data.
+
+#### Frustum extraction
+
+Each frame, before drawing, six clip planes are extracted from the
+view-projection matrix (`VP = proj * view`). The standard Griess-Hartmann
+method pulls them directly from the matrix rows:
+
+```
+left   = VP[3] + VP[0]
+right  = VP[3] - VP[0]
+bottom = VP[3] + VP[1]
+top    = VP[3] - VP[1]
+near   = VP[3] + VP[2]
+far    = VP[3] - VP[2]
+```
+
+Each plane is stored as (a, b, c, d) and normalized so that
+`a*x + b*y + c*z + d` gives the signed distance from the plane.
+
+#### AABB-frustum test
+
+For each object, the AABB is tested against all six planes using the
+"p-vertex / n-vertex" method:
+
+- For each plane, find the AABB corner most in the direction of the plane
+  normal (the p-vertex).
+- If the p-vertex is on the negative side of the plane, the entire AABB is
+  outside the frustum → cull.
+- If any plane culls the object, skip it.
+
+This test is conservative: it never culls a visible object, but may
+occasionally keep an invisible one (when the AABB straddles a frustum corner).
+That's fine — false positives just cost a few extra triangles.
+
+#### Drawing visible objects
+
+The surviving objects' `(index_count, index_offset)` pairs are passed to
+`glMultiDrawElements()` in a single call. This replaces the previous single
+`glDrawElements()` that drew everything. The GPU processes only the index
+ranges that survived the frustum test.
+
+Alternatively, for the pick pass (which runs less frequently), the same
+visibility list is reused — objects culled from the main pass are also culled
+from picking.
+
+#### Performance characteristics
+
+| Metric | Value |
+|--------|-------|
+| Per-object cost | ~6 dot products + 6 comparisons per frame |
+| 50k objects | ~0.3 ms on a modern CPU core |
+| 500k objects | ~3 ms (starts to matter at 60 fps) |
+| 1M objects | ~6 ms (too expensive — need phase 3) |
+| Memory overhead | 32 bytes/object |
+| Load-time overhead | Near zero (AABB computed during existing upload) |
+
+Phase 1 is sufficient for models up to ~100k objects. Beyond that, the CPU-side
+frustum test becomes a measurable fraction of the frame budget, motivating
+phase 3.
+
+### Phase 2: Spatial Tiling (optional, for large models)
+
+For models exceeding ~10k objects, spatial tiling groups nearby objects into
+tiles and culls at the tile level rather than per-object. This reduces the
+number of frustum tests from N_objects to N_tiles (typically hundreds to low
+thousands).
+
+#### When tiling activates
+
+Tiling is **optional and non-disruptive**. The system treats a non-tiled model
+as the degenerate case of "one tile containing everything" — the rendering loop
+always iterates tiles, so no separate code path is needed.
+
+Tiling activates in one of three ways:
+
+1. **Preprocessed cache exists**: If a `.ifcview` sidecar file is found next to
+   the `.ifc` file, the tile structure is loaded from it instantly. The model
+   uploads geometry in tile order.
+2. **Automatic by size**: If the model has more than a configurable threshold of
+   objects (default 10k), a background task builds the spatial tree after
+   initial loading completes. Until it finishes, phase 1 culling handles
+   visibility.
+3. **Explicit user action**: A "preprocess for performance" option builds the
+   spatial tree and saves the sidecar for future loads.
+
+#### Spatial subdivision
+
+The world-space bounding box of the entire model is subdivided using a
+**loose octree**:
+
+- The root node covers the scene AABB.
+- Each node is split when it contains more than a threshold number of objects
+  (e.g. 256).
+- Objects are assigned to the smallest node that fully contains their AABB.
+- "Loose" bounds (inflated by 1.5x) reduce the number of objects that span
+  multiple nodes.
+- Leaf nodes become tiles.
+
+An octree adapts to non-uniform object density (common in buildings — lots of
+detail in MEP risers, sparse in open atriums) better than a uniform grid.
+
+#### EBO re-sorting
+
+For tile-level culling to translate into contiguous index ranges, the EBO must
+be sorted so that all indices for objects in the same tile are adjacent.
+
+This happens via **deferred compaction**:
+
+1. During initial load, geometry uploads in iterator order (fast first frame,
+   phase 1 culling active).
+2. After loading completes, a background thread:
+   a. Builds the octree from the per-object AABBs (already computed in phase 1).
+   b. Determines the tile for each object.
+   c. Computes the new index order (sorted by tile, then by object within tile).
+   d. Builds a new EBO on the CPU.
+3. The main thread uploads the new EBO in one `glNamedBufferSubData` call and
+   swaps in the tile metadata. One frame of stutter, bounded by EBO upload
+   time.
+
+The per-tile metadata:
+
+```cpp
+struct TileInfo {
+    float    aabb_min[3];    // tile bounding box (union of contained AABBs)
+    float    aabb_max[3];
+    uint32_t index_offset;   // into the re-sorted EBO
+    uint32_t index_count;    // sum of all contained objects' indices
+    uint32_t object_count;   // for stats / debugging
+};
+```
+
+#### Preprocessed sidecar format
+
+The `.ifcview` file stores:
+
+- Octree structure (node hierarchy, split planes).
+- Per-object tile assignment (object_id → tile_id mapping).
+- Per-tile index order (so the EBO can be built in tile order directly during
+  upload, skipping the compaction pass entirely).
+- File hash of the source `.ifc` (invalidation check).
+
+This makes second-and-subsequent loads of the same model significantly faster:
+the spatial tree doesn't need to be rebuilt, and geometry uploads in tile order
+from the start.
+
+#### Performance characteristics
+
+| Metric | Value |
+|--------|-------|
+| Tile count (typical) | 500–5,000 for a large building |
+| Per-frame frustum tests | N_tiles instead of N_objects |
+| 500k objects, ~2k tiles | ~0.01 ms frustum testing |
+| Memory overhead | ~64 bytes/tile + 32 bytes/object (phase 1 metadata retained) |
+| Background compaction | 1–5 seconds for 1M objects (single-threaded) |
+| Sidecar file size | ~10–50 KB (indices + tree, no geometry) |
+
+#### Spatial coherence bonus
+
+Beyond culling, tile-sorted EBOs improve GPU cache performance. When the GPU
+rasterizes a tile's triangles, the vertices are contiguous in the VBO, so the
+post-transform vertex cache hits more often. This can yield 10–20% rasterization
+speedup even when nothing is culled (e.g. zoomed out to see the whole model).
+
+### Phase 3: GPU-Driven Indirect Draw
+
+For models with 500k+ objects, even tile-level CPU culling is fast, but the
+real bottleneck shifts to draw call submission. Phase 3 moves all per-frame
+visibility decisions to the GPU via compute shaders and indirect draw commands.
+
+#### How it works
+
+Phase 3 is **approach 2 layered on top of approach 3**. It does not replace
+tiling — it accelerates it.
+
+1. **Upload phase** (once, at load time):
+   - Per-tile AABBs are uploaded to a GPU SSBO (`tile_aabbs`).
+   - One `DrawElementsIndirectCommand` per tile is written to an indirect draw
+     buffer:
+     ```c
+     struct DrawElementsIndirectCommand {
+         uint count;          // tile's total index count
+         uint instanceCount;  // 1
+         uint firstIndex;     // offset into EBO
+         uint baseVertex;     // 0 (indices are global)
+         uint baseInstance;   // tile_id (available in shader via gl_DrawID)
+     };
+     ```
+   - A "template" copy of the indirect buffer is kept so the compute shader
+     can reset culled commands each frame without re-uploading from CPU.
+
+2. **Cull phase** (every frame, on the GPU):
+   - The CPU uploads 6 frustum plane vec4s as a uniform or small UBO.
+   - A compute shader dispatches `ceil(N_tiles / 64)` workgroups:
+     ```glsl
+     layout(local_size_x = 64) in;
+
+     void main() {
+         uint tile_id = gl_GlobalInvocationID.x;
+         if (tile_id >= tile_count) return;
+
+         // Copy from template (resets any previously zeroed commands)
+         commands[tile_id] = template_commands[tile_id];
+
+         // Frustum test
+         if (!aabb_vs_frustum(tile_aabbs[tile_id], frustum_planes)) {
+             commands[tile_id].count = 0;  // culled: GPU skips zero-count draws
+         }
+     }
+     ```
+   - A memory barrier ensures the indirect buffer is visible to the draw stage.
+
+3. **Draw phase** (every frame):
+   - One call: `glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT,
+     nullptr, N_tiles, 0)`.
+   - The GPU reads the indirect buffer, skips tiles with `count == 0`, and
+     draws the rest. Zero CPU-side per-object or per-tile work.
+
+#### What the CPU does per frame
+
+1. Upload 6 vec4 frustum planes (96 bytes).
+2. Dispatch one compute shader.
+3. Issue one `glMultiDrawElementsIndirect`.
+4. Swap buffers.
+
+That's it. The CPU frame time is essentially constant regardless of model size.
+
+#### Future extensions (enabled by this architecture)
+
+Once the compute-based cull pass exists, it's straightforward to add:
+
+- **Hierarchical-Z occlusion culling**: render a coarse depth buffer from the
+  previous frame, then test tile AABBs against it in the compute shader. Tiles
+  fully behind closer geometry get culled. This handles interior-heavy BIM
+  models well (most rooms are occluded from any given viewpoint).
+- **Distance-based LOD**: the compute shader can select different index ranges
+  (coarse vs. fine tessellation) per tile based on distance to camera.
+- **Contribution culling**: tiles whose screen-space projection is below a
+  pixel threshold get `count = 0`. Removes distant small objects.
+
+#### Performance characteristics
+
+| Metric | Value |
+|--------|-------|
+| CPU per-frame work | ~0.01 ms (constant, independent of model size) |
+| GPU compute dispatch | ~0.02 ms for 2k tiles |
+| Draw call overhead | 1 indirect multi-draw call |
+| GPU memory overhead | ~48 bytes/tile (AABB SSBO) + 20 bytes/tile (indirect commands) × 2 (template + live) |
+| Total for 2k tiles | ~176 KB GPU memory |
+| Implementation complexity | High (compute shaders, SSBOs, memory barriers, indirect draw) |
+
+#### When to use
+
+Phase 3 is worthwhile when:
+
+- The model has 500k+ objects (CPU frustum testing > 3 ms).
+- Smooth 60 fps orbiting is required during interaction.
+- The GPU has compute shader support (OpenGL 4.3+, which is guaranteed since
+  the viewer requires 4.5).
+
+For models under 100k objects, phase 1 alone is sufficient. For 100k–500k,
+phase 2 (tiling) keeps CPU culling under 1 ms. Phase 3 is the final step that
+makes the CPU frame time constant.
+
+### Summary
+
+```
+Model size       Active phases     CPU cull cost     Draw calls
+─────────────    ──────────────    ──────────────    ──────────
+< 10k objects    Phase 1           ~0.06 ms          1 multi-draw
+10k–100k         Phase 1           ~0.6 ms           1 multi-draw
+100k–500k        Phase 1 + 2       ~0.01 ms          1 multi-draw
+500k–1M+         Phase 1 + 2 + 3   ~0 (GPU)          1 indirect multi-draw
+```
+
+The load path:
+
+```
+open(model.ifc):
+  ├─ sidecar exists?
+  │   ├─ yes: load tile tree from .ifcview
+  │   │       upload geometry in tile order
+  │   │       (skip background compaction)
+  │   └─ no:  upload geometry in iterator order (fast first frame)
+  │           phase 1 culling active immediately
+  │           if object_count > threshold:
+  │               background: build octree, re-sort EBO, save .ifcview
+  │               on completion: swap in tile structure
+  └─ rendering:
+      ├─ phase 3 available? → compute cull + indirect multi-draw
+      └─ else               → CPU frustum test + glMultiDrawElements
+```
+
 ## Roadmap
 
-- [ ] Material color support (currently renders default grey per batch)
-- [ ] Buffer growth (reallocate when 64 MB VBO fills up)
-- [ ] `glMultiDrawElementsIndirect` for fewer draw calls
+- [x] Material color support (per-vertex RGBA8)
+- [x] Buffer growth (dynamic VBO/EBO resizing up to 4 GB)
+- [x] Per-object frustum culling (phase 1)
+- [ ] Spatial tiling with octree (phase 2)
+- [ ] GPU-driven indirect draw (phase 3)
+- [ ] Preprocessed `.ifcview` sidecar for fast re-loads
+- [ ] Hierarchical-Z occlusion culling
+- [ ] Distance-based LOD selection
 - [ ] Vulkan/MoltenVK backend for macOS
-- [ ] Spatial tree (BVH) for frustum culling
-- [ ] LOD: coarse tessellation during streaming, refine in background
 - [ ] Embedded Python scripting console
-- [ ] CJK text input support (Qt6 handles this natively)

From 8bd1dee43be71a2d949631089cdd9e28e7c5630a Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 19:50:46 +1000
Subject: [PATCH 07/37] Per-object frustum culling with glMultiDrawElements

Track per-object AABB and index range during upload. Each frame,
extract frustum planes from the view-projection matrix and cull
objects whose AABB is entirely outside any plane. Draw only visible
objects via glMultiDrawElements. Document the three-phase rendering
performance strategy in README.md.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 104 ++++++++++++++++++++++++++++---
 src/ifcviewer/ViewportWindow.h   |  17 ++++-
 2 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 99624cb9f54..414b9889fa5 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -26,7 +26,9 @@
 #include <QtOpenGL/QOpenGLVersionFunctionsFactory>
 
 #include <cstring>
+#include <cmath>
 #include <algorithm>
+#include <limits>
 
 static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;  // 64 MB
 static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;  // 32 MB
@@ -421,9 +423,31 @@ void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
     }
     gl_->glNamedBufferSubData(ebo_, ebo_used_, ib_size, global_indices.data());
 
+    // Compute AABB from vertex positions in this chunk.
+    ObjectDrawInfo info;
+    info.index_offset = static_cast<uint32_t>(ebo_used_);
+    info.index_count = static_cast<uint32_t>(chunk.indices.size());
+
+    const size_t num_verts = chunk.vertices.size() / VERTEX_STRIDE;
+    if (num_verts > 0) {
+        info.aabb_min[0] = info.aabb_min[1] = info.aabb_min[2] =  std::numeric_limits<float>::max();
+        info.aabb_max[0] = info.aabb_max[1] = info.aabb_max[2] = -std::numeric_limits<float>::max();
+        for (size_t v = 0; v < num_verts; ++v) {
+            const float* pos = &chunk.vertices[v * VERTEX_STRIDE];
+            for (int a = 0; a < 3; ++a) {
+                if (pos[a] < info.aabb_min[a]) info.aabb_min[a] = pos[a];
+                if (pos[a] > info.aabb_max[a]) info.aabb_max[a] = pos[a];
+            }
+        }
+    } else {
+        info.aabb_min[0] = info.aabb_min[1] = info.aabb_min[2] = 0.0f;
+        info.aabb_max[0] = info.aabb_max[1] = info.aabb_max[2] = 0.0f;
+    }
+
     {
         std::lock_guard<std::mutex> lock(upload_mutex_);
         total_index_count_ += static_cast<uint32_t>(chunk.indices.size());
+        object_draw_info_.push_back(info);
     }
 
     vbo_used_ += vb_size;
@@ -442,6 +466,7 @@ void ViewportWindow::resetScene() {
     vertex_count_ = 0;
     total_triangles_ = 0;
     selected_object_id_ = 0;
+    object_draw_info_.clear();
 }
 
 void ViewportWindow::setSelectedObjectId(uint32_t id) {
@@ -504,6 +529,63 @@ void ViewportWindow::updateCamera() {
     proj_matrix_.perspective(45.0f, aspect, 0.1f, camera_distance_ * 10.0f);
 }
 
+void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
+    visible_counts_.clear();
+    visible_offsets_.clear();
+
+    std::lock_guard<std::mutex> lock(upload_mutex_);
+    if (object_draw_info_.empty()) return;
+
+    // Extract 6 frustum planes from the view-projection matrix.
+    // Each plane is (a, b, c, d) where ax + by + cz + d >= 0 is inside.
+    // QMatrix4x4 is stored column-major; operator(row, col) gives element.
+    float planes[6][4];
+    for (int i = 0; i < 4; ++i) {
+        planes[0][i] = vp(3, i) + vp(0, i);  // left
+        planes[1][i] = vp(3, i) - vp(0, i);  // right
+        planes[2][i] = vp(3, i) + vp(1, i);  // bottom
+        planes[3][i] = vp(3, i) - vp(1, i);  // top
+        planes[4][i] = vp(3, i) + vp(2, i);  // near
+        planes[5][i] = vp(3, i) - vp(2, i);  // far
+    }
+    // Normalize planes.
+    for (int p = 0; p < 6; ++p) {
+        float len = std::sqrt(planes[p][0] * planes[p][0] +
+                              planes[p][1] * planes[p][1] +
+                              planes[p][2] * planes[p][2]);
+        if (len > 0.0f) {
+            float inv = 1.0f / len;
+            planes[p][0] *= inv;
+            planes[p][1] *= inv;
+            planes[p][2] *= inv;
+            planes[p][3] *= inv;
+        }
+    }
+
+    visible_counts_.reserve(object_draw_info_.size());
+    visible_offsets_.reserve(object_draw_info_.size());
+
+    for (const auto& obj : object_draw_info_) {
+        bool visible = true;
+        for (int p = 0; p < 6; ++p) {
+            // p-vertex: the AABB corner most in the direction of the plane normal.
+            float px = planes[p][0] >= 0.0f ? obj.aabb_max[0] : obj.aabb_min[0];
+            float py = planes[p][1] >= 0.0f ? obj.aabb_max[1] : obj.aabb_min[1];
+            float pz = planes[p][2] >= 0.0f ? obj.aabb_max[2] : obj.aabb_min[2];
+            float dist = planes[p][0] * px + planes[p][1] * py + planes[p][2] * pz + planes[p][3];
+            if (dist < 0.0f) {
+                visible = false;
+                break;
+            }
+        }
+        if (visible) {
+            visible_counts_.push_back(static_cast<GLsizei>(obj.index_count));
+            visible_offsets_.push_back(reinterpret_cast<const void*>(
+                static_cast<uintptr_t>(obj.index_offset)));
+        }
+    }
+}
+
 void ViewportWindow::render() {
     if (!gl_initialized_ || !isExposed()) return;
 
@@ -524,11 +606,12 @@ void ViewportWindow::render() {
 
     gl_->glBindVertexArray(vao_);
 
-    {
-        std::lock_guard<std::mutex> lock(upload_mutex_);
-        if (total_index_count_ > 0) {
-            gl_->glDrawElements(GL_TRIANGLES, total_index_count_, GL_UNSIGNED_INT, nullptr);
-        }
+    buildVisibleList(vp);
+    if (!visible_counts_.empty()) {
+        gl_->glMultiDrawElements(GL_TRIANGLES,
+            visible_counts_.data(), GL_UNSIGNED_INT,
+            visible_offsets_.data(),
+            static_cast<GLsizei>(visible_counts_.size()));
     }
 
     renderAxisGizmo();
@@ -588,11 +671,12 @@ void ViewportWindow::renderPickPass() {
 
     gl_->glBindVertexArray(vao_);
 
-    {
-        std::lock_guard<std::mutex> lock(upload_mutex_);
-        if (total_index_count_ > 0) {
-            gl_->glDrawElements(GL_TRIANGLES, total_index_count_, GL_UNSIGNED_INT, nullptr);
-        }
+    // Reuse the visible list from the most recent render() call.
+    if (!visible_counts_.empty()) {
+        gl_->glMultiDrawElements(GL_TRIANGLES,
+            visible_counts_.data(), GL_UNSIGNED_INT,
+            visible_offsets_.data(),
+            static_cast<GLsizei>(visible_counts_.size()));
     }
 
     gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index cb718050c8f..363158b16f7 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -36,6 +36,13 @@ struct MaterialInfo {
     float r = 0.75f, g = 0.75f, b = 0.78f, a = 1.0f;
 };
 
+struct ObjectDrawInfo {
+    uint32_t index_offset;  // byte offset into EBO
+    uint32_t index_count;   // number of indices
+    float aabb_min[3];      // world-space AABB
+    float aabb_max[3];
+};
+
 struct UploadChunk {
     // Interleaved per-vertex layout (8 floats / 32 bytes per vertex):
     //   pos(3 float) + normal(3 float) + object_id(1 float bitcast from uint)
@@ -77,6 +84,7 @@ class ViewportWindow : public QWindow {
     void buildAxisGizmo();
     bool growVbo(size_t needed_total);
     bool growEbo(size_t needed_total);
+    void buildVisibleList(const QMatrix4x4& vp);
 
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
@@ -116,12 +124,15 @@ class ViewportWindow : public QWindow {
     int pick_width_ = 0;
     int pick_height_ = 0;
 
-    // The entire scene is a single mega-batch: per-vertex color removes the
-    // need to switch materials between draw calls. Indices are written into
-    // the EBO already offset by base_vertex so one glDrawElements covers all.
+    // Per-object draw metadata for frustum culling.
+    std::vector<ObjectDrawInfo> object_draw_info_;
     uint32_t total_index_count_ = 0;
     std::mutex upload_mutex_;
 
+    // Scratch buffers reused each frame to avoid allocation.
+    std::vector<GLsizei> visible_counts_;
+    std::vector<const void*> visible_offsets_;
+
     // Camera
     QVector3D camera_target_{0, 0, 0};
     float camera_distance_ = 50.0f;

From 6a001840bcb5a4e68f58ca1f1b2dad485a9845ac Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 20:05:50 +1000
Subject: [PATCH 08/37] Add performance stats overlay in status bar

Show FPS, frame time, visible/total objects, and visible/total
triangles in the status bar. Toggled via Settings > Show Performance
Stats, persisted in app settings.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/AppSettings.cpp    | 14 ++++++++++++++
 src/ifcviewer/AppSettings.h      |  5 +++++
 src/ifcviewer/MainWindow.cpp     | 21 +++++++++++++++++++++
 src/ifcviewer/MainWindow.h       |  1 +
 src/ifcviewer/SettingsWindow.cpp |  6 ++++++
 src/ifcviewer/SettingsWindow.h   |  2 ++
 src/ifcviewer/ViewportWindow.cpp | 21 +++++++++++++++++++++
 src/ifcviewer/ViewportWindow.h   | 14 ++++++++++++++
 8 files changed, 84 insertions(+)

diff --git a/src/ifcviewer/AppSettings.cpp b/src/ifcviewer/AppSettings.cpp
index 07c5f8c3bc2..af1edfa36f6 100644
--- a/src/ifcviewer/AppSettings.cpp
+++ b/src/ifcviewer/AppSettings.cpp
@@ -24,6 +24,7 @@
 namespace {
 constexpr const char* kGeometryLibraryKey = "geometry/library";
 constexpr const char* kGeometryLibraryDefault = "hybrid-cgal-simple-opencascade";
+constexpr const char* kShowStatsKey = "viewport/show_stats";
 }
 
 AppSettings& AppSettings::instance() {
@@ -46,12 +47,25 @@ void AppSettings::setGeometryLibrary(const QString& value) {
     emit geometryLibraryChanged(value);
 }
 
+bool AppSettings::showStats() const {
+    return show_stats_;
+}
+
+void AppSettings::setShowStats(bool value) {
+    if (show_stats_ == value) return;
+    show_stats_ = value;
+    persist();
+    emit showStatsChanged(value);
+}
+
 void AppSettings::load() {
     QSettings settings;
     geometry_library_ = settings.value(kGeometryLibraryKey, kGeometryLibraryDefault).toString();
+    show_stats_ = settings.value(kShowStatsKey, false).toBool();
 }
 
 void AppSettings::persist() {
     QSettings settings;
     settings.setValue(kGeometryLibraryKey, geometry_library_);
+    settings.setValue(kShowStatsKey, show_stats_);
 }
diff --git a/src/ifcviewer/AppSettings.h b/src/ifcviewer/AppSettings.h
index 9658c10b955..f70062475c6 100644
--- a/src/ifcviewer/AppSettings.h
+++ b/src/ifcviewer/AppSettings.h
@@ -34,8 +34,12 @@ class AppSettings : public QObject {
     QString geometryLibrary() const;
     void setGeometryLibrary(const QString& value);
 
+    bool showStats() const;
+    void setShowStats(bool value);
+
 signals:
     void geometryLibraryChanged(const QString& value);
+    void showStatsChanged(bool value);
 
 private:
     AppSettings();
@@ -43,6 +47,7 @@ class AppSettings : public QObject {
     void persist();
 
     QString geometry_library_;
+    bool show_stats_ = false;
 };
 
 #endif // APPSETTINGS_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 6eede353532..4abd929b0b8 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -18,6 +18,7 @@
  ********************************************************************************/
 
 #include "MainWindow.h"
+#include "AppSettings.h"
 #include "SettingsWindow.h"
 
 #include <QApplication>
@@ -43,6 +44,23 @@ MainWindow::MainWindow(QWidget* parent)
         QMessageBox::warning(this, "Error", msg);
     }, Qt::QueuedConnection);
 
+    connect(viewport_, &ViewportWindow::frameStatsUpdated, this, [this](const ViewportWindow::FrameStats& s) {
+        if (!stats_label_->isVisible()) return;
+        stats_label_->setText(
+            QString("%1 fps | %2 ms | %3/%4 obj | %5/%6 tri")
+                .arg(s.fps, 0, 'f', 1)
+                .arg(s.frame_time_ms, 0, 'f', 1)
+                .arg(s.visible_objects)
+                .arg(s.total_objects)
+                .arg(s.visible_triangles)
+                .arg(s.total_triangles));
+    });
+
+    connect(&AppSettings::instance(), &AppSettings::showStatsChanged, this, [this](bool show) {
+        stats_label_->setVisible(show);
+        if (!show) stats_label_->clear();
+    });
+
     connect(&element_poll_timer_, &QTimer::timeout, this, &MainWindow::pollNewElements);
     element_poll_timer_.setInterval(100);
 
@@ -91,7 +109,10 @@ void MainWindow::setupUi() {
     progress_bar_->setMaximumWidth(200);
     progress_bar_->setVisible(false);
     status_label_ = new QLabel("Ready");
+    stats_label_ = new QLabel();
+    stats_label_->setVisible(AppSettings::instance().showStats());
     statusBar()->addWidget(status_label_, 1);
+    statusBar()->addPermanentWidget(stats_label_);
     statusBar()->addPermanentWidget(progress_bar_);
 }
 
diff --git a/src/ifcviewer/MainWindow.h b/src/ifcviewer/MainWindow.h
index d5f4c18a395..bbec6ce83de 100644
--- a/src/ifcviewer/MainWindow.h
+++ b/src/ifcviewer/MainWindow.h
@@ -66,6 +66,7 @@ private slots:
     QTableWidget* property_table_ = nullptr;
     QProgressBar* progress_bar_ = nullptr;
     QLabel* status_label_ = nullptr;
+    QLabel* stats_label_ = nullptr;
     QTimer element_poll_timer_;
     QElapsedTimer load_timer_;
 
diff --git a/src/ifcviewer/SettingsWindow.cpp b/src/ifcviewer/SettingsWindow.cpp
index a24f9bc9763..c4ebddc650e 100644
--- a/src/ifcviewer/SettingsWindow.cpp
+++ b/src/ifcviewer/SettingsWindow.cpp
@@ -20,6 +20,7 @@
 #include "SettingsWindow.h"
 #include "AppSettings.h"
 
+#include <QCheckBox>
 #include <QDialogButtonBox>
 #include <QFormLayout>
 #include <QLineEdit>
@@ -40,6 +41,9 @@ void SettingsWindow::setupUi() {
     geometry_library_edit_->setMinimumWidth(280);
     form->addRow("Geometry Library", geometry_library_edit_);
 
+    show_stats_check_ = new QCheckBox(this);
+    form->addRow("Show Performance Stats", show_stats_check_);
+
     auto* button_box = new QDialogButtonBox(
         QDialogButtonBox::Ok | QDialogButtonBox::Cancel, this);
 
@@ -60,9 +64,11 @@ void SettingsWindow::showEvent(QShowEvent* event) {
 
 void SettingsWindow::syncFromSettings() {
     geometry_library_edit_->setText(AppSettings::instance().geometryLibrary());
+    show_stats_check_->setChecked(AppSettings::instance().showStats());
 }
 
 void SettingsWindow::onAccepted() {
     AppSettings::instance().setGeometryLibrary(geometry_library_edit_->text());
+    AppSettings::instance().setShowStats(show_stats_check_->isChecked());
     accept();
 }
diff --git a/src/ifcviewer/SettingsWindow.h b/src/ifcviewer/SettingsWindow.h
index 77affe77578..ea55252682e 100644
--- a/src/ifcviewer/SettingsWindow.h
+++ b/src/ifcviewer/SettingsWindow.h
@@ -22,6 +22,7 @@
 
 #include <QDialog>
 
+class QCheckBox;
 class QLineEdit;
 class QShowEvent;
 
@@ -41,6 +42,7 @@ private slots:
     void syncFromSettings();
 
     QLineEdit* geometry_library_edit_ = nullptr;
+    QCheckBox* show_stats_check_ = nullptr;
 };
 
 #endif
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 414b9889fa5..1ebe988554a 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -532,6 +532,7 @@ void ViewportWindow::updateCamera() {
 void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
     visible_counts_.clear();
     visible_offsets_.clear();
+    visible_triangles_ = 0;
 
     std::lock_guard<std::mutex> lock(upload_mutex_);
     if (object_draw_info_.empty()) return;
@@ -582,6 +583,7 @@ void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
             visible_counts_.push_back(static_cast<GLsizei>(obj.index_count));
             visible_offsets_.push_back(reinterpret_cast<const void*>(
                 static_cast<uintptr_t>(obj.index_offset)));
+            visible_triangles_ += obj.index_count / 3;
         }
     }
 }
@@ -617,6 +619,25 @@ void ViewportWindow::render() {
     renderAxisGizmo();
 
     context_->swapBuffers(this);
+
+    // Compute FPS (updated once per second to avoid flicker).
+    float dt = frame_clock_.restart() / 1000.0f;
+    accumulated_time_ += dt;
+    frame_count_++;
+    if (accumulated_time_ >= 1.0f) {
+        last_fps_ = static_cast<float>(frame_count_) / accumulated_time_;
+        frame_count_ = 0;
+        accumulated_time_ = 0.0f;
+
+        FrameStats stats;
+        stats.fps = last_fps_;
+        stats.frame_time_ms = 1000.0f / last_fps_;
+        stats.total_objects = static_cast<uint32_t>(object_draw_info_.size());
+        stats.visible_objects = static_cast<uint32_t>(visible_counts_.size());
+        stats.total_triangles = total_triangles_;
+        stats.visible_triangles = visible_triangles_;
+        emit frameStatsUpdated(stats);
+    }
 }
 
 void ViewportWindow::renderAxisGizmo() {
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 363158b16f7..58a63343212 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -65,9 +65,19 @@ class ViewportWindow : public QWindow {
     void setSelectedObjectId(uint32_t id);
     uint32_t pickObjectAt(int x, int y);
 
+    struct FrameStats {
+        float fps;
+        float frame_time_ms;
+        uint32_t total_objects;
+        uint32_t visible_objects;
+        uint32_t total_triangles;
+        uint32_t visible_triangles;
+    };
+
 signals:
     void objectPicked(uint32_t object_id);
     void initialized();
+    void frameStatsUpdated(const ViewportWindow::FrameStats& stats);
 
 protected:
     void exposeEvent(QExposeEvent* event) override;
@@ -152,6 +162,10 @@ class ViewportWindow : public QWindow {
 
     // Stats
     uint32_t total_triangles_ = 0;
+    uint32_t visible_triangles_ = 0;
+    int frame_count_ = 0;
+    float accumulated_time_ = 0.0f;
+    float last_fps_ = 0.0f;
 };
 
 #endif // VIEWPORTWINDOW_H

From 2ffd2da540f22828dd7ccde07c8b08ada4ac0ffe Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 20:49:39 +1000
Subject: [PATCH 09/37] Multi-model project support with sequential loading

Introduce ModelHandle and per-model GeometryStreamers so multiple IFC
files can be loaded simultaneously. Object IDs are globally unique
(monotonically increasing across models). File picker is now multiselect.
Each model gets a top-level tree node. Property lookup uses the correct
model's ifcopenshell::file. ViewportWindow supports hide/show/remove
per model via model_id filtering in the frustum cull pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/GeometryStreamer.cpp |   7 +-
 src/ifcviewer/GeometryStreamer.h   |   7 +-
 src/ifcviewer/MainWindow.cpp       | 134 +++++++++++++++++++++--------
 src/ifcviewer/MainWindow.h         |  31 ++++++-
 src/ifcviewer/ViewportWindow.cpp   |  22 +++++
 src/ifcviewer/ViewportWindow.h     |   9 ++
 src/ifcviewer/main.cpp             |   4 +-
 7 files changed, 167 insertions(+), 47 deletions(-)

diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
index 437209c9098..7235bced9f8 100644
--- a/src/ifcviewer/GeometryStreamer.cpp
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -40,7 +40,7 @@ GeometryStreamer::~GeometryStreamer() {
     }
 }
 
-void GeometryStreamer::loadFile(const std::string& path, int num_threads) {
+void GeometryStreamer::loadFile(const std::string& path, uint32_t start_object_id, uint32_t model_id, int num_threads) {
     if (running_.load()) {
         cancel();
         if (worker_thread_ && worker_thread_->isRunning()) {
@@ -52,7 +52,8 @@ void GeometryStreamer::loadFile(const std::string& path, int num_threads) {
     cancel_requested_ = false;
     running_ = true;
     progress_ = 0;
-    next_object_id_ = 1;
+    next_object_id_ = start_object_id;
+    model_id_ = model_id;
 
     {
         std::lock_guard<std::mutex> lock(elements_mutex_);
@@ -139,6 +140,7 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
         // Record element metadata
         ElementInfo info;
         info.object_id = object_id;
+        info.model_id = model_id_;
         info.ifc_id = tri_elem->id();
         info.guid = tri_elem->guid();
         info.name = tri_elem->name();
@@ -201,6 +203,7 @@ static inline uint32_t packRGBA8(const MaterialInfo& m) {
 UploadChunk GeometryStreamer::convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id) {
     UploadChunk chunk;
     chunk.object_id = object_id;
+    chunk.model_id = model_id_;
 
     const auto& geom = elem->geometry();
     const auto& verts = geom.verts();
diff --git a/src/ifcviewer/GeometryStreamer.h b/src/ifcviewer/GeometryStreamer.h
index abd087463cc..0d49a12ca70 100644
--- a/src/ifcviewer/GeometryStreamer.h
+++ b/src/ifcviewer/GeometryStreamer.h
@@ -38,6 +38,7 @@
 
 struct ElementInfo {
     uint32_t object_id;
+    uint32_t model_id;
     int ifc_id;
     std::string guid;
     std::string name;
@@ -51,11 +52,13 @@ class GeometryStreamer : public QObject {
     explicit GeometryStreamer(QObject* parent = nullptr);
     ~GeometryStreamer();
 
-    void loadFile(const std::string& path, int num_threads = 0);
+    void loadFile(const std::string& path, uint32_t start_object_id, uint32_t model_id, int num_threads = 0);
     void cancel();
 
     bool isRunning() const { return running_.load(); }
     int progress() const { return progress_.load(); }
+    uint32_t lastObjectId() const { return next_object_id_; }
+    uint32_t modelId() const { return model_id_; }
 
     ifcopenshell::file* ifcFile() const { return ifc_file_.get(); }
 
@@ -82,8 +85,8 @@ class GeometryStreamer : public QObject {
     std::mutex elements_mutex_;
     std::vector<ElementInfo> pending_elements_;
 
-    // Map from IFC product id to our compact object_id
     uint32_t next_object_id_ = 1; // 0 = no object
+    uint32_t model_id_ = 0;
 };
 
 #endif // GEOMETRYSTREAMER_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 4abd929b0b8..3b4e58fbacc 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -24,6 +24,7 @@
 #include <QApplication>
 #include <QMenuBar>
 #include <QFileDialog>
+#include <QFileInfo>
 #include <QMessageBox>
 #include <QStatusBar>
 #include <QHeaderView>
@@ -36,14 +37,6 @@ MainWindow::MainWindow(QWidget* parent)
     setupUi();
     setupMenus();
 
-    streamer_ = new GeometryStreamer(this);
-    connect(streamer_, &GeometryStreamer::progressChanged, this, &MainWindow::onProgressChanged, Qt::QueuedConnection);
-    connect(streamer_, &GeometryStreamer::elementReady, this, &MainWindow::onElementReady, Qt::QueuedConnection);
-    connect(streamer_, &GeometryStreamer::finished, this, &MainWindow::onStreamingFinished, Qt::QueuedConnection);
-    connect(streamer_, &GeometryStreamer::errorOccurred, this, [this](const QString& msg) {
-        QMessageBox::warning(this, "Error", msg);
-    }, Qt::QueuedConnection);
-
     connect(viewport_, &ViewportWindow::frameStatsUpdated, this, [this](const ViewportWindow::FrameStats& s) {
         if (!stats_label_->isVisible()) return;
         stats_label_->setText(
@@ -118,7 +111,7 @@ void MainWindow::setupUi() {
 
 void MainWindow::setupMenus() {
     auto* file_menu = menuBar()->addMenu("&File");
-    auto* open_action = file_menu->addAction("&Open...", this, &MainWindow::onFileOpen);
+    auto* open_action = file_menu->addAction("&Add Files...", this, &MainWindow::onFileOpen);
     open_action->setShortcut(QKeySequence::Open);
     file_menu->addAction("&Settings...", this, &MainWindow::onFileSettings);
     file_menu->addSeparator();
@@ -126,9 +119,11 @@ void MainWindow::setupMenus() {
 }
 
 void MainWindow::onFileOpen() {
-    QString path = QFileDialog::getOpenFileName(this, "Open IFC File", QString(), "IFC Files (*.ifc *.ifcxml *.ifczip);;All Files (*)");
-    if (!path.isEmpty()) {
-        openFile(path);
+    QStringList paths = QFileDialog::getOpenFileNames(
+        this, "Add IFC Files", QString(),
+        "IFC Files (*.ifc *.ifcxml *.ifczip);;All Files (*)");
+    if (!paths.isEmpty()) {
+        addFiles(paths);
     }
 }
 
@@ -141,21 +136,64 @@ void MainWindow::onFileSettings() {
     settings_->raise();
 }
 
-void MainWindow::openFile(const QString& path) {
-    viewport_->resetScene();
-    element_tree_->clear();
-    property_table_->setRowCount(0);
-    element_map_.clear();
-    tree_items_.clear();
-    ifc_id_to_object_id_.clear();
+void MainWindow::addFiles(const QStringList& paths) {
+    for (const auto& path : paths) {
+        ModelId id = next_model_id_++;
+
+        ModelHandle handle;
+        handle.id = id;
+        handle.file_path = path;
+        handle.display_name = QFileInfo(path).fileName();
+        handle.streamer = new GeometryStreamer(this);
+
+        // Create top-level tree item for this model
+        auto* root = new QTreeWidgetItem(element_tree_);
+        root->setText(0, handle.display_name);
+        root->setText(1, "IFC Model");
+        root->setData(0, Qt::UserRole, static_cast<uint32_t>(0)); // 0 = not a pickable object
+        handle.tree_root = root;
+
+        models_[id] = handle;
+        load_queue_.push_back(id);
+    }
+
+    if (loading_model_id_ == 0) {
+        startNextLoad();
+    }
+}
+
+void MainWindow::connectStreamer(GeometryStreamer* streamer) {
+    connect(streamer, &GeometryStreamer::progressChanged,
+            this, &MainWindow::onProgressChanged, Qt::QueuedConnection);
+    connect(streamer, &GeometryStreamer::elementReady,
+            this, &MainWindow::onElementReady, Qt::QueuedConnection);
+    connect(streamer, &GeometryStreamer::finished,
+            this, &MainWindow::onStreamingFinished, Qt::QueuedConnection);
+    connect(streamer, &GeometryStreamer::errorOccurred, this, [this](const QString& msg) {
+        QMessageBox::warning(this, "Error", msg);
+    }, Qt::QueuedConnection);
+}
+
+void MainWindow::startNextLoad() {
+    if (load_queue_.empty()) {
+        loading_model_id_ = 0;
+        return;
+    }
+
+    loading_model_id_ = load_queue_.front();
+    load_queue_.pop_front();
+
+    auto& model = models_[loading_model_id_];
+    connectStreamer(model.streamer);
 
     progress_bar_->setValue(0);
     progress_bar_->setVisible(true);
-    status_label_->setText("Loading: " + path);
+    status_label_->setText("Loading: " + model.display_name);
 
     load_timer_.restart();
     element_poll_timer_.start();
-    streamer_->loadFile(path.toStdString());
+    model.streamer->loadFile(
+        model.file_path.toStdString(), next_object_id_, loading_model_id_);
 }
 
 void MainWindow::onProgressChanged(int percent) {
@@ -170,15 +208,30 @@ void MainWindow::onStreamingFinished() {
     element_poll_timer_.stop();
     pollNewElements(); // drain remaining
 
+    // Update next_object_id_ from the streamer that just finished.
+    if (loading_model_id_ != 0) {
+        auto it = models_.find(loading_model_id_);
+        if (it != models_.end()) {
+            next_object_id_ = it->second.streamer->lastObjectId();
+        }
+    }
+
     progress_bar_->setVisible(false);
 
     qint64 ms = load_timer_.elapsed();
     QString elapsed = (ms >= 1000)
         ? QString::number(ms / 1000.0, 'f', 2) + " s"
         : QString::number(ms) + " ms";
-    status_label_->setText(QString("Loaded %1 elements in %2")
-        .arg(element_map_.size())
+
+    size_t total_elements = element_map_.size();
+    size_t num_models = models_.size();
+    status_label_->setText(QString("%1 elements across %2 model(s) — last loaded in %3")
+        .arg(total_elements)
+        .arg(num_models)
         .arg(elapsed));
+
+    // Start next model if queued.
+    startNextLoad();
 }
 
 void MainWindow::onObjectPicked(uint32_t object_id) {
@@ -205,15 +258,23 @@ void MainWindow::onTreeSelectionChanged() {
 }
 
 void MainWindow::pollNewElements() {
-    auto elements = streamer_->drainElements();
+    if (loading_model_id_ == 0) return;
+
+    auto it = models_.find(loading_model_id_);
+    if (it == models_.end()) return;
+
+    auto& model = it->second;
+    auto elements = model.streamer->drainElements();
+
     for (auto& info : elements) {
         element_map_[info.object_id] = info;
-        ifc_id_to_object_id_[info.ifc_id] = info.object_id;
+        scoped_ifc_id_to_object_id_[scopedKey(info.model_id, info.ifc_id)] = info.object_id;
 
-        // Find parent tree item
-        QTreeWidgetItem* parent_item = nullptr;
-        auto parent_obj_it = ifc_id_to_object_id_.find(info.parent_id);
-        if (parent_obj_it != ifc_id_to_object_id_.end()) {
+        // Find parent tree item (scoped to this model)
+        QTreeWidgetItem* parent_item = model.tree_root;
+        auto parent_obj_it = scoped_ifc_id_to_object_id_.find(
+            scopedKey(info.model_id, info.parent_id));
+        if (parent_obj_it != scoped_ifc_id_to_object_id_.end()) {
             auto tree_it = tree_items_.find(parent_obj_it->second);
             if (tree_it != tree_items_.end()) {
                 parent_item = tree_it->second;
@@ -225,12 +286,7 @@ void MainWindow::pollNewElements() {
             display_name = QString::fromStdString(info.type) + " #" + QString::number(info.ifc_id);
         }
 
-        QTreeWidgetItem* item;
-        if (parent_item) {
-            item = new QTreeWidgetItem(parent_item);
-        } else {
-            item = new QTreeWidgetItem(element_tree_);
-        }
+        auto* item = new QTreeWidgetItem(parent_item);
         item->setText(0, display_name);
         item->setText(1, QString::fromStdString(info.type));
         item->setText(2, QString::fromStdString(info.guid));
@@ -261,8 +317,11 @@ void MainWindow::populateProperties(uint32_t object_id) {
     addRow("Name", QString::fromStdString(info.name));
     addRow("Type", QString::fromStdString(info.type));
 
-    // If the file is loaded, try to get property sets
-    auto* file = streamer_->ifcFile();
+    // Find the correct model's file for property lookup
+    auto model_it = models_.find(info.model_id);
+    if (model_it == models_.end()) return;
+
+    auto* file = model_it->second.streamer->ifcFile();
     if (!file) return;
 
     auto product = file->instance_by_id(info.ifc_id);
@@ -280,7 +339,6 @@ void MainWindow::populateProperties(uint32_t object_id) {
                     try {
                         str_val = static_cast<std::string>(val);
                     } catch (...) {
-                        // Not a string-convertible attribute (entity ref, aggregate, etc.)
                         str_val = "<" + std::string(ifcopenshell::argument_type_to_string(val.type())) + ">";
                     }
                     addRow(QString::fromStdString(attr->name()), QString::fromStdString(str_val));
diff --git a/src/ifcviewer/MainWindow.h b/src/ifcviewer/MainWindow.h
index bbec6ce83de..e9bcc37cb6b 100644
--- a/src/ifcviewer/MainWindow.h
+++ b/src/ifcviewer/MainWindow.h
@@ -29,6 +29,8 @@
 #include <QTimer>
 #include <QElapsedTimer>
 
+#include <map>
+#include <deque>
 #include <unordered_map>
 
 #include "ViewportWindow.h"
@@ -36,13 +38,24 @@
 
 class SettingsWindow;
 
+using ModelId = uint32_t;
+
+struct ModelHandle {
+    ModelId id = 0;
+    QString file_path;
+    QString display_name;
+    GeometryStreamer* streamer = nullptr;
+    QTreeWidgetItem* tree_root = nullptr;
+    bool visible = true;
+};
+
 class MainWindow : public QMainWindow {
     Q_OBJECT
 public:
     explicit MainWindow(QWidget* parent = nullptr);
     ~MainWindow();
 
-    void openFile(const QString& path);
+    void addFiles(const QStringList& paths);
 
 private slots:
     void onFileOpen();
@@ -58,6 +71,8 @@ private slots:
     void setupUi();
     void setupMenus();
     void populateProperties(uint32_t object_id);
+    void startNextLoad();
+    void connectStreamer(GeometryStreamer* streamer);
 
     ViewportWindow* viewport_ = nullptr;
     SettingsWindow* settings_ = nullptr;
@@ -70,12 +85,22 @@ private slots:
     QTimer element_poll_timer_;
     QElapsedTimer load_timer_;
 
-    GeometryStreamer* streamer_ = nullptr;
+    // Multi-model state
+    std::map<ModelId, ModelHandle> models_;
+    ModelId next_model_id_ = 1;
+    uint32_t next_object_id_ = 1; // monotonically increasing across all models
+    std::deque<ModelId> load_queue_;
+    ModelId loading_model_id_ = 0;
 
     // Map object_id -> tree item and element info
     std::unordered_map<uint32_t, ElementInfo> element_map_;
     std::unordered_map<uint32_t, QTreeWidgetItem*> tree_items_;
-    std::unordered_map<int, uint32_t> ifc_id_to_object_id_;
+    // Scoped (model_id, ifc_id) -> object_id
+    std::unordered_map<uint64_t, uint32_t> scoped_ifc_id_to_object_id_;
+
+    static uint64_t scopedKey(uint32_t model_id, int ifc_id) {
+        return (static_cast<uint64_t>(model_id) << 32) | static_cast<uint32_t>(ifc_id);
+    }
 };
 
 #endif // MAINWINDOW_H
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 1ebe988554a..4217c997423 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -427,6 +427,7 @@ void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
     ObjectDrawInfo info;
     info.index_offset = static_cast<uint32_t>(ebo_used_);
     info.index_count = static_cast<uint32_t>(chunk.indices.size());
+    info.model_id = chunk.model_id;
 
     const size_t num_verts = chunk.vertices.size() / VERTEX_STRIDE;
     if (num_verts > 0) {
@@ -467,6 +468,23 @@ void ViewportWindow::resetScene() {
     total_triangles_ = 0;
     selected_object_id_ = 0;
     object_draw_info_.clear();
+    hidden_models_.clear();
+    removed_models_.clear();
+}
+
+void ViewportWindow::hideModel(uint32_t model_id) {
+    std::lock_guard<std::mutex> lock(upload_mutex_);
+    hidden_models_.insert(model_id);
+}
+
+void ViewportWindow::showModel(uint32_t model_id) {
+    std::lock_guard<std::mutex> lock(upload_mutex_);
+    hidden_models_.erase(model_id);
+}
+
+void ViewportWindow::removeModel(uint32_t model_id) {
+    std::lock_guard<std::mutex> lock(upload_mutex_);
+    removed_models_.insert(model_id);
 }
 
 void ViewportWindow::setSelectedObjectId(uint32_t id) {
@@ -567,6 +585,10 @@ void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
     visible_offsets_.reserve(object_draw_info_.size());
 
     for (const auto& obj : object_draw_info_) {
+        // Skip hidden or removed models.
+        if (hidden_models_.count(obj.model_id) || removed_models_.count(obj.model_id))
+            continue;
+
         bool visible = true;
         for (int p = 0; p < 6; ++p) {
             // p-vertex: the AABB corner most in the direction of the plane normal.
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 58a63343212..fda82a1db5e 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -29,6 +29,7 @@
 #include <QVector3D>
 
 #include <vector>
+#include <unordered_set>
 #include <cstdint>
 #include <mutex>
 
@@ -39,6 +40,7 @@ struct MaterialInfo {
 struct ObjectDrawInfo {
     uint32_t index_offset;  // byte offset into EBO
     uint32_t index_count;   // number of indices
+    uint32_t model_id;      // which model this object belongs to
     float aabb_min[3];      // world-space AABB
     float aabb_max[3];
 };
@@ -51,6 +53,7 @@ struct UploadChunk {
     std::vector<float> vertices;
     std::vector<uint32_t> indices; // local to this chunk's vertices
     uint32_t object_id = 0;
+    uint32_t model_id = 0;
 };
 
 class ViewportWindow : public QWindow {
@@ -62,6 +65,10 @@ class ViewportWindow : public QWindow {
     void uploadChunk(const UploadChunk& chunk);
     void resetScene();
 
+    void hideModel(uint32_t model_id);
+    void showModel(uint32_t model_id);
+    void removeModel(uint32_t model_id);
+
     void setSelectedObjectId(uint32_t id);
     uint32_t pickObjectAt(int x, int y);
 
@@ -136,6 +143,8 @@ class ViewportWindow : public QWindow {
 
     // Per-object draw metadata for frustum culling.
     std::vector<ObjectDrawInfo> object_draw_info_;
+    std::unordered_set<uint32_t> hidden_models_;
+    std::unordered_set<uint32_t> removed_models_;
     uint32_t total_index_count_ = 0;
     std::mutex upload_mutex_;
 
diff --git a/src/ifcviewer/main.cpp b/src/ifcviewer/main.cpp
index 3bca693a371..a5bb487db80 100644
--- a/src/ifcviewer/main.cpp
+++ b/src/ifcviewer/main.cpp
@@ -40,7 +40,7 @@ int main(int argc, char* argv[]) {
     QCommandLineParser parser;
     parser.setApplicationDescription("IfcOpenShell IFC Viewer");
     parser.addHelpOption();
-    parser.addPositionalArgument("file", "IFC file to open");
+    parser.addPositionalArgument("files", "IFC file(s) to open", "[files...]");
     parser.process(app);
 
     MainWindow window;
@@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
 
     auto args = parser.positionalArguments();
     if (!args.isEmpty()) {
-        window.openFile(args.first());
+        window.addFiles(args);
     }
 
     return app.exec();

From 49b70e8276d0244d619452f4fce58cabb3c24c92 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sat, 11 Apr 2026 20:55:24 +1000
Subject: [PATCH 10/37] Update README for multi-model support and frustum
 culling

Reflect current architecture: per-model streamers, glMultiDrawElements
with frustum culling, 32-byte vertex format with color, multiselect
file picker, settings/stats files.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 9c6c52560ce..d2fb084a416 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -10,21 +10,22 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 |  +----------+ +--------------------------+|
 |  | Element  | | 3D Viewport              ||
 |  | Tree     | | (QWindow + OpenGL 4.5)   ||
-|  |          | |                          ||
-|  +----------+ | Single VBO/EBO           ||
-|  | Property | | DrawElementsBaseVertex   ||
+|  | (per-    | |                          ||
+|  |  model)  | | Single VBO/EBO           ||
+|  +----------+ | glMultiDrawElements      ||
+|  | Property | | frustum culling          ||
 |  | Table    | | GPU pick pass            ||
 |  +----------+ +--------------------------+|
-|  | Status / Progress                      |
+|  | Status / Progress / Stats              |
 +-------------------------------------------+
         ^                    ^
         |                    |
   element metadata     UploadChunks
         |                    |
 +-------------------------------------------+
-|  GeometryStreamer (background QThread)     |
+|  GeometryStreamer (one per loaded model)   |
 |  IfcGeom::Iterator with N threads         |
-|  (one per CPU core by default)            |
+|  (models loaded sequentially)             |
 +-------------------------------------------+
 ```
 
@@ -32,8 +33,10 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 
 - **QWindow viewport** embedded via `QWidget::createWindowContainer()`. This gives us a raw native surface for OpenGL, bypassing `QOpenGLWidget`'s compositor overhead.
 - **One big vertex buffer + index buffer** (64 MB + 32 MB initial). Geometry is appended as it streams in. No per-object VBOs, no rebinding.
-- **Interleaved vertex format**: position (3 floats) + normal (3 floats) + object ID (1 float, bitcast uint32) = 28 bytes per vertex.
+- **Interleaved vertex format**: position (3 floats) + normal (3 floats) + object ID (1 float, bitcast uint32) + color (RGBA8 packed into 1 float) = 32 bytes per vertex.
+- **Per-object frustum culling**: each object's AABB is tested against 6 frustum planes each frame. Only visible objects are drawn via `glMultiDrawElements`.
 - **GPU object picking**: a second render pass writes object IDs to an R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
+- **Multi-model support**: multiple IFC files can be loaded simultaneously. Each model gets its own `GeometryStreamer` (owning the `ifcopenshell::file` for property lookup). Models are loaded sequentially; geometry from all models coexists in the shared VBO/EBO. Per-model visibility toggle and removal are supported.
 - **Multi-threaded tessellation**: `IfcGeom::Iterator` runs on a background thread and internally parallelizes geometry conversion across all CPU cores.
 - **Non-blocking streaming**: the iterator emits `UploadChunk` signals via Qt's queued connection. The main thread uploads to the GPU without blocking iteration.
 - **World coordinates**: geometry is emitted in world space (`use-world-coords=true`) so no per-object transform matrices are needed on the GPU.
@@ -43,9 +46,11 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 | File | Purpose |
 |------|---------|
 | `main.cpp` | Application entry point, GL 4.5 surface format, CLI argument parsing |
-| `MainWindow.h/cpp` | Qt main window: dockable element tree, property table, status bar, menus |
-| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, picking |
-| `GeometryStreamer.h/cpp` | Background geometry processing: loads IFC, runs iterator, emits chunks |
+| `MainWindow.h/cpp` | Qt main window: multi-model project management, element tree, property table, status bar |
+| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, frustum culling, picking |
+| `GeometryStreamer.h/cpp` | Background geometry processing: loads IFC, runs iterator, emits chunks (one per model) |
+| `AppSettings.h/cpp` | Persisted application preferences (geometry library, show stats) |
+| `SettingsWindow.h/cpp` | Settings dialog UI |
 | `CMakeLists.txt` | Build configuration |
 
 ## Dependencies
@@ -94,10 +99,10 @@ make -j$(nproc)
 ## Usage
 
 ```sh
-# Open a file directly
-./IfcViewer model.ifc
+# Open one or more files from the command line
+./IfcViewer arch.ifc struct.ifc mep.ifc
 
-# Or use File -> Open from the menu
+# Or use File -> Add Files from the menu (supports multiselect)
 ./IfcViewer
 ```
 
@@ -114,7 +119,7 @@ make -j$(nproc)
 
 | Key | Action |
 |-----|--------|
-| Ctrl+O | Open file |
+| Ctrl+O | Add files |
 | Ctrl+Q | Quit |
 
 ## Performance Strategy

From 4d5256b427593dc232aaa8ad962fa836d46ce524 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 09:09:32 +1000
Subject: [PATCH 11/37] BVH frustum culling, sidecar cache, per-model buffers,
 progressive upload

Phase 2 performance: BVH acceleration with median-split build, per-model
trees, and EBO re-sorting for GPU cache coherence. Raw binary .ifcview
sidecar stores full geometry + BVH for instant subsequent loads (skip
tessellation entirely).

Per-model GPU buffers (VAO/VBO/EBO per model) eliminate cross-model buffer
copies on growth. Sidecar reads happen on a background thread. Bulk GPU
uploads are progressive (48 MB/frame chunks) so the viewport stays
interactive while multi-GB models stream in.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/BvhAccel.cpp       | 226 +++++++++++
 src/ifcviewer/BvhAccel.h         |  75 ++++
 src/ifcviewer/MainWindow.cpp     | 178 ++++++++-
 src/ifcviewer/MainWindow.h       |   7 +
 src/ifcviewer/README.md          | 349 +++++++++++------
 src/ifcviewer/SidecarCache.cpp   | 196 ++++++++++
 src/ifcviewer/SidecarCache.h     |  76 ++++
 src/ifcviewer/ViewportWindow.cpp | 631 +++++++++++++++++++++++--------
 src/ifcviewer/ViewportWindow.h   | 116 ++++--
 9 files changed, 1534 insertions(+), 320 deletions(-)
 create mode 100644 src/ifcviewer/BvhAccel.cpp
 create mode 100644 src/ifcviewer/BvhAccel.h
 create mode 100644 src/ifcviewer/SidecarCache.cpp
 create mode 100644 src/ifcviewer/SidecarCache.h

diff --git a/src/ifcviewer/BvhAccel.cpp b/src/ifcviewer/BvhAccel.cpp
new file mode 100644
index 00000000000..e0b232a283c
--- /dev/null
+++ b/src/ifcviewer/BvhAccel.cpp
@@ -0,0 +1,226 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "BvhAccel.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <numeric>
+
+namespace {
+
+struct Centroid {
+    float x, y, z;
+};
+
+Centroid computeCentroid(const ObjectDrawInfo& obj) {
+    return {
+        (obj.aabb_min[0] + obj.aabb_max[0]) * 0.5f,
+        (obj.aabb_min[1] + obj.aabb_max[1]) * 0.5f,
+        (obj.aabb_min[2] + obj.aabb_max[2]) * 0.5f
+    };
+}
+
+void computeAABB(const std::vector<ObjectDrawInfo>& draw_info,
+                 const uint32_t* indices, uint32_t count,
+                 float out_min[3], float out_max[3]) {
+    out_min[0] = out_min[1] = out_min[2] = std::numeric_limits<float>::max();
+    out_max[0] = out_max[1] = out_max[2] = -std::numeric_limits<float>::max();
+    for (uint32_t i = 0; i < count; ++i) {
+        const auto& obj = draw_info[indices[i]];
+        for (int a = 0; a < 3; ++a) {
+            if (obj.aabb_min[a] < out_min[a]) out_min[a] = obj.aabb_min[a];
+            if (obj.aabb_max[a] > out_max[a]) out_max[a] = obj.aabb_max[a];
+        }
+    }
+}
+
+// Recursive BVH builder. Writes nodes in pre-order DFS into mbvh.nodes.
+// object_indices[start..start+count) are the indices to partition.
+void buildRecursive(ModelBvh& mbvh,
+                    const std::vector<ObjectDrawInfo>& draw_info,
+                    uint32_t start, uint32_t count) {
+    uint32_t node_idx = static_cast<uint32_t>(mbvh.nodes.size());
+    mbvh.nodes.emplace_back();
+    BvhNode& node = mbvh.nodes[node_idx];
+
+    computeAABB(draw_info, &mbvh.object_indices[start], count,
+                node.aabb_min, node.aabb_max);
+
+    if (count <= BVH_MAX_LEAF_SIZE) {
+        node.right_or_first = start;
+        node.count = static_cast<uint16_t>(count);
+        node.axis = 0;
+        return;
+    }
+
+    // Find longest axis of node AABB.
+    float extent[3] = {
+        node.aabb_max[0] - node.aabb_min[0],
+        node.aabb_max[1] - node.aabb_min[1],
+        node.aabb_max[2] - node.aabb_min[2]
+    };
+    int axis = 0;
+    if (extent[1] > extent[axis]) axis = 1;
+    if (extent[2] > extent[axis]) axis = 2;
+
+    // Partition at median centroid on the chosen axis.
+    uint32_t mid = count / 2;
+    std::nth_element(
+        mbvh.object_indices.begin() + start,
+        mbvh.object_indices.begin() + start + mid,
+        mbvh.object_indices.begin() + start + count,
+        [&](uint32_t a, uint32_t b) {
+            Centroid ca = computeCentroid(draw_info[a]);
+            Centroid cb = computeCentroid(draw_info[b]);
+            return (&ca.x)[axis] < (&cb.x)[axis];
+        });
+
+    node.count = 0;  // interior
+    node.axis = static_cast<uint16_t>(axis);
+
+    // Left child is always node_idx + 1 (implicit in pre-order DFS).
+    // Build left subtree first. Note: &node is invalidated after this call
+    // because the vector may reallocate.
+    buildRecursive(mbvh, draw_info, start, mid);
+
+    // Right child is the next node written after the entire left subtree.
+    uint32_t right_child_idx = static_cast<uint32_t>(mbvh.nodes.size());
+    buildRecursive(mbvh, draw_info, start + mid, count - mid);
+
+    // Patch the right child index (left is implicit = node_idx + 1).
+    mbvh.nodes[node_idx].right_or_first = right_child_idx;
+}
+
+} // anonymous namespace
+
+ModelBvh buildModelBvh(const std::vector<ObjectDrawInfo>& draw_info,
+                       const std::vector<uint32_t>& model_object_indices,
+                       uint32_t model_id) {
+    ModelBvh mbvh;
+    mbvh.model_id = model_id;
+    mbvh.object_indices = model_object_indices;
+
+    uint32_t count = static_cast<uint32_t>(model_object_indices.size());
+    if (count == 0) return mbvh;
+
+    // Reserve a rough estimate: ~2*n nodes for a balanced binary tree.
+    mbvh.nodes.reserve(count * 2);
+
+    buildRecursive(mbvh, draw_info, 0, count);
+
+    // Verify: every object appears exactly once in the leaves.
+    assert(!mbvh.nodes.empty());
+
+    return mbvh;
+}
+
+std::shared_ptr<BvhSet> buildBvhSet(const std::vector<ObjectDrawInfo>& draw_info) {
+    auto bvh_set = std::make_shared<BvhSet>();
+
+    // Group object indices by model_id.
+    std::unordered_map<uint32_t, std::vector<uint32_t>> model_objects;
+    for (uint32_t i = 0; i < static_cast<uint32_t>(draw_info.size()); ++i) {
+        model_objects[draw_info[i].model_id].push_back(i);
+    }
+
+    // Build per-model BVHs.
+    for (auto& [model_id, obj_indices] : model_objects) {
+        if (obj_indices.size() < BVH_MIN_OBJECTS) continue;
+
+        ModelBvh mbvh = buildModelBvh(draw_info, obj_indices, model_id);
+        bvh_set->bvh_model_ids.insert(model_id);
+        bvh_set->models[model_id] = std::move(mbvh);
+    }
+
+    return bvh_set;
+}
+
+EboReorderResult reorderEbo(const BvhSet& bvh_set,
+                            const std::vector<ObjectDrawInfo>& draw_info,
+                            const std::vector<uint32_t>& original_ebo) {
+    EboReorderResult result;
+    result.reordered_draw_info = draw_info;  // copy; we'll update offsets
+    result.reordered_ebo.reserve(original_ebo.size());
+
+    // Track which draw_info entries have been placed.
+    std::vector<bool> placed(draw_info.size(), false);
+
+    for (const auto& [model_id, mbvh] : bvh_set.models) {
+        // DFS traversal of BVH to visit leaves in order.
+        uint32_t stack[64];
+        int sp = 0;
+        stack[sp++] = 0;
+
+        while (sp > 0) {
+            uint32_t ni = stack[--sp];
+            const BvhNode& node = mbvh.nodes[ni];
+
+            if (node.count > 0) {
+                // Leaf: emit objects in order.
+                for (uint32_t i = 0; i < node.count; ++i) {
+                    uint32_t oi = mbvh.object_indices[node.right_or_first + i];
+                    if (placed[oi]) continue;
+                    placed[oi] = true;
+
+                    const auto& old_info = draw_info[oi];
+                    uint32_t new_offset = static_cast<uint32_t>(
+                        result.reordered_ebo.size() * sizeof(uint32_t));
+
+                    // Copy indices from original EBO.
+                    uint32_t idx_start = old_info.index_offset / sizeof(uint32_t);
+                    uint32_t idx_count = old_info.index_count;
+                    for (uint32_t j = 0; j < idx_count; ++j) {
+                        result.reordered_ebo.push_back(original_ebo[idx_start + j]);
+                    }
+
+                    result.reordered_draw_info[oi].index_offset = new_offset;
+                }
+            } else {
+                // Interior: push left (=ni+1) last so it's processed first.
+                stack[sp++] = node.right_or_first;  // right child
+                stack[sp++] = ni + 1;                // left child
+            }
+        }
+    }
+
+    // Append non-BVH objects (models too small for BVH).
+    for (uint32_t oi = 0; oi < static_cast<uint32_t>(draw_info.size()); ++oi) {
+        if (placed[oi]) continue;
+        placed[oi] = true;
+
+        const auto& old_info = draw_info[oi];
+        uint32_t new_offset = static_cast<uint32_t>(
+            result.reordered_ebo.size() * sizeof(uint32_t));
+
+        uint32_t idx_start = old_info.index_offset / sizeof(uint32_t);
+        uint32_t idx_count = old_info.index_count;
+        for (uint32_t j = 0; j < idx_count; ++j) {
+            result.reordered_ebo.push_back(original_ebo[idx_start + j]);
+        }
+
+        result.reordered_draw_info[oi].index_offset = new_offset;
+    }
+
+    assert(result.reordered_ebo.size() == original_ebo.size());
+
+    return result;
+}
diff --git a/src/ifcviewer/BvhAccel.h b/src/ifcviewer/BvhAccel.h
new file mode 100644
index 00000000000..21c57c2712a
--- /dev/null
+++ b/src/ifcviewer/BvhAccel.h
@@ -0,0 +1,75 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef BVHACCEL_H
+#define BVHACCEL_H
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <memory>
+
+struct ObjectDrawInfo {
+    uint32_t index_offset;  // byte offset into EBO
+    uint32_t index_count;   // number of indices
+    uint32_t model_id;      // which model this object belongs to
+    float aabb_min[3];      // world-space AABB
+    float aabb_max[3];
+};
+
+static constexpr uint32_t BVH_MAX_LEAF_SIZE = 8;
+static constexpr uint32_t BVH_MIN_OBJECTS = 32;
+
+struct BvhNode {
+    float aabb_min[3];
+    float aabb_max[3];
+    uint32_t right_or_first; // interior: right child index (left is always this_index+1); leaf: first object index
+    uint16_t count;           // 0 = interior; >0 = leaf with this many objects
+    uint16_t axis;            // split axis (0/1/2) for interior; unused for leaf
+};
+static_assert(sizeof(BvhNode) == 32, "BvhNode must be 32 bytes for cache alignment and sidecar format");
+
+struct ModelBvh {
+    uint32_t model_id = 0;
+    std::vector<BvhNode> nodes;
+    std::vector<uint32_t> object_indices;  // indices into object_draw_info_
+};
+
+struct BvhSet {
+    std::unordered_map<uint32_t, ModelBvh> models;
+    std::unordered_set<uint32_t> bvh_model_ids;
+};
+
+struct EboReorderResult {
+    std::vector<uint32_t> reordered_ebo;
+    std::vector<ObjectDrawInfo> reordered_draw_info;
+};
+
+// Build BVH trees for all models in the given draw info snapshot.
+// Only builds the tree structure; does not touch EBO data.
+std::shared_ptr<BvhSet> buildBvhSet(const std::vector<ObjectDrawInfo>& draw_info);
+
+// Reorder the EBO so objects within each BVH leaf are contiguous.
+// Must be called with the CURRENT run's EBO and draw_info (not cached).
+EboReorderResult reorderEbo(const BvhSet& bvh_set,
+                            const std::vector<ObjectDrawInfo>& draw_info,
+                            const std::vector<uint32_t>& original_ebo);
+
+#endif // BVHACCEL_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 3b4e58fbacc..b5ee3581c44 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -20,6 +20,7 @@
 #include "MainWindow.h"
 #include "AppSettings.h"
 #include "SettingsWindow.h"
+#include "SidecarCache.h"
 
 #include <QApplication>
 #include <QMenuBar>
@@ -61,7 +62,14 @@ MainWindow::MainWindow(QWidget* parent)
     resize(1400, 900);
 }
 
-MainWindow::~MainWindow() {}
+MainWindow::~MainWindow() {
+    joinSidecarThread();
+}
+
+void MainWindow::joinSidecarThread() {
+    if (sidecar_read_thread_.joinable())
+        sidecar_read_thread_.join();
+}
 
 void MainWindow::setupUi() {
     // 3D Viewport as central widget
@@ -158,7 +166,7 @@ void MainWindow::addFiles(const QStringList& paths) {
     }
 
     if (loading_model_id_ == 0) {
-        startNextLoad();
+        QTimer::singleShot(0, this, &MainWindow::startNextLoad);
     }
 }
 
@@ -184,16 +192,133 @@ void MainWindow::startNextLoad() {
     load_queue_.pop_front();
 
     auto& model = models_[loading_model_id_];
-    connectStreamer(model.streamer);
 
-    progress_bar_->setValue(0);
-    progress_bar_->setVisible(true);
+    load_timer_.restart();
     status_label_->setText("Loading: " + model.display_name);
 
-    load_timer_.restart();
-    element_poll_timer_.start();
-    model.streamer->loadFile(
-        model.file_path.toStdString(), next_object_id_, loading_model_id_);
+    // Try sidecar on a background thread so the UI stays responsive.
+    std::string ifc_path = model.file_path.toStdString();
+    uint64_t file_size = static_cast<uint64_t>(QFileInfo(model.file_path).size());
+    ModelId mid = loading_model_id_;
+
+    joinSidecarThread();
+    sidecar_read_thread_ = std::thread([this, ifc_path, file_size, mid]() {
+        QElapsedTimer rt; rt.start();
+        auto cached = readSidecar(ifc_path, file_size);
+        qDebug("  Sidecar read: %lld ms (%s)", rt.elapsed(), ifc_path.c_str());
+        auto result = std::make_shared<std::optional<SidecarData>>(std::move(cached));
+        QMetaObject::invokeMethod(this, [this, mid, result]() {
+            if (*result && !(*result)->draw_info.empty()) {
+                applySidecarData(mid, std::move(**result));
+            } else {
+                // No sidecar — fall back to streaming from IFC.
+                auto it = models_.find(mid);
+                if (it == models_.end()) return;
+                auto& m = it->second;
+                connectStreamer(m.streamer);
+                progress_bar_->setValue(0);
+                progress_bar_->setVisible(true);
+                status_label_->setText("Loading: " + m.display_name);
+                element_poll_timer_.start();
+                m.streamer->loadFile(
+                    m.file_path.toStdString(), next_object_id_, loading_model_id_);
+            }
+        }, Qt::QueuedConnection);
+    });
+}
+
+void MainWindow::applySidecarData(ModelId mid, SidecarData data) {
+    auto it = models_.find(mid);
+    if (it == models_.end()) return;
+    auto& model = it->second;
+
+    QElapsedTimer t;
+
+    qDebug("Sidecar hit: %s (%zu objects, %zu verts, %zu indices, %.1f MB)",
+           model.file_path.toStdString().c_str(), data.draw_info.size(),
+           data.vertices.size() / 8, data.indices.size(),
+           (data.vertices.size() * 4 + data.indices.size() * 4) / (1024.0 * 1024.0));
+
+    // GL upload — fast, single buffer copy.
+    t.start();
+    viewport_->uploadBulk(mid, data.vertices, data.indices,
+                          data.draw_info, std::move(data.bvh_set));
+    qDebug("  GL upload: %lld ms", t.elapsed());
+
+    // Update next_object_id_ past all objects in this model.
+    for (const auto& elem : data.elements) {
+        if (elem.object_id >= next_object_id_)
+            next_object_id_ = elem.object_id + 1;
+    }
+
+    // Suppress per-item layout recalcs while building the tree.
+    t.restart();
+    element_tree_->setUpdatesEnabled(false);
+    populateTreeFromSidecar(model, data.elements, data.string_table);
+    element_tree_->setUpdatesEnabled(true);
+    qDebug("  Tree build: %lld ms (%zu elements)", t.elapsed(), data.elements.size());
+
+    progress_bar_->setVisible(false);
+
+    qint64 ms = load_timer_.elapsed();
+    QString elapsed = (ms >= 1000)
+        ? QString::number(ms / 1000.0, 'f', 2) + " s"
+        : QString::number(ms) + " ms";
+
+    status_label_->setText(QString("%1 elements across %2 model(s) — loaded from cache in %3")
+        .arg(element_map_.size())
+        .arg(models_.size())
+        .arg(elapsed));
+
+    loading_model_id_ = 0;
+    QTimer::singleShot(0, this, &MainWindow::startNextLoad);
+}
+
+void MainWindow::populateTreeFromSidecar(ModelHandle& model,
+                                          const std::vector<PackedElementInfo>& elements,
+                                          const std::string& stbl) {
+    auto str = [&](uint32_t offset, uint32_t length) -> std::string {
+        if (length == 0 || offset + length > stbl.size()) return {};
+        return stbl.substr(offset, length);
+    };
+
+    for (const auto& pe : elements) {
+        ElementInfo info;
+        info.object_id = pe.object_id;
+        info.model_id = pe.model_id;
+        info.ifc_id = pe.ifc_id;
+        info.parent_id = pe.parent_id;
+        info.guid = str(pe.guid_offset, pe.guid_length);
+        info.name = str(pe.name_offset, pe.name_length);
+        info.type = str(pe.type_offset, pe.type_length);
+
+        element_map_[info.object_id] = info;
+        scoped_ifc_id_to_object_id_[scopedKey(info.model_id, info.ifc_id)] = info.object_id;
+
+        // Find parent tree item.
+        QTreeWidgetItem* parent_item = model.tree_root;
+        auto parent_obj_it = scoped_ifc_id_to_object_id_.find(
+            scopedKey(info.model_id, info.parent_id));
+        if (parent_obj_it != scoped_ifc_id_to_object_id_.end()) {
+            auto tree_it = tree_items_.find(parent_obj_it->second);
+            if (tree_it != tree_items_.end()) {
+                parent_item = tree_it->second;
+            }
+        }
+
+        QString display_name = QString::fromStdString(info.name);
+        if (display_name.isEmpty()) {
+            display_name = QString::fromStdString(info.type) + " #" + QString::number(info.ifc_id);
+        }
+
+        auto* item = new QTreeWidgetItem(parent_item);
+        item->setText(0, display_name);
+        item->setText(1, QString::fromStdString(info.type));
+        item->setText(2, QString::fromStdString(info.guid));
+        item->setData(0, Qt::UserRole, info.object_id);
+
+        tree_items_[info.object_id] = item;
+    }
 }
 
 void MainWindow::onProgressChanged(int percent) {
@@ -230,6 +355,41 @@ void MainWindow::onStreamingFinished() {
         .arg(num_models)
         .arg(elapsed));
 
+    // Build BVH and write sidecar (geometry + metadata + BVH).
+    if (loading_model_id_ != 0) {
+        auto it = models_.find(loading_model_id_);
+        if (it != models_.end()) {
+            std::string ifc_path = it->second.file_path.toStdString();
+            QFileInfo fi(it->second.file_path);
+            uint64_t file_size = static_cast<uint64_t>(fi.size());
+
+            // Pack element info for the sidecar (only this model's elements).
+            std::vector<PackedElementInfo> packed;
+            std::string stbl;
+            for (const auto& [oid, info] : element_map_) {
+                if (info.model_id != loading_model_id_) continue;
+                PackedElementInfo pe;
+                pe.object_id = info.object_id;
+                pe.model_id = info.model_id;
+                pe.ifc_id = info.ifc_id;
+                pe.parent_id = info.parent_id;
+                pe.guid_offset = static_cast<uint32_t>(stbl.size());
+                pe.guid_length = static_cast<uint32_t>(info.guid.size());
+                stbl += info.guid;
+                pe.name_offset = static_cast<uint32_t>(stbl.size());
+                pe.name_length = static_cast<uint32_t>(info.name.size());
+                stbl += info.name;
+                pe.type_offset = static_cast<uint32_t>(stbl.size());
+                pe.type_length = static_cast<uint32_t>(info.type.size());
+                stbl += info.type;
+                packed.push_back(pe);
+            }
+
+            viewport_->buildBvhAsync(loading_model_id_, ifc_path, file_size,
+                                     std::move(packed), std::move(stbl));
+        }
+    }
+
     // Start next model if queued.
     startNextLoad();
 }
diff --git a/src/ifcviewer/MainWindow.h b/src/ifcviewer/MainWindow.h
index e9bcc37cb6b..f60da70b75d 100644
--- a/src/ifcviewer/MainWindow.h
+++ b/src/ifcviewer/MainWindow.h
@@ -31,6 +31,7 @@
 
 #include <map>
 #include <deque>
+#include <thread>
 #include <unordered_map>
 
 #include "ViewportWindow.h"
@@ -72,6 +73,11 @@ private slots:
     void setupMenus();
     void populateProperties(uint32_t object_id);
     void startNextLoad();
+    void applySidecarData(ModelId mid, SidecarData data);
+    void joinSidecarThread();
+    void populateTreeFromSidecar(ModelHandle& model,
+                                 const std::vector<PackedElementInfo>& elements,
+                                 const std::string& string_table);
     void connectStreamer(GeometryStreamer* streamer);
 
     ViewportWindow* viewport_ = nullptr;
@@ -91,6 +97,7 @@ private slots:
     uint32_t next_object_id_ = 1; // monotonically increasing across all models
     std::deque<ModelId> load_queue_;
     ModelId loading_model_id_ = 0;
+    std::thread sidecar_read_thread_;
 
     // Map object_id -> tree item and element info
     std::unordered_map<uint32_t, ElementInfo> element_map_;
diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index d2fb084a416..d0122d63c83 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -11,16 +11,16 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 |  | Element  | | 3D Viewport              ||
 |  | Tree     | | (QWindow + OpenGL 4.5)   ||
 |  | (per-    | |                          ||
-|  |  model)  | | Single VBO/EBO           ||
+|  |  model)  | | Per-model VAO/VBO/EBO    ||
 |  +----------+ | glMultiDrawElements      ||
-|  | Property | | frustum culling          ||
+|  | Property | | BVH frustum culling      ||
 |  | Table    | | GPU pick pass            ||
 |  +----------+ +--------------------------+|
 |  | Status / Progress / Stats              |
 +-------------------------------------------+
         ^                    ^
         |                    |
-  element metadata     UploadChunks
+  element metadata     UploadChunks / Sidecar
         |                    |
 +-------------------------------------------+
 |  GeometryStreamer (one per loaded model)   |
@@ -32,11 +32,13 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 ### Key design decisions
 
 - **QWindow viewport** embedded via `QWidget::createWindowContainer()`. This gives us a raw native surface for OpenGL, bypassing `QOpenGLWidget`'s compositor overhead.
-- **One big vertex buffer + index buffer** (64 MB + 32 MB initial). Geometry is appended as it streams in. No per-object VBOs, no rebinding.
+- **Per-model GPU buffers**: each loaded model gets its own VAO/VBO/EBO. No shared buffer, no cross-model copies on growth. Removing a model frees its GPU memory immediately.
 - **Interleaved vertex format**: position (3 floats) + normal (3 floats) + object ID (1 float, bitcast uint32) + color (RGBA8 packed into 1 float) = 32 bytes per vertex.
-- **Per-object frustum culling**: each object's AABB is tested against 6 frustum planes each frame. Only visible objects are drawn via `glMultiDrawElements`.
+- **Progressive GPU upload**: bulk sidecar loads allocate empty GPU buffers, then stream data in 48 MB chunks per frame. VBO uploads first (no objects visible), then EBO (objects appear progressively as their index range lands). The viewport stays interactive throughout — you can orbit already-loaded models while new ones stream in.
+- **Non-blocking sidecar loading**: sidecar files are read on a background thread. The heavy disk I/O (potentially gigabytes) never blocks the render loop. Only the final GPU upload and tree population happen on the main thread.
+- **BVH frustum culling**: per-model BVH trees cull entire subtrees of objects in one frustum test, reducing per-frame cost from O(N) to O(log N). Falls back to linear scan during progressive upload; BVH activates once the model is fully loaded.
 - **GPU object picking**: a second render pass writes object IDs to an R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
-- **Multi-model support**: multiple IFC files can be loaded simultaneously. Each model gets its own `GeometryStreamer` (owning the `ifcopenshell::file` for property lookup). Models are loaded sequentially; geometry from all models coexists in the shared VBO/EBO. Per-model visibility toggle and removal are supported.
+- **Multi-model support**: multiple IFC files can be loaded simultaneously. Each model gets its own `GeometryStreamer` (owning the `ifcopenshell::file` for property lookup). Models are loaded sequentially. Per-model visibility toggle and removal are supported.
 - **Multi-threaded tessellation**: `IfcGeom::Iterator` runs on a background thread and internally parallelizes geometry conversion across all CPU cores.
 - **Non-blocking streaming**: the iterator emits `UploadChunk` signals via Qt's queued connection. The main thread uploads to the GPU without blocking iteration.
 - **World coordinates**: geometry is emitted in world space (`use-world-coords=true`) so no per-object transform matrices are needed on the GPU.
@@ -47,8 +49,10 @@ A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine
 |------|---------|
 | `main.cpp` | Application entry point, GL 4.5 surface format, CLI argument parsing |
 | `MainWindow.h/cpp` | Qt main window: multi-model project management, element tree, property table, status bar |
-| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, frustum culling, picking |
+| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, frustum culling, BVH traversal, picking |
 | `GeometryStreamer.h/cpp` | Background geometry processing: loads IFC, runs iterator, emits chunks (one per model) |
+| `BvhAccel.h/cpp` | BVH construction (median-split), per-model trees, EBO reordering |
+| `SidecarCache.h/cpp` | Raw binary `.ifcview` sidecar read/write |
 | `AppSettings.h/cpp` | Persisted application preferences (geometry library, show stats) |
 | `SettingsWindow.h/cpp` | Settings dialog UI |
 | `CMakeLists.txt` | Build configuration |
@@ -142,8 +146,9 @@ object that enters the GPU buffers:
 
 ```cpp
 struct ObjectDrawInfo {
-    uint32_t index_offset;   // byte offset into the shared EBO
+    uint32_t index_offset;   // byte offset into the model's EBO
     uint32_t index_count;    // number of indices (triangles * 3)
+    uint32_t model_id;       // which model this object belongs to
     float    aabb_min[3];    // world-space axis-aligned bounding box
     float    aabb_max[3];    // (computed from vertex positions at upload time)
 };
@@ -211,108 +216,207 @@ Phase 1 is sufficient for models up to ~100k objects. Beyond that, the CPU-side
 frustum test becomes a measurable fraction of the frame budget, motivating
 phase 3.
 
-### Phase 2: Spatial Tiling (optional, for large models)
+### Phase 2: BVH Acceleration (optional, for large models)
 
-For models exceeding ~10k objects, spatial tiling groups nearby objects into
-tiles and culls at the tile level rather than per-object. This reduces the
-number of frustum tests from N_objects to N_tiles (typically hundreds to low
-thousands).
+**Status:** Implemented.
 
-#### When tiling activates
+For models exceeding ~100 objects, a bounding volume hierarchy (BVH) groups
+nearby objects into a binary tree and culls entire subtrees in one frustum
+test. This reduces the number of AABB-frustum tests from O(N_objects) to
+O(log N) in the best case (camera zoomed into a corner) and gives a constant
+overhead for the common case where most of the model is on screen.
 
-Tiling is **optional and non-disruptive**. The system treats a non-tiled model
-as the degenerate case of "one tile containing everything" — the rendering loop
-always iterates tiles, so no separate code path is needed.
+A BVH was chosen over an octree because BIM data is spatially non-uniform —
+dense MEP risers in one zone, sparse open atriums in another. An octree
+subdivides space uniformly, wasting nodes on empty regions and creating deep
+chains in dense ones. A BVH adapts its splits to the actual object
+distribution, producing balanced trees regardless of density variation.
 
-Tiling activates in one of three ways:
+#### When the BVH activates
 
-1. **Preprocessed cache exists**: If a `.ifcview` sidecar file is found next to
-   the `.ifc` file, the tile structure is loaded from it instantly. The model
-   uploads geometry in tile order.
-2. **Automatic by size**: If the model has more than a configurable threshold of
-   objects (default 10k), a background task builds the spatial tree after
-   initial loading completes. Until it finishes, phase 1 culling handles
-   visibility.
-3. **Explicit user action**: A "preprocess for performance" option builds the
-   spatial tree and saves the sidecar for future loads.
+The BVH is **optional and non-disruptive**. Until it is built, phase 1's
+linear scan handles all culling. The rendering loop checks for an active BVH
+and falls back to the linear scan for any model that doesn't have one.
 
-#### Spatial subdivision
+The BVH activates in one of two ways:
 
-The world-space bounding box of the entire model is subdivided using a
-**loose octree**:
+1. **Sidecar cache exists**: If a `.ifcview` file is found next to the `.ifc`
+   file, the BVH is loaded from it instantly (raw memory read, no parsing).
+   The model uses BVH culling from the first frame after loading.
+2. **Automatic build**: After streaming finishes, a background thread builds
+   the BVH from the per-object AABBs already computed in phase 1. Until it
+   completes, phase 1 culling handles visibility. On completion, the render
+   thread picks up the BVH on the next frame. The sidecar is written for
+   future loads.
 
-- The root node covers the scene AABB.
-- Each node is split when it contains more than a threshold number of objects
-  (e.g. 256).
-- Objects are assigned to the smallest node that fully contains their AABB.
-- "Loose" bounds (inflated by 1.5x) reduce the number of objects that span
-  multiple nodes.
-- Leaf nodes become tiles.
+Models with fewer than 32 objects skip the BVH entirely — the overhead of tree
+traversal is worse than a linear scan at that scale.
 
-An octree adapts to non-uniform object density (common in buildings — lots of
-detail in MEP risers, sparse in open atriums) better than a uniform grid.
+#### BVH node layout
 
-#### EBO re-sorting
+Each node is 32 bytes, so two nodes fit in one 64-byte cache line:
 
-For tile-level culling to translate into contiguous index ranges, the EBO must
-be sorted so that all indices for objects in the same tile are adjacent.
+```cpp
+struct BvhNode {
+    float    aabb_min[3];     // world-space bounding box (12 bytes)
+    float    aabb_max[3];     // (12 bytes)
+    uint32_t right_or_first;  // interior: right child index; leaf: first object index (4 bytes)
+    uint16_t count;           // 0 = interior node; >0 = leaf with this many objects (2 bytes)
+    uint16_t axis;            // split axis for interior (0=x, 1=y, 2=z); unused for leaf (2 bytes)
+};
+```
 
-This happens via **deferred compaction**:
+Interior nodes store the right child index; the left child is always the
+immediately next node in the array (implicit in pre-order DFS layout, no
+pointer needed). Leaf nodes reference a contiguous range in a sorted
+object-index array.
 
-1. During initial load, geometry uploads in iterator order (fast first frame,
-   phase 1 culling active).
-2. After loading completes, a background thread:
-   a. Builds the octree from the per-object AABBs (already computed in phase 1).
-   b. Determines the tile for each object.
-   c. Computes the new index order (sorted by tile, then by object within tile).
-   d. Builds a new EBO on the CPU.
-3. The main thread uploads the new EBO in one `glNamedBufferSubData` call and
-   swaps in the tile metadata. One frame of stutter, bounded by EBO upload
-   time.
+The BVH is stored as a flat `std::vector<BvhNode>` in pre-order DFS layout.
+This means a depth-first traversal (which is what frustum culling does) reads
+memory sequentially, maximizing prefetch and cache-line utilization.
 
-The per-tile metadata:
+#### Build algorithm: object-median split
+
+1. Compute the centroid of each object's AABB.
+2. Find the longest axis of the current node's bounding box.
+3. Use `std::nth_element` to partition objects at the median centroid on that
+   axis. This is O(n) — no full sort needed.
+4. Recurse on each half. Terminate when the node contains ≤ 8 objects (leaf).
+5. Write nodes into the flat array in pre-order DFS.
+
+Total build time is O(n log n). For 100k objects this is well under 100 ms on
+a single core.
+
+SAH (Surface Area Heuristic) is the gold standard for ray-tracing BVHs, but
+for frustum culling — where we test 6 planes and early-out entire subtrees —
+the quality difference vs. median split is negligible. Median split is simpler
+and produces reliably balanced trees.
+
+#### Frustum traversal
+
+The traversal uses an explicit stack on the C++ stack (no heap allocation,
+no recursion):
+
+```
+stack[64] = {0}   // start at root; depth 64 handles billions of objects
+while stack not empty:
+    node = nodes[stack.pop()]
+    if node AABB outside frustum: continue   // cull entire subtree
+    if leaf:
+        for each object in node:
+            if object AABB in frustum: emit to visible list
+    else:
+        push right child, push left child    // left processed first (DFS)
+```
+
+When the camera is zoomed into a corner of the model, the traversal skips
+large portions of the tree after testing only a handful of interior nodes.
+When zoomed out to see everything, the traversal visits all leaves but the
+overhead of the interior-node tests is small relative to the leaf work.
+
+#### Per-model BVH
+
+Each loaded model gets its own BVH. During frustum culling, the outer loop
+iterates over models (skipping hidden/removed ones); the inner loop traverses
+that model's BVH. This means hiding or removing a model is free — just skip
+its BVH, no tree modification needed.
 
 ```cpp
-struct TileInfo {
-    float    aabb_min[3];    // tile bounding box (union of contained AABBs)
-    float    aabb_max[3];
-    uint32_t index_offset;   // into the re-sorted EBO
-    uint32_t index_count;    // sum of all contained objects' indices
-    uint32_t object_count;   // for stats / debugging
+struct ModelBvh {
+    uint32_t model_id;
+    std::vector<BvhNode> nodes;            // flat BVH node array
+    std::vector<uint32_t> object_indices;  // indices into object_draw_info_
 };
 ```
 
-#### Preprocessed sidecar format
+#### EBO re-sorting
+
+For BVH culling to maximise GPU cache performance, the EBO is re-sorted so
+that objects in the same BVH leaf are contiguous. This happens via **deferred
+compaction**:
+
+1. During initial load, geometry uploads in iterator order (fast first frame,
+   phase 1 culling active).
+2. After the BVH build completes on the background thread:
+   a. Walk the BVH leaves in DFS order.
+   b. For each object in each leaf, copy its index data to a new EBO buffer,
+      updating `ObjectDrawInfo::index_offset` accordingly.
+   c. Package the reordered EBO + updated draw info as a `BvhBuildResult`.
+3. The render thread picks up the result on the next frame: one
+   `glNamedBufferSubData` call to re-upload the EBO, then swap in the new
+   draw info and activate the BVH. One frame of stutter, bounded by EBO
+   upload time (~5 ms for 32 MB).
+
+#### Async build and render-thread handoff
+
+The BVH build must not stall the render loop:
+
+1. `buildBvhAsync()` snapshots `object_draw_info_` under the upload mutex,
+   then launches a `std::thread`.
+2. The thread builds the BVH and reordered EBO, then stores the result in a
+   `pending_bvh_result_` pointer under a separate mutex.
+3. At the top of each `render()` call, `applyBvhResult()` checks for a
+   pending result. If found, it re-uploads the EBO (requires GL context),
+   swaps the draw info, and activates the BVH.
+4. Until the BVH is ready, phase 1's linear scan runs every frame as before.
+
+#### Preprocessed sidecar format (`.ifcview`)
+
+The sidecar is a raw memory dump (Blender `.blend`-style) — no serialization
+format, no parsing. It stores everything needed to display the model without
+re-tessellating: vertex data, index data, per-object metadata, element tree
+info, and the BVH. Loading is just `fread` into vectors → GPU upload →
+render. The expensive `IfcGeom::Iterator` tessellation is skipped entirely.
+
+The IFC file is still parsed on demand (in background) for detailed property
+lookup; the sidecar provides the basic properties (name, type, GUID)
+immediately.
 
-The `.ifcview` file stores:
+```
+SidecarHeader            (16 bytes: magic, version, endian, reserved)
+uint64_t                 source_file_size
+
+uint32_t + float[]       vertex data    (interleaved, 8 floats/vertex)
+uint32_t + uint32_t[]    index data     (global indices, ready for EBO)
+uint32_t + ObjectDrawInfo[]   per-object draw metadata
+uint32_t + PackedElementInfo[]  element tree records (fixed-size)
+uint32_t + char[]        string table   (concatenated UTF-8: guid, name, type)
+
+uint32_t                 num_bvh_models
+per model:
+  uint32_t model_id
+  uint32_t + BvhNode[]        BVH node array
+  uint32_t + uint32_t[]       object indices
+```
 
-- Octree structure (node hierarchy, split planes).
-- Per-object tile assignment (object_id → tile_id mapping).
-- Per-tile index order (so the EBO can be built in tile order directly during
-  upload, skipping the compaction pass entirely).
-- File hash of the source `.ifc` (invalidation check).
+Staleness check: `source_file_size` is compared against the actual IFC file
+size. If mismatched, the sidecar is stale and is rebuilt. This is cheap and
+sufficient for a local cache (no hash computation on multi-GB files).
 
-This makes second-and-subsequent loads of the same model significantly faster:
-the spatial tree doesn't need to be rebuilt, and geometry uploads in tile order
-from the start.
+Endianness: if the marker reads back as `0x01020304`, the file was written on
+the same architecture — just `fread` the structs directly. Otherwise, reject
+the sidecar and rebuild.
 
 #### Performance characteristics
 
 | Metric | Value |
 |--------|-------|
-| Tile count (typical) | 500–5,000 for a large building |
-| Per-frame frustum tests | N_tiles instead of N_objects |
-| 500k objects, ~2k tiles | ~0.01 ms frustum testing |
-| Memory overhead | ~64 bytes/tile + 32 bytes/object (phase 1 metadata retained) |
-| Background compaction | 1–5 seconds for 1M objects (single-threaded) |
-| Sidecar file size | ~10–50 KB (indices + tree, no geometry) |
+| BVH build time (100k objects) | < 100 ms (single-threaded, background) |
+| Per-frame traversal (100k objects, 50% visible) | ~0.1 ms |
+| Per-frame traversal (100k objects, 5% visible) | ~0.02 ms |
+| Memory overhead | 32 bytes/node + 4 bytes/object index (~1.5× object count) |
+| EBO reorder (one-time) | 1–5 ms upload for 32 MB EBO |
+| Sidecar file size | ~same as geometry data (vertices + indices + metadata) |
+| Sidecar read time | bounded by disk I/O (~500 ms for 640 MB, ~2 s for 2.8 GB from NVMe) |
+| GPU upload time | progressive: ~48 MB/frame (~1 s for 2.8 GB at 60 fps, non-blocking) |
 
 #### Spatial coherence bonus
 
-Beyond culling, tile-sorted EBOs improve GPU cache performance. When the GPU
-rasterizes a tile's triangles, the vertices are contiguous in the VBO, so the
-post-transform vertex cache hits more often. This can yield 10–20% rasterization
-speedup even when nothing is culled (e.g. zoomed out to see the whole model).
+Beyond culling, BVH-leaf-sorted EBOs improve GPU cache performance. When the
+GPU rasterizes a leaf's triangles, the vertices are close together in the VBO,
+so the post-transform vertex cache hits more often. This can yield 10–20%
+rasterization speedup even when nothing is culled (e.g. zoomed out to see the
+whole model).
 
 ### Phase 3: GPU-Driven Indirect Draw
 
@@ -322,20 +426,20 @@ visibility decisions to the GPU via compute shaders and indirect draw commands.
 
 #### How it works
 
-Phase 3 is **approach 2 layered on top of approach 3**. It does not replace
-tiling — it accelerates it.
+Phase 3 builds on the BVH from phase 2. It does not replace the BVH — it
+moves the per-frame traversal to the GPU.
 
 1. **Upload phase** (once, at load time):
-   - Per-tile AABBs are uploaded to a GPU SSBO (`tile_aabbs`).
-   - One `DrawElementsIndirectCommand` per tile is written to an indirect draw
-     buffer:
+   - Per-leaf AABBs from the BVH are uploaded to a GPU SSBO (`leaf_aabbs`).
+   - One `DrawElementsIndirectCommand` per BVH leaf is written to an indirect
+     draw buffer:
      ```c
      struct DrawElementsIndirectCommand {
-         uint count;          // tile's total index count
+         uint count;          // leaf's total index count
          uint instanceCount;  // 1
-         uint firstIndex;     // offset into EBO
+         uint firstIndex;     // offset into EBO (from BVH leaf order)
          uint baseVertex;     // 0 (indices are global)
-         uint baseInstance;   // tile_id (available in shader via gl_DrawID)
+         uint baseInstance;   // leaf_id (available in shader via gl_DrawID)
      };
      ```
    - A "template" copy of the indirect buffer is kept so the compute shader
@@ -343,20 +447,20 @@ tiling — it accelerates it.
 
 2. **Cull phase** (every frame, on the GPU):
    - The CPU uploads 6 frustum plane vec4s as a uniform or small UBO.
-   - A compute shader dispatches `ceil(N_tiles / 64)` workgroups:
+   - A compute shader dispatches `ceil(N_leaves / 64)` workgroups:
      ```glsl
      layout(local_size_x = 64) in;
 
      void main() {
-         uint tile_id = gl_GlobalInvocationID.x;
-         if (tile_id >= tile_count) return;
+         uint leaf_id = gl_GlobalInvocationID.x;
+         if (leaf_id >= leaf_count) return;
 
          // Copy from template (resets any previously zeroed commands)
-         commands[tile_id] = template_commands[tile_id];
+         commands[leaf_id] = template_commands[leaf_id];
 
          // Frustum test
-         if (!aabb_vs_frustum(tile_aabbs[tile_id], frustum_planes)) {
-             commands[tile_id].count = 0;  // culled: GPU skips zero-count draws
+         if (!aabb_vs_frustum(leaf_aabbs[leaf_id], frustum_planes)) {
+             commands[leaf_id].count = 0;  // culled: GPU skips zero-count draws
          }
      }
      ```
@@ -364,7 +468,7 @@ tiling — it accelerates it.
 
 3. **Draw phase** (every frame):
    - One call: `glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT,
-     nullptr, N_tiles, 0)`.
+     nullptr, N_leaves, 0)`.
    - The GPU reads the indirect buffer, skips tiles with `count == 0`, and
      draws the rest. Zero CPU-side per-object or per-tile work.
 
@@ -382,12 +486,12 @@ That's it. The CPU frame time is essentially constant regardless of model size.
 Once the compute-based cull pass exists, it's straightforward to add:
 
 - **Hierarchical-Z occlusion culling**: render a coarse depth buffer from the
-  previous frame, then test tile AABBs against it in the compute shader. Tiles
-  fully behind closer geometry get culled. This handles interior-heavy BIM
-  models well (most rooms are occluded from any given viewpoint).
+  previous frame, then test BVH leaf AABBs against it in the compute shader.
+  Leaves fully behind closer geometry get culled. This handles interior-heavy
+  BIM models well (most rooms are occluded from any given viewpoint).
 - **Distance-based LOD**: the compute shader can select different index ranges
-  (coarse vs. fine tessellation) per tile based on distance to camera.
-- **Contribution culling**: tiles whose screen-space projection is below a
+  (coarse vs. fine tessellation) per leaf based on distance to camera.
+- **Contribution culling**: leaves whose screen-space projection is below a
   pixel threshold get `count = 0`. Removes distant small objects.
 
 #### Performance characteristics
@@ -395,10 +499,10 @@ Once the compute-based cull pass exists, it's straightforward to add:
 | Metric | Value |
 |--------|-------|
 | CPU per-frame work | ~0.01 ms (constant, independent of model size) |
-| GPU compute dispatch | ~0.02 ms for 2k tiles |
+| GPU compute dispatch | ~0.02 ms for 2k leaves |
 | Draw call overhead | 1 indirect multi-draw call |
-| GPU memory overhead | ~48 bytes/tile (AABB SSBO) + 20 bytes/tile (indirect commands) × 2 (template + live) |
-| Total for 2k tiles | ~176 KB GPU memory |
+| GPU memory overhead | ~48 bytes/leaf (AABB SSBO) + 20 bytes/leaf (indirect commands) × 2 (template + live) |
+| Total for 2k leaves | ~176 KB GPU memory |
 | Implementation complexity | High (compute shaders, SSBOs, memory barriers, indirect draw) |
 
 #### When to use
@@ -411,8 +515,8 @@ Phase 3 is worthwhile when:
   the viewer requires 4.5).
 
 For models under 100k objects, phase 1 alone is sufficient. For 100k–500k,
-phase 2 (tiling) keeps CPU culling under 1 ms. Phase 3 is the final step that
-makes the CPU frame time constant.
+phase 2 (BVH) keeps CPU culling well under 1 ms. Phase 3 is the final step
+that makes the CPU frame time constant.
 
 ### Summary
 
@@ -429,28 +533,33 @@ The load path:
 
 ```
 open(model.ifc):
-  ├─ sidecar exists?
-  │   ├─ yes: load tile tree from .ifcview
-  │   │       upload geometry in tile order
-  │   │       (skip background compaction)
-  │   └─ no:  upload geometry in iterator order (fast first frame)
-  │           phase 1 culling active immediately
-  │           if object_count > threshold:
-  │               background: build octree, re-sort EBO, save .ifcview
-  │               on completion: swap in tile structure
-  └─ rendering:
-      ├─ phase 3 available? → compute cull + indirect multi-draw
-      └─ else               → CPU frustum test + glMultiDrawElements
+  ├─ sidecar exists (.ifcview)?
+  │   ├─ yes: background thread reads sidecar file (non-blocking I/O)
+  │   │       → allocate per-model VAO/VBO/EBO (empty, exact size)
+  │   │       → progressive GPU upload: 48 MB/frame VBO, then EBO
+  │   │       → objects appear as EBO chunks land
+  │   │       → BVH activates once fully loaded
+  │   │       → viewport interactive throughout
+  │   └─ no:  stream from IFC via GeometryStreamer
+  │           → uploadChunk() appends to per-model buffers (immediately drawable)
+  │           → phase 1 linear-scan culling active from first chunk
+  │           → on completion: background BVH build, re-sort EBO, save .ifcview
+  └─ rendering (per model, per frame):
+      ├─ phase 3 available?  → compute cull + indirect multi-draw
+      ├─ BVH available?      → BVH traversal + glMultiDrawElements
+      └─ else / progressive  → linear scan of active objects + glMultiDrawElements
 ```
 
 ## Roadmap
 
 - [x] Material color support (per-vertex RGBA8)
-- [x] Buffer growth (dynamic VBO/EBO resizing up to 4 GB)
+- [x] Per-model GPU buffers (VAO/VBO/EBO per model, no cross-model copies)
 - [x] Per-object frustum culling (phase 1)
-- [ ] Spatial tiling with octree (phase 2)
+- [x] BVH acceleration with per-model trees (phase 2)
+- [x] Raw binary `.ifcview` sidecar cache (full geometry + BVH, Blender-style)
+- [x] Non-blocking sidecar loading (background thread I/O)
+- [x] Progressive GPU upload (48 MB/frame chunked VBO/EBO transfer)
 - [ ] GPU-driven indirect draw (phase 3)
-- [ ] Preprocessed `.ifcview` sidecar for fast re-loads
 - [ ] Hierarchical-Z occlusion culling
 - [ ] Distance-based LOD selection
 - [ ] Vulkan/MoltenVK backend for macOS
diff --git a/src/ifcviewer/SidecarCache.cpp b/src/ifcviewer/SidecarCache.cpp
new file mode 100644
index 00000000000..d77095c9223
--- /dev/null
+++ b/src/ifcviewer/SidecarCache.cpp
@@ -0,0 +1,196 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "SidecarCache.h"
+
+#include <cstdio>
+#include <cstring>
+
+// Binary layout (all multi-byte fields native-endian):
+//
+//   SidecarHeader          (16 bytes)
+//   uint64_t               source_file_size
+//
+//   uint32_t               num_vertices  (count of floats)
+//   float[num_vertices]    vertex data
+//
+//   uint32_t               num_indices
+//   uint32_t[num_indices]  index data
+//
+//   uint32_t               num_draw_infos
+//   ObjectDrawInfo[N]      draw info array
+//
+//   uint32_t               num_elements
+//   PackedElementInfo[N]   element records
+//   uint32_t               string_table_bytes
+//   char[string_table_bytes]
+//
+//   uint32_t               num_bvh_models
+//   for each model:
+//     uint32_t model_id
+//     uint32_t num_nodes
+//     BvhNode[num_nodes]
+//     uint32_t num_object_indices
+//     uint32_t[num_object_indices]
+
+struct SidecarHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t endian;
+    uint32_t reserved;
+};
+
+static std::string sidecarPath(const std::string& ifc_path) {
+    return ifc_path + ".ifcview";
+}
+
+template<typename T>
+static bool writeVec(FILE* f, const std::vector<T>& v) {
+    uint32_t n = static_cast<uint32_t>(v.size());
+    if (fwrite(&n, 4, 1, f) != 1) return false;
+    if (n > 0 && fwrite(v.data(), sizeof(T), n, f) != n) return false;
+    return true;
+}
+
+template<typename T>
+static bool readVec(FILE* f, std::vector<T>& v) {
+    uint32_t n;
+    if (fread(&n, 4, 1, f) != 1) return false;
+    v.resize(n);
+    if (n > 0 && fread(v.data(), sizeof(T), n, f) != n) return false;
+    return true;
+}
+
+bool writeSidecar(const std::string& ifc_path,
+                  const SidecarData& data,
+                  uint64_t ifc_file_size) {
+    std::string path = sidecarPath(ifc_path);
+    FILE* f = fopen(path.c_str(), "wb");
+    if (!f) return false;
+
+    // Header
+    SidecarHeader hdr = { SIDECAR_MAGIC, SIDECAR_VERSION, SIDECAR_ENDIAN, 0 };
+    fwrite(&hdr, sizeof(hdr), 1, f);
+    fwrite(&ifc_file_size, 8, 1, f);
+
+    // Geometry
+    if (!writeVec(f, data.vertices)) { fclose(f); return false; }
+    if (!writeVec(f, data.indices))  { fclose(f); return false; }
+
+    // Draw info
+    if (!writeVec(f, data.draw_info)) { fclose(f); return false; }
+
+    // Elements + string table
+    if (!writeVec(f, data.elements)) { fclose(f); return false; }
+    uint32_t stbl_len = static_cast<uint32_t>(data.string_table.size());
+    fwrite(&stbl_len, 4, 1, f);
+    if (stbl_len > 0) fwrite(data.string_table.data(), 1, stbl_len, f);
+
+    // BVH
+    uint32_t num_bvh_models = data.bvh_set
+        ? static_cast<uint32_t>(data.bvh_set->models.size()) : 0;
+    fwrite(&num_bvh_models, 4, 1, f);
+
+    if (data.bvh_set) {
+        for (const auto& [model_id, mbvh] : data.bvh_set->models) {
+            fwrite(&model_id, 4, 1, f);
+
+            uint32_t nn = static_cast<uint32_t>(mbvh.nodes.size());
+            fwrite(&nn, 4, 1, f);
+            if (nn > 0) fwrite(mbvh.nodes.data(), sizeof(BvhNode), nn, f);
+
+            uint32_t no = static_cast<uint32_t>(mbvh.object_indices.size());
+            fwrite(&no, 4, 1, f);
+            if (no > 0) fwrite(mbvh.object_indices.data(), 4, no, f);
+        }
+    }
+
+    fclose(f);
+    return true;
+}
+
+std::optional<SidecarData> readSidecar(const std::string& ifc_path,
+                                       uint64_t ifc_file_size) {
+    std::string path = sidecarPath(ifc_path);
+    FILE* f = fopen(path.c_str(), "rb");
+    if (!f) return std::nullopt;
+
+    auto fail = [&]() -> std::optional<SidecarData> { fclose(f); return std::nullopt; };
+
+    // Header
+    SidecarHeader hdr;
+    if (fread(&hdr, sizeof(hdr), 1, f) != 1) return fail();
+    if (hdr.magic != SIDECAR_MAGIC ||
+        hdr.version != SIDECAR_VERSION ||
+        hdr.endian != SIDECAR_ENDIAN) return fail();
+
+    uint64_t stored_size;
+    if (fread(&stored_size, 8, 1, f) != 1) return fail();
+    if (stored_size != ifc_file_size) return fail();
+
+    SidecarData data;
+
+    // Geometry
+    if (!readVec(f, data.vertices)) return fail();
+    if (!readVec(f, data.indices))  return fail();
+
+    // Draw info
+    if (!readVec(f, data.draw_info)) return fail();
+
+    // Elements + string table
+    if (!readVec(f, data.elements)) return fail();
+    uint32_t stbl_len;
+    if (fread(&stbl_len, 4, 1, f) != 1) return fail();
+    data.string_table.resize(stbl_len);
+    if (stbl_len > 0 && fread(data.string_table.data(), 1, stbl_len, f) != stbl_len)
+        return fail();
+
+    // BVH
+    uint32_t num_bvh_models;
+    if (fread(&num_bvh_models, 4, 1, f) != 1) return fail();
+
+    if (num_bvh_models > 0) {
+        data.bvh_set = std::make_shared<BvhSet>();
+        for (uint32_t m = 0; m < num_bvh_models; ++m) {
+            uint32_t model_id;
+            if (fread(&model_id, 4, 1, f) != 1) return fail();
+
+            ModelBvh mbvh;
+            mbvh.model_id = model_id;
+
+            uint32_t nn;
+            if (fread(&nn, 4, 1, f) != 1) return fail();
+            mbvh.nodes.resize(nn);
+            if (nn > 0 && fread(mbvh.nodes.data(), sizeof(BvhNode), nn, f) != nn)
+                return fail();
+
+            uint32_t no;
+            if (fread(&no, 4, 1, f) != 1) return fail();
+            mbvh.object_indices.resize(no);
+            if (no > 0 && fread(mbvh.object_indices.data(), 4, no, f) != no)
+                return fail();
+
+            data.bvh_set->bvh_model_ids.insert(model_id);
+            data.bvh_set->models[model_id] = std::move(mbvh);
+        }
+    }
+
+    fclose(f);
+    return data;
+}
diff --git a/src/ifcviewer/SidecarCache.h b/src/ifcviewer/SidecarCache.h
new file mode 100644
index 00000000000..49c36dba15a
--- /dev/null
+++ b/src/ifcviewer/SidecarCache.h
@@ -0,0 +1,76 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef SIDECARCACHE_H
+#define SIDECARCACHE_H
+
+#include "BvhAccel.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+static constexpr uint32_t SIDECAR_MAGIC   = 0x49465657;  // "IFVW"
+static constexpr uint32_t SIDECAR_VERSION = 3;
+static constexpr uint32_t SIDECAR_ENDIAN  = 0x01020304;
+
+// Fixed-size element record for the sidecar.  Strings are stored as
+// (offset, length) pairs into a separate string table.
+struct PackedElementInfo {
+    uint32_t object_id;
+    uint32_t model_id;
+    int32_t  ifc_id;
+    int32_t  parent_id;
+    uint32_t guid_offset;
+    uint32_t guid_length;
+    uint32_t name_offset;
+    uint32_t name_length;
+    uint32_t type_offset;
+    uint32_t type_length;
+};
+
+// Everything the viewer needs to display a model without tessellating.
+struct SidecarData {
+    // GPU geometry (ready to upload as-is)
+    std::vector<float>    vertices;      // interleaved, 8 floats per vertex
+    std::vector<uint32_t> indices;       // global (already remapped)
+
+    // Per-object metadata
+    std::vector<ObjectDrawInfo> draw_info;
+
+    // Element tree metadata
+    std::vector<PackedElementInfo> elements;
+    std::string string_table;            // concatenated UTF-8
+
+    // BVH acceleration
+    std::shared_ptr<BvhSet> bvh_set;
+};
+
+// Write a full sidecar next to the IFC file.
+// Returns true on success.
+bool writeSidecar(const std::string& ifc_path,
+                  const SidecarData& data,
+                  uint64_t ifc_file_size);
+
+// Read a sidecar.  Returns nullopt on any failure (missing, stale, corrupt).
+std::optional<SidecarData> readSidecar(const std::string& ifc_path,
+                                       uint64_t ifc_file_size);
+
+#endif // SIDECARCACHE_H
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 4217c997423..ae50f6dc44a 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -18,6 +18,7 @@
  ********************************************************************************/
 
 #include "ViewportWindow.h"
+#include "SidecarCache.h"
 
 #include <QMouseEvent>
 #include <QWheelEvent>
@@ -32,7 +33,6 @@
 
 static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;  // 64 MB
 static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;  // 32 MB
-// Cap buffer growth so a runaway upload can't try to allocate the world.
 static const size_t MAX_BUFFER_SIZE = 4ull * 1024 * 1024 * 1024;  // 4 GB
 static const int VERTEX_STRIDE = 8;  // pos(3) + normal(3) + object_id(1) + color(1 packed)
 
@@ -192,12 +192,16 @@ ViewportWindow::ViewportWindow(QWindow* parent)
 }
 
 ViewportWindow::~ViewportWindow() {
+    if (bvh_build_thread_.joinable())
+        bvh_build_thread_.join();
     if (context_) {
         context_->makeCurrent(this);
         if (gl_) {
-            if (vao_) gl_->glDeleteVertexArrays(1, &vao_);
-            if (vbo_) gl_->glDeleteBuffers(1, &vbo_);
-            if (ebo_) gl_->glDeleteBuffers(1, &ebo_);
+            for (auto& [mid, m] : models_gpu_) {
+                if (m.vao) gl_->glDeleteVertexArrays(1, &m.vao);
+                if (m.vbo) gl_->glDeleteBuffers(1, &m.vbo);
+                if (m.ebo) gl_->glDeleteBuffers(1, &m.ebo);
+            }
             if (axis_vao_) gl_->glDeleteVertexArrays(1, &axis_vao_);
             if (axis_vbo_) gl_->glDeleteBuffers(1, &axis_vbo_);
             if (main_program_) gl_->glDeleteProgram(main_program_);
@@ -231,55 +235,40 @@ void ViewportWindow::initGL() {
     buildShaders();
     buildAxisGizmo();
 
-    // Create VAO
-    gl_->glCreateVertexArrays(1, &vao_);
+    gl_->glEnable(GL_DEPTH_TEST);
+    gl_->glEnable(GL_MULTISAMPLE);
+    gl_->glClearColor(0.18f, 0.20f, 0.22f, 1.0f);
 
-    // Create VBO with initial capacity
-    vbo_capacity_ = INITIAL_VBO_SIZE;
-    gl_->glCreateBuffers(1, &vbo_);
-    gl_->glNamedBufferStorage(vbo_, vbo_capacity_, nullptr,
-        GL_DYNAMIC_STORAGE_BIT);
+    gl_initialized_ = true;
+    frame_clock_.start();
+    render_timer_.start();
 
-    // Create EBO with initial capacity
-    ebo_capacity_ = INITIAL_EBO_SIZE;
-    gl_->glCreateBuffers(1, &ebo_);
-    gl_->glNamedBufferStorage(ebo_, ebo_capacity_, nullptr,
-        GL_DYNAMIC_STORAGE_BIT);
+    emit initialized();
+}
 
-    // Vertex layout: pos(3f) + normal(3f) + object_id(1f) + color(4 unorm bytes)
-    // = 8 floats = 32 bytes per vertex.
-    gl_->glVertexArrayVertexBuffer(vao_, 0, vbo_, 0, VERTEX_STRIDE * sizeof(float));
-    gl_->glVertexArrayElementBuffer(vao_, ebo_);
+void ViewportWindow::setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo) {
+    gl_->glVertexArrayVertexBuffer(vao, 0, vbo, 0, VERTEX_STRIDE * sizeof(float));
+    gl_->glVertexArrayElementBuffer(vao, ebo);
 
     // position
-    gl_->glEnableVertexArrayAttrib(vao_, 0);
-    gl_->glVertexArrayAttribFormat(vao_, 0, 3, GL_FLOAT, GL_FALSE, 0);
-    gl_->glVertexArrayAttribBinding(vao_, 0, 0);
+    gl_->glEnableVertexArrayAttrib(vao, 0);
+    gl_->glVertexArrayAttribFormat(vao, 0, 3, GL_FLOAT, GL_FALSE, 0);
+    gl_->glVertexArrayAttribBinding(vao, 0, 0);
 
     // normal
-    gl_->glEnableVertexArrayAttrib(vao_, 1);
-    gl_->glVertexArrayAttribFormat(vao_, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
-    gl_->glVertexArrayAttribBinding(vao_, 1, 0);
+    gl_->glEnableVertexArrayAttrib(vao, 1);
+    gl_->glVertexArrayAttribFormat(vao, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao, 1, 0);
 
     // object_id (passed as float, decoded in shader via floatBitsToUint)
-    gl_->glEnableVertexArrayAttrib(vao_, 2);
-    gl_->glVertexArrayAttribFormat(vao_, 2, 1, GL_FLOAT, GL_FALSE, 6 * sizeof(float));
-    gl_->glVertexArrayAttribBinding(vao_, 2, 0);
+    gl_->glEnableVertexArrayAttrib(vao, 2);
+    gl_->glVertexArrayAttribFormat(vao, 2, 1, GL_FLOAT, GL_FALSE, 6 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao, 2, 0);
 
     // color (RGBA8 packed into the 4 bytes at offset 28; normalized to vec4)
-    gl_->glEnableVertexArrayAttrib(vao_, 3);
-    gl_->glVertexArrayAttribFormat(vao_, 3, 4, GL_UNSIGNED_BYTE, GL_TRUE, 7 * sizeof(float));
-    gl_->glVertexArrayAttribBinding(vao_, 3, 0);
-
-    gl_->glEnable(GL_DEPTH_TEST);
-    gl_->glEnable(GL_MULTISAMPLE);
-    gl_->glClearColor(0.18f, 0.20f, 0.22f, 1.0f);
-
-    gl_initialized_ = true;
-    frame_clock_.start();
-    render_timer_.start();
-
-    emit initialized();
+    gl_->glEnableVertexArrayAttrib(vao, 3);
+    gl_->glVertexArrayAttribFormat(vao, 3, 4, GL_UNSIGNED_BYTE, GL_TRUE, 7 * sizeof(float));
+    gl_->glVertexArrayAttribBinding(vao, 3, 0);
 }
 
 void ViewportWindow::buildShaders() {
@@ -301,15 +290,11 @@ void ViewportWindow::buildShaders() {
 }
 
 void ViewportWindow::buildAxisGizmo() {
-    // 3 line segments (X red, Y green, Z blue), 6 vertices, pos(3) + color(3).
     static const float axis_data[] = {
-        // X axis - red
         0.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
         1.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
-        // Y axis - green
         0.0f, 0.0f, 0.0f,   0.30f, 0.95f, 0.30f,
         0.0f, 1.0f, 0.0f,   0.30f, 0.95f, 0.30f,
-        // Z axis - blue
         0.0f, 0.0f, 0.0f,   0.30f, 0.55f, 1.0f,
         0.0f, 0.0f, 1.0f,   0.30f, 0.55f, 1.0f,
     };
@@ -329,15 +314,11 @@ void ViewportWindow::buildAxisGizmo() {
     gl_->glVertexArrayAttribBinding(axis_vao_, 1, 0);
 }
 
-bool ViewportWindow::growVbo(size_t needed_total) {
-    // Double until it fits, but don't blow past the cap.
-    size_t new_capacity = vbo_capacity_;
-    while (new_capacity < needed_total) {
-        new_capacity *= 2;
-    }
+bool ViewportWindow::growModelVbo(ModelGpuData& m, size_t needed_total) {
+    size_t new_capacity = m.vbo_capacity;
+    while (new_capacity < needed_total) new_capacity *= 2;
     if (new_capacity > MAX_BUFFER_SIZE) {
-        qWarning("VBO grow request (%zu MB) exceeds cap (%zu MB)",
-            new_capacity / (1024 * 1024), MAX_BUFFER_SIZE / (1024 * 1024));
+        qWarning("VBO grow request (%zu MB) exceeds cap", new_capacity / (1024 * 1024));
         return false;
     }
 
@@ -345,29 +326,25 @@ bool ViewportWindow::growVbo(size_t needed_total) {
     gl_->glCreateBuffers(1, &new_vbo);
     gl_->glNamedBufferStorage(new_vbo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
 
-    if (vbo_used_ > 0) {
-        gl_->glCopyNamedBufferSubData(vbo_, new_vbo, 0, 0, vbo_used_);
+    if (m.vbo_used > 0) {
+        gl_->glCopyNamedBufferSubData(m.vbo, new_vbo, 0, 0, m.vbo_used);
     }
 
-    gl_->glDeleteBuffers(1, &vbo_);
-    vbo_ = new_vbo;
-    vbo_capacity_ = new_capacity;
+    gl_->glDeleteBuffers(1, &m.vbo);
+    m.vbo = new_vbo;
+    m.vbo_capacity = new_capacity;
 
-    // Rebind on the VAO so subsequent draws see the new buffer.
-    gl_->glVertexArrayVertexBuffer(vao_, 0, vbo_, 0, VERTEX_STRIDE * sizeof(float));
+    gl_->glVertexArrayVertexBuffer(m.vao, 0, m.vbo, 0, VERTEX_STRIDE * sizeof(float));
 
-    qInfo("VBO grew to %zu MB", vbo_capacity_ / (1024 * 1024));
+    qInfo("Model VBO grew to %zu MB", m.vbo_capacity / (1024 * 1024));
     return true;
 }
 
-bool ViewportWindow::growEbo(size_t needed_total) {
-    size_t new_capacity = ebo_capacity_;
-    while (new_capacity < needed_total) {
-        new_capacity *= 2;
-    }
+bool ViewportWindow::growModelEbo(ModelGpuData& m, size_t needed_total) {
+    size_t new_capacity = m.ebo_capacity;
+    while (new_capacity < needed_total) new_capacity *= 2;
     if (new_capacity > MAX_BUFFER_SIZE) {
-        qWarning("EBO grow request (%zu MB) exceeds cap (%zu MB)",
-            new_capacity / (1024 * 1024), MAX_BUFFER_SIZE / (1024 * 1024));
+        qWarning("EBO grow request (%zu MB) exceeds cap", new_capacity / (1024 * 1024));
         return false;
     }
 
@@ -375,17 +352,17 @@ bool ViewportWindow::growEbo(size_t needed_total) {
     gl_->glCreateBuffers(1, &new_ebo);
     gl_->glNamedBufferStorage(new_ebo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
 
-    if (ebo_used_ > 0) {
-        gl_->glCopyNamedBufferSubData(ebo_, new_ebo, 0, 0, ebo_used_);
+    if (m.ebo_used > 0) {
+        gl_->glCopyNamedBufferSubData(m.ebo, new_ebo, 0, 0, m.ebo_used);
     }
 
-    gl_->glDeleteBuffers(1, &ebo_);
-    ebo_ = new_ebo;
-    ebo_capacity_ = new_capacity;
+    gl_->glDeleteBuffers(1, &m.ebo);
+    m.ebo = new_ebo;
+    m.ebo_capacity = new_capacity;
 
-    gl_->glVertexArrayElementBuffer(vao_, ebo_);
+    gl_->glVertexArrayElementBuffer(m.vao, m.ebo);
 
-    qInfo("EBO grew to %zu MB", ebo_capacity_ / (1024 * 1024));
+    qInfo("Model EBO grew to %zu MB", m.ebo_capacity / (1024 * 1024));
     return true;
 }
 
@@ -395,37 +372,55 @@ void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
 
     context_->makeCurrent(this);
 
+    // Get or create per-model GPU data.
+    auto it = models_gpu_.find(chunk.model_id);
+    if (it == models_gpu_.end()) {
+        ModelGpuData m;
+        gl_->glCreateVertexArrays(1, &m.vao);
+        gl_->glCreateBuffers(1, &m.vbo);
+        gl_->glCreateBuffers(1, &m.ebo);
+
+        m.vbo_capacity = INITIAL_VBO_SIZE;
+        m.ebo_capacity = INITIAL_EBO_SIZE;
+        gl_->glNamedBufferStorage(m.vbo, m.vbo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+        gl_->glNamedBufferStorage(m.ebo, m.ebo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+
+        setupVaoLayout(m.vao, m.vbo, m.ebo);
+        it = models_gpu_.emplace(chunk.model_id, std::move(m)).first;
+    }
+
+    auto& mgpu = it->second;
+
     size_t vb_size = chunk.vertices.size() * sizeof(float);
     size_t ib_size = chunk.indices.size() * sizeof(uint32_t);
 
-    if (vbo_used_ + vb_size > vbo_capacity_) {
-        if (!growVbo(vbo_used_ + vb_size)) {
+    if (mgpu.vbo_used + vb_size > mgpu.vbo_capacity) {
+        if (!growModelVbo(mgpu, mgpu.vbo_used + vb_size)) {
             qWarning("VBO at cap, skipping chunk");
             return;
         }
     }
-    if (ebo_used_ + ib_size > ebo_capacity_) {
-        if (!growEbo(ebo_used_ + ib_size)) {
+    if (mgpu.ebo_used + ib_size > mgpu.ebo_capacity) {
+        if (!growModelEbo(mgpu, mgpu.ebo_used + ib_size)) {
             qWarning("EBO at cap, skipping chunk");
             return;
         }
     }
 
-    uint32_t base_vertex = vertex_count_;
+    uint32_t base_vertex = mgpu.vertex_count;
 
-    gl_->glNamedBufferSubData(vbo_, vbo_used_, vb_size, chunk.vertices.data());
+    gl_->glNamedBufferSubData(mgpu.vbo, mgpu.vbo_used, vb_size, chunk.vertices.data());
 
-    // Remap chunk-local indices into global indices so the whole EBO can be
-    // drawn with a single glDrawElements call.
+    // Remap chunk-local indices into model-local global indices.
     std::vector<uint32_t> global_indices(chunk.indices.size());
     for (size_t i = 0; i < chunk.indices.size(); ++i) {
         global_indices[i] = chunk.indices[i] + base_vertex;
     }
-    gl_->glNamedBufferSubData(ebo_, ebo_used_, ib_size, global_indices.data());
+    gl_->glNamedBufferSubData(mgpu.ebo, mgpu.ebo_used, ib_size, global_indices.data());
 
     // Compute AABB from vertex positions in this chunk.
     ObjectDrawInfo info;
-    info.index_offset = static_cast<uint32_t>(ebo_used_);
+    info.index_offset = static_cast<uint32_t>(mgpu.ebo_used);
     info.index_count = static_cast<uint32_t>(chunk.indices.size());
     info.model_id = chunk.model_id;
 
@@ -445,46 +440,301 @@ void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
         info.aabb_max[0] = info.aabb_max[1] = info.aabb_max[2] = 0.0f;
     }
 
-    {
-        std::lock_guard<std::mutex> lock(upload_mutex_);
-        total_index_count_ += static_cast<uint32_t>(chunk.indices.size());
-        object_draw_info_.push_back(info);
-    }
+    mgpu.draw_info.push_back(info);
+    mgpu.active_draw_count = static_cast<uint32_t>(mgpu.draw_info.size()); // immediately drawable
+    mgpu.vbo_used += vb_size;
+    mgpu.ebo_used += ib_size;
+    mgpu.vertex_count += static_cast<uint32_t>(num_verts);
+    mgpu.total_triangles += static_cast<uint32_t>(chunk.indices.size() / 3);
+}
 
-    vbo_used_ += vb_size;
-    ebo_used_ += ib_size;
-    vertex_count_ += static_cast<uint32_t>(chunk.vertices.size() / VERTEX_STRIDE);
-    total_triangles_ += static_cast<uint32_t>(chunk.indices.size() / 3);
+void ViewportWindow::uploadBulk(uint32_t model_id,
+                                std::vector<float> vertices,
+                                std::vector<uint32_t> indices,
+                                const std::vector<ObjectDrawInfo>& draw_info,
+                                std::shared_ptr<BvhSet> bvh_set) {
+    if (!gl_initialized_) return;
+    if (vertices.empty() || indices.empty()) return;
+
+    context_->makeCurrent(this);
+
+    size_t vb_size = vertices.size() * sizeof(float);
+    size_t ib_size = indices.size() * sizeof(uint32_t);
+
+    // Allocate empty buffers at exact size — no data uploaded yet.
+    ModelGpuData m;
+    gl_->glCreateVertexArrays(1, &m.vao);
+    gl_->glCreateBuffers(1, &m.vbo);
+    gl_->glCreateBuffers(1, &m.ebo);
+
+    m.vbo_capacity = vb_size;
+    m.ebo_capacity = ib_size;
+    gl_->glNamedBufferStorage(m.vbo, vb_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
+    gl_->glNamedBufferStorage(m.ebo, ib_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
+
+    setupVaoLayout(m.vao, m.vbo, m.ebo);
+
+    m.vbo_used = vb_size;
+    m.ebo_used = ib_size;
+    m.vertex_count = static_cast<uint32_t>(vertices.size() / VERTEX_STRIDE);
+    m.draw_info = draw_info;
+    m.active_draw_count = 0;  // nothing drawable yet
+
+    uint32_t total_tri = 0;
+    for (const auto& di : draw_info) total_tri += di.index_count / 3;
+    m.total_triangles = total_tri;
+
+    // Delete old model data if re-uploading.
+    auto it = models_gpu_.find(model_id);
+    if (it != models_gpu_.end()) {
+        gl_->glDeleteVertexArrays(1, &it->second.vao);
+        gl_->glDeleteBuffers(1, &it->second.vbo);
+        gl_->glDeleteBuffers(1, &it->second.ebo);
+    }
+    models_gpu_[model_id] = std::move(m);
+
+    // Queue progressive upload — data will stream in over subsequent frames.
+    PendingUpload pu;
+    pu.model_id = model_id;
+    pu.vertices = std::move(vertices);
+    pu.indices = std::move(indices);
+    pu.bvh_set = std::move(bvh_set);
+    pending_uploads_.push_back(std::move(pu));
+
+    qDebug("Bulk upload queued: model %u, %zu vertices, %zu indices, %zu objects",
+           model_id, vertices.size() / VERTEX_STRIDE, indices.size(), draw_info.size());
 }
 
 void ViewportWindow::resetScene() {
     if (!gl_initialized_) return;
 
-    std::lock_guard<std::mutex> lock(upload_mutex_);
-    total_index_count_ = 0;
-    vbo_used_ = 0;
-    ebo_used_ = 0;
-    vertex_count_ = 0;
-    total_triangles_ = 0;
+    if (bvh_build_thread_.joinable())
+        bvh_build_thread_.join();
+
+    context_->makeCurrent(this);
+    for (auto& [mid, m] : models_gpu_) {
+        if (m.vao) gl_->glDeleteVertexArrays(1, &m.vao);
+        if (m.vbo) gl_->glDeleteBuffers(1, &m.vbo);
+        if (m.ebo) gl_->glDeleteBuffers(1, &m.ebo);
+    }
+    models_gpu_.clear();
+    model_bvhs_.clear();
+    pending_uploads_.clear();
     selected_object_id_ = 0;
-    object_draw_info_.clear();
-    hidden_models_.clear();
-    removed_models_.clear();
+    {
+        std::lock_guard<std::mutex> bvh_lock(bvh_result_mutex_);
+        pending_bvh_.reset();
+    }
+}
+
+static const size_t UPLOAD_CHUNK_BYTES = 48 * 1024 * 1024;  // 48 MB per frame
+
+void ViewportWindow::processPendingUploads() {
+    if (pending_uploads_.empty()) return;
+
+    auto& pu = pending_uploads_.front();
+    auto it = models_gpu_.find(pu.model_id);
+    if (it == models_gpu_.end()) {
+        pending_uploads_.pop_front();
+        return;
+    }
+    auto& mgpu = it->second;
+
+    size_t vbo_total = pu.vertices.size() * sizeof(float);
+    size_t ebo_total = pu.indices.size() * sizeof(uint32_t);
+
+    // Phase 1: Upload VBO in chunks.
+    if (pu.vbo_uploaded < vbo_total) {
+        size_t remaining = vbo_total - pu.vbo_uploaded;
+        size_t chunk = std::min(remaining, UPLOAD_CHUNK_BYTES);
+        gl_->glNamedBufferSubData(mgpu.vbo, pu.vbo_uploaded, chunk,
+                                  reinterpret_cast<const char*>(pu.vertices.data()) + pu.vbo_uploaded);
+        pu.vbo_uploaded += chunk;
+
+        if (pu.vbo_uploaded >= vbo_total) {
+            // VBO done — free CPU memory.
+            pu.vertices.clear();
+            pu.vertices.shrink_to_fit();
+        }
+        return;  // yield to render loop
+    }
+
+    // Phase 2: Upload EBO in chunks. Objects become drawable as their range lands.
+    if (pu.ebo_uploaded < ebo_total) {
+        size_t remaining = ebo_total - pu.ebo_uploaded;
+        size_t chunk = std::min(remaining, UPLOAD_CHUNK_BYTES);
+        gl_->glNamedBufferSubData(mgpu.ebo, pu.ebo_uploaded, chunk,
+                                  reinterpret_cast<const char*>(pu.indices.data()) + pu.ebo_uploaded);
+        pu.ebo_uploaded += chunk;
+
+        // Advance active_draw_count: activate objects whose EBO range is fully uploaded.
+        while (mgpu.active_draw_count < mgpu.draw_info.size()) {
+            const auto& obj = mgpu.draw_info[mgpu.active_draw_count];
+            size_t obj_end = obj.index_offset + obj.index_count * sizeof(uint32_t);
+            if (obj_end <= pu.ebo_uploaded)
+                mgpu.active_draw_count++;
+            else
+                break;
+        }
+
+        if (pu.ebo_uploaded >= ebo_total) {
+            // EBO done — free CPU memory.
+            pu.indices.clear();
+            pu.indices.shrink_to_fit();
+        } else {
+            return;  // yield to render loop
+        }
+    }
+
+    // Fully uploaded — activate BVH if present.
+    mgpu.active_draw_count = static_cast<uint32_t>(mgpu.draw_info.size());
+    if (pu.bvh_set) {
+        model_bvhs_[pu.model_id] = std::move(pu.bvh_set);
+    }
+
+    qDebug("Progressive upload complete: model %u", pu.model_id);
+    pending_uploads_.pop_front();
 }
 
 void ViewportWindow::hideModel(uint32_t model_id) {
-    std::lock_guard<std::mutex> lock(upload_mutex_);
-    hidden_models_.insert(model_id);
+    auto it = models_gpu_.find(model_id);
+    if (it != models_gpu_.end()) it->second.hidden = true;
 }
 
 void ViewportWindow::showModel(uint32_t model_id) {
-    std::lock_guard<std::mutex> lock(upload_mutex_);
-    hidden_models_.erase(model_id);
+    auto it = models_gpu_.find(model_id);
+    if (it != models_gpu_.end()) it->second.hidden = false;
 }
 
 void ViewportWindow::removeModel(uint32_t model_id) {
-    std::lock_guard<std::mutex> lock(upload_mutex_);
-    removed_models_.insert(model_id);
+    if (!gl_initialized_) return;
+    context_->makeCurrent(this);
+
+    // Cancel any pending upload for this model.
+    pending_uploads_.erase(
+        std::remove_if(pending_uploads_.begin(), pending_uploads_.end(),
+                        [model_id](const PendingUpload& pu) { return pu.model_id == model_id; }),
+        pending_uploads_.end());
+
+    auto it = models_gpu_.find(model_id);
+    if (it != models_gpu_.end()) {
+        gl_->glDeleteVertexArrays(1, &it->second.vao);
+        gl_->glDeleteBuffers(1, &it->second.vbo);
+        gl_->glDeleteBuffers(1, &it->second.ebo);
+        models_gpu_.erase(it);
+    }
+    model_bvhs_.erase(model_id);
+}
+
+std::vector<uint32_t> ViewportWindow::readbackEbo(uint32_t model_id) const {
+    std::vector<uint32_t> ebo_data;
+    auto it = models_gpu_.find(model_id);
+    if (!gl_ || it == models_gpu_.end() || it->second.ebo_used == 0) return ebo_data;
+
+    const auto& m = it->second;
+    size_t num_indices = m.ebo_used / sizeof(uint32_t);
+    ebo_data.resize(num_indices);
+    gl_->glGetNamedBufferSubData(m.ebo, 0, m.ebo_used, ebo_data.data());
+    return ebo_data;
+}
+
+std::vector<float> ViewportWindow::readbackVbo(uint32_t model_id) const {
+    std::vector<float> vbo_data;
+    auto it = models_gpu_.find(model_id);
+    if (!gl_ || it == models_gpu_.end() || it->second.vbo_used == 0) return vbo_data;
+
+    const auto& m = it->second;
+    size_t num_floats = m.vbo_used / sizeof(float);
+    vbo_data.resize(num_floats);
+    gl_->glGetNamedBufferSubData(m.vbo, 0, m.vbo_used, vbo_data.data());
+    return vbo_data;
+}
+
+void ViewportWindow::buildBvhAsync(uint32_t model_id,
+                                   const std::string& ifc_path,
+                                   uint64_t ifc_file_size,
+                                   std::vector<PackedElementInfo> sidecar_elements,
+                                   std::string sidecar_string_table) {
+    if (bvh_build_thread_.joinable())
+        bvh_build_thread_.join();
+
+    auto it = models_gpu_.find(model_id);
+    if (it == models_gpu_.end()) return;
+
+    // Snapshot draw info; read back EBO + VBO on GL thread.
+    std::vector<ObjectDrawInfo> draw_snapshot = it->second.draw_info;
+    std::vector<uint32_t> ebo_snapshot = readbackEbo(model_id);
+    std::vector<float> vbo_snapshot;
+    if (!ifc_path.empty() && !sidecar_elements.empty()) {
+        vbo_snapshot = readbackVbo(model_id);
+    }
+
+    if (draw_snapshot.empty() || ebo_snapshot.empty()) return;
+
+    bvh_build_thread_ = std::thread([this,
+                                     model_id,
+                                     draw_info = std::move(draw_snapshot),
+                                     ebo_data = std::move(ebo_snapshot),
+                                     vbo_data = std::move(vbo_snapshot),
+                                     elements = std::move(sidecar_elements),
+                                     string_table = std::move(sidecar_string_table),
+                                     ifc_path, ifc_file_size]() {
+        auto bvh_set = buildBvhSet(draw_info);
+
+        EboReorderResult ebo_result = reorderEbo(*bvh_set, draw_info, ebo_data);
+
+        // Write full sidecar if requested.
+        if (!ifc_path.empty() && !elements.empty() && !vbo_data.empty()) {
+            SidecarData sd;
+            sd.vertices = vbo_data;
+            sd.indices = ebo_result.reordered_ebo;
+            sd.draw_info = ebo_result.reordered_draw_info;
+            sd.elements = std::move(elements);
+            sd.string_table = std::move(string_table);
+            sd.bvh_set = bvh_set;
+            writeSidecar(ifc_path, sd, ifc_file_size);
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(bvh_result_mutex_);
+            pending_bvh_ = std::make_unique<PendingBvh>();
+            pending_bvh_->model_id = model_id;
+            pending_bvh_->bvh_set = std::move(bvh_set);
+            pending_bvh_->ebo_reorder = std::move(ebo_result);
+        }
+    });
+}
+
+void ViewportWindow::applyBvhResult() {
+    std::unique_ptr<PendingBvh> result;
+    {
+        std::lock_guard<std::mutex> lock(bvh_result_mutex_);
+        result = std::move(pending_bvh_);
+    }
+    if (!result) return;
+
+    auto it = models_gpu_.find(result->model_id);
+    if (it == models_gpu_.end()) return;
+
+    auto& mgpu = it->second;
+
+    // Re-upload the reordered EBO into this model's buffer.
+    if (!result->ebo_reorder.reordered_ebo.empty()) {
+        size_t ebo_bytes = result->ebo_reorder.reordered_ebo.size() * sizeof(uint32_t);
+        if (ebo_bytes <= mgpu.ebo_capacity) {
+            gl_->glNamedBufferSubData(mgpu.ebo, 0, ebo_bytes,
+                                      result->ebo_reorder.reordered_ebo.data());
+        }
+    }
+
+    // Swap draw info.
+    if (result->ebo_reorder.reordered_draw_info.size() == mgpu.draw_info.size()) {
+        mgpu.draw_info = std::move(result->ebo_reorder.reordered_draw_info);
+    }
+
+    model_bvhs_[result->model_id] = std::move(result->bvh_set);
+
+    qDebug("BVH activated for model %u", result->model_id);
 }
 
 void ViewportWindow::setSelectedObjectId(uint32_t id) {
@@ -499,7 +749,6 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     int w = width() * devicePixelRatio();
     int h = height() * devicePixelRatio();
 
-    // Create/resize pick FBO if needed
     if (pick_width_ != w || pick_height_ != h) {
         if (pick_fbo_) gl_->glDeleteFramebuffers(1, &pick_fbo_);
         if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
@@ -533,7 +782,6 @@ void ViewportWindow::updateCamera() {
     float yaw_rad = qDegreesToRadians(camera_yaw_);
     float pitch_rad = qDegreesToRadians(camera_pitch_);
 
-    // IFC / Blender convention: X right, Y forward, Z up.
     QVector3D eye;
     eye.setX(camera_target_.x() + camera_distance_ * cosf(pitch_rad) * cosf(yaw_rad));
     eye.setY(camera_target_.y() + camera_distance_ * cosf(pitch_rad) * sinf(yaw_rad));
@@ -547,17 +795,61 @@ void ViewportWindow::updateCamera() {
     proj_matrix_.perspective(45.0f, aspect, 0.1f, camera_distance_ * 10.0f);
 }
 
+bool ViewportWindow::aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
+                                   const float planes[6][4]) {
+    for (int p = 0; p < 6; ++p) {
+        float px = planes[p][0] >= 0.0f ? aabb_max[0] : aabb_min[0];
+        float py = planes[p][1] >= 0.0f ? aabb_max[1] : aabb_min[1];
+        float pz = planes[p][2] >= 0.0f ? aabb_max[2] : aabb_min[2];
+        float dist = planes[p][0] * px + planes[p][1] * py + planes[p][2] * pz + planes[p][3];
+        if (dist < 0.0f) return false;
+    }
+    return true;
+}
+
+void ViewportWindow::traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
+                                 const float planes[6][4]) {
+    if (mbvh.nodes.empty()) return;
+
+    uint32_t stack[64];
+    int sp = 0;
+    stack[sp++] = 0;  // root
+
+    // Get the current model's draw command being built.
+    auto& cmd = frame_draw_cmds_.back();
+
+    while (sp > 0) {
+        uint32_t ni = stack[--sp];
+        const BvhNode& node = mbvh.nodes[ni];
+
+        if (!aabbInFrustum(node.aabb_min, node.aabb_max, planes))
+            continue;
+
+        if (node.count > 0) {
+            for (uint32_t i = 0; i < node.count; ++i) {
+                uint32_t oi = mbvh.object_indices[node.right_or_first + i];
+                const auto& obj = mgpu.draw_info[oi];
+                if (aabbInFrustum(obj.aabb_min, obj.aabb_max, planes)) {
+                    cmd.counts.push_back(static_cast<GLsizei>(obj.index_count));
+                    cmd.offsets.push_back(reinterpret_cast<const void*>(
+                        static_cast<uintptr_t>(obj.index_offset)));
+                    visible_triangles_ += obj.index_count / 3;
+                }
+            }
+        } else {
+            if (sp < 63) {
+                stack[sp++] = node.right_or_first;
+                stack[sp++] = ni + 1;
+            }
+        }
+    }
+}
+
 void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
-    visible_counts_.clear();
-    visible_offsets_.clear();
+    frame_draw_cmds_.clear();
     visible_triangles_ = 0;
 
-    std::lock_guard<std::mutex> lock(upload_mutex_);
-    if (object_draw_info_.empty()) return;
-
     // Extract 6 frustum planes from the view-projection matrix.
-    // Each plane is (a, b, c, d) where ax + by + cz + d >= 0 is inside.
-    // QMatrix4x4 is stored column-major; operator(row, col) gives element.
     float planes[6][4];
     for (int i = 0; i < 4; ++i) {
         planes[0][i] = vp(3, i) + vp(0, i);  // left
@@ -567,7 +859,6 @@ void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
         planes[4][i] = vp(3, i) + vp(2, i);  // near
         planes[5][i] = vp(3, i) - vp(2, i);  // far
     }
-    // Normalize planes.
     for (int p = 0; p < 6; ++p) {
         float len = std::sqrt(planes[p][0] * planes[p][0] +
                               planes[p][1] * planes[p][1] +
@@ -581,31 +872,40 @@ void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
         }
     }
 
-    visible_counts_.reserve(object_draw_info_.size());
-    visible_offsets_.reserve(object_draw_info_.size());
+    for (auto& [model_id, mgpu] : models_gpu_) {
+        if (mgpu.hidden || mgpu.active_draw_count == 0) continue;
 
-    for (const auto& obj : object_draw_info_) {
-        // Skip hidden or removed models.
-        if (hidden_models_.count(obj.model_id) || removed_models_.count(obj.model_id))
-            continue;
+        frame_draw_cmds_.push_back({mgpu.vao, {}, {}});
+        auto& cmd = frame_draw_cmds_.back();
+        cmd.counts.reserve(mgpu.active_draw_count);
+        cmd.offsets.reserve(mgpu.active_draw_count);
 
-        bool visible = true;
-        for (int p = 0; p < 6; ++p) {
-            // p-vertex: the AABB corner most in the direction of the plane normal.
-            float px = planes[p][0] >= 0.0f ? obj.aabb_max[0] : obj.aabb_min[0];
-            float py = planes[p][1] >= 0.0f ? obj.aabb_max[1] : obj.aabb_min[1];
-            float pz = planes[p][2] >= 0.0f ? obj.aabb_max[2] : obj.aabb_min[2];
-            float dist = planes[p][0] * px + planes[p][1] * py + planes[p][2] * pz + planes[p][3];
-            if (dist < 0.0f) {
-                visible = false;
-                break;
+        bool fully_loaded = (mgpu.active_draw_count == mgpu.draw_info.size());
+        auto bvh_it = model_bvhs_.find(model_id);
+
+        // Only use BVH if model is fully uploaded; during progressive upload,
+        // fall back to linear scan of active objects.
+        if (fully_loaded && bvh_it != model_bvhs_.end() && bvh_it->second) {
+            const auto& bvh_set = *bvh_it->second;
+            auto mbvh_it = bvh_set.models.find(model_id);
+            if (mbvh_it != bvh_set.models.end()) {
+                traverseBvh(mbvh_it->second, mgpu, planes);
+            }
+        } else {
+            // Linear scan of active objects only.
+            for (uint32_t i = 0; i < mgpu.active_draw_count; ++i) {
+                const auto& obj = mgpu.draw_info[i];
+                if (aabbInFrustum(obj.aabb_min, obj.aabb_max, planes)) {
+                    cmd.counts.push_back(static_cast<GLsizei>(obj.index_count));
+                    cmd.offsets.push_back(reinterpret_cast<const void*>(
+                        static_cast<uintptr_t>(obj.index_offset)));
+                    visible_triangles_ += obj.index_count / 3;
+                }
             }
         }
-        if (visible) {
-            visible_counts_.push_back(static_cast<GLsizei>(obj.index_count));
-            visible_offsets_.push_back(reinterpret_cast<const void*>(
-                static_cast<uintptr_t>(obj.index_offset)));
-            visible_triangles_ += obj.index_count / 3;
+
+        if (cmd.counts.empty()) {
+            frame_draw_cmds_.pop_back();
         }
     }
 }
@@ -614,6 +914,8 @@ void ViewportWindow::render() {
     if (!gl_initialized_ || !isExposed()) return;
 
     context_->makeCurrent(this);
+    applyBvhResult();
+    processPendingUploads();
     updateCamera();
 
     int w = width() * devicePixelRatio();
@@ -628,21 +930,20 @@ void ViewportWindow::render() {
     gl_->glUniform3f(gl_->glGetUniformLocation(main_program_, "u_light_dir"), 0.3f, 0.5f, 0.8f);
     gl_->glUniform1ui(gl_->glGetUniformLocation(main_program_, "u_selected_id"), selected_object_id_);
 
-    gl_->glBindVertexArray(vao_);
-
     buildVisibleList(vp);
-    if (!visible_counts_.empty()) {
+    for (const auto& cmd : frame_draw_cmds_) {
+        gl_->glBindVertexArray(cmd.vao);
         gl_->glMultiDrawElements(GL_TRIANGLES,
-            visible_counts_.data(), GL_UNSIGNED_INT,
-            visible_offsets_.data(),
-            static_cast<GLsizei>(visible_counts_.size()));
+            cmd.counts.data(), GL_UNSIGNED_INT,
+            cmd.offsets.data(),
+            static_cast<GLsizei>(cmd.counts.size()));
     }
 
     renderAxisGizmo();
 
     context_->swapBuffers(this);
 
-    // Compute FPS (updated once per second to avoid flicker).
+    // Compute FPS.
     float dt = frame_clock_.restart() / 1000.0f;
     accumulated_time_ += dt;
     frame_count_++;
@@ -651,12 +952,23 @@ void ViewportWindow::render() {
         frame_count_ = 0;
         accumulated_time_ = 0.0f;
 
+        uint32_t total_obj = 0, total_tri = 0, vis_obj = 0;
+        for (const auto& [mid, m] : models_gpu_) {
+            if (!m.hidden) {
+                total_obj += static_cast<uint32_t>(m.draw_info.size());
+                total_tri += m.total_triangles;
+            }
+        }
+        for (const auto& cmd : frame_draw_cmds_) {
+            vis_obj += static_cast<uint32_t>(cmd.counts.size());
+        }
+
         FrameStats stats;
         stats.fps = last_fps_;
         stats.frame_time_ms = 1000.0f / last_fps_;
-        stats.total_objects = static_cast<uint32_t>(object_draw_info_.size());
-        stats.visible_objects = static_cast<uint32_t>(visible_counts_.size());
-        stats.total_triangles = total_triangles_;
+        stats.total_objects = total_obj;
+        stats.visible_objects = vis_obj;
+        stats.total_triangles = total_tri;
         stats.visible_triangles = visible_triangles_;
         emit frameStatsUpdated(stats);
     }
@@ -672,8 +984,6 @@ void ViewportWindow::renderAxisGizmo() {
     gl_->glViewport(margin, margin, gizmo_size, gizmo_size);
     gl_->glDisable(GL_DEPTH_TEST);
 
-    // Build a view matrix from the same camera orientation but with a fixed
-    // close-up distance, so the gizmo rotates with the scene camera. Z-up.
     float yaw_rad = qDegreesToRadians(camera_yaw_);
     float pitch_rad = qDegreesToRadians(camera_pitch_);
 
@@ -693,7 +1003,7 @@ void ViewportWindow::renderAxisGizmo() {
     gl_->glUseProgram(axis_program_);
     gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(axis_program_, "u_mvp"), 1, GL_FALSE, mvp.constData());
 
-    gl_->glLineWidth(2.5f);  // ignored on some core-profile drivers, that's OK
+    gl_->glLineWidth(2.5f);
     gl_->glBindVertexArray(axis_vao_);
     gl_->glDrawArrays(GL_LINES, 0, 6);
 
@@ -712,14 +1022,13 @@ void ViewportWindow::renderPickPass() {
     gl_->glUseProgram(pick_program_);
     gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(pick_program_, "u_view_projection"), 1, GL_FALSE, vp.constData());
 
-    gl_->glBindVertexArray(vao_);
-
     // Reuse the visible list from the most recent render() call.
-    if (!visible_counts_.empty()) {
+    for (const auto& cmd : frame_draw_cmds_) {
+        gl_->glBindVertexArray(cmd.vao);
         gl_->glMultiDrawElements(GL_TRIANGLES,
-            visible_counts_.data(), GL_UNSIGNED_INT,
-            visible_offsets_.data(),
-            static_cast<GLsizei>(visible_counts_.size()));
+            cmd.counts.data(), GL_UNSIGNED_INT,
+            cmd.offsets.data(),
+            static_cast<GLsizei>(cmd.counts.size()));
     }
 
     gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
@@ -774,7 +1083,6 @@ void ViewportWindow::handleMouseMove(QMouseEvent* e) {
 
     if (active_button_ == Qt::MiddleButton) {
         if (e->modifiers() & Qt::ShiftModifier) {
-            // Pan in screen space, derived from the Z-up camera basis.
             float pan_speed = camera_distance_ * 0.002f;
             float yaw_rad = qDegreesToRadians(camera_yaw_);
             float pitch_rad = qDegreesToRadians(camera_pitch_);
@@ -786,7 +1094,6 @@ void ViewportWindow::handleMouseMove(QMouseEvent* e) {
             camera_target_ -= right * delta.x() * pan_speed;
             camera_target_ += up * delta.y() * pan_speed;
         } else {
-            // Orbit
             camera_yaw_ -= delta.x() * 0.3f;
             camera_pitch_ += delta.y() * 0.3f;
             camera_pitch_ = qBound(-89.0f, camera_pitch_, 89.0f);
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index fda82a1db5e..62abc480022 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -28,23 +28,23 @@
 #include <QMatrix4x4>
 #include <QVector3D>
 
+#include <deque>
 #include <vector>
 #include <unordered_set>
+#include <unordered_map>
 #include <cstdint>
 #include <mutex>
+#include <thread>
+#include <memory>
+#include <atomic>
+
+#include "BvhAccel.h"
+#include "SidecarCache.h"
 
 struct MaterialInfo {
     float r = 0.75f, g = 0.75f, b = 0.78f, a = 1.0f;
 };
 
-struct ObjectDrawInfo {
-    uint32_t index_offset;  // byte offset into EBO
-    uint32_t index_count;   // number of indices
-    uint32_t model_id;      // which model this object belongs to
-    float aabb_min[3];      // world-space AABB
-    float aabb_max[3];
-};
-
 struct UploadChunk {
     // Interleaved per-vertex layout (8 floats / 32 bytes per vertex):
     //   pos(3 float) + normal(3 float) + object_id(1 float bitcast from uint)
@@ -56,6 +56,32 @@ struct UploadChunk {
     uint32_t model_id = 0;
 };
 
+// Per-model GPU state: own VAO, VBO, EBO, draw info, BVH.
+struct ModelGpuData {
+    GLuint vao = 0;
+    GLuint vbo = 0;
+    GLuint ebo = 0;
+    size_t vbo_capacity = 0;
+    size_t ebo_capacity = 0;
+    size_t vbo_used = 0;   // bytes
+    size_t ebo_used = 0;   // bytes
+    uint32_t vertex_count = 0;
+    uint32_t total_triangles = 0;
+    std::vector<ObjectDrawInfo> draw_info;
+    uint32_t active_draw_count = 0; // how many objects are drawable (progressive upload)
+    bool hidden = false;
+};
+
+// Pending progressive upload — VBO first, then EBO.
+struct PendingUpload {
+    uint32_t model_id = 0;
+    std::vector<float> vertices;
+    std::vector<uint32_t> indices;
+    std::shared_ptr<BvhSet> bvh_set;
+    size_t vbo_uploaded = 0;  // bytes
+    size_t ebo_uploaded = 0;  // bytes
+};
+
 class ViewportWindow : public QWindow {
     Q_OBJECT
 public:
@@ -65,10 +91,29 @@ class ViewportWindow : public QWindow {
     void uploadChunk(const UploadChunk& chunk);
     void resetScene();
 
+    // Bulk upload pre-built geometry from a sidecar cache.
+    // Creates a perfectly-sized per-model buffer set. No copy.
+    void uploadBulk(uint32_t model_id,
+                    std::vector<float> vertices,
+                    std::vector<uint32_t> indices,
+                    const std::vector<ObjectDrawInfo>& draw_info,
+                    std::shared_ptr<BvhSet> bvh_set);
+
     void hideModel(uint32_t model_id);
     void showModel(uint32_t model_id);
     void removeModel(uint32_t model_id);
 
+    // Build BVH and optionally write a sidecar cache.
+    void buildBvhAsync(uint32_t model_id,
+                       const std::string& ifc_path = "",
+                       uint64_t ifc_file_size = 0,
+                       std::vector<PackedElementInfo> sidecar_elements = {},
+                       std::string sidecar_string_table = {});
+
+    // Read snapshots of a model's GPU buffers into CPU vectors.
+    std::vector<uint32_t> readbackEbo(uint32_t model_id) const;
+    std::vector<float> readbackVbo(uint32_t model_id) const;
+
     void setSelectedObjectId(uint32_t id);
     uint32_t pickObjectAt(int x, int y);
 
@@ -99,9 +144,16 @@ class ViewportWindow : public QWindow {
     void updateCamera();
     void buildShaders();
     void buildAxisGizmo();
-    bool growVbo(size_t needed_total);
-    bool growEbo(size_t needed_total);
+    void setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo);
+    bool growModelVbo(ModelGpuData& m, size_t needed_total);
+    bool growModelEbo(ModelGpuData& m, size_t needed_total);
     void buildVisibleList(const QMatrix4x4& vp);
+    void traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
+                     const float planes[6][4]);
+    static bool aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
+                              const float planes[6][4]);
+    void applyBvhResult();
+    void processPendingUploads();
 
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
@@ -124,15 +176,9 @@ class ViewportWindow : public QWindow {
     GLuint axis_vao_ = 0;
     GLuint axis_vbo_ = 0;
 
-    // Geometry buffers - one big buffer pair
-    GLuint vao_ = 0;
-    GLuint vbo_ = 0;
-    GLuint ebo_ = 0;
-    size_t vbo_capacity_ = 0;
-    size_t ebo_capacity_ = 0;
-    size_t vbo_used_ = 0;  // in bytes
-    size_t ebo_used_ = 0;  // in bytes
-    uint32_t vertex_count_ = 0;
+    // Per-model GPU data
+    std::unordered_map<uint32_t, ModelGpuData> models_gpu_;
+    std::mutex models_mutex_;
 
     // Pick framebuffer
     GLuint pick_fbo_ = 0;
@@ -141,16 +187,20 @@ class ViewportWindow : public QWindow {
     int pick_width_ = 0;
     int pick_height_ = 0;
 
-    // Per-object draw metadata for frustum culling.
-    std::vector<ObjectDrawInfo> object_draw_info_;
-    std::unordered_set<uint32_t> hidden_models_;
-    std::unordered_set<uint32_t> removed_models_;
-    uint32_t total_index_count_ = 0;
-    std::mutex upload_mutex_;
+    // Per-model BVH
+    std::unordered_map<uint32_t, std::shared_ptr<const BvhSet>> model_bvhs_;
+
+    // Progressive upload queue
+    std::deque<PendingUpload> pending_uploads_;
 
     // Scratch buffers reused each frame to avoid allocation.
-    std::vector<GLsizei> visible_counts_;
-    std::vector<const void*> visible_offsets_;
+    struct ModelDrawCmd {
+        GLuint vao;
+        std::vector<GLsizei> counts;
+        std::vector<const void*> offsets;
+    };
+    std::vector<ModelDrawCmd> frame_draw_cmds_;
+    uint32_t visible_triangles_ = 0;
 
     // Camera
     QVector3D camera_target_{0, 0, 0};
@@ -169,9 +219,17 @@ class ViewportWindow : public QWindow {
     bool pick_requested_ = false;
     int pick_x_ = 0, pick_y_ = 0;
 
+    // BVH build (phase 2)
+    struct PendingBvh {
+        uint32_t model_id;
+        std::shared_ptr<BvhSet> bvh_set;
+        EboReorderResult ebo_reorder;
+    };
+    std::unique_ptr<PendingBvh> pending_bvh_;
+    std::mutex bvh_result_mutex_;
+    std::thread bvh_build_thread_;
+
     // Stats
-    uint32_t total_triangles_ = 0;
-    uint32_t visible_triangles_ = 0;
     int frame_count_ = 0;
     float accumulated_time_ = 0.0f;
     float last_fps_ = 0.0f;

From 2d9f3fba7a3208f994ffa3284a5907dde183405e Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 19:01:43 +1000
Subject: [PATCH 12/37] Add profiling for VRAM, FPS ratios, and instancing
 analysis

Per-second frame log reports fps/ms, visible/total object & triangle
ratios, VRAM breakdown (VBO+EBO), model count, and pending uploads.

Upload-complete log includes per-model VBO/EBO MB and scene total VRAM.

Streamer runs an instancing analysis keyed on geom.id(): total shapes,
unique representations, dedup ratio, theoretical VBO/EBO/SSBO sizes if
instanced, potential savings, and top-5 most-duplicated representations.
Used to validate whether GPU instancing is worth the architectural
rewrite for a given dataset.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/GeometryStreamer.cpp | 103 +++++++++++++++++++++++++++++
 src/ifcviewer/ViewportWindow.cpp   |  41 ++++++++++--
 2 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
index 7235bced9f8..54b37df70ca 100644
--- a/src/ifcviewer/GeometryStreamer.cpp
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -27,6 +27,9 @@
 #include <cstring>
 #include <algorithm>
 
+#include <QDebug>
+#include <QElapsedTimer>
+
 GeometryStreamer::GeometryStreamer(QObject* parent)
     : QObject(parent)
 {
@@ -126,6 +129,20 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
 
     int last_progress = 0;
 
+    // Instancing analysis: count shapes grouped by representation id.
+    struct GeomStat {
+        uint32_t count = 0;
+        size_t vertex_count = 0;
+        size_t index_count = 0;
+        std::string example_type;
+    };
+    std::unordered_map<std::string, GeomStat> geom_stats;
+    uint32_t total_shapes = 0;
+    size_t total_vertices = 0;
+    size_t total_indices = 0;
+    QElapsedTimer stream_timer;
+    stream_timer.start();
+
     do {
         if (cancel_requested_.load()) break;
 
@@ -147,6 +164,24 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
         info.type = tri_elem->type();
         info.parent_id = tri_elem->parent_id();
 
+        // Instancing stats: key by representation id, count unique vs repeated.
+        const auto& geom = tri_elem->geometry();
+        const std::string& geom_id = geom.id();
+        size_t nv = geom.verts().size() / 3;
+        size_t ni = geom.faces().size();
+        if (!geom_id.empty()) {
+            auto& gs = geom_stats[geom_id];
+            gs.count++;
+            if (gs.count == 1) {
+                gs.vertex_count = nv;
+                gs.index_count = ni;
+                gs.example_type = info.type;
+            }
+        }
+        total_shapes++;
+        total_vertices += nv;
+        total_indices += ni;
+
         {
             std::lock_guard<std::mutex> lock(elements_mutex_);
             pending_elements_.push_back(std::move(info));
@@ -168,6 +203,74 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
 
     progress_ = 100;
     emit progressChanged(100);
+
+    // === Instancing report ===
+    {
+        size_t unique_geoms = geom_stats.size();
+        size_t unique_vertices = 0;
+        size_t unique_indices = 0;
+        size_t repeated_shapes = 0; // total shapes that share a repr with another
+        for (const auto& [gid, gs] : geom_stats) {
+            unique_vertices += gs.vertex_count;
+            unique_indices += gs.index_count;
+            if (gs.count > 1) repeated_shapes += gs.count;
+        }
+
+        // Bytes assuming current layout (32 B/vertex, 4 B/index).
+        size_t baked_vbo_bytes = total_vertices * 32;
+        size_t baked_ebo_bytes = total_indices * 4;
+        size_t instanced_vbo_bytes = unique_vertices * 32;
+        size_t instanced_ebo_bytes = unique_indices * 4;
+        // Per-instance data: 64 B transform + 8 B (object_id + color).
+        size_t per_instance_bytes = 72;
+        size_t instance_ssbo_bytes = total_shapes * per_instance_bytes;
+
+        double dedup_ratio = unique_geoms > 0
+            ? static_cast<double>(total_shapes) / static_cast<double>(unique_geoms)
+            : 1.0;
+
+        qDebug("=== Instancing analysis: %s ===", path.c_str());
+        qDebug("  Stream time: %.2f s", stream_timer.elapsed() / 1000.0);
+        qDebug("  Total shapes:      %u", total_shapes);
+        qDebug("  Unique geometries: %zu  (dedup ratio %.2fx)",
+               unique_geoms, dedup_ratio);
+        qDebug("  Repeated shapes:   %zu  (%.1f%% of total)",
+               repeated_shapes,
+               total_shapes > 0 ? 100.0 * repeated_shapes / total_shapes : 0.0);
+        qDebug("  Baked geometry:    VBO %.1f MB + EBO %.1f MB = %.1f MB",
+               baked_vbo_bytes / (1024.0*1024.0),
+               baked_ebo_bytes / (1024.0*1024.0),
+               (baked_vbo_bytes + baked_ebo_bytes) / (1024.0*1024.0));
+        qDebug("  If instanced:      VBO %.1f MB + EBO %.1f MB + SSBO %.1f MB = %.1f MB",
+               instanced_vbo_bytes / (1024.0*1024.0),
+               instanced_ebo_bytes / (1024.0*1024.0),
+               instance_ssbo_bytes / (1024.0*1024.0),
+               (instanced_vbo_bytes + instanced_ebo_bytes + instance_ssbo_bytes)
+                   / (1024.0*1024.0));
+        size_t baked_total = baked_vbo_bytes + baked_ebo_bytes;
+        size_t inst_total = instanced_vbo_bytes + instanced_ebo_bytes + instance_ssbo_bytes;
+        if (inst_total > 0 && baked_total > inst_total) {
+            qDebug("  Potential savings: %.1f MB (%.1f%%)",
+                   (baked_total - inst_total) / (1024.0*1024.0),
+                   100.0 * (baked_total - inst_total) / baked_total);
+        } else {
+            qDebug("  Potential savings: none (instance overhead exceeds dedup win)");
+        }
+
+        // Top-5 most duplicated representations.
+        std::vector<std::pair<std::string, GeomStat>> sorted(geom_stats.begin(), geom_stats.end());
+        std::partial_sort(sorted.begin(),
+                          sorted.begin() + std::min<size_t>(5, sorted.size()),
+                          sorted.end(),
+                          [](const auto& a, const auto& b) { return a.second.count > b.second.count; });
+        qDebug("  Top duplicated representations:");
+        for (size_t i = 0; i < std::min<size_t>(5, sorted.size()); ++i) {
+            const auto& [gid, gs] = sorted[i];
+            qDebug("    [%zu] count=%u  verts=%zu  type=%s  repr_id=%s",
+                   i + 1, gs.count, gs.vertex_count,
+                   gs.example_type.c_str(), gid.c_str());
+        }
+    }
 }
 
 static MaterialInfo materialFromStyle(const ifcopenshell::geometry::taxonomy::style::ptr& style) {
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index ae50f6dc44a..c872f799b67 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -592,7 +592,19 @@ void ViewportWindow::processPendingUploads() {
         model_bvhs_[pu.model_id] = std::move(pu.bvh_set);
     }
 
-    qDebug("Progressive upload complete: model %u", pu.model_id);
+    size_t total_vbo = 0, total_ebo = 0;
+    for (const auto& [mid, mg] : models_gpu_) {
+        total_vbo += mg.vbo_capacity;
+        total_ebo += mg.ebo_capacity;
+    }
+    qDebug("Progressive upload complete: model %u  (this: vbo %.1f MB + ebo %.1f MB, "
+           "%u objects, %u triangles)  scene total vram %.1f MB",
+           pu.model_id,
+           mgpu.vbo_capacity / (1024.0 * 1024.0),
+           mgpu.ebo_capacity / (1024.0 * 1024.0),
+           static_cast<uint32_t>(mgpu.draw_info.size()),
+           mgpu.total_triangles,
+           (total_vbo + total_ebo) / (1024.0 * 1024.0));
     pending_uploads_.pop_front();
 }
 
@@ -953,12 +965,17 @@ void ViewportWindow::render() {
         accumulated_time_ = 0.0f;
 
         uint32_t total_obj = 0, total_tri = 0, vis_obj = 0;
+        size_t total_vram = 0, total_vbo = 0, total_ebo = 0;
+        size_t num_models = 0, num_hidden = 0;
         for (const auto& [mid, m] : models_gpu_) {
-            if (!m.hidden) {
-                total_obj += static_cast<uint32_t>(m.draw_info.size());
-                total_tri += m.total_triangles;
-            }
+            num_models++;
+            if (m.hidden) { num_hidden++; continue; }
+            total_obj += static_cast<uint32_t>(m.draw_info.size());
+            total_tri += m.total_triangles;
+            total_vbo += m.vbo_capacity;
+            total_ebo += m.ebo_capacity;
         }
+        total_vram = total_vbo + total_ebo;
         for (const auto& cmd : frame_draw_cmds_) {
             vis_obj += static_cast<uint32_t>(cmd.counts.size());
         }
@@ -971,6 +988,20 @@ void ViewportWindow::render() {
         stats.total_triangles = total_tri;
         stats.visible_triangles = visible_triangles_;
         emit frameStatsUpdated(stats);
+
+        double vis_obj_pct = total_obj > 0 ? 100.0 * vis_obj / total_obj : 0.0;
+        double vis_tri_pct = total_tri > 0 ? 100.0 * visible_triangles_ / total_tri : 0.0;
+        qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u (%.1f%%)  tri %u/%u (%.1f%%)  "
+               "vram %.1f MB (vbo %.1f + ebo %.1f)  models %zu (%zu hidden)  draws %zu  pending_uploads %zu",
+               last_fps_, 1000.0f / last_fps_,
+               vis_obj, total_obj, vis_obj_pct,
+               visible_triangles_, total_tri, vis_tri_pct,
+               total_vram / (1024.0 * 1024.0),
+               total_vbo / (1024.0 * 1024.0),
+               total_ebo / (1024.0 * 1024.0),
+               num_models, num_hidden,
+               frame_draw_cmds_.size(),
+               pending_uploads_.size());
     }
 }
 

From d8362e243703f6dcb9ae457213c91a70c5452db2 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 19:17:44 +1000
Subject: [PATCH 13/37] Leaf-batched BVH draw commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a BVH leaf passes the frustum test, emit a single glMultiDrawElements
record covering the leaf's entire index range instead of one per object.
Leaves are contiguous in the EBO after reorderEbo, so the range is just
[first_object.index_offset, sum(index_count)]. Cuts draw calls by ~8x
(BVH_MAX_LEAF_SIZE) and shifts the bottleneck from CPU/driver per-draw
overhead toward GPU vertex throughput.

Per-object features (selection highlight, per-vertex color, object_id
picking) are unchanged — they operate on vertex attributes, not draw
state. Future per-object hide/override will use SSBO lookups sampled
by object_id in the fragment shader.

Slight overdraw from skipping per-object frustum tests within a leaf is
negligible given median-split BVH tightness and spare tri throughput.

Also adds visible_objects_ counter so stats still report true object
counts (not leaf counts), plus leaf_draws/model_draws breakdown in the
per-second frame log.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 40 +++++++++++++++++++++-----------
 src/ifcviewer/ViewportWindow.h   |  1 +
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index c872f799b67..1c6ab786254 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -838,16 +838,25 @@ void ViewportWindow::traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
             continue;
 
         if (node.count > 0) {
+            // Leaf-batched draw: after reorderEbo, a leaf's objects occupy a
+            // contiguous EBO range. Emit one draw command covering all of them
+            // instead of N per-object tests/draws. The leaf AABB test above is
+            // already a conservative cull; any overdraw (up to BVH_MAX_LEAF_SIZE
+            // objects that may be fully outside the frustum but inside the leaf
+            // AABB) costs far less than the per-draw CPU/driver overhead we save.
+            uint32_t first_oi = mbvh.object_indices[node.right_or_first];
+            const auto& first_obj = mgpu.draw_info[first_oi];
+            uint32_t leaf_offset = first_obj.index_offset;
+            uint32_t leaf_count = 0;
             for (uint32_t i = 0; i < node.count; ++i) {
                 uint32_t oi = mbvh.object_indices[node.right_or_first + i];
-                const auto& obj = mgpu.draw_info[oi];
-                if (aabbInFrustum(obj.aabb_min, obj.aabb_max, planes)) {
-                    cmd.counts.push_back(static_cast<GLsizei>(obj.index_count));
-                    cmd.offsets.push_back(reinterpret_cast<const void*>(
-                        static_cast<uintptr_t>(obj.index_offset)));
-                    visible_triangles_ += obj.index_count / 3;
-                }
+                leaf_count += mgpu.draw_info[oi].index_count;
             }
+            cmd.counts.push_back(static_cast<GLsizei>(leaf_count));
+            cmd.offsets.push_back(reinterpret_cast<const void*>(
+                static_cast<uintptr_t>(leaf_offset)));
+            visible_triangles_ += leaf_count / 3;
+            visible_objects_ += node.count;
         } else {
             if (sp < 63) {
                 stack[sp++] = node.right_or_first;
@@ -860,6 +869,7 @@ void ViewportWindow::traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
 void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
     frame_draw_cmds_.clear();
     visible_triangles_ = 0;
+    visible_objects_ = 0;
 
     // Extract 6 frustum planes from the view-projection matrix.
     float planes[6][4];
@@ -912,6 +922,7 @@ void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
                     cmd.offsets.push_back(reinterpret_cast<const void*>(
                         static_cast<uintptr_t>(obj.index_offset)));
                     visible_triangles_ += obj.index_count / 3;
+                    visible_objects_++;
                 }
             }
         }
@@ -964,9 +975,10 @@ void ViewportWindow::render() {
         frame_count_ = 0;
         accumulated_time_ = 0.0f;
 
-        uint32_t total_obj = 0, total_tri = 0, vis_obj = 0;
+        uint32_t total_obj = 0, total_tri = 0;
         size_t total_vram = 0, total_vbo = 0, total_ebo = 0;
         size_t num_models = 0, num_hidden = 0;
+        size_t total_leaf_draws = 0;
         for (const auto& [mid, m] : models_gpu_) {
             num_models++;
             if (m.hidden) { num_hidden++; continue; }
@@ -977,29 +989,31 @@ void ViewportWindow::render() {
         }
         total_vram = total_vbo + total_ebo;
         for (const auto& cmd : frame_draw_cmds_) {
-            vis_obj += static_cast<uint32_t>(cmd.counts.size());
+            total_leaf_draws += cmd.counts.size();
         }
 
         FrameStats stats;
         stats.fps = last_fps_;
         stats.frame_time_ms = 1000.0f / last_fps_;
         stats.total_objects = total_obj;
-        stats.visible_objects = vis_obj;
+        stats.visible_objects = visible_objects_;
         stats.total_triangles = total_tri;
         stats.visible_triangles = visible_triangles_;
         emit frameStatsUpdated(stats);
 
-        double vis_obj_pct = total_obj > 0 ? 100.0 * vis_obj / total_obj : 0.0;
+        double vis_obj_pct = total_obj > 0 ? 100.0 * visible_objects_ / total_obj : 0.0;
         double vis_tri_pct = total_tri > 0 ? 100.0 * visible_triangles_ / total_tri : 0.0;
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u (%.1f%%)  tri %u/%u (%.1f%%)  "
-               "vram %.1f MB (vbo %.1f + ebo %.1f)  models %zu (%zu hidden)  draws %zu  pending_uploads %zu",
+               "vram %.1f MB (vbo %.1f + ebo %.1f)  models %zu (%zu hidden)  "
+               "leaf_draws %zu  model_draws %zu  pending_uploads %zu",
                last_fps_, 1000.0f / last_fps_,
-               vis_obj, total_obj, vis_obj_pct,
+               visible_objects_, total_obj, vis_obj_pct,
                visible_triangles_, total_tri, vis_tri_pct,
                total_vram / (1024.0 * 1024.0),
                total_vbo / (1024.0 * 1024.0),
                total_ebo / (1024.0 * 1024.0),
                num_models, num_hidden,
+               total_leaf_draws,
                frame_draw_cmds_.size(),
                pending_uploads_.size());
     }
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 62abc480022..97925e6e2e3 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -201,6 +201,7 @@ class ViewportWindow : public QWindow {
     };
     std::vector<ModelDrawCmd> frame_draw_cmds_;
     uint32_t visible_triangles_ = 0;
+    uint32_t visible_objects_ = 0;
 
     // Camera
     QVector3D camera_target_{0, 0, 0};

From 1097fa3bced668aec3196fb620f7cb3db1e67dd9 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 19:53:06 +1000
Subject: [PATCH 14/37] GPU instancing: streamer, viewport, shaders rewritten

Commit A of the instancing migration (Phase 3a).  The streamer now runs
the iterator with use-world-coords=false and dedupes by the geometry's
representation id, emitting a MeshChunk once per unique geometry and an
InstanceChunk per placement.  The viewport keeps geometry in local
coordinates (28 B/vertex, down from 32) and applies the per-instance
transform in the vertex shader via an std430 SSBO indexed by
gl_InstanceID + a per-draw uniform offset.  After streaming finishes
finalizeModel() stable-sorts instances by mesh_id, assigns each mesh a
contiguous range, and uploads the SSBO; render then issues one
glDrawElementsInstancedBaseVertex per mesh.

BvhAccel is reshaped to operate on a generic BvhItem (world AABB +
model_id) so it can drive instance-level culling, but the path is not
wired in yet -- every instance is drawn every frame in this commit.
Progressive-during-streaming rendering is likewise disabled: a model
appears when its SSBO is uploaded, not incrementally.  Sidecar cache
is stubbed (reads miss, writes are no-ops); the v4 on-disk format with
MeshInfo + InstanceGpu sections lands in Commit B.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/BvhAccel.cpp         |  151 +----
 src/ifcviewer/BvhAccel.h           |   43 +-
 src/ifcviewer/GeometryStreamer.cpp |  443 ++++++------
 src/ifcviewer/GeometryStreamer.h   |   11 +-
 src/ifcviewer/InstancedGeometry.h  |  103 +++
 src/ifcviewer/MainWindow.cpp       |  100 +--
 src/ifcviewer/MainWindow.h         |    3 +-
 src/ifcviewer/SidecarCache.cpp     |  182 +----
 src/ifcviewer/SidecarCache.h       |   39 +-
 src/ifcviewer/ViewportWindow.cpp   | 1013 ++++++++++------------------
 src/ifcviewer/ViewportWindow.h     |  130 ++--
 11 files changed, 811 insertions(+), 1407 deletions(-)
 create mode 100644 src/ifcviewer/InstancedGeometry.h

diff --git a/src/ifcviewer/BvhAccel.cpp b/src/ifcviewer/BvhAccel.cpp
index e0b232a283c..c285f1fbfe0 100644
--- a/src/ifcviewer/BvhAccel.cpp
+++ b/src/ifcviewer/BvhAccel.cpp
@@ -23,7 +23,6 @@
 #include <cassert>
 #include <cmath>
 #include <limits>
-#include <numeric>
 
 namespace {
 
@@ -31,38 +30,36 @@ struct Centroid {
     float x, y, z;
 };
 
-Centroid computeCentroid(const ObjectDrawInfo& obj) {
+Centroid computeCentroid(const BvhItem& it) {
     return {
-        (obj.aabb_min[0] + obj.aabb_max[0]) * 0.5f,
-        (obj.aabb_min[1] + obj.aabb_max[1]) * 0.5f,
-        (obj.aabb_min[2] + obj.aabb_max[2]) * 0.5f
+        (it.aabb_min[0] + it.aabb_max[0]) * 0.5f,
+        (it.aabb_min[1] + it.aabb_max[1]) * 0.5f,
+        (it.aabb_min[2] + it.aabb_max[2]) * 0.5f
     };
 }
 
-void computeAABB(const std::vector<ObjectDrawInfo>& draw_info,
+void computeAABB(const std::vector<BvhItem>& items,
                  const uint32_t* indices, uint32_t count,
                  float out_min[3], float out_max[3]) {
     out_min[0] = out_min[1] = out_min[2] = std::numeric_limits<float>::max();
     out_max[0] = out_max[1] = out_max[2] = -std::numeric_limits<float>::max();
     for (uint32_t i = 0; i < count; ++i) {
-        const auto& obj = draw_info[indices[i]];
+        const auto& it = items[indices[i]];
         for (int a = 0; a < 3; ++a) {
-            if (obj.aabb_min[a] < out_min[a]) out_min[a] = obj.aabb_min[a];
-            if (obj.aabb_max[a] > out_max[a]) out_max[a] = obj.aabb_max[a];
+            if (it.aabb_min[a] < out_min[a]) out_min[a] = it.aabb_min[a];
+            if (it.aabb_max[a] > out_max[a]) out_max[a] = it.aabb_max[a];
         }
     }
 }
 
-// Recursive BVH builder. Writes nodes in pre-order DFS into mbvh.nodes.
-// object_indices[start..start+count) are the indices to partition.
 void buildRecursive(ModelBvh& mbvh,
-                    const std::vector<ObjectDrawInfo>& draw_info,
+                    const std::vector<BvhItem>& items,
                     uint32_t start, uint32_t count) {
     uint32_t node_idx = static_cast<uint32_t>(mbvh.nodes.size());
     mbvh.nodes.emplace_back();
     BvhNode& node = mbvh.nodes[node_idx];
 
-    computeAABB(draw_info, &mbvh.object_indices[start], count,
+    computeAABB(items, &mbvh.item_indices[start], count,
                 node.aabb_min, node.aabb_max);
 
     if (count <= BVH_MAX_LEAF_SIZE) {
@@ -72,7 +69,6 @@ void buildRecursive(ModelBvh& mbvh,
         return;
     }
 
-    // Find longest axis of node AABB.
     float extent[3] = {
         node.aabb_max[0] - node.aabb_min[0],
         node.aabb_max[1] - node.aabb_min[1],
@@ -82,145 +78,62 @@ void buildRecursive(ModelBvh& mbvh,
     if (extent[1] > extent[axis]) axis = 1;
     if (extent[2] > extent[axis]) axis = 2;
 
-    // Partition at median centroid on the chosen axis.
     uint32_t mid = count / 2;
     std::nth_element(
-        mbvh.object_indices.begin() + start,
-        mbvh.object_indices.begin() + start + mid,
-        mbvh.object_indices.begin() + start + count,
+        mbvh.item_indices.begin() + start,
+        mbvh.item_indices.begin() + start + mid,
+        mbvh.item_indices.begin() + start + count,
         [&](uint32_t a, uint32_t b) {
-            Centroid ca = computeCentroid(draw_info[a]);
-            Centroid cb = computeCentroid(draw_info[b]);
+            Centroid ca = computeCentroid(items[a]);
+            Centroid cb = computeCentroid(items[b]);
             return (&ca.x)[axis] < (&cb.x)[axis];
         });
 
-    node.count = 0;  // interior
+    node.count = 0;
     node.axis = static_cast<uint16_t>(axis);
 
-    // Left child is always node_idx + 1 (implicit in pre-order DFS).
-    // Build left subtree first. Note: &node is invalidated after this call
-    // because the vector may reallocate.
-    buildRecursive(mbvh, draw_info, start, mid);
+    buildRecursive(mbvh, items, start, mid);
 
-    // Right child is the next node written after the entire left subtree.
     uint32_t right_child_idx = static_cast<uint32_t>(mbvh.nodes.size());
-    buildRecursive(mbvh, draw_info, start + mid, count - mid);
+    buildRecursive(mbvh, items, start + mid, count - mid);
 
-    // Patch the right child index (left is implicit = node_idx + 1).
     mbvh.nodes[node_idx].right_or_first = right_child_idx;
 }
 
-} // anonymous namespace
-
-ModelBvh buildModelBvh(const std::vector<ObjectDrawInfo>& draw_info,
-                       const std::vector<uint32_t>& model_object_indices,
+ModelBvh buildModelBvh(const std::vector<BvhItem>& items,
+                       const std::vector<uint32_t>& model_item_indices,
                        uint32_t model_id) {
     ModelBvh mbvh;
     mbvh.model_id = model_id;
-    mbvh.object_indices = model_object_indices;
+    mbvh.item_indices = model_item_indices;
 
-    uint32_t count = static_cast<uint32_t>(model_object_indices.size());
+    uint32_t count = static_cast<uint32_t>(model_item_indices.size());
     if (count == 0) return mbvh;
 
-    // Reserve a rough estimate: ~2*n nodes for a balanced binary tree.
     mbvh.nodes.reserve(count * 2);
+    buildRecursive(mbvh, items, 0, count);
 
-    buildRecursive(mbvh, draw_info, 0, count);
-
-    // Verify: every object appears exactly once in the leaves.
     assert(!mbvh.nodes.empty());
-
     return mbvh;
 }
 
-std::shared_ptr<BvhSet> buildBvhSet(const std::vector<ObjectDrawInfo>& draw_info) {
+} // anonymous namespace
+
+std::shared_ptr<BvhSet> buildBvhSet(const std::vector<BvhItem>& items) {
     auto bvh_set = std::make_shared<BvhSet>();
 
-    // Group object indices by model_id.
-    std::unordered_map<uint32_t, std::vector<uint32_t>> model_objects;
-    for (uint32_t i = 0; i < static_cast<uint32_t>(draw_info.size()); ++i) {
-        model_objects[draw_info[i].model_id].push_back(i);
+    std::unordered_map<uint32_t, std::vector<uint32_t>> model_items;
+    for (uint32_t i = 0; i < static_cast<uint32_t>(items.size()); ++i) {
+        model_items[items[i].model_id].push_back(i);
     }
 
-    // Build per-model BVHs.
-    for (auto& [model_id, obj_indices] : model_objects) {
-        if (obj_indices.size() < BVH_MIN_OBJECTS) continue;
+    for (auto& [model_id, idxs] : model_items) {
+        if (idxs.size() < BVH_MIN_OBJECTS) continue;
 
-        ModelBvh mbvh = buildModelBvh(draw_info, obj_indices, model_id);
+        ModelBvh mbvh = buildModelBvh(items, idxs, model_id);
         bvh_set->bvh_model_ids.insert(model_id);
         bvh_set->models[model_id] = std::move(mbvh);
     }
 
     return bvh_set;
 }
-
-EboReorderResult reorderEbo(const BvhSet& bvh_set,
-                            const std::vector<ObjectDrawInfo>& draw_info,
-                            const std::vector<uint32_t>& original_ebo) {
-    EboReorderResult result;
-    result.reordered_draw_info = draw_info;  // copy; we'll update offsets
-    result.reordered_ebo.reserve(original_ebo.size());
-
-    // Track which draw_info entries have been placed.
-    std::vector<bool> placed(draw_info.size(), false);
-
-    for (const auto& [model_id, mbvh] : bvh_set.models) {
-        // DFS traversal of BVH to visit leaves in order.
-        uint32_t stack[64];
-        int sp = 0;
-        stack[sp++] = 0;
-
-        while (sp > 0) {
-            uint32_t ni = stack[--sp];
-            const BvhNode& node = mbvh.nodes[ni];
-
-            if (node.count > 0) {
-                // Leaf: emit objects in order.
-                for (uint32_t i = 0; i < node.count; ++i) {
-                    uint32_t oi = mbvh.object_indices[node.right_or_first + i];
-                    if (placed[oi]) continue;
-                    placed[oi] = true;
-
-                    const auto& old_info = draw_info[oi];
-                    uint32_t new_offset = static_cast<uint32_t>(
-                        result.reordered_ebo.size() * sizeof(uint32_t));
-
-                    // Copy indices from original EBO.
-                    uint32_t idx_start = old_info.index_offset / sizeof(uint32_t);
-                    uint32_t idx_count = old_info.index_count;
-                    for (uint32_t j = 0; j < idx_count; ++j) {
-                        result.reordered_ebo.push_back(original_ebo[idx_start + j]);
-                    }
-
-                    result.reordered_draw_info[oi].index_offset = new_offset;
-                }
-            } else {
-                // Interior: push left (=ni+1) last so it's processed first.
-                stack[sp++] = node.right_or_first;  // right child
-                stack[sp++] = ni + 1;                // left child
-            }
-        }
-    }
-
-    // Append non-BVH objects (models too small for BVH).
-    for (uint32_t oi = 0; oi < static_cast<uint32_t>(draw_info.size()); ++oi) {
-        if (placed[oi]) continue;
-        placed[oi] = true;
-
-        const auto& old_info = draw_info[oi];
-        uint32_t new_offset = static_cast<uint32_t>(
-            result.reordered_ebo.size() * sizeof(uint32_t));
-
-        uint32_t idx_start = old_info.index_offset / sizeof(uint32_t);
-        uint32_t idx_count = old_info.index_count;
-        for (uint32_t j = 0; j < idx_count; ++j) {
-            result.reordered_ebo.push_back(original_ebo[idx_start + j]);
-        }
-
-        result.reordered_draw_info[oi].index_offset = new_offset;
-    }
-
-    assert(result.reordered_ebo.size() == original_ebo.size());
-
-    return result;
-}
diff --git a/src/ifcviewer/BvhAccel.h b/src/ifcviewer/BvhAccel.h
index 21c57c2712a..a2cb6a13163 100644
--- a/src/ifcviewer/BvhAccel.h
+++ b/src/ifcviewer/BvhAccel.h
@@ -26,22 +26,22 @@
 #include <unordered_set>
 #include <memory>
 
-struct ObjectDrawInfo {
-    uint32_t index_offset;  // byte offset into EBO
-    uint32_t index_count;   // number of indices
-    uint32_t model_id;      // which model this object belongs to
-    float aabb_min[3];      // world-space AABB
-    float aabb_max[3];
+// Generic BVH item — anything with a world AABB and a model_id.
+// For the instanced renderer each item represents one InstanceCpu.
+struct BvhItem {
+    float    aabb_min[3];
+    float    aabb_max[3];
+    uint32_t model_id;
 };
 
 static constexpr uint32_t BVH_MAX_LEAF_SIZE = 8;
-static constexpr uint32_t BVH_MIN_OBJECTS = 32;
+static constexpr uint32_t BVH_MIN_OBJECTS   = 32;
 
 struct BvhNode {
-    float aabb_min[3];
-    float aabb_max[3];
-    uint32_t right_or_first; // interior: right child index (left is always this_index+1); leaf: first object index
-    uint16_t count;           // 0 = interior; >0 = leaf with this many objects
+    float    aabb_min[3];
+    float    aabb_max[3];
+    uint32_t right_or_first; // interior: right child index (left is always this_index+1); leaf: first item index
+    uint16_t count;           // 0 = interior; >0 = leaf with this many items
     uint16_t axis;            // split axis (0/1/2) for interior; unused for leaf
 };
 static_assert(sizeof(BvhNode) == 32, "BvhNode must be 32 bytes for cache alignment and sidecar format");
@@ -49,7 +49,7 @@ static_assert(sizeof(BvhNode) == 32, "BvhNode must be 32 bytes for cache alignme
 struct ModelBvh {
     uint32_t model_id = 0;
     std::vector<BvhNode> nodes;
-    std::vector<uint32_t> object_indices;  // indices into object_draw_info_
+    std::vector<uint32_t> item_indices;  // indices into the model's InstanceCpu array
 };
 
 struct BvhSet {
@@ -57,19 +57,10 @@ struct BvhSet {
     std::unordered_set<uint32_t> bvh_model_ids;
 };
 
-struct EboReorderResult {
-    std::vector<uint32_t> reordered_ebo;
-    std::vector<ObjectDrawInfo> reordered_draw_info;
-};
-
-// Build BVH trees for all models in the given draw info snapshot.
-// Only builds the tree structure; does not touch EBO data.
-std::shared_ptr<BvhSet> buildBvhSet(const std::vector<ObjectDrawInfo>& draw_info);
-
-// Reorder the EBO so objects within each BVH leaf are contiguous.
-// Must be called with the CURRENT run's EBO and draw_info (not cached).
-EboReorderResult reorderEbo(const BvhSet& bvh_set,
-                            const std::vector<ObjectDrawInfo>& draw_info,
-                            const std::vector<uint32_t>& original_ebo);
+// Build BVH trees for all models in the given item snapshot.
+// Items are expected to already be grouped/filtered by caller if needed.
+// item_indices in the result reference positions within the full `items`
+// vector — callers providing a single model's items will see 0..N-1.
+std::shared_ptr<BvhSet> buildBvhSet(const std::vector<BvhItem>& items);
 
 #endif // BVHACCEL_H
diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
index 54b37df70ca..226fb0808ca 100644
--- a/src/ifcviewer/GeometryStreamer.cpp
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -20,16 +20,52 @@
 #include "GeometryStreamer.h"
 #include "AppSettings.h"
 #include "../ifcgeom/hybrid_kernel.h"
+#include "../ifcgeom/taxonomy.h"
+
+#include <Eigen/Dense>
 
 #include <thread>
 #include <unordered_map>
 #include <cmath>
 #include <cstring>
 #include <algorithm>
+#include <limits>
 
 #include <QDebug>
 #include <QElapsedTimer>
 
+struct MaterialInfo {
+    float r = 0.75f, g = 0.75f, b = 0.78f, a = 1.0f;
+};
+
+static MaterialInfo materialFromStyle(const ifcopenshell::geometry::taxonomy::style::ptr& style) {
+    MaterialInfo m;
+    if (!style) return m;
+    const auto& color = style->get_color();
+    if (color) {
+        m.r = static_cast<float>(color.r());
+        m.g = static_cast<float>(color.g());
+        m.b = static_cast<float>(color.b());
+    }
+    if (!std::isnan(style->transparency)) {
+        m.a = 1.0f - static_cast<float>(style->transparency);
+    }
+    return m;
+}
+
+static inline uint32_t packRGBA8(const MaterialInfo& m) {
+    auto to_byte = [](float v) -> uint32_t {
+        float c = std::clamp(v, 0.0f, 1.0f);
+        return static_cast<uint32_t>(c * 255.0f + 0.5f);
+    };
+    uint32_t r = to_byte(m.r);
+    uint32_t g = to_byte(m.g);
+    uint32_t b = to_byte(m.b);
+    uint32_t a = to_byte(m.a);
+    // Little-endian byte layout [r,g,b,a] for GL_UNSIGNED_BYTE * 4 normalized.
+    return r | (g << 8) | (b << 16) | (a << 24);
+}
+
 GeometryStreamer::GeometryStreamer(QObject* parent)
     : QObject(parent)
 {
@@ -96,6 +132,130 @@ std::vector<ElementInfo> GeometryStreamer::drainElements() {
     return result;
 }
 
+// Build a mesh chunk (local coords, 28-byte interleaved vertices) from a
+// TriangulationElement. Per-vertex color is baked from material_ids so that
+// triangulations with per-face materials still render correctly.
+static MeshChunk buildMeshChunk(uint32_t model_id,
+                                uint32_t local_mesh_id,
+                                const IfcGeom::TriangulationElement* elem) {
+    MeshChunk chunk;
+    chunk.model_id = model_id;
+    chunk.local_mesh_id = local_mesh_id;
+
+    const auto& geom = elem->geometry();
+    const auto& verts = geom.verts();
+    const auto& faces = geom.faces();
+    const auto& normals = geom.normals();
+    const auto& materials = geom.materials();
+    const auto& material_ids = geom.material_ids();
+
+    if (verts.empty() || faces.empty()) return chunk;
+
+    const size_t num_verts_src = verts.size() / 3;
+    const size_t num_tris = faces.size() / 3;
+    const bool have_per_tri_material = (material_ids.size() == num_tris);
+
+    // Dedupe (original vertex index, material id) so vertices shared across
+    // triangles of the same material stay shared; vertices spanning multiple
+    // materials are split (per-face color demands it).
+    auto make_key = [](uint32_t orig_idx, int mat_id) -> uint64_t {
+        return (static_cast<uint64_t>(orig_idx) << 32) | static_cast<uint32_t>(mat_id);
+    };
+
+    std::unordered_map<uint64_t, uint32_t> remap;
+    remap.reserve(num_verts_src);
+
+    chunk.vertices.reserve(num_verts_src * INSTANCED_VERTEX_STRIDE_FLOATS);
+    chunk.indices.reserve(faces.size());
+
+    // Track local AABB as we emit vertices.
+    float amin[3] = { std::numeric_limits<float>::max(),
+                      std::numeric_limits<float>::max(),
+                      std::numeric_limits<float>::max() };
+    float amax[3] = { -std::numeric_limits<float>::max(),
+                      -std::numeric_limits<float>::max(),
+                      -std::numeric_limits<float>::max() };
+
+    auto emit_vertex = [&](uint32_t orig_idx, int mat_id) -> uint32_t {
+        const uint64_t key = make_key(orig_idx, mat_id);
+        auto it = remap.find(key);
+        if (it != remap.end()) return it->second;
+
+        const uint32_t new_idx = static_cast<uint32_t>(
+            chunk.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS);
+
+        float px = static_cast<float>(verts[orig_idx * 3 + 0]);
+        float py = static_cast<float>(verts[orig_idx * 3 + 1]);
+        float pz = static_cast<float>(verts[orig_idx * 3 + 2]);
+        chunk.vertices.push_back(px);
+        chunk.vertices.push_back(py);
+        chunk.vertices.push_back(pz);
+        if (px < amin[0]) amin[0] = px; if (px > amax[0]) amax[0] = px;
+        if (py < amin[1]) amin[1] = py; if (py > amax[1]) amax[1] = py;
+        if (pz < amin[2]) amin[2] = pz; if (pz > amax[2]) amax[2] = pz;
+
+        if (orig_idx * 3 + 2 < normals.size()) {
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 0]));
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 1]));
+            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 2]));
+        } else {
+            chunk.vertices.push_back(0.0f);
+            chunk.vertices.push_back(1.0f);
+            chunk.vertices.push_back(0.0f);
+        }
+
+        MaterialInfo m;
+        if (mat_id >= 0 && mat_id < static_cast<int>(materials.size())) {
+            m = materialFromStyle(materials[mat_id]);
+        }
+        uint32_t packed = packRGBA8(m);
+        float packed_as_float;
+        std::memcpy(&packed_as_float, &packed, sizeof(float));
+        chunk.vertices.push_back(packed_as_float);
+
+        remap.emplace(key, new_idx);
+        return new_idx;
+    };
+
+    for (size_t t = 0; t < num_tris; ++t) {
+        const int mat_id = have_per_tri_material ? material_ids[t] : -1;
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 0]), mat_id));
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 1]), mat_id));
+        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 2]), mat_id));
+    }
+
+    if (chunk.vertices.empty()) {
+        for (int a = 0; a < 3; ++a) amin[a] = amax[a] = 0.0f;
+    }
+    for (int a = 0; a < 3; ++a) {
+        chunk.local_aabb_min[a] = amin[a];
+        chunk.local_aabb_max[a] = amax[a];
+    }
+    return chunk;
+}
+
+// Compute the world-space AABB by transforming the 8 corners of the local
+// AABB through the column-major 4x4 transform.
+static void worldAabbFromLocal(const float local_min[3],
+                               const float local_max[3],
+                               const float M[16],
+                               float out_min[3], float out_max[3]) {
+    out_min[0] = out_min[1] = out_min[2] =  std::numeric_limits<float>::max();
+    out_max[0] = out_max[1] = out_max[2] = -std::numeric_limits<float>::max();
+    for (int c = 0; c < 8; ++c) {
+        float x = (c & 1) ? local_max[0] : local_min[0];
+        float y = (c & 2) ? local_max[1] : local_min[1];
+        float z = (c & 4) ? local_max[2] : local_min[2];
+        // Column-major: world = M * [x,y,z,1].
+        float wx = M[0]*x + M[4]*y + M[8]*z  + M[12];
+        float wy = M[1]*x + M[5]*y + M[9]*z  + M[13];
+        float wz = M[2]*x + M[6]*y + M[10]*z + M[14];
+        if (wx < out_min[0]) out_min[0] = wx; if (wx > out_max[0]) out_max[0] = wx;
+        if (wy < out_min[1]) out_min[1] = wy; if (wy > out_max[1]) out_max[1] = wy;
+        if (wz < out_min[2]) out_min[2] = wz; if (wz > out_max[2]) out_max[2] = wz;
+    }
+}
+
 void GeometryStreamer::run(const std::string& path, int num_threads) {
     try {
         ifc_file_ = std::make_unique<ifcopenshell::file>(path);
@@ -105,7 +265,9 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
     }
 
     ifcopenshell::geometry::Settings settings;
-    settings.set("use-world-coords", true);
+    // Instancing path: geometry stays in local coords; the transform is
+    // applied on the GPU per instance.
+    settings.set("use-world-coords", false);
     settings.set("weld-vertices", false);
     settings.set("apply-default-materials", true);
 
@@ -129,17 +291,14 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
 
     int last_progress = 0;
 
-    // Instancing analysis: count shapes grouped by representation id.
-    struct GeomStat {
-        uint32_t count = 0;
-        size_t vertex_count = 0;
-        size_t index_count = 0;
-        std::string example_type;
-    };
-    std::unordered_map<std::string, GeomStat> geom_stats;
+    // geom.id() → local_mesh_id within this model.
+    std::unordered_map<std::string, uint32_t> geom_to_local_mesh_id;
+    // local_mesh_id → (local AABB) so we can derive world AABBs for later instances.
+    struct MeshAabb { float lmin[3], lmax[3]; };
+    std::vector<MeshAabb> mesh_aabbs;
+
     uint32_t total_shapes = 0;
-    size_t total_vertices = 0;
-    size_t total_indices = 0;
+    uint32_t total_meshes = 0;
     QElapsedTimer stream_timer;
     stream_timer.start();
 
@@ -152,9 +311,12 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
         const auto* tri_elem = dynamic_cast<const IfcGeom::TriangulationElement*>(elem);
         if (!tri_elem) continue;
 
+        const auto& geom = tri_elem->geometry();
+        if (geom.verts().empty() || geom.faces().empty()) continue;
+
         uint32_t object_id = next_object_id_++;
 
-        // Record element metadata
+        // Element metadata.
         ElementInfo info;
         info.object_id = object_id;
         info.model_id = model_id_;
@@ -163,36 +325,62 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
         info.name = tri_elem->name();
         info.type = tri_elem->type();
         info.parent_id = tri_elem->parent_id();
+        {
+            std::lock_guard<std::mutex> lock(elements_mutex_);
+            pending_elements_.push_back(std::move(info));
+        }
 
-        // Instancing stats: key by representation id, count unique vs repeated.
-        const auto& geom = tri_elem->geometry();
+        // Representation dedup.
         const std::string& geom_id = geom.id();
-        size_t nv = geom.verts().size() / 3;
-        size_t ni = geom.faces().size();
-        if (!geom_id.empty()) {
-            auto& gs = geom_stats[geom_id];
-            gs.count++;
-            if (gs.count == 1) {
-                gs.vertex_count = nv;
-                gs.index_count = ni;
-                gs.example_type = info.type;
+        uint32_t local_mesh_id;
+        bool first_sight = false;
+        if (geom_id.empty()) {
+            // No representation key — treat as unique.
+            local_mesh_id = total_meshes++;
+            first_sight = true;
+        } else {
+            auto it = geom_to_local_mesh_id.find(geom_id);
+            if (it == geom_to_local_mesh_id.end()) {
+                local_mesh_id = total_meshes++;
+                geom_to_local_mesh_id.emplace(geom_id, local_mesh_id);
+                first_sight = true;
+            } else {
+                local_mesh_id = it->second;
             }
         }
-        total_shapes++;
-        total_vertices += nv;
-        total_indices += ni;
 
-        {
-            std::lock_guard<std::mutex> lock(elements_mutex_);
-            pending_elements_.push_back(std::move(info));
+        if (first_sight) {
+            MeshChunk mesh_chunk = buildMeshChunk(model_id_, local_mesh_id, tri_elem);
+            MeshAabb ma;
+            for (int a = 0; a < 3; ++a) {
+                ma.lmin[a] = mesh_chunk.local_aabb_min[a];
+                ma.lmax[a] = mesh_chunk.local_aabb_max[a];
+            }
+            if (mesh_aabbs.size() <= local_mesh_id) mesh_aabbs.resize(local_mesh_id + 1);
+            mesh_aabbs[local_mesh_id] = ma;
+            if (!mesh_chunk.indices.empty()) {
+                emit meshReady(std::move(mesh_chunk));
+            }
         }
 
-        // Convert geometry to upload chunk
-        UploadChunk chunk = convertElement(tri_elem, object_id);
-        if (!chunk.indices.empty()) {
-            emit elementReady(std::move(chunk));
+        // Transform (column-major 4x4, cast to float).
+        const Eigen::Matrix4d& mat_d = tri_elem->transformation().data()->ccomponents();
+        InstanceChunk inst;
+        inst.model_id = model_id_;
+        inst.local_mesh_id = local_mesh_id;
+        inst.object_id = object_id;
+        inst.color_override_rgba8 = 0; // 0 = use baked vertex color
+        for (int i = 0; i < 16; ++i) {
+            inst.transform[i] = static_cast<float>(mat_d.data()[i]);
         }
 
+        const MeshAabb& ma = mesh_aabbs[local_mesh_id];
+        worldAabbFromLocal(ma.lmin, ma.lmax, inst.transform,
+                           inst.world_aabb_min, inst.world_aabb_max);
+
+        emit instanceReady(std::move(inst));
+        total_shapes++;
+
         int p = iterator->progress();
         if (p != last_progress) {
             last_progress = p;
@@ -204,188 +392,9 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
     progress_ = 100;
     emit progressChanged(100);
 
-    // === Instancing report ===
-    {
-        size_t unique_geoms = geom_stats.size();
-        size_t unique_vertices = 0;
-        size_t unique_indices = 0;
-        size_t repeated_shapes = 0; // total shapes that share a repr with another
-        for (const auto& [gid, gs] : geom_stats) {
-            unique_vertices += gs.vertex_count;
-            unique_indices += gs.index_count;
-            if (gs.count > 1) repeated_shapes += gs.count;
-        }
-
-        // Bytes assuming current layout (32 B/vertex, 4 B/index).
-        size_t baked_vbo_bytes = total_vertices * 32;
-        size_t baked_ebo_bytes = total_indices * 4;
-        size_t instanced_vbo_bytes = unique_vertices * 32;
-        size_t instanced_ebo_bytes = unique_indices * 4;
-        // Per-instance data: 64 B transform + 8 B (object_id + color).
-        size_t per_instance_bytes = 72;
-        size_t instance_ssbo_bytes = total_shapes * per_instance_bytes;
-
-        double dedup_ratio = unique_geoms > 0
-            ? static_cast<double>(total_shapes) / static_cast<double>(unique_geoms)
-            : 1.0;
-
-        qDebug("=== Instancing analysis: %s ===", path.c_str());
-        qDebug("  Stream time: %.2f s", stream_timer.elapsed() / 1000.0);
-        qDebug("  Total shapes:      %u", total_shapes);
-        qDebug("  Unique geometries: %zu  (dedup ratio %.2fx)",
-               unique_geoms, dedup_ratio);
-        qDebug("  Repeated shapes:   %zu  (%.1f%% of total)",
-               repeated_shapes,
-               total_shapes > 0 ? 100.0 * repeated_shapes / total_shapes : 0.0);
-        qDebug("  Baked geometry:    VBO %.1f MB + EBO %.1f MB = %.1f MB",
-               baked_vbo_bytes / (1024.0*1024.0),
-               baked_ebo_bytes / (1024.0*1024.0),
-               (baked_vbo_bytes + baked_ebo_bytes) / (1024.0*1024.0));
-        qDebug("  If instanced:      VBO %.1f MB + EBO %.1f MB + SSBO %.1f MB = %.1f MB",
-               instanced_vbo_bytes / (1024.0*1024.0),
-               instanced_ebo_bytes / (1024.0*1024.0),
-               instance_ssbo_bytes / (1024.0*1024.0),
-               (instanced_vbo_bytes + instanced_ebo_bytes + instance_ssbo_bytes)
-                   / (1024.0*1024.0));
-        size_t baked_total = baked_vbo_bytes + baked_ebo_bytes;
-        size_t inst_total = instanced_vbo_bytes + instanced_ebo_bytes + instance_ssbo_bytes;
-        if (inst_total > 0 && baked_total > inst_total) {
-            qDebug("  Potential savings: %.1f MB (%.1f%%)",
-                   (baked_total - inst_total) / (1024.0*1024.0),
-                   100.0 * (baked_total - inst_total) / baked_total);
-        } else {
-            qDebug("  Potential savings: none (instance overhead exceeds dedup win)");
-        }
-
-        // Top-5 most duplicated representations.
-        std::vector<std::pair<std::string, GeomStat>> sorted(geom_stats.begin(), geom_stats.end());
-        std::partial_sort(sorted.begin(),
-                          sorted.begin() + std::min<size_t>(5, sorted.size()),
-                          sorted.end(),
-                          [](const auto& a, const auto& b) { return a.second.count > b.second.count; });
-        qDebug("  Top duplicated representations:");
-        for (size_t i = 0; i < std::min<size_t>(5, sorted.size()); ++i) {
-            const auto& [gid, gs] = sorted[i];
-            qDebug("    [%zu] count=%u  verts=%zu  type=%s  repr_id=%s",
-                   i + 1, gs.count, gs.vertex_count,
-                   gs.example_type.c_str(), gid.c_str());
-        }
-    }
-}
-
-static MaterialInfo materialFromStyle(const ifcopenshell::geometry::taxonomy::style::ptr& style) {
-    MaterialInfo m;
-    if (!style) return m;
-
-    const auto& color = style->get_color();
-    if (color) {
-        m.r = static_cast<float>(color.r());
-        m.g = static_cast<float>(color.g());
-        m.b = static_cast<float>(color.b());
-    }
-    if (!std::isnan(style->transparency)) {
-        m.a = 1.0f - static_cast<float>(style->transparency);
-    }
-    return m;
-}
-
-static inline uint32_t packRGBA8(const MaterialInfo& m) {
-    auto to_byte = [](float v) -> uint32_t {
-        float c = std::clamp(v, 0.0f, 1.0f);
-        return static_cast<uint32_t>(c * 255.0f + 0.5f);
-    };
-    uint32_t r = to_byte(m.r);
-    uint32_t g = to_byte(m.g);
-    uint32_t b = to_byte(m.b);
-    uint32_t a = to_byte(m.a);
-    // Layout in memory (little-endian) reads as bytes [r, g, b, a] which is
-    // what the GL_UNSIGNED_BYTE * 4 normalized vertex attribute expects.
-    return r | (g << 8) | (b << 16) | (a << 24);
-}
-
-UploadChunk GeometryStreamer::convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id) {
-    UploadChunk chunk;
-    chunk.object_id = object_id;
-    chunk.model_id = model_id_;
-
-    const auto& geom = elem->geometry();
-    const auto& verts = geom.verts();
-    const auto& faces = geom.faces();
-    const auto& normals = geom.normals();
-    const auto& materials = geom.materials();
-    const auto& material_ids = geom.material_ids();
-
-    if (verts.empty() || faces.empty()) return chunk;
-
-    // Encode object_id as float bits for the vertex attribute
-    float id_as_float;
-    static_assert(sizeof(float) == sizeof(uint32_t));
-    std::memcpy(&id_as_float, &object_id, sizeof(float));
-
-    const size_t num_verts = verts.size() / 3;
-    const size_t num_tris = faces.size() / 3;
-    const bool have_per_tri_material = (material_ids.size() == num_tris);
-
-    // Per-vertex color requires that any vertex shared between triangles with
-    // *different* materials be split. We dedupe (orig_vert_idx, mat_id) pairs
-    // so vertices that are only ever used by one material stay shared.
-    auto make_key = [](uint32_t orig_idx, int mat_id) -> uint64_t {
-        return (static_cast<uint64_t>(orig_idx) << 32) |
-               static_cast<uint32_t>(mat_id);
-    };
-
-    std::unordered_map<uint64_t, uint32_t> remap;
-    remap.reserve(num_verts);
-
-    chunk.vertices.reserve(num_verts * 8);
-    chunk.indices.reserve(faces.size());
-
-    auto emit_vertex = [&](uint32_t orig_idx, int mat_id) -> uint32_t {
-        const uint64_t key = make_key(orig_idx, mat_id);
-        auto it = remap.find(key);
-        if (it != remap.end()) return it->second;
-
-        const uint32_t new_idx = static_cast<uint32_t>(chunk.vertices.size() / 8);
-
-        // pos
-        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 0]));
-        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 1]));
-        chunk.vertices.push_back(static_cast<float>(verts[orig_idx * 3 + 2]));
-
-        // normal
-        if (orig_idx * 3 + 2 < normals.size()) {
-            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 0]));
-            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 1]));
-            chunk.vertices.push_back(static_cast<float>(normals[orig_idx * 3 + 2]));
-        } else {
-            chunk.vertices.push_back(0.0f);
-            chunk.vertices.push_back(1.0f);
-            chunk.vertices.push_back(0.0f);
-        }
-
-        // object_id (float bits)
-        chunk.vertices.push_back(id_as_float);
-
-        // color (packed RGBA8 reinterpreted as float)
-        MaterialInfo m;
-        if (mat_id >= 0 && mat_id < static_cast<int>(materials.size())) {
-            m = materialFromStyle(materials[mat_id]);
-        }
-        uint32_t packed = packRGBA8(m);
-        float packed_as_float;
-        std::memcpy(&packed_as_float, &packed, sizeof(float));
-        chunk.vertices.push_back(packed_as_float);
-
-        remap.emplace(key, new_idx);
-        return new_idx;
-    };
-
-    for (size_t t = 0; t < num_tris; ++t) {
-        const int mat_id = have_per_tri_material ? material_ids[t] : -1;
-        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 0]), mat_id));
-        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 1]), mat_id));
-        chunk.indices.push_back(emit_vertex(static_cast<uint32_t>(faces[t * 3 + 2]), mat_id));
-    }
-
-    return chunk;
+    double dedup_ratio = total_meshes > 0
+        ? static_cast<double>(total_shapes) / static_cast<double>(total_meshes) : 1.0;
+    qDebug("Streamer done: %s  %.2fs  shapes=%u  unique_meshes=%u  dedup=%.2fx",
+           path.c_str(), stream_timer.elapsed() / 1000.0,
+           total_shapes, total_meshes, dedup_ratio);
 }
diff --git a/src/ifcviewer/GeometryStreamer.h b/src/ifcviewer/GeometryStreamer.h
index 0d49a12ca70..f6201517ad1 100644
--- a/src/ifcviewer/GeometryStreamer.h
+++ b/src/ifcviewer/GeometryStreamer.h
@@ -26,15 +26,13 @@
 #include <string>
 #include <vector>
 #include <atomic>
-#include <functional>
 #include <memory>
 #include <mutex>
-#include <deque>
 
 #include "../ifcparse/file.h"
 #include "../ifcgeom/Iterator.h"
 
-#include "ViewportWindow.h"
+#include "InstancedGeometry.h"
 
 struct ElementInfo {
     uint32_t object_id;
@@ -67,15 +65,14 @@ class GeometryStreamer : public QObject {
 
 signals:
     void progressChanged(int percent);
-    void elementReady(UploadChunk chunk);
+    void meshReady(MeshChunk chunk);
+    void instanceReady(InstanceChunk chunk);
     void finished();
     void errorOccurred(const QString& message);
 
 private:
     void run(const std::string& path, int num_threads);
 
-    UploadChunk convertElement(const IfcGeom::TriangulationElement* elem, uint32_t object_id);
-
     std::unique_ptr<ifcopenshell::file> ifc_file_;
     std::unique_ptr<QThread> worker_thread_;
     std::atomic<bool> running_{false};
@@ -85,7 +82,7 @@ class GeometryStreamer : public QObject {
     std::mutex elements_mutex_;
     std::vector<ElementInfo> pending_elements_;
 
-    uint32_t next_object_id_ = 1; // 0 = no object
+    uint32_t next_object_id_ = 1;
     uint32_t model_id_ = 0;
 };
 
diff --git a/src/ifcviewer/InstancedGeometry.h b/src/ifcviewer/InstancedGeometry.h
new file mode 100644
index 00000000000..1c027976ef1
--- /dev/null
+++ b/src/ifcviewer/InstancedGeometry.h
@@ -0,0 +1,103 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef INSTANCEDGEOMETRY_H
+#define INSTANCEDGEOMETRY_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+// Per-vertex layout for instanced meshes, stored in local coordinates.
+// 28 bytes per vertex:
+//   pos(3 float)    -- 12 B
+//   normal(3 float) -- 12 B
+//   color(4 bytes RGBA8, read as GL_UNSIGNED_BYTE*4 normalized) -- 4 B
+static constexpr int INSTANCED_VERTEX_STRIDE_BYTES = 28;
+static constexpr int INSTANCED_VERTEX_STRIDE_FLOATS = 7;
+
+// Per-mesh metadata on the CPU side.  Meshes own a slice of the model's
+// VBO and EBO (both local-coords/mesh-local indices).
+struct MeshInfo {
+    uint32_t vbo_byte_offset = 0;    // where this mesh's vertices start
+    uint32_t vertex_count    = 0;
+    uint32_t ebo_byte_offset = 0;    // where this mesh's indices start
+    uint32_t index_count     = 0;
+    float    local_aabb_min[3]{};
+    float    local_aabb_max[3]{};
+    uint32_t first_instance  = 0;    // index into per-model instances array
+    uint32_t instance_count  = 0;
+};
+static_assert(sizeof(MeshInfo) == 48, "MeshInfo must be 48 bytes");
+
+// Per-instance record uploaded to an SSBO and read by the vertex shader.
+// Layout deliberately matches std430 expectations:
+//   mat4 transform (64 B column-major)
+//   uint object_id
+//   uint color_override_rgba8   -- 0 = use baked vertex color, else override
+//   uint _pad0, _pad1           -- align to 16 for std430
+struct alignas(16) InstanceGpu {
+    float    transform[16];
+    uint32_t object_id            = 0;
+    uint32_t color_override_rgba8 = 0;
+    uint32_t _pad0                = 0;
+    uint32_t _pad1                = 0;
+};
+static_assert(sizeof(InstanceGpu) == 80, "InstanceGpu must be 80 bytes");
+
+// CPU-side per-instance data.  The GPU record above is derived from this;
+// we also retain the world AABB for BVH construction and the mesh_id.
+struct InstanceCpu {
+    uint32_t mesh_id              = 0;  // index into meshes array
+    uint32_t object_id            = 0;
+    uint32_t color_override_rgba8 = 0;
+    uint32_t model_id             = 0;
+    float    transform[16]{};
+    float    world_aabb_min[3]{};
+    float    world_aabb_max[3]{};
+};
+
+// Chunks emitted by the streamer to the viewport (main thread).
+
+// Emitted the first time a representation id is seen.  Carries the mesh
+// geometry in local coords.  `local_mesh_id` is the streamer-assigned id
+// within this model.
+struct MeshChunk {
+    uint32_t model_id      = 0;
+    uint32_t local_mesh_id = 0;
+    std::vector<float>    vertices;  // 7 floats * N_verts (pos3+norm3+color1_packed)
+    std::vector<uint32_t> indices;
+    float    local_aabb_min[3]{};
+    float    local_aabb_max[3]{};
+};
+
+// Emitted for every placement (every triangulation element from the
+// iterator).  For the first instance of a mesh, the MeshChunk is emitted
+// just before this.
+struct InstanceChunk {
+    uint32_t model_id             = 0;
+    uint32_t local_mesh_id        = 0;
+    uint32_t object_id            = 0;
+    uint32_t color_override_rgba8 = 0;
+    float    transform[16]{};
+    float    world_aabb_min[3]{};
+    float    world_aabb_max[3]{};
+};
+
+#endif // INSTANCEDGEOMETRY_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index b5ee3581c44..86a787a0e26 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -173,8 +173,10 @@ void MainWindow::addFiles(const QStringList& paths) {
 void MainWindow::connectStreamer(GeometryStreamer* streamer) {
     connect(streamer, &GeometryStreamer::progressChanged,
             this, &MainWindow::onProgressChanged, Qt::QueuedConnection);
-    connect(streamer, &GeometryStreamer::elementReady,
-            this, &MainWindow::onElementReady, Qt::QueuedConnection);
+    connect(streamer, &GeometryStreamer::meshReady,
+            this, &MainWindow::onMeshReady, Qt::QueuedConnection);
+    connect(streamer, &GeometryStreamer::instanceReady,
+            this, &MainWindow::onInstanceReady, Qt::QueuedConnection);
     connect(streamer, &GeometryStreamer::finished,
             this, &MainWindow::onStreamingFinished, Qt::QueuedConnection);
     connect(streamer, &GeometryStreamer::errorOccurred, this, [this](const QString& msg) {
@@ -208,7 +210,7 @@ void MainWindow::startNextLoad() {
         qDebug("  Sidecar read: %lld ms (%s)", rt.elapsed(), ifc_path.c_str());
         auto result = std::make_shared<std::optional<SidecarData>>(std::move(cached));
         QMetaObject::invokeMethod(this, [this, mid, result]() {
-            if (*result && !(*result)->draw_info.empty()) {
+            if (*result && !(*result)->meshes.empty()) {
                 applySidecarData(mid, std::move(**result));
             } else {
                 // No sidecar — fall back to streaming from IFC.
@@ -227,51 +229,10 @@ void MainWindow::startNextLoad() {
     });
 }
 
-void MainWindow::applySidecarData(ModelId mid, SidecarData data) {
-    auto it = models_.find(mid);
-    if (it == models_.end()) return;
-    auto& model = it->second;
-
-    QElapsedTimer t;
-
-    qDebug("Sidecar hit: %s (%zu objects, %zu verts, %zu indices, %.1f MB)",
-           model.file_path.toStdString().c_str(), data.draw_info.size(),
-           data.vertices.size() / 8, data.indices.size(),
-           (data.vertices.size() * 4 + data.indices.size() * 4) / (1024.0 * 1024.0));
-
-    // GL upload — fast, single buffer copy.
-    t.start();
-    viewport_->uploadBulk(mid, data.vertices, data.indices,
-                          data.draw_info, std::move(data.bvh_set));
-    qDebug("  GL upload: %lld ms", t.elapsed());
-
-    // Update next_object_id_ past all objects in this model.
-    for (const auto& elem : data.elements) {
-        if (elem.object_id >= next_object_id_)
-            next_object_id_ = elem.object_id + 1;
-    }
-
-    // Suppress per-item layout recalcs while building the tree.
-    t.restart();
-    element_tree_->setUpdatesEnabled(false);
-    populateTreeFromSidecar(model, data.elements, data.string_table);
-    element_tree_->setUpdatesEnabled(true);
-    qDebug("  Tree build: %lld ms (%zu elements)", t.elapsed(), data.elements.size());
-
-    progress_bar_->setVisible(false);
-
-    qint64 ms = load_timer_.elapsed();
-    QString elapsed = (ms >= 1000)
-        ? QString::number(ms / 1000.0, 'f', 2) + " s"
-        : QString::number(ms) + " ms";
-
-    status_label_->setText(QString("%1 elements across %2 model(s) — loaded from cache in %3")
-        .arg(element_map_.size())
-        .arg(models_.size())
-        .arg(elapsed));
-
-    loading_model_id_ = 0;
-    QTimer::singleShot(0, this, &MainWindow::startNextLoad);
+void MainWindow::applySidecarData(ModelId /*mid*/, SidecarData /*data*/) {
+    // Commit A: readSidecar() always returns nullopt, so this is unreachable.
+    // Restored in Commit B along with the v4 on-disk format.
+    qWarning("applySidecarData called but sidecar is disabled in Commit A");
 }
 
 void MainWindow::populateTreeFromSidecar(ModelHandle& model,
@@ -325,8 +286,12 @@ void MainWindow::onProgressChanged(int percent) {
     progress_bar_->setValue(percent);
 }
 
-void MainWindow::onElementReady(UploadChunk chunk) {
-    viewport_->uploadChunk(chunk);
+void MainWindow::onMeshReady(MeshChunk chunk) {
+    viewport_->uploadMeshChunk(chunk);
+}
+
+void MainWindow::onInstanceReady(InstanceChunk chunk) {
+    viewport_->uploadInstanceChunk(chunk);
 }
 
 void MainWindow::onStreamingFinished() {
@@ -355,39 +320,10 @@ void MainWindow::onStreamingFinished() {
         .arg(num_models)
         .arg(elapsed));
 
-    // Build BVH and write sidecar (geometry + metadata + BVH).
+    // Sort instances by mesh and upload the per-model instance SSBO.
+    // Sidecar write is stubbed in Commit A.
     if (loading_model_id_ != 0) {
-        auto it = models_.find(loading_model_id_);
-        if (it != models_.end()) {
-            std::string ifc_path = it->second.file_path.toStdString();
-            QFileInfo fi(it->second.file_path);
-            uint64_t file_size = static_cast<uint64_t>(fi.size());
-
-            // Pack element info for the sidecar (only this model's elements).
-            std::vector<PackedElementInfo> packed;
-            std::string stbl;
-            for (const auto& [oid, info] : element_map_) {
-                if (info.model_id != loading_model_id_) continue;
-                PackedElementInfo pe;
-                pe.object_id = info.object_id;
-                pe.model_id = info.model_id;
-                pe.ifc_id = info.ifc_id;
-                pe.parent_id = info.parent_id;
-                pe.guid_offset = static_cast<uint32_t>(stbl.size());
-                pe.guid_length = static_cast<uint32_t>(info.guid.size());
-                stbl += info.guid;
-                pe.name_offset = static_cast<uint32_t>(stbl.size());
-                pe.name_length = static_cast<uint32_t>(info.name.size());
-                stbl += info.name;
-                pe.type_offset = static_cast<uint32_t>(stbl.size());
-                pe.type_length = static_cast<uint32_t>(info.type.size());
-                stbl += info.type;
-                packed.push_back(pe);
-            }
-
-            viewport_->buildBvhAsync(loading_model_id_, ifc_path, file_size,
-                                     std::move(packed), std::move(stbl));
-        }
+        viewport_->finalizeModel(loading_model_id_);
     }
 
     // Start next model if queued.
diff --git a/src/ifcviewer/MainWindow.h b/src/ifcviewer/MainWindow.h
index f60da70b75d..5270676af53 100644
--- a/src/ifcviewer/MainWindow.h
+++ b/src/ifcviewer/MainWindow.h
@@ -62,7 +62,8 @@ private slots:
     void onFileOpen();
     void onFileSettings();
     void onProgressChanged(int percent);
-    void onElementReady(UploadChunk chunk);
+    void onMeshReady(MeshChunk chunk);
+    void onInstanceReady(InstanceChunk chunk);
     void onStreamingFinished();
     void onObjectPicked(uint32_t object_id);
     void onTreeSelectionChanged();
diff --git a/src/ifcviewer/SidecarCache.cpp b/src/ifcviewer/SidecarCache.cpp
index d77095c9223..be19c8698f4 100644
--- a/src/ifcviewer/SidecarCache.cpp
+++ b/src/ifcviewer/SidecarCache.cpp
@@ -17,180 +17,20 @@
  *                                                                              *
  ********************************************************************************/
 
-#include "SidecarCache.h"
-
-#include <cstdio>
-#include <cstring>
-
-// Binary layout (all multi-byte fields native-endian):
-//
-//   SidecarHeader          (16 bytes)
-//   uint64_t               source_file_size
-//
-//   uint32_t               num_vertices  (count of floats)
-//   float[num_vertices]    vertex data
-//
-//   uint32_t               num_indices
-//   uint32_t[num_indices]  index data
-//
-//   uint32_t               num_draw_infos
-//   ObjectDrawInfo[N]      draw info array
-//
-//   uint32_t               num_elements
-//   PackedElementInfo[N]   element records
-//   uint32_t               string_table_bytes
-//   char[string_table_bytes]
-//
-//   uint32_t               num_bvh_models
-//   for each model:
-//     uint32_t model_id
-//     uint32_t num_nodes
-//     BvhNode[num_nodes]
-//     uint32_t num_object_indices
-//     uint32_t[num_object_indices]
-
-struct SidecarHeader {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t endian;
-    uint32_t reserved;
-};
-
-static std::string sidecarPath(const std::string& ifc_path) {
-    return ifc_path + ".ifcview";
-}
-
-template<typename T>
-static bool writeVec(FILE* f, const std::vector<T>& v) {
-    uint32_t n = static_cast<uint32_t>(v.size());
-    if (fwrite(&n, 4, 1, f) != 1) return false;
-    if (n > 0 && fwrite(v.data(), sizeof(T), n, f) != n) return false;
-    return true;
-}
-
-template<typename T>
-static bool readVec(FILE* f, std::vector<T>& v) {
-    uint32_t n;
-    if (fread(&n, 4, 1, f) != 1) return false;
-    v.resize(n);
-    if (n > 0 && fread(v.data(), sizeof(T), n, f) != n) return false;
-    return true;
-}
-
-bool writeSidecar(const std::string& ifc_path,
-                  const SidecarData& data,
-                  uint64_t ifc_file_size) {
-    std::string path = sidecarPath(ifc_path);
-    FILE* f = fopen(path.c_str(), "wb");
-    if (!f) return false;
-
-    // Header
-    SidecarHeader hdr = { SIDECAR_MAGIC, SIDECAR_VERSION, SIDECAR_ENDIAN, 0 };
-    fwrite(&hdr, sizeof(hdr), 1, f);
-    fwrite(&ifc_file_size, 8, 1, f);
-
-    // Geometry
-    if (!writeVec(f, data.vertices)) { fclose(f); return false; }
-    if (!writeVec(f, data.indices))  { fclose(f); return false; }
-
-    // Draw info
-    if (!writeVec(f, data.draw_info)) { fclose(f); return false; }
-
-    // Elements + string table
-    if (!writeVec(f, data.elements)) { fclose(f); return false; }
-    uint32_t stbl_len = static_cast<uint32_t>(data.string_table.size());
-    fwrite(&stbl_len, 4, 1, f);
-    if (stbl_len > 0) fwrite(data.string_table.data(), 1, stbl_len, f);
-
-    // BVH
-    uint32_t num_bvh_models = data.bvh_set
-        ? static_cast<uint32_t>(data.bvh_set->models.size()) : 0;
-    fwrite(&num_bvh_models, 4, 1, f);
+// Commit A: sidecar cache is temporarily disabled.  The on-disk format is
+// being rewritten from v3 (monolithic world-coord geometry) to v4 (instanced
+// meshes + per-instance records).  Until v4 is finalised, loads always go
+// through the streaming path and writes are no-ops.
 
-    if (data.bvh_set) {
-        for (const auto& [model_id, mbvh] : data.bvh_set->models) {
-            fwrite(&model_id, 4, 1, f);
-
-            uint32_t nn = static_cast<uint32_t>(mbvh.nodes.size());
-            fwrite(&nn, 4, 1, f);
-            if (nn > 0) fwrite(mbvh.nodes.data(), sizeof(BvhNode), nn, f);
-
-            uint32_t no = static_cast<uint32_t>(mbvh.object_indices.size());
-            fwrite(&no, 4, 1, f);
-            if (no > 0) fwrite(mbvh.object_indices.data(), 4, no, f);
-        }
-    }
+#include "SidecarCache.h"
 
-    fclose(f);
+bool writeSidecar(const std::string& /*ifc_path*/,
+                  const SidecarData& /*data*/,
+                  uint64_t /*ifc_file_size*/) {
     return true;
 }
 
-std::optional<SidecarData> readSidecar(const std::string& ifc_path,
-                                       uint64_t ifc_file_size) {
-    std::string path = sidecarPath(ifc_path);
-    FILE* f = fopen(path.c_str(), "rb");
-    if (!f) return std::nullopt;
-
-    auto fail = [&]() -> std::optional<SidecarData> { fclose(f); return std::nullopt; };
-
-    // Header
-    SidecarHeader hdr;
-    if (fread(&hdr, sizeof(hdr), 1, f) != 1) return fail();
-    if (hdr.magic != SIDECAR_MAGIC ||
-        hdr.version != SIDECAR_VERSION ||
-        hdr.endian != SIDECAR_ENDIAN) return fail();
-
-    uint64_t stored_size;
-    if (fread(&stored_size, 8, 1, f) != 1) return fail();
-    if (stored_size != ifc_file_size) return fail();
-
-    SidecarData data;
-
-    // Geometry
-    if (!readVec(f, data.vertices)) return fail();
-    if (!readVec(f, data.indices))  return fail();
-
-    // Draw info
-    if (!readVec(f, data.draw_info)) return fail();
-
-    // Elements + string table
-    if (!readVec(f, data.elements)) return fail();
-    uint32_t stbl_len;
-    if (fread(&stbl_len, 4, 1, f) != 1) return fail();
-    data.string_table.resize(stbl_len);
-    if (stbl_len > 0 && fread(data.string_table.data(), 1, stbl_len, f) != stbl_len)
-        return fail();
-
-    // BVH
-    uint32_t num_bvh_models;
-    if (fread(&num_bvh_models, 4, 1, f) != 1) return fail();
-
-    if (num_bvh_models > 0) {
-        data.bvh_set = std::make_shared<BvhSet>();
-        for (uint32_t m = 0; m < num_bvh_models; ++m) {
-            uint32_t model_id;
-            if (fread(&model_id, 4, 1, f) != 1) return fail();
-
-            ModelBvh mbvh;
-            mbvh.model_id = model_id;
-
-            uint32_t nn;
-            if (fread(&nn, 4, 1, f) != 1) return fail();
-            mbvh.nodes.resize(nn);
-            if (nn > 0 && fread(mbvh.nodes.data(), sizeof(BvhNode), nn, f) != nn)
-                return fail();
-
-            uint32_t no;
-            if (fread(&no, 4, 1, f) != 1) return fail();
-            mbvh.object_indices.resize(no);
-            if (no > 0 && fread(mbvh.object_indices.data(), 4, no, f) != no)
-                return fail();
-
-            data.bvh_set->bvh_model_ids.insert(model_id);
-            data.bvh_set->models[model_id] = std::move(mbvh);
-        }
-    }
-
-    fclose(f);
-    return data;
+std::optional<SidecarData> readSidecar(const std::string& /*ifc_path*/,
+                                       uint64_t /*ifc_file_size*/) {
+    return std::nullopt;
 }
diff --git a/src/ifcviewer/SidecarCache.h b/src/ifcviewer/SidecarCache.h
index 49c36dba15a..e14eb9d2561 100644
--- a/src/ifcviewer/SidecarCache.h
+++ b/src/ifcviewer/SidecarCache.h
@@ -17,22 +17,28 @@
  *                                                                              *
  ********************************************************************************/
 
+// NOTE: Sidecar format v3 is being rewritten to v4 (instanced geometry layout).
+// During the instancing rewrite (Commit A) the cache is a no-op: reads always
+// miss and writes always succeed without producing a file. Commit B will
+// re-introduce the on-disk format with MeshInfo + InstanceGpu sections.
+
 #ifndef SIDECARCACHE_H
 #define SIDECARCACHE_H
 
-#include "BvhAccel.h"
+#include "InstancedGeometry.h"
 
 #include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
+#include <memory>
 
 static constexpr uint32_t SIDECAR_MAGIC   = 0x49465657;  // "IFVW"
-static constexpr uint32_t SIDECAR_VERSION = 3;
+static constexpr uint32_t SIDECAR_VERSION = 4;
 static constexpr uint32_t SIDECAR_ENDIAN  = 0x01020304;
 
-// Fixed-size element record for the sidecar.  Strings are stored as
-// (offset, length) pairs into a separate string table.
+// Fixed-size element record.  Strings are stored as (offset, length) pairs
+// into a separate string table.
 struct PackedElementInfo {
     uint32_t object_id;
     uint32_t model_id;
@@ -46,30 +52,27 @@ struct PackedElementInfo {
     uint32_t type_length;
 };
 
-// Everything the viewer needs to display a model without tessellating.
+// Everything needed to display an already-tessellated model without
+// re-running the iterator.  v4 schema: instanced geometry.
 struct SidecarData {
-    // GPU geometry (ready to upload as-is)
-    std::vector<float>    vertices;      // interleaved, 8 floats per vertex
-    std::vector<uint32_t> indices;       // global (already remapped)
+    // Per-model GPU geometry (local coords).  28 bytes/vertex.
+    std::vector<float>        vertices;
+    std::vector<uint32_t>     indices;
 
-    // Per-object metadata
-    std::vector<ObjectDrawInfo> draw_info;
+    // Mesh dictionary and per-instance data.
+    std::vector<MeshInfo>     meshes;        // indexed by local_mesh_id
+    std::vector<InstanceCpu>  instances;     // sorted by mesh_id
 
-    // Element tree metadata
+    // Element tree metadata.
     std::vector<PackedElementInfo> elements;
-    std::string string_table;            // concatenated UTF-8
-
-    // BVH acceleration
-    std::shared_ptr<BvhSet> bvh_set;
+    std::string               string_table;
 };
 
-// Write a full sidecar next to the IFC file.
-// Returns true on success.
+// v4 writer/reader are stubbed for Commit A — no disk I/O happens.
 bool writeSidecar(const std::string& ifc_path,
                   const SidecarData& data,
                   uint64_t ifc_file_size);
 
-// Read a sidecar.  Returns nullopt on any failure (missing, stale, corrupt).
 std::optional<SidecarData> readSidecar(const std::string& ifc_path,
                                        uint64_t ifc_file_size);
 
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 1c6ab786254..e264f990e45 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -18,7 +18,6 @@
  ********************************************************************************/
 
 #include "ViewportWindow.h"
-#include "SidecarCache.h"
 
 #include <QMouseEvent>
 #include <QWheelEvent>
@@ -31,33 +30,75 @@
 #include <algorithm>
 #include <limits>
 
-static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;  // 64 MB
-static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;  // 32 MB
-static const size_t MAX_BUFFER_SIZE = 4ull * 1024 * 1024 * 1024;  // 4 GB
-static const int VERTEX_STRIDE = 8;  // pos(3) + normal(3) + object_id(1) + color(1 packed)
+static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;   // 64 MB
+static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;   // 32 MB
+static const size_t MAX_BUFFER_SIZE  = 4ull * 1024 * 1024 * 1024;  // 4 GB
+
+// -----------------------------------------------------------------------------
+// Shaders
+// -----------------------------------------------------------------------------
+//
+// Vertex layout (GL side, 28 bytes):
+//   location 0: vec3 a_position     (local coords)
+//   location 1: vec3 a_normal       (local)
+//   location 2: vec4 a_color        (GL_UNSIGNED_BYTE * 4 normalized)
+//
+// Per-instance record in SSBO std430 (80 bytes):
+//   mat4 transform
+//   uint object_id
+//   uint color_override_rgba8     -- 0 => use baked a_color
+//   uint _pad0, _pad1
+//
+// The draw calls pass `u_instance_offset = mesh.first_instance`; the shader
+// reads `instances[u_instance_offset + gl_InstanceID]`.
 
 static const char* MAIN_VERTEX_SHADER = R"(
 #version 450 core
 layout(location = 0) in vec3 a_position;
 layout(location = 1) in vec3 a_normal;
-layout(location = 2) in float a_object_id;
-layout(location = 3) in vec4 a_color;
+layout(location = 2) in vec4 a_color;
+
+struct InstanceRecord {
+    mat4 transform;
+    uint object_id;
+    uint color_override;
+    uint _pad0;
+    uint _pad1;
+};
+layout(std430, binding = 0) readonly buffer Instances {
+    InstanceRecord instances[];
+};
 
 uniform mat4 u_view_projection;
+uniform uint u_instance_offset;
 uniform uint u_selected_id;
 
 out vec3 v_normal;
-out vec3 v_position;
 out vec4 v_color;
 flat out uint v_object_id;
 flat out uint v_selected;
 
 void main() {
-    gl_Position = u_view_projection * vec4(a_position, 1.0);
-    v_normal = a_normal;
-    v_position = a_position;
-    v_color = a_color;
-    v_object_id = floatBitsToUint(a_object_id);
+    InstanceRecord inst = instances[u_instance_offset + uint(gl_InstanceID)];
+    vec4 world = inst.transform * vec4(a_position, 1.0);
+    gl_Position = u_view_projection * world;
+
+    // Rotate the normal by the upper-3x3 of the transform. For the vast
+    // majority of BIM placements this is a rigid rotation (+ uniform scale),
+    // so we skip the inverse-transpose.
+    v_normal = normalize(mat3(inst.transform) * a_normal);
+
+    vec4 baked = a_color;
+    if (inst.color_override != 0u) {
+        float r = float((inst.color_override      ) & 0xFFu) / 255.0;
+        float g = float((inst.color_override >>  8) & 0xFFu) / 255.0;
+        float b = float((inst.color_override >> 16) & 0xFFu) / 255.0;
+        float a = float((inst.color_override >> 24) & 0xFFu) / 255.0;
+        if (a > 0.0) baked = vec4(r, g, b, a);
+    }
+    v_color = baked;
+
+    v_object_id = inst.object_id;
     v_selected = (v_object_id == u_selected_id) ? 1u : 0u;
 }
 )";
@@ -65,7 +106,6 @@ void main() {
 static const char* MAIN_FRAGMENT_SHADER = R"(
 #version 450 core
 in vec3 v_normal;
-in vec3 v_position;
 in vec4 v_color;
 flat in uint v_object_id;
 flat in uint v_selected;
@@ -80,11 +120,7 @@ void main() {
     float ambient = 0.25;
     float diffuse = 0.75 * ndotl;
     vec3 color = v_color.rgb * (ambient + diffuse);
-
-    if (v_selected == 1u) {
-        color = mix(color, vec3(0.2, 0.6, 1.0), 0.5);
-    }
-
+    if (v_selected == 1u) color = mix(color, vec3(0.2, 0.6, 1.0), 0.5);
     frag_color = vec4(color, v_color.a);
 }
 )";
@@ -92,39 +128,43 @@ void main() {
 static const char* PICK_VERTEX_SHADER = R"(
 #version 450 core
 layout(location = 0) in vec3 a_position;
-layout(location = 1) in vec3 a_normal;
-layout(location = 2) in float a_object_id;
+
+struct InstanceRecord {
+    mat4 transform;
+    uint object_id;
+    uint color_override;
+    uint _pad0;
+    uint _pad1;
+};
+layout(std430, binding = 0) readonly buffer Instances {
+    InstanceRecord instances[];
+};
 
 uniform mat4 u_view_projection;
+uniform uint u_instance_offset;
 
 flat out uint v_object_id;
 
 void main() {
-    gl_Position = u_view_projection * vec4(a_position, 1.0);
-    v_object_id = floatBitsToUint(a_object_id);
+    InstanceRecord inst = instances[u_instance_offset + uint(gl_InstanceID)];
+    gl_Position = u_view_projection * inst.transform * vec4(a_position, 1.0);
+    v_object_id = inst.object_id;
 }
 )";
 
 static const char* PICK_FRAGMENT_SHADER = R"(
 #version 450 core
 flat in uint v_object_id;
-
 out uint frag_id;
-
-void main() {
-    frag_id = v_object_id;
-}
+void main() { frag_id = v_object_id; }
 )";
 
 static const char* AXIS_VERTEX_SHADER = R"(
 #version 450 core
 layout(location = 0) in vec3 a_position;
 layout(location = 1) in vec3 a_color;
-
 uniform mat4 u_mvp;
-
 out vec3 v_color;
-
 void main() {
     gl_Position = u_mvp * vec4(a_position, 1.0);
     v_color = a_color;
@@ -135,10 +175,7 @@ static const char* AXIS_FRAGMENT_SHADER = R"(
 #version 450 core
 in vec3 v_color;
 out vec4 frag_color;
-
-void main() {
-    frag_color = vec4(v_color, 1.0);
-}
+void main() { frag_color = vec4(v_color, 1.0); }
 )";
 
 static GLuint compileShader(QOpenGLFunctions_4_5_Core* gl, GLenum type, const char* source) {
@@ -148,7 +185,7 @@ static GLuint compileShader(QOpenGLFunctions_4_5_Core* gl, GLenum type, const ch
     GLint ok = 0;
     gl->glGetShaderiv(shader, GL_COMPILE_STATUS, &ok);
     if (!ok) {
-        char log[1024];
+        char log[2048];
         gl->glGetShaderInfoLog(shader, sizeof(log), nullptr, log);
         qWarning("Shader compile error: %s", log);
     }
@@ -163,7 +200,7 @@ static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint fra
     GLint ok = 0;
     gl->glGetProgramiv(prog, GL_LINK_STATUS, &ok);
     if (!ok) {
-        char log[1024];
+        char log[2048];
         gl->glGetProgramInfoLog(prog, sizeof(log), nullptr, log);
         qWarning("Program link error: %s", log);
     }
@@ -172,6 +209,8 @@ static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint fra
     return prog;
 }
 
+// -----------------------------------------------------------------------------
+
 ViewportWindow::ViewportWindow(QWindow* parent)
     : QWindow(parent)
 {
@@ -188,26 +227,25 @@ ViewportWindow::ViewportWindow(QWindow* parent)
     connect(&render_timer_, &QTimer::timeout, this, [this]() {
         if (isExposed()) render();
     });
-    render_timer_.setInterval(16); // ~60 fps
+    render_timer_.setInterval(16);
 }
 
 ViewportWindow::~ViewportWindow() {
-    if (bvh_build_thread_.joinable())
-        bvh_build_thread_.join();
     if (context_) {
         context_->makeCurrent(this);
         if (gl_) {
             for (auto& [mid, m] : models_gpu_) {
-                if (m.vao) gl_->glDeleteVertexArrays(1, &m.vao);
-                if (m.vbo) gl_->glDeleteBuffers(1, &m.vbo);
-                if (m.ebo) gl_->glDeleteBuffers(1, &m.ebo);
+                if (m.vao)  gl_->glDeleteVertexArrays(1, &m.vao);
+                if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
+                if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
+                if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
             }
-            if (axis_vao_) gl_->glDeleteVertexArrays(1, &axis_vao_);
-            if (axis_vbo_) gl_->glDeleteBuffers(1, &axis_vbo_);
-            if (main_program_) gl_->glDeleteProgram(main_program_);
-            if (pick_program_) gl_->glDeleteProgram(pick_program_);
-            if (axis_program_) gl_->glDeleteProgram(axis_program_);
-            if (pick_fbo_) gl_->glDeleteFramebuffers(1, &pick_fbo_);
+            if (axis_vao_)      gl_->glDeleteVertexArrays(1, &axis_vao_);
+            if (axis_vbo_)      gl_->glDeleteBuffers(1, &axis_vbo_);
+            if (main_program_)  gl_->glDeleteProgram(main_program_);
+            if (pick_program_)  gl_->glDeleteProgram(pick_program_);
+            if (axis_program_)  gl_->glDeleteProgram(axis_program_);
+            if (pick_fbo_)      gl_->glDeleteFramebuffers(1, &pick_fbo_);
             if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
             if (pick_depth_rbo_) gl_->glDeleteRenderbuffers(1, &pick_depth_rbo_);
         }
@@ -220,17 +258,11 @@ void ViewportWindow::initGL() {
 
     context_ = new QOpenGLContext(this);
     context_->setFormat(requestedFormat());
-    if (!context_->create()) {
-        qFatal("Failed to create OpenGL context");
-        return;
-    }
+    if (!context_->create()) { qFatal("Failed to create OpenGL context"); return; }
     context_->makeCurrent(this);
 
     gl_ = QOpenGLVersionFunctionsFactory::get<QOpenGLFunctions_4_5_Core>(context_);
-    if (!gl_) {
-        qWarning("OpenGL 4.5 not available, falling back");
-        return;
-    }
+    if (!gl_) { qWarning("OpenGL 4.5 not available"); return; }
 
     buildShaders();
     buildAxisGizmo();
@@ -247,28 +279,23 @@ void ViewportWindow::initGL() {
 }
 
 void ViewportWindow::setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo) {
-    gl_->glVertexArrayVertexBuffer(vao, 0, vbo, 0, VERTEX_STRIDE * sizeof(float));
+    gl_->glVertexArrayVertexBuffer(vao, 0, vbo, 0, INSTANCED_VERTEX_STRIDE_BYTES);
     gl_->glVertexArrayElementBuffer(vao, ebo);
 
-    // position
+    // position (3 float @ 0)
     gl_->glEnableVertexArrayAttrib(vao, 0);
     gl_->glVertexArrayAttribFormat(vao, 0, 3, GL_FLOAT, GL_FALSE, 0);
     gl_->glVertexArrayAttribBinding(vao, 0, 0);
 
-    // normal
+    // normal (3 float @ 12)
     gl_->glEnableVertexArrayAttrib(vao, 1);
-    gl_->glVertexArrayAttribFormat(vao, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
+    gl_->glVertexArrayAttribFormat(vao, 1, 3, GL_FLOAT, GL_FALSE, 12);
     gl_->glVertexArrayAttribBinding(vao, 1, 0);
 
-    // object_id (passed as float, decoded in shader via floatBitsToUint)
+    // color (4 ubyte @ 24, normalized)
     gl_->glEnableVertexArrayAttrib(vao, 2);
-    gl_->glVertexArrayAttribFormat(vao, 2, 1, GL_FLOAT, GL_FALSE, 6 * sizeof(float));
+    gl_->glVertexArrayAttribFormat(vao, 2, 4, GL_UNSIGNED_BYTE, GL_TRUE, 24);
     gl_->glVertexArrayAttribBinding(vao, 2, 0);
-
-    // color (RGBA8 packed into the 4 bytes at offset 28; normalized to vec4)
-    gl_->glEnableVertexArrayAttrib(vao, 3);
-    gl_->glVertexArrayAttribFormat(vao, 3, 4, GL_UNSIGNED_BYTE, GL_TRUE, 7 * sizeof(float));
-    gl_->glVertexArrayAttribBinding(vao, 3, 0);
 }
 
 void ViewportWindow::buildShaders() {
@@ -291,24 +318,20 @@ void ViewportWindow::buildShaders() {
 
 void ViewportWindow::buildAxisGizmo() {
     static const float axis_data[] = {
-        0.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
-        1.0f, 0.0f, 0.0f,   1.0f, 0.25f, 0.25f,
-        0.0f, 0.0f, 0.0f,   0.30f, 0.95f, 0.30f,
-        0.0f, 1.0f, 0.0f,   0.30f, 0.95f, 0.30f,
-        0.0f, 0.0f, 0.0f,   0.30f, 0.55f, 1.0f,
-        0.0f, 0.0f, 1.0f,   0.30f, 0.55f, 1.0f,
+        0,0,0,  1.0f,0.25f,0.25f,
+        1,0,0,  1.0f,0.25f,0.25f,
+        0,0,0,  0.30f,0.95f,0.30f,
+        0,1,0,  0.30f,0.95f,0.30f,
+        0,0,0,  0.30f,0.55f,1.0f,
+        0,0,1,  0.30f,0.55f,1.0f,
     };
-
     gl_->glCreateVertexArrays(1, &axis_vao_);
     gl_->glCreateBuffers(1, &axis_vbo_);
     gl_->glNamedBufferStorage(axis_vbo_, sizeof(axis_data), axis_data, 0);
-
     gl_->glVertexArrayVertexBuffer(axis_vao_, 0, axis_vbo_, 0, 6 * sizeof(float));
-
     gl_->glEnableVertexArrayAttrib(axis_vao_, 0);
     gl_->glVertexArrayAttribFormat(axis_vao_, 0, 3, GL_FLOAT, GL_FALSE, 0);
     gl_->glVertexArrayAttribBinding(axis_vao_, 0, 0);
-
     gl_->glEnableVertexArrayAttrib(axis_vao_, 1);
     gl_->glVertexArrayAttribFormat(axis_vao_, 1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float));
     gl_->glVertexArrayAttribBinding(axis_vao_, 1, 0);
@@ -318,25 +341,20 @@ bool ViewportWindow::growModelVbo(ModelGpuData& m, size_t needed_total) {
     size_t new_capacity = m.vbo_capacity;
     while (new_capacity < needed_total) new_capacity *= 2;
     if (new_capacity > MAX_BUFFER_SIZE) {
-        qWarning("VBO grow request (%zu MB) exceeds cap", new_capacity / (1024 * 1024));
+        qWarning("VBO grow request (%zu MB) exceeds cap", new_capacity / (1024*1024));
         return false;
     }
-
     GLuint new_vbo = 0;
     gl_->glCreateBuffers(1, &new_vbo);
     gl_->glNamedBufferStorage(new_vbo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
-
     if (m.vbo_used > 0) {
         gl_->glCopyNamedBufferSubData(m.vbo, new_vbo, 0, 0, m.vbo_used);
     }
-
     gl_->glDeleteBuffers(1, &m.vbo);
     m.vbo = new_vbo;
     m.vbo_capacity = new_capacity;
-
-    gl_->glVertexArrayVertexBuffer(m.vao, 0, m.vbo, 0, VERTEX_STRIDE * sizeof(float));
-
-    qInfo("Model VBO grew to %zu MB", m.vbo_capacity / (1024 * 1024));
+    gl_->glVertexArrayVertexBuffer(m.vao, 0, m.vbo, 0, INSTANCED_VERTEX_STRIDE_BYTES);
+    qInfo("Model VBO grew to %zu MB", m.vbo_capacity / (1024*1024));
     return true;
 }
 
@@ -344,268 +362,178 @@ bool ViewportWindow::growModelEbo(ModelGpuData& m, size_t needed_total) {
     size_t new_capacity = m.ebo_capacity;
     while (new_capacity < needed_total) new_capacity *= 2;
     if (new_capacity > MAX_BUFFER_SIZE) {
-        qWarning("EBO grow request (%zu MB) exceeds cap", new_capacity / (1024 * 1024));
+        qWarning("EBO grow request (%zu MB) exceeds cap", new_capacity / (1024*1024));
         return false;
     }
-
     GLuint new_ebo = 0;
     gl_->glCreateBuffers(1, &new_ebo);
     gl_->glNamedBufferStorage(new_ebo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
-
     if (m.ebo_used > 0) {
         gl_->glCopyNamedBufferSubData(m.ebo, new_ebo, 0, 0, m.ebo_used);
     }
-
     gl_->glDeleteBuffers(1, &m.ebo);
     m.ebo = new_ebo;
     m.ebo_capacity = new_capacity;
-
     gl_->glVertexArrayElementBuffer(m.vao, m.ebo);
-
-    qInfo("Model EBO grew to %zu MB", m.ebo_capacity / (1024 * 1024));
+    qInfo("Model EBO grew to %zu MB", m.ebo_capacity / (1024*1024));
     return true;
 }
 
-void ViewportWindow::uploadChunk(const UploadChunk& chunk) {
+ModelGpuData& ViewportWindow::getOrCreateModel(uint32_t model_id) {
+    auto it = models_gpu_.find(model_id);
+    if (it != models_gpu_.end()) return it->second;
+
+    ModelGpuData m;
+    gl_->glCreateVertexArrays(1, &m.vao);
+    gl_->glCreateBuffers(1, &m.vbo);
+    gl_->glCreateBuffers(1, &m.ebo);
+
+    m.vbo_capacity = INITIAL_VBO_SIZE;
+    m.ebo_capacity = INITIAL_EBO_SIZE;
+    gl_->glNamedBufferStorage(m.vbo, m.vbo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+    gl_->glNamedBufferStorage(m.ebo, m.ebo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+    setupVaoLayout(m.vao, m.vbo, m.ebo);
+
+    return models_gpu_.emplace(model_id, std::move(m)).first->second;
+}
+
+void ViewportWindow::uploadMeshChunk(const MeshChunk& chunk) {
     if (!gl_initialized_) return;
     if (chunk.vertices.empty() || chunk.indices.empty()) return;
-
     context_->makeCurrent(this);
 
-    // Get or create per-model GPU data.
-    auto it = models_gpu_.find(chunk.model_id);
-    if (it == models_gpu_.end()) {
-        ModelGpuData m;
-        gl_->glCreateVertexArrays(1, &m.vao);
-        gl_->glCreateBuffers(1, &m.vbo);
-        gl_->glCreateBuffers(1, &m.ebo);
-
-        m.vbo_capacity = INITIAL_VBO_SIZE;
-        m.ebo_capacity = INITIAL_EBO_SIZE;
-        gl_->glNamedBufferStorage(m.vbo, m.vbo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
-        gl_->glNamedBufferStorage(m.ebo, m.ebo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
-
-        setupVaoLayout(m.vao, m.vbo, m.ebo);
-        it = models_gpu_.emplace(chunk.model_id, std::move(m)).first;
-    }
-
-    auto& mgpu = it->second;
+    ModelGpuData& m = getOrCreateModel(chunk.model_id);
 
-    size_t vb_size = chunk.vertices.size() * sizeof(float);
-    size_t ib_size = chunk.indices.size() * sizeof(uint32_t);
+    const size_t vb_size = chunk.vertices.size() * sizeof(float);
+    const size_t ib_size = chunk.indices.size()  * sizeof(uint32_t);
 
-    if (mgpu.vbo_used + vb_size > mgpu.vbo_capacity) {
-        if (!growModelVbo(mgpu, mgpu.vbo_used + vb_size)) {
-            qWarning("VBO at cap, skipping chunk");
-            return;
-        }
+    if (m.vbo_used + vb_size > m.vbo_capacity) {
+        if (!growModelVbo(m, m.vbo_used + vb_size)) return;
     }
-    if (mgpu.ebo_used + ib_size > mgpu.ebo_capacity) {
-        if (!growModelEbo(mgpu, mgpu.ebo_used + ib_size)) {
-            qWarning("EBO at cap, skipping chunk");
-            return;
-        }
+    if (m.ebo_used + ib_size > m.ebo_capacity) {
+        if (!growModelEbo(m, m.ebo_used + ib_size)) return;
     }
 
-    uint32_t base_vertex = mgpu.vertex_count;
-
-    gl_->glNamedBufferSubData(mgpu.vbo, mgpu.vbo_used, vb_size, chunk.vertices.data());
-
-    // Remap chunk-local indices into model-local global indices.
-    std::vector<uint32_t> global_indices(chunk.indices.size());
-    for (size_t i = 0; i < chunk.indices.size(); ++i) {
-        global_indices[i] = chunk.indices[i] + base_vertex;
-    }
-    gl_->glNamedBufferSubData(mgpu.ebo, mgpu.ebo_used, ib_size, global_indices.data());
-
-    // Compute AABB from vertex positions in this chunk.
-    ObjectDrawInfo info;
-    info.index_offset = static_cast<uint32_t>(mgpu.ebo_used);
-    info.index_count = static_cast<uint32_t>(chunk.indices.size());
-    info.model_id = chunk.model_id;
-
-    const size_t num_verts = chunk.vertices.size() / VERTEX_STRIDE;
-    if (num_verts > 0) {
-        info.aabb_min[0] = info.aabb_min[1] = info.aabb_min[2] =  std::numeric_limits<float>::max();
-        info.aabb_max[0] = info.aabb_max[1] = info.aabb_max[2] = -std::numeric_limits<float>::max();
-        for (size_t v = 0; v < num_verts; ++v) {
-            const float* pos = &chunk.vertices[v * VERTEX_STRIDE];
-            for (int a = 0; a < 3; ++a) {
-                if (pos[a] < info.aabb_min[a]) info.aabb_min[a] = pos[a];
-                if (pos[a] > info.aabb_max[a]) info.aabb_max[a] = pos[a];
-            }
-        }
-    } else {
-        info.aabb_min[0] = info.aabb_min[1] = info.aabb_min[2] = 0.0f;
-        info.aabb_max[0] = info.aabb_max[1] = info.aabb_max[2] = 0.0f;
+    MeshInfo info;
+    info.vbo_byte_offset = static_cast<uint32_t>(m.vbo_used);
+    info.vertex_count    = static_cast<uint32_t>(
+        chunk.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS);
+    info.ebo_byte_offset = static_cast<uint32_t>(m.ebo_used);
+    info.index_count     = static_cast<uint32_t>(chunk.indices.size());
+    for (int a = 0; a < 3; ++a) {
+        info.local_aabb_min[a] = chunk.local_aabb_min[a];
+        info.local_aabb_max[a] = chunk.local_aabb_max[a];
     }
+    info.first_instance = 0;
+    info.instance_count = 0;
+
+    gl_->glNamedBufferSubData(m.vbo, m.vbo_used, vb_size, chunk.vertices.data());
+    gl_->glNamedBufferSubData(m.ebo, m.ebo_used, ib_size, chunk.indices.data());
+    m.vbo_used += vb_size;
+    m.ebo_used += ib_size;
+    m.vertex_count += info.vertex_count;
 
-    mgpu.draw_info.push_back(info);
-    mgpu.active_draw_count = static_cast<uint32_t>(mgpu.draw_info.size()); // immediately drawable
-    mgpu.vbo_used += vb_size;
-    mgpu.ebo_used += ib_size;
-    mgpu.vertex_count += static_cast<uint32_t>(num_verts);
-    mgpu.total_triangles += static_cast<uint32_t>(chunk.indices.size() / 3);
+    if (m.meshes.size() <= chunk.local_mesh_id) m.meshes.resize(chunk.local_mesh_id + 1);
+    m.meshes[chunk.local_mesh_id] = info;
 }
 
-void ViewportWindow::uploadBulk(uint32_t model_id,
-                                std::vector<float> vertices,
-                                std::vector<uint32_t> indices,
-                                const std::vector<ObjectDrawInfo>& draw_info,
-                                std::shared_ptr<BvhSet> bvh_set) {
+void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     if (!gl_initialized_) return;
-    if (vertices.empty() || indices.empty()) return;
+    // We don't need a GL context here since we're only touching CPU state,
+    // but the signal may fire on the render thread so keep it simple.
+    ModelGpuData& m = getOrCreateModel(chunk.model_id);
 
-    context_->makeCurrent(this);
+    InstanceCpu inst;
+    inst.mesh_id  = chunk.local_mesh_id;
+    inst.object_id = chunk.object_id;
+    inst.color_override_rgba8 = chunk.color_override_rgba8;
+    inst.model_id = chunk.model_id;
+    std::memcpy(inst.transform,      chunk.transform,      sizeof(inst.transform));
+    std::memcpy(inst.world_aabb_min, chunk.world_aabb_min, sizeof(inst.world_aabb_min));
+    std::memcpy(inst.world_aabb_max, chunk.world_aabb_max, sizeof(inst.world_aabb_max));
+    m.instances.push_back(inst);
 
-    size_t vb_size = vertices.size() * sizeof(float);
-    size_t ib_size = indices.size() * sizeof(uint32_t);
+    if (chunk.local_mesh_id < m.meshes.size()) {
+        m.total_triangles += m.meshes[chunk.local_mesh_id].index_count / 3;
+    }
+}
 
-    // Allocate empty buffers at exact size — no data uploaded yet.
-    ModelGpuData m;
-    gl_->glCreateVertexArrays(1, &m.vao);
-    gl_->glCreateBuffers(1, &m.vbo);
-    gl_->glCreateBuffers(1, &m.ebo);
+void ViewportWindow::finalizeModel(uint32_t model_id) {
+    if (!gl_initialized_) return;
+    context_->makeCurrent(this);
 
-    m.vbo_capacity = vb_size;
-    m.ebo_capacity = ib_size;
-    gl_->glNamedBufferStorage(m.vbo, vb_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
-    gl_->glNamedBufferStorage(m.ebo, ib_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
+    auto it = models_gpu_.find(model_id);
+    if (it == models_gpu_.end()) return;
+    ModelGpuData& m = it->second;
+    if (m.instances.empty()) { m.finalized = true; return; }
+
+    // Sort instances by mesh_id (stable for deterministic ordering).
+    std::stable_sort(m.instances.begin(), m.instances.end(),
+        [](const InstanceCpu& a, const InstanceCpu& b) {
+            return a.mesh_id < b.mesh_id;
+        });
+
+    // Assign per-mesh contiguous range.
+    for (auto& mesh : m.meshes) { mesh.first_instance = 0; mesh.instance_count = 0; }
+    uint32_t current = UINT32_MAX;
+    uint32_t run_start = 0;
+    for (uint32_t i = 0; i < m.instances.size(); ++i) {
+        uint32_t mid = m.instances[i].mesh_id;
+        if (mid != current) {
+            if (current != UINT32_MAX && current < m.meshes.size()) {
+                m.meshes[current].first_instance = run_start;
+                m.meshes[current].instance_count = i - run_start;
+            }
+            current = mid;
+            run_start = i;
+        }
+    }
+    if (current != UINT32_MAX && current < m.meshes.size()) {
+        m.meshes[current].first_instance = run_start;
+        m.meshes[current].instance_count = static_cast<uint32_t>(m.instances.size()) - run_start;
+    }
 
-    setupVaoLayout(m.vao, m.vbo, m.ebo);
+    // Build GPU-layout array.
+    std::vector<InstanceGpu> gpu(m.instances.size());
+    for (size_t i = 0; i < m.instances.size(); ++i) {
+        const InstanceCpu& src = m.instances[i];
+        InstanceGpu& dst = gpu[i];
+        std::memcpy(dst.transform, src.transform, sizeof(dst.transform));
+        dst.object_id = src.object_id;
+        dst.color_override_rgba8 = src.color_override_rgba8;
+        dst._pad0 = 0;
+        dst._pad1 = 0;
+    }
 
-    m.vbo_used = vb_size;
-    m.ebo_used = ib_size;
-    m.vertex_count = static_cast<uint32_t>(vertices.size() / VERTEX_STRIDE);
-    m.draw_info = draw_info;
-    m.active_draw_count = 0;  // nothing drawable yet
+    // Allocate and upload SSBO.
+    if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+    gl_->glCreateBuffers(1, &m.ssbo);
+    const size_t ssbo_bytes = gpu.size() * sizeof(InstanceGpu);
+    gl_->glNamedBufferStorage(m.ssbo, ssbo_bytes, gpu.data(), 0);
+    m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
 
-    uint32_t total_tri = 0;
-    for (const auto& di : draw_info) total_tri += di.index_count / 3;
-    m.total_triangles = total_tri;
+    m.finalized = true;
 
-    // Delete old model data if re-uploading.
-    auto it = models_gpu_.find(model_id);
-    if (it != models_gpu_.end()) {
-        gl_->glDeleteVertexArrays(1, &it->second.vao);
-        gl_->glDeleteBuffers(1, &it->second.vbo);
-        gl_->glDeleteBuffers(1, &it->second.ebo);
-    }
-    models_gpu_[model_id] = std::move(m);
-
-    // Queue progressive upload — data will stream in over subsequent frames.
-    PendingUpload pu;
-    pu.model_id = model_id;
-    pu.vertices = std::move(vertices);
-    pu.indices = std::move(indices);
-    pu.bvh_set = std::move(bvh_set);
-    pending_uploads_.push_back(std::move(pu));
-
-    qDebug("Bulk upload queued: model %u, %zu vertices, %zu indices, %zu objects",
-           model_id, vertices.size() / VERTEX_STRIDE, indices.size(), draw_info.size());
+    qDebug("Model %u finalized: %zu verts, %zu meshes, %zu instances, %.1f MB vram "
+           "(vbo %.1f + ebo %.1f + ssbo %.1f)",
+           model_id, size_t(m.vertex_count), m.meshes.size(), m.instances.size(),
+           (m.vbo_capacity + m.ebo_capacity + ssbo_bytes) / (1024.0*1024.0),
+           m.vbo_capacity / (1024.0*1024.0),
+           m.ebo_capacity / (1024.0*1024.0),
+           ssbo_bytes / (1024.0*1024.0));
 }
 
 void ViewportWindow::resetScene() {
     if (!gl_initialized_) return;
-
-    if (bvh_build_thread_.joinable())
-        bvh_build_thread_.join();
-
     context_->makeCurrent(this);
     for (auto& [mid, m] : models_gpu_) {
-        if (m.vao) gl_->glDeleteVertexArrays(1, &m.vao);
-        if (m.vbo) gl_->glDeleteBuffers(1, &m.vbo);
-        if (m.ebo) gl_->glDeleteBuffers(1, &m.ebo);
+        if (m.vao)  gl_->glDeleteVertexArrays(1, &m.vao);
+        if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
+        if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
+        if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
     }
     models_gpu_.clear();
-    model_bvhs_.clear();
-    pending_uploads_.clear();
     selected_object_id_ = 0;
-    {
-        std::lock_guard<std::mutex> bvh_lock(bvh_result_mutex_);
-        pending_bvh_.reset();
-    }
-}
-
-static const size_t UPLOAD_CHUNK_BYTES = 48 * 1024 * 1024;  // 48 MB per frame
-
-void ViewportWindow::processPendingUploads() {
-    if (pending_uploads_.empty()) return;
-
-    auto& pu = pending_uploads_.front();
-    auto it = models_gpu_.find(pu.model_id);
-    if (it == models_gpu_.end()) {
-        pending_uploads_.pop_front();
-        return;
-    }
-    auto& mgpu = it->second;
-
-    size_t vbo_total = pu.vertices.size() * sizeof(float);
-    size_t ebo_total = pu.indices.size() * sizeof(uint32_t);
-
-    // Phase 1: Upload VBO in chunks.
-    if (pu.vbo_uploaded < vbo_total) {
-        size_t remaining = vbo_total - pu.vbo_uploaded;
-        size_t chunk = std::min(remaining, UPLOAD_CHUNK_BYTES);
-        gl_->glNamedBufferSubData(mgpu.vbo, pu.vbo_uploaded, chunk,
-                                  reinterpret_cast<const char*>(pu.vertices.data()) + pu.vbo_uploaded);
-        pu.vbo_uploaded += chunk;
-
-        if (pu.vbo_uploaded >= vbo_total) {
-            // VBO done — free CPU memory.
-            pu.vertices.clear();
-            pu.vertices.shrink_to_fit();
-        }
-        return;  // yield to render loop
-    }
-
-    // Phase 2: Upload EBO in chunks. Objects become drawable as their range lands.
-    if (pu.ebo_uploaded < ebo_total) {
-        size_t remaining = ebo_total - pu.ebo_uploaded;
-        size_t chunk = std::min(remaining, UPLOAD_CHUNK_BYTES);
-        gl_->glNamedBufferSubData(mgpu.ebo, pu.ebo_uploaded, chunk,
-                                  reinterpret_cast<const char*>(pu.indices.data()) + pu.ebo_uploaded);
-        pu.ebo_uploaded += chunk;
-
-        // Advance active_draw_count: activate objects whose EBO range is fully uploaded.
-        while (mgpu.active_draw_count < mgpu.draw_info.size()) {
-            const auto& obj = mgpu.draw_info[mgpu.active_draw_count];
-            size_t obj_end = obj.index_offset + obj.index_count * sizeof(uint32_t);
-            if (obj_end <= pu.ebo_uploaded)
-                mgpu.active_draw_count++;
-            else
-                break;
-        }
-
-        if (pu.ebo_uploaded >= ebo_total) {
-            // EBO done — free CPU memory.
-            pu.indices.clear();
-            pu.indices.shrink_to_fit();
-        } else {
-            return;  // yield to render loop
-        }
-    }
-
-    // Fully uploaded — activate BVH if present.
-    mgpu.active_draw_count = static_cast<uint32_t>(mgpu.draw_info.size());
-    if (pu.bvh_set) {
-        model_bvhs_[pu.model_id] = std::move(pu.bvh_set);
-    }
-
-    size_t total_vbo = 0, total_ebo = 0;
-    for (const auto& [mid, mg] : models_gpu_) {
-        total_vbo += mg.vbo_capacity;
-        total_ebo += mg.ebo_capacity;
-    }
-    qDebug("Progressive upload complete: model %u  (this: vbo %.1f MB + ebo %.1f MB, "
-           "%u objects, %u triangles)  scene total vram %.1f MB",
-           pu.model_id,
-           mgpu.vbo_capacity / (1024.0 * 1024.0),
-           mgpu.ebo_capacity / (1024.0 * 1024.0),
-           static_cast<uint32_t>(mgpu.draw_info.size()),
-           mgpu.total_triangles,
-           (total_vbo + total_ebo) / (1024.0 * 1024.0));
-    pending_uploads_.pop_front();
 }
 
 void ViewportWindow::hideModel(uint32_t model_id) {
@@ -621,161 +549,35 @@ void ViewportWindow::showModel(uint32_t model_id) {
 void ViewportWindow::removeModel(uint32_t model_id) {
     if (!gl_initialized_) return;
     context_->makeCurrent(this);
-
-    // Cancel any pending upload for this model.
-    pending_uploads_.erase(
-        std::remove_if(pending_uploads_.begin(), pending_uploads_.end(),
-                        [model_id](const PendingUpload& pu) { return pu.model_id == model_id; }),
-        pending_uploads_.end());
-
     auto it = models_gpu_.find(model_id);
     if (it != models_gpu_.end()) {
-        gl_->glDeleteVertexArrays(1, &it->second.vao);
-        gl_->glDeleteBuffers(1, &it->second.vbo);
-        gl_->glDeleteBuffers(1, &it->second.ebo);
+        if (it->second.vao)  gl_->glDeleteVertexArrays(1, &it->second.vao);
+        if (it->second.vbo)  gl_->glDeleteBuffers(1, &it->second.vbo);
+        if (it->second.ebo)  gl_->glDeleteBuffers(1, &it->second.ebo);
+        if (it->second.ssbo) gl_->glDeleteBuffers(1, &it->second.ssbo);
         models_gpu_.erase(it);
     }
-    model_bvhs_.erase(model_id);
-}
-
-std::vector<uint32_t> ViewportWindow::readbackEbo(uint32_t model_id) const {
-    std::vector<uint32_t> ebo_data;
-    auto it = models_gpu_.find(model_id);
-    if (!gl_ || it == models_gpu_.end() || it->second.ebo_used == 0) return ebo_data;
-
-    const auto& m = it->second;
-    size_t num_indices = m.ebo_used / sizeof(uint32_t);
-    ebo_data.resize(num_indices);
-    gl_->glGetNamedBufferSubData(m.ebo, 0, m.ebo_used, ebo_data.data());
-    return ebo_data;
-}
-
-std::vector<float> ViewportWindow::readbackVbo(uint32_t model_id) const {
-    std::vector<float> vbo_data;
-    auto it = models_gpu_.find(model_id);
-    if (!gl_ || it == models_gpu_.end() || it->second.vbo_used == 0) return vbo_data;
-
-    const auto& m = it->second;
-    size_t num_floats = m.vbo_used / sizeof(float);
-    vbo_data.resize(num_floats);
-    gl_->glGetNamedBufferSubData(m.vbo, 0, m.vbo_used, vbo_data.data());
-    return vbo_data;
-}
-
-void ViewportWindow::buildBvhAsync(uint32_t model_id,
-                                   const std::string& ifc_path,
-                                   uint64_t ifc_file_size,
-                                   std::vector<PackedElementInfo> sidecar_elements,
-                                   std::string sidecar_string_table) {
-    if (bvh_build_thread_.joinable())
-        bvh_build_thread_.join();
-
-    auto it = models_gpu_.find(model_id);
-    if (it == models_gpu_.end()) return;
-
-    // Snapshot draw info; read back EBO + VBO on GL thread.
-    std::vector<ObjectDrawInfo> draw_snapshot = it->second.draw_info;
-    std::vector<uint32_t> ebo_snapshot = readbackEbo(model_id);
-    std::vector<float> vbo_snapshot;
-    if (!ifc_path.empty() && !sidecar_elements.empty()) {
-        vbo_snapshot = readbackVbo(model_id);
-    }
-
-    if (draw_snapshot.empty() || ebo_snapshot.empty()) return;
-
-    bvh_build_thread_ = std::thread([this,
-                                     model_id,
-                                     draw_info = std::move(draw_snapshot),
-                                     ebo_data = std::move(ebo_snapshot),
-                                     vbo_data = std::move(vbo_snapshot),
-                                     elements = std::move(sidecar_elements),
-                                     string_table = std::move(sidecar_string_table),
-                                     ifc_path, ifc_file_size]() {
-        auto bvh_set = buildBvhSet(draw_info);
-
-        EboReorderResult ebo_result = reorderEbo(*bvh_set, draw_info, ebo_data);
-
-        // Write full sidecar if requested.
-        if (!ifc_path.empty() && !elements.empty() && !vbo_data.empty()) {
-            SidecarData sd;
-            sd.vertices = vbo_data;
-            sd.indices = ebo_result.reordered_ebo;
-            sd.draw_info = ebo_result.reordered_draw_info;
-            sd.elements = std::move(elements);
-            sd.string_table = std::move(string_table);
-            sd.bvh_set = bvh_set;
-            writeSidecar(ifc_path, sd, ifc_file_size);
-        }
-
-        {
-            std::lock_guard<std::mutex> lock(bvh_result_mutex_);
-            pending_bvh_ = std::make_unique<PendingBvh>();
-            pending_bvh_->model_id = model_id;
-            pending_bvh_->bvh_set = std::move(bvh_set);
-            pending_bvh_->ebo_reorder = std::move(ebo_result);
-        }
-    });
-}
-
-void ViewportWindow::applyBvhResult() {
-    std::unique_ptr<PendingBvh> result;
-    {
-        std::lock_guard<std::mutex> lock(bvh_result_mutex_);
-        result = std::move(pending_bvh_);
-    }
-    if (!result) return;
-
-    auto it = models_gpu_.find(result->model_id);
-    if (it == models_gpu_.end()) return;
-
-    auto& mgpu = it->second;
-
-    // Re-upload the reordered EBO into this model's buffer.
-    if (!result->ebo_reorder.reordered_ebo.empty()) {
-        size_t ebo_bytes = result->ebo_reorder.reordered_ebo.size() * sizeof(uint32_t);
-        if (ebo_bytes <= mgpu.ebo_capacity) {
-            gl_->glNamedBufferSubData(mgpu.ebo, 0, ebo_bytes,
-                                      result->ebo_reorder.reordered_ebo.data());
-        }
-    }
-
-    // Swap draw info.
-    if (result->ebo_reorder.reordered_draw_info.size() == mgpu.draw_info.size()) {
-        mgpu.draw_info = std::move(result->ebo_reorder.reordered_draw_info);
-    }
-
-    model_bvhs_[result->model_id] = std::move(result->bvh_set);
-
-    qDebug("BVH activated for model %u", result->model_id);
 }
 
-void ViewportWindow::setSelectedObjectId(uint32_t id) {
-    selected_object_id_ = id;
-}
+void ViewportWindow::setSelectedObjectId(uint32_t id) { selected_object_id_ = id; }
 
 uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     if (!gl_initialized_) return 0;
-
     context_->makeCurrent(this);
 
     int w = width() * devicePixelRatio();
     int h = height() * devicePixelRatio();
-
     if (pick_width_ != w || pick_height_ != h) {
         if (pick_fbo_) gl_->glDeleteFramebuffers(1, &pick_fbo_);
         if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
         if (pick_depth_rbo_) gl_->glDeleteRenderbuffers(1, &pick_depth_rbo_);
-
         gl_->glCreateFramebuffers(1, &pick_fbo_);
-
         gl_->glCreateTextures(GL_TEXTURE_2D, 1, &pick_color_tex_);
         gl_->glTextureStorage2D(pick_color_tex_, 1, GL_R32UI, w, h);
         gl_->glNamedFramebufferTexture(pick_fbo_, GL_COLOR_ATTACHMENT0, pick_color_tex_, 0);
-
         gl_->glCreateRenderbuffers(1, &pick_depth_rbo_);
         gl_->glNamedRenderbufferStorage(pick_depth_rbo_, GL_DEPTH_COMPONENT24, w, h);
         gl_->glNamedFramebufferRenderbuffer(pick_fbo_, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, pick_depth_rbo_);
-
         pick_width_ = w;
         pick_height_ = h;
     }
@@ -785,163 +587,32 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     int px = x * devicePixelRatio();
     int py = (height() - y) * devicePixelRatio();
     uint32_t pixel = 0;
-    gl_->glGetTextureSubImage(pick_color_tex_, 0, px, py, 0, 1, 1, 1, GL_RED_INTEGER, GL_UNSIGNED_INT, sizeof(pixel), &pixel);
-
+    gl_->glGetTextureSubImage(pick_color_tex_, 0, px, py, 0, 1, 1, 1,
+                              GL_RED_INTEGER, GL_UNSIGNED_INT, sizeof(pixel), &pixel);
     return pixel;
 }
 
 void ViewportWindow::updateCamera() {
     float yaw_rad = qDegreesToRadians(camera_yaw_);
     float pitch_rad = qDegreesToRadians(camera_pitch_);
-
     QVector3D eye;
     eye.setX(camera_target_.x() + camera_distance_ * cosf(pitch_rad) * cosf(yaw_rad));
     eye.setY(camera_target_.y() + camera_distance_ * cosf(pitch_rad) * sinf(yaw_rad));
     eye.setZ(camera_target_.z() + camera_distance_ * sinf(pitch_rad));
-
     view_matrix_.setToIdentity();
     view_matrix_.lookAt(eye, camera_target_, QVector3D(0, 0, 1));
-
     proj_matrix_.setToIdentity();
     float aspect = width() > 0 ? float(width()) / float(height()) : 1.0f;
     proj_matrix_.perspective(45.0f, aspect, 0.1f, camera_distance_ * 10.0f);
 }
 
-bool ViewportWindow::aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
-                                   const float planes[6][4]) {
-    for (int p = 0; p < 6; ++p) {
-        float px = planes[p][0] >= 0.0f ? aabb_max[0] : aabb_min[0];
-        float py = planes[p][1] >= 0.0f ? aabb_max[1] : aabb_min[1];
-        float pz = planes[p][2] >= 0.0f ? aabb_max[2] : aabb_min[2];
-        float dist = planes[p][0] * px + planes[p][1] * py + planes[p][2] * pz + planes[p][3];
-        if (dist < 0.0f) return false;
-    }
-    return true;
-}
-
-void ViewportWindow::traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
-                                 const float planes[6][4]) {
-    if (mbvh.nodes.empty()) return;
-
-    uint32_t stack[64];
-    int sp = 0;
-    stack[sp++] = 0;  // root
-
-    // Get the current model's draw command being built.
-    auto& cmd = frame_draw_cmds_.back();
-
-    while (sp > 0) {
-        uint32_t ni = stack[--sp];
-        const BvhNode& node = mbvh.nodes[ni];
-
-        if (!aabbInFrustum(node.aabb_min, node.aabb_max, planes))
-            continue;
-
-        if (node.count > 0) {
-            // Leaf-batched draw: after reorderEbo, a leaf's objects occupy a
-            // contiguous EBO range. Emit one draw command covering all of them
-            // instead of N per-object tests/draws. The leaf AABB test above is
-            // already a conservative cull; any overdraw (up to BVH_MAX_LEAF_SIZE
-            // objects that may be fully outside the frustum but inside the leaf
-            // AABB) costs far less than the per-draw CPU/driver overhead we save.
-            uint32_t first_oi = mbvh.object_indices[node.right_or_first];
-            const auto& first_obj = mgpu.draw_info[first_oi];
-            uint32_t leaf_offset = first_obj.index_offset;
-            uint32_t leaf_count = 0;
-            for (uint32_t i = 0; i < node.count; ++i) {
-                uint32_t oi = mbvh.object_indices[node.right_or_first + i];
-                leaf_count += mgpu.draw_info[oi].index_count;
-            }
-            cmd.counts.push_back(static_cast<GLsizei>(leaf_count));
-            cmd.offsets.push_back(reinterpret_cast<const void*>(
-                static_cast<uintptr_t>(leaf_offset)));
-            visible_triangles_ += leaf_count / 3;
-            visible_objects_ += node.count;
-        } else {
-            if (sp < 63) {
-                stack[sp++] = node.right_or_first;
-                stack[sp++] = ni + 1;
-            }
-        }
-    }
-}
-
-void ViewportWindow::buildVisibleList(const QMatrix4x4& vp) {
-    frame_draw_cmds_.clear();
-    visible_triangles_ = 0;
-    visible_objects_ = 0;
-
-    // Extract 6 frustum planes from the view-projection matrix.
-    float planes[6][4];
-    for (int i = 0; i < 4; ++i) {
-        planes[0][i] = vp(3, i) + vp(0, i);  // left
-        planes[1][i] = vp(3, i) - vp(0, i);  // right
-        planes[2][i] = vp(3, i) + vp(1, i);  // bottom
-        planes[3][i] = vp(3, i) - vp(1, i);  // top
-        planes[4][i] = vp(3, i) + vp(2, i);  // near
-        planes[5][i] = vp(3, i) - vp(2, i);  // far
-    }
-    for (int p = 0; p < 6; ++p) {
-        float len = std::sqrt(planes[p][0] * planes[p][0] +
-                              planes[p][1] * planes[p][1] +
-                              planes[p][2] * planes[p][2]);
-        if (len > 0.0f) {
-            float inv = 1.0f / len;
-            planes[p][0] *= inv;
-            planes[p][1] *= inv;
-            planes[p][2] *= inv;
-            planes[p][3] *= inv;
-        }
-    }
-
-    for (auto& [model_id, mgpu] : models_gpu_) {
-        if (mgpu.hidden || mgpu.active_draw_count == 0) continue;
-
-        frame_draw_cmds_.push_back({mgpu.vao, {}, {}});
-        auto& cmd = frame_draw_cmds_.back();
-        cmd.counts.reserve(mgpu.active_draw_count);
-        cmd.offsets.reserve(mgpu.active_draw_count);
-
-        bool fully_loaded = (mgpu.active_draw_count == mgpu.draw_info.size());
-        auto bvh_it = model_bvhs_.find(model_id);
-
-        // Only use BVH if model is fully uploaded; during progressive upload,
-        // fall back to linear scan of active objects.
-        if (fully_loaded && bvh_it != model_bvhs_.end() && bvh_it->second) {
-            const auto& bvh_set = *bvh_it->second;
-            auto mbvh_it = bvh_set.models.find(model_id);
-            if (mbvh_it != bvh_set.models.end()) {
-                traverseBvh(mbvh_it->second, mgpu, planes);
-            }
-        } else {
-            // Linear scan of active objects only.
-            for (uint32_t i = 0; i < mgpu.active_draw_count; ++i) {
-                const auto& obj = mgpu.draw_info[i];
-                if (aabbInFrustum(obj.aabb_min, obj.aabb_max, planes)) {
-                    cmd.counts.push_back(static_cast<GLsizei>(obj.index_count));
-                    cmd.offsets.push_back(reinterpret_cast<const void*>(
-                        static_cast<uintptr_t>(obj.index_offset)));
-                    visible_triangles_ += obj.index_count / 3;
-                    visible_objects_++;
-                }
-            }
-        }
-
-        if (cmd.counts.empty()) {
-            frame_draw_cmds_.pop_back();
-        }
-    }
-}
-
 void ViewportWindow::render() {
     if (!gl_initialized_ || !isExposed()) return;
 
     context_->makeCurrent(this);
-    applyBvhResult();
-    processPendingUploads();
     updateCamera();
 
-    int w = width() * devicePixelRatio();
+    int w = width()  * devicePixelRatio();
     int h = height() * devicePixelRatio();
     gl_->glViewport(0, 0, w, h);
     gl_->glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
@@ -949,24 +620,43 @@ void ViewportWindow::render() {
     QMatrix4x4 vp = proj_matrix_ * view_matrix_;
 
     gl_->glUseProgram(main_program_);
-    gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(main_program_, "u_view_projection"), 1, GL_FALSE, vp.constData());
-    gl_->glUniform3f(gl_->glGetUniformLocation(main_program_, "u_light_dir"), 0.3f, 0.5f, 0.8f);
-    gl_->glUniform1ui(gl_->glGetUniformLocation(main_program_, "u_selected_id"), selected_object_id_);
-
-    buildVisibleList(vp);
-    for (const auto& cmd : frame_draw_cmds_) {
-        gl_->glBindVertexArray(cmd.vao);
-        gl_->glMultiDrawElements(GL_TRIANGLES,
-            cmd.counts.data(), GL_UNSIGNED_INT,
-            cmd.offsets.data(),
-            static_cast<GLsizei>(cmd.counts.size()));
+    GLint u_vp        = gl_->glGetUniformLocation(main_program_, "u_view_projection");
+    GLint u_light     = gl_->glGetUniformLocation(main_program_, "u_light_dir");
+    GLint u_sel       = gl_->glGetUniformLocation(main_program_, "u_selected_id");
+    GLint u_inst_off  = gl_->glGetUniformLocation(main_program_, "u_instance_offset");
+    gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
+    gl_->glUniform3f(u_light, 0.3f, 0.5f, 0.8f);
+    gl_->glUniform1ui(u_sel, selected_object_id_);
+
+    visible_triangles_ = 0;
+    visible_objects_ = 0;
+    instanced_draws_ = 0;
+
+    for (auto& [model_id, m] : models_gpu_) {
+        if (m.hidden || !m.finalized || !m.ssbo) continue;
+        gl_->glBindVertexArray(m.vao);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
+
+        for (const auto& mesh : m.meshes) {
+            if (mesh.instance_count == 0 || mesh.index_count == 0) continue;
+            gl_->glUniform1ui(u_inst_off, mesh.first_instance);
+            gl_->glDrawElementsInstancedBaseVertex(
+                GL_TRIANGLES,
+                static_cast<GLsizei>(mesh.index_count),
+                GL_UNSIGNED_INT,
+                reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
+                static_cast<GLsizei>(mesh.instance_count),
+                static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
+            visible_triangles_ += (mesh.index_count / 3) * mesh.instance_count;
+            visible_objects_   += mesh.instance_count;
+            ++instanced_draws_;
+        }
     }
 
     renderAxisGizmo();
 
     context_->swapBuffers(this);
 
-    // Compute FPS.
     float dt = frame_clock_.restart() / 1000.0f;
     accumulated_time_ += dt;
     frame_count_++;
@@ -975,21 +665,18 @@ void ViewportWindow::render() {
         frame_count_ = 0;
         accumulated_time_ = 0.0f;
 
-        uint32_t total_obj = 0, total_tri = 0;
-        size_t total_vram = 0, total_vbo = 0, total_ebo = 0;
+        uint32_t total_obj = 0, total_tri = 0, total_meshes = 0;
+        size_t total_vbo = 0, total_ebo = 0, total_ssbo = 0;
         size_t num_models = 0, num_hidden = 0;
-        size_t total_leaf_draws = 0;
-        for (const auto& [mid, m] : models_gpu_) {
+        for (const auto& [mid, mm] : models_gpu_) {
             num_models++;
-            if (m.hidden) { num_hidden++; continue; }
-            total_obj += static_cast<uint32_t>(m.draw_info.size());
-            total_tri += m.total_triangles;
-            total_vbo += m.vbo_capacity;
-            total_ebo += m.ebo_capacity;
-        }
-        total_vram = total_vbo + total_ebo;
-        for (const auto& cmd : frame_draw_cmds_) {
-            total_leaf_draws += cmd.counts.size();
+            if (mm.hidden || !mm.finalized) { num_hidden++; continue; }
+            total_obj += static_cast<uint32_t>(mm.instances.size());
+            total_tri += mm.total_triangles;
+            total_meshes += static_cast<uint32_t>(mm.meshes.size());
+            total_vbo += mm.vbo_capacity;
+            total_ebo += mm.ebo_capacity;
+            total_ssbo += mm.ssbo_instance_count * sizeof(InstanceGpu);
         }
 
         FrameStats stats;
@@ -999,112 +686,95 @@ void ViewportWindow::render() {
         stats.visible_objects = visible_objects_;
         stats.total_triangles = total_tri;
         stats.visible_triangles = visible_triangles_;
+        stats.unique_meshes = total_meshes;
+        stats.instanced_draws = instanced_draws_;
         emit frameStatsUpdated(stats);
 
-        double vis_obj_pct = total_obj > 0 ? 100.0 * visible_objects_ / total_obj : 0.0;
-        double vis_tri_pct = total_tri > 0 ? 100.0 * visible_triangles_ / total_tri : 0.0;
-        qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u (%.1f%%)  tri %u/%u (%.1f%%)  "
-               "vram %.1f MB (vbo %.1f + ebo %.1f)  models %zu (%zu hidden)  "
-               "leaf_draws %zu  model_draws %zu  pending_uploads %zu",
+        qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
+               "meshes %u  inst_draws %u  "
+               "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
-               visible_objects_, total_obj, vis_obj_pct,
-               visible_triangles_, total_tri, vis_tri_pct,
-               total_vram / (1024.0 * 1024.0),
-               total_vbo / (1024.0 * 1024.0),
-               total_ebo / (1024.0 * 1024.0),
-               num_models, num_hidden,
-               total_leaf_draws,
-               frame_draw_cmds_.size(),
-               pending_uploads_.size());
+               visible_objects_, total_obj,
+               visible_triangles_, total_tri,
+               total_meshes, instanced_draws_,
+               (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
+               total_vbo / (1024.0*1024.0),
+               total_ebo / (1024.0*1024.0),
+               total_ssbo / (1024.0*1024.0),
+               num_models, num_hidden);
+    }
+}
+
+void ViewportWindow::renderPickPass() {
+    gl_->glBindFramebuffer(GL_FRAMEBUFFER, pick_fbo_);
+    gl_->glViewport(0, 0, pick_width_, pick_height_);
+    GLuint clear_val = 0;
+    gl_->glClearBufferuiv(GL_COLOR, 0, &clear_val);
+    gl_->glClear(GL_DEPTH_BUFFER_BIT);
+
+    QMatrix4x4 vp = proj_matrix_ * view_matrix_;
+    gl_->glUseProgram(pick_program_);
+    GLint u_vp       = gl_->glGetUniformLocation(pick_program_, "u_view_projection");
+    GLint u_inst_off = gl_->glGetUniformLocation(pick_program_, "u_instance_offset");
+    gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
+
+    for (auto& [model_id, m] : models_gpu_) {
+        if (m.hidden || !m.finalized || !m.ssbo) continue;
+        gl_->glBindVertexArray(m.vao);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
+        for (const auto& mesh : m.meshes) {
+            if (mesh.instance_count == 0 || mesh.index_count == 0) continue;
+            gl_->glUniform1ui(u_inst_off, mesh.first_instance);
+            gl_->glDrawElementsInstancedBaseVertex(
+                GL_TRIANGLES,
+                static_cast<GLsizei>(mesh.index_count),
+                GL_UNSIGNED_INT,
+                reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
+                static_cast<GLsizei>(mesh.instance_count),
+                static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
+        }
     }
+    gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
 }
 
 void ViewportWindow::renderAxisGizmo() {
     if (!axis_program_ || !axis_vao_) return;
-
     const int dpr = devicePixelRatio();
     const int gizmo_size = 110 * dpr;
     const int margin = 10 * dpr;
-
     gl_->glViewport(margin, margin, gizmo_size, gizmo_size);
     gl_->glDisable(GL_DEPTH_TEST);
 
     float yaw_rad = qDegreesToRadians(camera_yaw_);
     float pitch_rad = qDegreesToRadians(camera_pitch_);
-
-    QVector3D eye_dir;
-    eye_dir.setX(cosf(pitch_rad) * cosf(yaw_rad));
-    eye_dir.setY(cosf(pitch_rad) * sinf(yaw_rad));
-    eye_dir.setZ(sinf(pitch_rad));
-
-    QMatrix4x4 gizmo_view;
-    gizmo_view.lookAt(eye_dir * 3.0f, QVector3D(0, 0, 0), QVector3D(0, 0, 1));
-
-    QMatrix4x4 gizmo_proj;
-    gizmo_proj.ortho(-1.4f, 1.4f, -1.4f, 1.4f, 0.1f, 10.0f);
-
-    QMatrix4x4 mvp = gizmo_proj * gizmo_view;
+    QVector3D eye_dir(cosf(pitch_rad) * cosf(yaw_rad),
+                      cosf(pitch_rad) * sinf(yaw_rad),
+                      sinf(pitch_rad));
+    QMatrix4x4 gv; gv.lookAt(eye_dir * 3.0f, QVector3D(0,0,0), QVector3D(0,0,1));
+    QMatrix4x4 gp; gp.ortho(-1.4f, 1.4f, -1.4f, 1.4f, 0.1f, 10.0f);
+    QMatrix4x4 mvp = gp * gv;
 
     gl_->glUseProgram(axis_program_);
     gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(axis_program_, "u_mvp"), 1, GL_FALSE, mvp.constData());
-
     gl_->glLineWidth(2.5f);
     gl_->glBindVertexArray(axis_vao_);
     gl_->glDrawArrays(GL_LINES, 0, 6);
-
     gl_->glEnable(GL_DEPTH_TEST);
 }
 
-void ViewportWindow::renderPickPass() {
-    gl_->glBindFramebuffer(GL_FRAMEBUFFER, pick_fbo_);
-    gl_->glViewport(0, 0, pick_width_, pick_height_);
-
-    GLuint clear_val = 0;
-    gl_->glClearBufferuiv(GL_COLOR, 0, &clear_val);
-    gl_->glClear(GL_DEPTH_BUFFER_BIT);
-
-    QMatrix4x4 vp = proj_matrix_ * view_matrix_;
-    gl_->glUseProgram(pick_program_);
-    gl_->glUniformMatrix4fv(gl_->glGetUniformLocation(pick_program_, "u_view_projection"), 1, GL_FALSE, vp.constData());
-
-    // Reuse the visible list from the most recent render() call.
-    for (const auto& cmd : frame_draw_cmds_) {
-        gl_->glBindVertexArray(cmd.vao);
-        gl_->glMultiDrawElements(GL_TRIANGLES,
-            cmd.counts.data(), GL_UNSIGNED_INT,
-            cmd.offsets.data(),
-            static_cast<GLsizei>(cmd.counts.size()));
-    }
-
-    gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
-}
-
 void ViewportWindow::exposeEvent(QExposeEvent*) {
-    if (isExposed() && !gl_initialized_) {
-        initGL();
-    }
+    if (isExposed() && !gl_initialized_) initGL();
 }
-
 void ViewportWindow::resizeEvent(QResizeEvent*) {
     if (gl_initialized_) render();
 }
-
 bool ViewportWindow::event(QEvent* e) {
     switch (e->type()) {
-    case QEvent::MouseButtonPress:
-        handleMousePress(static_cast<QMouseEvent*>(e));
-        return true;
-    case QEvent::MouseButtonRelease:
-        handleMouseRelease(static_cast<QMouseEvent*>(e));
-        return true;
-    case QEvent::MouseMove:
-        handleMouseMove(static_cast<QMouseEvent*>(e));
-        return true;
-    case QEvent::Wheel:
-        handleWheel(static_cast<QWheelEvent*>(e));
-        return true;
-    default:
-        return QWindow::event(e);
+    case QEvent::MouseButtonPress:   handleMousePress(static_cast<QMouseEvent*>(e));   return true;
+    case QEvent::MouseButtonRelease: handleMouseRelease(static_cast<QMouseEvent*>(e)); return true;
+    case QEvent::MouseMove:          handleMouseMove(static_cast<QMouseEvent*>(e));    return true;
+    case QEvent::Wheel:              handleWheel(static_cast<QWheelEvent*>(e));        return true;
+    default: return QWindow::event(e);
     }
 }
 
@@ -1112,7 +782,6 @@ void ViewportWindow::handleMousePress(QMouseEvent* e) {
     active_button_ = e->button();
     last_mouse_pos_ = e->pos();
 }
-
 void ViewportWindow::handleMouseRelease(QMouseEvent* e) {
     if (active_button_ == Qt::LeftButton && (e->pos() - last_mouse_pos_).manhattanLength() < 5) {
         uint32_t id = pickObjectAt(e->pos().x(), e->pos().y());
@@ -1121,21 +790,18 @@ void ViewportWindow::handleMouseRelease(QMouseEvent* e) {
     }
     active_button_ = Qt::NoButton;
 }
-
 void ViewportWindow::handleMouseMove(QMouseEvent* e) {
     QPoint delta = e->pos() - last_mouse_pos_;
     last_mouse_pos_ = e->pos();
-
     if (active_button_ == Qt::MiddleButton) {
         if (e->modifiers() & Qt::ShiftModifier) {
             float pan_speed = camera_distance_ * 0.002f;
             float yaw_rad = qDegreesToRadians(camera_yaw_);
             float pitch_rad = qDegreesToRadians(camera_pitch_);
             QVector3D right(-sinf(yaw_rad), cosf(yaw_rad), 0.0f);
-            QVector3D up(
-                -sinf(pitch_rad) * cosf(yaw_rad),
-                -sinf(pitch_rad) * sinf(yaw_rad),
-                 cosf(pitch_rad));
+            QVector3D up(-sinf(pitch_rad) * cosf(yaw_rad),
+                         -sinf(pitch_rad) * sinf(yaw_rad),
+                          cosf(pitch_rad));
             camera_target_ -= right * delta.x() * pan_speed;
             camera_target_ += up * delta.y() * pan_speed;
         } else {
@@ -1145,7 +811,6 @@ void ViewportWindow::handleMouseMove(QMouseEvent* e) {
         }
     }
 }
-
 void ViewportWindow::handleWheel(QWheelEvent* e) {
     float factor = e->angleDelta().y() > 0 ? 0.9f : 1.1f;
     camera_distance_ *= factor;
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 97925e6e2e3..9fbdcf054b0 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -28,58 +28,43 @@
 #include <QMatrix4x4>
 #include <QVector3D>
 
-#include <deque>
 #include <vector>
-#include <unordered_set>
 #include <unordered_map>
 #include <cstdint>
 #include <mutex>
-#include <thread>
 #include <memory>
-#include <atomic>
 
-#include "BvhAccel.h"
+#include "InstancedGeometry.h"
 #include "SidecarCache.h"
 
-struct MaterialInfo {
-    float r = 0.75f, g = 0.75f, b = 0.78f, a = 1.0f;
-};
-
-struct UploadChunk {
-    // Interleaved per-vertex layout (8 floats / 32 bytes per vertex):
-    //   pos(3 float) + normal(3 float) + object_id(1 float bitcast from uint)
-    //   + color(1 float holding RGBA8 packed bytes, read on the GPU as
-    //   GL_UNSIGNED_BYTE * 4 normalized).
-    std::vector<float> vertices;
-    std::vector<uint32_t> indices; // local to this chunk's vertices
-    uint32_t object_id = 0;
-    uint32_t model_id = 0;
-};
-
-// Per-model GPU state: own VAO, VBO, EBO, draw info, BVH.
+// Per-model GPU state for the instanced render path.
+//
+//   VBO: local-coord interleaved verts (pos3 + normal3 + color1_packed) — 28 B.
+//   EBO: mesh-local indices (uint32).
+//   meshes[]: per-unique-representation metadata; indexed by local_mesh_id.
+//   instances[]: CPU-side per-instance records; sorted by mesh_id at finalize.
+//   ssbo: InstanceGpu[]; populated at finalize.
+//
+// A model is drawable once `finalized == true`.
 struct ModelGpuData {
     GLuint vao = 0;
     GLuint vbo = 0;
     GLuint ebo = 0;
+    GLuint ssbo = 0;
+
     size_t vbo_capacity = 0;
     size_t ebo_capacity = 0;
-    size_t vbo_used = 0;   // bytes
-    size_t ebo_used = 0;   // bytes
-    uint32_t vertex_count = 0;
+    size_t vbo_used = 0;
+    size_t ebo_used = 0;
+    uint32_t vertex_count = 0;      // total (across all meshes)
     uint32_t total_triangles = 0;
-    std::vector<ObjectDrawInfo> draw_info;
-    uint32_t active_draw_count = 0; // how many objects are drawable (progressive upload)
-    bool hidden = false;
-};
 
-// Pending progressive upload — VBO first, then EBO.
-struct PendingUpload {
-    uint32_t model_id = 0;
-    std::vector<float> vertices;
-    std::vector<uint32_t> indices;
-    std::shared_ptr<BvhSet> bvh_set;
-    size_t vbo_uploaded = 0;  // bytes
-    size_t ebo_uploaded = 0;  // bytes
+    std::vector<MeshInfo>    meshes;
+    std::vector<InstanceCpu> instances;    // unsorted until finalize
+    uint32_t                 ssbo_instance_count = 0;
+
+    bool finalized = false;
+    bool hidden    = false;
 };
 
 class ViewportWindow : public QWindow {
@@ -88,32 +73,21 @@ class ViewportWindow : public QWindow {
     explicit ViewportWindow(QWindow* parent = nullptr);
     ~ViewportWindow();
 
-    void uploadChunk(const UploadChunk& chunk);
-    void resetScene();
+    // Streaming ingress.
+    void uploadMeshChunk(const MeshChunk& chunk);
+    void uploadInstanceChunk(const InstanceChunk& chunk);
 
-    // Bulk upload pre-built geometry from a sidecar cache.
-    // Creates a perfectly-sized per-model buffer set. No copy.
-    void uploadBulk(uint32_t model_id,
-                    std::vector<float> vertices,
-                    std::vector<uint32_t> indices,
-                    const std::vector<ObjectDrawInfo>& draw_info,
-                    std::shared_ptr<BvhSet> bvh_set);
+    // Called once all chunks for a model have arrived: sorts instances by
+    // mesh_id, assigns each mesh its contiguous range, and uploads the
+    // instance SSBO. The model becomes drawable.
+    void finalizeModel(uint32_t model_id);
+
+    void resetScene();
 
     void hideModel(uint32_t model_id);
     void showModel(uint32_t model_id);
     void removeModel(uint32_t model_id);
 
-    // Build BVH and optionally write a sidecar cache.
-    void buildBvhAsync(uint32_t model_id,
-                       const std::string& ifc_path = "",
-                       uint64_t ifc_file_size = 0,
-                       std::vector<PackedElementInfo> sidecar_elements = {},
-                       std::string sidecar_string_table = {});
-
-    // Read snapshots of a model's GPU buffers into CPU vectors.
-    std::vector<uint32_t> readbackEbo(uint32_t model_id) const;
-    std::vector<float> readbackVbo(uint32_t model_id) const;
-
     void setSelectedObjectId(uint32_t id);
     uint32_t pickObjectAt(int x, int y);
 
@@ -124,6 +98,8 @@ class ViewportWindow : public QWindow {
         uint32_t visible_objects;
         uint32_t total_triangles;
         uint32_t visible_triangles;
+        uint32_t unique_meshes;
+        uint32_t instanced_draws;
     };
 
 signals:
@@ -147,13 +123,7 @@ class ViewportWindow : public QWindow {
     void setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo);
     bool growModelVbo(ModelGpuData& m, size_t needed_total);
     bool growModelEbo(ModelGpuData& m, size_t needed_total);
-    void buildVisibleList(const QMatrix4x4& vp);
-    void traverseBvh(const ModelBvh& mbvh, const ModelGpuData& mgpu,
-                     const float planes[6][4]);
-    static bool aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
-                              const float planes[6][4]);
-    void applyBvhResult();
-    void processPendingUploads();
+    ModelGpuData& getOrCreateModel(uint32_t model_id);
 
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
@@ -172,13 +142,12 @@ class ViewportWindow : public QWindow {
     GLuint pick_program_ = 0;
     GLuint axis_program_ = 0;
 
-    // Axis gizmo (separate VAO/VBO since vertex layout differs from scene)
+    // Axis gizmo
     GLuint axis_vao_ = 0;
     GLuint axis_vbo_ = 0;
 
     // Per-model GPU data
     std::unordered_map<uint32_t, ModelGpuData> models_gpu_;
-    std::mutex models_mutex_;
 
     // Pick framebuffer
     GLuint pick_fbo_ = 0;
@@ -187,21 +156,10 @@ class ViewportWindow : public QWindow {
     int pick_width_ = 0;
     int pick_height_ = 0;
 
-    // Per-model BVH
-    std::unordered_map<uint32_t, std::shared_ptr<const BvhSet>> model_bvhs_;
-
-    // Progressive upload queue
-    std::deque<PendingUpload> pending_uploads_;
-
-    // Scratch buffers reused each frame to avoid allocation.
-    struct ModelDrawCmd {
-        GLuint vao;
-        std::vector<GLsizei> counts;
-        std::vector<const void*> offsets;
-    };
-    std::vector<ModelDrawCmd> frame_draw_cmds_;
+    // Per-frame stats
     uint32_t visible_triangles_ = 0;
     uint32_t visible_objects_ = 0;
+    uint32_t instanced_draws_ = 0;
 
     // Camera
     QVector3D camera_target_{0, 0, 0};
@@ -211,26 +169,14 @@ class ViewportWindow : public QWindow {
     QMatrix4x4 view_matrix_;
     QMatrix4x4 proj_matrix_;
 
-    // Mouse state
+    // Mouse
     Qt::MouseButton active_button_ = Qt::NoButton;
     QPoint last_mouse_pos_;
 
     // Selection
     uint32_t selected_object_id_ = 0;
-    bool pick_requested_ = false;
-    int pick_x_ = 0, pick_y_ = 0;
-
-    // BVH build (phase 2)
-    struct PendingBvh {
-        uint32_t model_id;
-        std::shared_ptr<BvhSet> bvh_set;
-        EboReorderResult ebo_reorder;
-    };
-    std::unique_ptr<PendingBvh> pending_bvh_;
-    std::mutex bvh_result_mutex_;
-    std::thread bvh_build_thread_;
 
-    // Stats
+    // FPS smoothing
     int frame_count_ = 0;
     float accumulated_time_ = 0.0f;
     float last_fps_ = 0.0f;

From ea640e2f691bb16a14799fb25a902791440ac97c Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 20:10:20 +1000
Subject: [PATCH 15/37] Sidecar v4: persist instanced geometry + metadata

Commit B of the instancing migration.  The sidecar on-disk format is
reintroduced at version 4 with MeshInfo + InstanceCpu sections in place
of v3's flat per-object draw-info array.

After streaming finishes, MainWindow asks the viewport for a post-
finalise snapshot (VBO + EBO are read back from the GPU, meshes and
instances come from the CPU-side arrays) and writes it alongside
PackedElementInfo + the string table.  On a subsequent load,
readSidecar rehydrates the whole struct and ViewportWindow::
applyCachedModel uploads VBO/EBO/SSBO in a single step, bypassing the
iterator entirely.

Staleness check is still by source file size.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/MainWindow.cpp     |  92 ++++++++++++++++++++++--
 src/ifcviewer/SidecarCache.cpp   | 118 ++++++++++++++++++++++++++++---
 src/ifcviewer/ViewportWindow.cpp |  97 +++++++++++++++++++++++++
 src/ifcviewer/ViewportWindow.h   |  10 +++
 4 files changed, 300 insertions(+), 17 deletions(-)

diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 86a787a0e26..ceeedc8cbd4 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -210,7 +210,7 @@ void MainWindow::startNextLoad() {
         qDebug("  Sidecar read: %lld ms (%s)", rt.elapsed(), ifc_path.c_str());
         auto result = std::make_shared<std::optional<SidecarData>>(std::move(cached));
         QMetaObject::invokeMethod(this, [this, mid, result]() {
-            if (*result && !(*result)->meshes.empty()) {
+            if (*result && !(*result)->instances.empty()) {
                 applySidecarData(mid, std::move(**result));
             } else {
                 // No sidecar — fall back to streaming from IFC.
@@ -229,10 +229,54 @@ void MainWindow::startNextLoad() {
     });
 }
 
-void MainWindow::applySidecarData(ModelId /*mid*/, SidecarData /*data*/) {
-    // Commit A: readSidecar() always returns nullopt, so this is unreachable.
-    // Restored in Commit B along with the v4 on-disk format.
-    qWarning("applySidecarData called but sidecar is disabled in Commit A");
+void MainWindow::applySidecarData(ModelId mid, SidecarData data) {
+    auto it = models_.find(mid);
+    if (it == models_.end()) return;
+    auto& model = it->second;
+
+    qDebug("Sidecar hit: %s (%zu verts, %zu indices, %zu meshes, %zu instances, %zu elements)",
+           model.file_path.toStdString().c_str(),
+           data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS,
+           data.indices.size(),
+           data.meshes.size(),
+           data.instances.size(),
+           data.elements.size());
+
+    QElapsedTimer t;
+    t.start();
+
+    // Update next_object_id_ past all objects in this model before the
+    // extracted `elements` is moved out of `data`.
+    for (const auto& elem : data.elements) {
+        if (elem.object_id >= next_object_id_)
+            next_object_id_ = elem.object_id + 1;
+    }
+
+    // Hand off geometry to GPU in a single call.
+    std::vector<PackedElementInfo> elements = std::move(data.elements);
+    std::string stbl                        = std::move(data.string_table);
+    viewport_->applyCachedModel(mid, std::move(data));
+    qDebug("  GL upload: %lld ms", t.elapsed());
+
+    t.restart();
+    element_tree_->setUpdatesEnabled(false);
+    populateTreeFromSidecar(model, elements, stbl);
+    element_tree_->setUpdatesEnabled(true);
+    qDebug("  Tree build: %lld ms (%zu elements)", t.elapsed(), elements.size());
+
+    progress_bar_->setVisible(false);
+
+    qint64 ms = load_timer_.elapsed();
+    QString elapsed = (ms >= 1000)
+        ? QString::number(ms / 1000.0, 'f', 2) + " s"
+        : QString::number(ms) + " ms";
+    status_label_->setText(QString("%1 elements across %2 model(s) — loaded from cache in %3")
+        .arg(element_map_.size())
+        .arg(models_.size())
+        .arg(elapsed));
+
+    loading_model_id_ = 0;
+    QTimer::singleShot(0, this, &MainWindow::startNextLoad);
 }
 
 void MainWindow::populateTreeFromSidecar(ModelHandle& model,
@@ -320,10 +364,44 @@ void MainWindow::onStreamingFinished() {
         .arg(num_models)
         .arg(elapsed));
 
-    // Sort instances by mesh and upload the per-model instance SSBO.
-    // Sidecar write is stubbed in Commit A.
+    // Sort instances by mesh, upload the per-model instance SSBO, and
+    // persist a v4 sidecar for next load.
     if (loading_model_id_ != 0) {
         viewport_->finalizeModel(loading_model_id_);
+
+        auto it = models_.find(loading_model_id_);
+        if (it != models_.end()) {
+            SidecarData sd;
+            if (viewport_->snapshotModel(loading_model_id_, sd)) {
+                // Pack this model's element metadata + string table.
+                for (const auto& [oid, info] : element_map_) {
+                    if (info.model_id != loading_model_id_) continue;
+                    PackedElementInfo pe;
+                    pe.object_id = info.object_id;
+                    pe.model_id  = info.model_id;
+                    pe.ifc_id    = info.ifc_id;
+                    pe.parent_id = info.parent_id;
+                    pe.guid_offset = static_cast<uint32_t>(sd.string_table.size());
+                    pe.guid_length = static_cast<uint32_t>(info.guid.size());
+                    sd.string_table += info.guid;
+                    pe.name_offset = static_cast<uint32_t>(sd.string_table.size());
+                    pe.name_length = static_cast<uint32_t>(info.name.size());
+                    sd.string_table += info.name;
+                    pe.type_offset = static_cast<uint32_t>(sd.string_table.size());
+                    pe.type_length = static_cast<uint32_t>(info.type.size());
+                    sd.string_table += info.type;
+                    sd.elements.push_back(pe);
+                }
+
+                std::string ifc_path = it->second.file_path.toStdString();
+                uint64_t file_size = static_cast<uint64_t>(
+                    QFileInfo(it->second.file_path).size());
+                QElapsedTimer t; t.start();
+                bool ok = writeSidecar(ifc_path, sd, file_size);
+                qDebug("  Sidecar write: %lld ms (%s)",
+                       t.elapsed(), ok ? "ok" : "FAILED");
+            }
+        }
     }
 
     // Start next model if queued.
diff --git a/src/ifcviewer/SidecarCache.cpp b/src/ifcviewer/SidecarCache.cpp
index be19c8698f4..3c5ca9cd8d5 100644
--- a/src/ifcviewer/SidecarCache.cpp
+++ b/src/ifcviewer/SidecarCache.cpp
@@ -17,20 +17,118 @@
  *                                                                              *
  ********************************************************************************/
 
-// Commit A: sidecar cache is temporarily disabled.  The on-disk format is
-// being rewritten from v3 (monolithic world-coord geometry) to v4 (instanced
-// meshes + per-instance records).  Until v4 is finalised, loads always go
-// through the streaming path and writes are no-ops.
+// v4 layout (all multi-byte fields native-endian; endianness marker in header):
+//
+//   SidecarHeader (16 bytes)
+//   uint64_t  source_file_size
+//
+//   uint32_t  num_vertices_floats
+//   float[]   vertex data (28 B/vertex: pos3 + normal3 + color1_packed)
+//   uint32_t  num_indices
+//   uint32_t[] index data (mesh-local indices; base_vertex applied at draw time)
+//
+//   uint32_t  num_meshes
+//   MeshInfo[num_meshes]
+//
+//   uint32_t  num_instances
+//   InstanceCpu[num_instances]   (already sorted by mesh_id)
+//
+//   uint32_t  num_elements
+//   PackedElementInfo[num_elements]
+//   uint32_t  string_table_bytes
+//   char[string_table_bytes]
 
 #include "SidecarCache.h"
 
-bool writeSidecar(const std::string& /*ifc_path*/,
-                  const SidecarData& /*data*/,
-                  uint64_t /*ifc_file_size*/) {
+#include <cstdio>
+#include <cstring>
+
+struct SidecarHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t endian;
+    uint32_t reserved;
+};
+
+static std::string sidecarPath(const std::string& ifc_path) {
+    return ifc_path + ".ifcview";
+}
+
+template<typename T>
+static bool writeVec(FILE* f, const std::vector<T>& v) {
+    uint32_t n = static_cast<uint32_t>(v.size());
+    if (fwrite(&n, 4, 1, f) != 1) return false;
+    if (n > 0 && fwrite(v.data(), sizeof(T), n, f) != n) return false;
+    return true;
+}
+
+template<typename T>
+static bool readVec(FILE* f, std::vector<T>& v) {
+    uint32_t n;
+    if (fread(&n, 4, 1, f) != 1) return false;
+    v.resize(n);
+    if (n > 0 && fread(v.data(), sizeof(T), n, f) != n) return false;
+    return true;
+}
+
+bool writeSidecar(const std::string& ifc_path,
+                  const SidecarData& data,
+                  uint64_t ifc_file_size) {
+    std::string path = sidecarPath(ifc_path);
+    FILE* f = fopen(path.c_str(), "wb");
+    if (!f) return false;
+
+    SidecarHeader hdr = { SIDECAR_MAGIC, SIDECAR_VERSION, SIDECAR_ENDIAN, 0 };
+    if (fwrite(&hdr, sizeof(hdr), 1, f) != 1)    { fclose(f); return false; }
+    if (fwrite(&ifc_file_size, 8, 1, f) != 1)    { fclose(f); return false; }
+
+    if (!writeVec(f, data.vertices))  { fclose(f); return false; }
+    if (!writeVec(f, data.indices))   { fclose(f); return false; }
+    if (!writeVec(f, data.meshes))    { fclose(f); return false; }
+    if (!writeVec(f, data.instances)) { fclose(f); return false; }
+    if (!writeVec(f, data.elements))  { fclose(f); return false; }
+
+    uint32_t stbl_len = static_cast<uint32_t>(data.string_table.size());
+    if (fwrite(&stbl_len, 4, 1, f) != 1) { fclose(f); return false; }
+    if (stbl_len > 0 && fwrite(data.string_table.data(), 1, stbl_len, f) != stbl_len) {
+        fclose(f); return false;
+    }
+
+    fclose(f);
     return true;
 }
 
-std::optional<SidecarData> readSidecar(const std::string& /*ifc_path*/,
-                                       uint64_t /*ifc_file_size*/) {
-    return std::nullopt;
+std::optional<SidecarData> readSidecar(const std::string& ifc_path,
+                                       uint64_t ifc_file_size) {
+    std::string path = sidecarPath(ifc_path);
+    FILE* f = fopen(path.c_str(), "rb");
+    if (!f) return std::nullopt;
+
+    auto fail = [&]() -> std::optional<SidecarData> { fclose(f); return std::nullopt; };
+
+    SidecarHeader hdr;
+    if (fread(&hdr, sizeof(hdr), 1, f) != 1) return fail();
+    if (hdr.magic  != SIDECAR_MAGIC   ||
+        hdr.version != SIDECAR_VERSION ||
+        hdr.endian != SIDECAR_ENDIAN) return fail();
+
+    uint64_t stored_size;
+    if (fread(&stored_size, 8, 1, f) != 1)  return fail();
+    if (stored_size != ifc_file_size)       return fail();
+
+    SidecarData data;
+    if (!readVec(f, data.vertices))  return fail();
+    if (!readVec(f, data.indices))   return fail();
+    if (!readVec(f, data.meshes))    return fail();
+    if (!readVec(f, data.instances)) return fail();
+    if (!readVec(f, data.elements))  return fail();
+
+    uint32_t stbl_len;
+    if (fread(&stbl_len, 4, 1, f) != 1) return fail();
+    data.string_table.resize(stbl_len);
+    if (stbl_len > 0 && fread(data.string_table.data(), 1, stbl_len, f) != stbl_len)
+        return fail();
+
+    fclose(f);
+    return data;
 }
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index e264f990e45..48558fc64f4 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -523,6 +523,103 @@ void ViewportWindow::finalizeModel(uint32_t model_id) {
            ssbo_bytes / (1024.0*1024.0));
 }
 
+bool ViewportWindow::snapshotModel(uint32_t model_id, SidecarData& out) const {
+    auto it = models_gpu_.find(model_id);
+    if (!gl_ || it == models_gpu_.end()) return false;
+    const auto& m = it->second;
+    if (!m.finalized) return false;
+
+    // GPU readback of the packed VBO/EBO ranges actually in use.
+    if (m.vbo_used > 0) {
+        out.vertices.resize(m.vbo_used / sizeof(float));
+        gl_->glGetNamedBufferSubData(m.vbo, 0, m.vbo_used, out.vertices.data());
+    }
+    if (m.ebo_used > 0) {
+        out.indices.resize(m.ebo_used / sizeof(uint32_t));
+        gl_->glGetNamedBufferSubData(m.ebo, 0, m.ebo_used, out.indices.data());
+    }
+
+    out.meshes    = m.meshes;
+    out.instances = m.instances;
+    return true;
+}
+
+void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
+    if (!gl_initialized_) return;
+    context_->makeCurrent(this);
+
+    // Drop any existing state for this model_id.
+    auto existing = models_gpu_.find(model_id);
+    if (existing != models_gpu_.end()) {
+        if (existing->second.vao)  gl_->glDeleteVertexArrays(1, &existing->second.vao);
+        if (existing->second.vbo)  gl_->glDeleteBuffers(1, &existing->second.vbo);
+        if (existing->second.ebo)  gl_->glDeleteBuffers(1, &existing->second.ebo);
+        if (existing->second.ssbo) gl_->glDeleteBuffers(1, &existing->second.ssbo);
+        models_gpu_.erase(existing);
+    }
+
+    ModelGpuData m;
+    gl_->glCreateVertexArrays(1, &m.vao);
+    gl_->glCreateBuffers(1, &m.vbo);
+    gl_->glCreateBuffers(1, &m.ebo);
+
+    const size_t vb_bytes = data.vertices.size() * sizeof(float);
+    const size_t ib_bytes = data.indices.size()  * sizeof(uint32_t);
+    m.vbo_capacity = std::max<size_t>(vb_bytes, 1);
+    m.ebo_capacity = std::max<size_t>(ib_bytes, 1);
+    gl_->glNamedBufferStorage(m.vbo, m.vbo_capacity,
+                              vb_bytes ? data.vertices.data() : nullptr,
+                              GL_DYNAMIC_STORAGE_BIT);
+    gl_->glNamedBufferStorage(m.ebo, m.ebo_capacity,
+                              ib_bytes ? data.indices.data() : nullptr,
+                              GL_DYNAMIC_STORAGE_BIT);
+    setupVaoLayout(m.vao, m.vbo, m.ebo);
+
+    m.vbo_used = vb_bytes;
+    m.ebo_used = ib_bytes;
+    m.vertex_count = static_cast<uint32_t>(
+        data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS);
+    m.meshes = std::move(data.meshes);
+    m.instances = std::move(data.instances);
+
+    uint32_t total_tri = 0;
+    for (const auto& mesh : m.meshes) {
+        total_tri += (mesh.index_count / 3) * mesh.instance_count;
+    }
+    m.total_triangles = total_tri;
+
+    // Build and upload the instance SSBO.
+    std::vector<InstanceGpu> gpu(m.instances.size());
+    for (size_t i = 0; i < m.instances.size(); ++i) {
+        const InstanceCpu& src = m.instances[i];
+        InstanceGpu& dst = gpu[i];
+        std::memcpy(dst.transform, src.transform, sizeof(dst.transform));
+        dst.object_id = src.object_id;
+        dst.color_override_rgba8 = src.color_override_rgba8;
+        dst._pad0 = 0;
+        dst._pad1 = 0;
+    }
+    gl_->glCreateBuffers(1, &m.ssbo);
+    const size_t ssbo_bytes = gpu.size() * sizeof(InstanceGpu);
+    if (ssbo_bytes > 0) {
+        gl_->glNamedBufferStorage(m.ssbo, ssbo_bytes, gpu.data(), 0);
+    }
+    m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
+
+    m.finalized = true;
+    models_gpu_.emplace(model_id, std::move(m));
+
+    qDebug("Sidecar apply: model %u  %zu verts, %zu meshes, %zu instances  "
+           "%.1f MB vram (vbo %.1f + ebo %.1f + ssbo %.1f)",
+           model_id, data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS,
+           models_gpu_[model_id].meshes.size(),
+           models_gpu_[model_id].instances.size(),
+           (vb_bytes + ib_bytes + ssbo_bytes) / (1024.0*1024.0),
+           vb_bytes / (1024.0*1024.0),
+           ib_bytes / (1024.0*1024.0),
+           ssbo_bytes / (1024.0*1024.0));
+}
+
 void ViewportWindow::resetScene() {
     if (!gl_initialized_) return;
     context_->makeCurrent(this);
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 9fbdcf054b0..65b15412e9a 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -84,6 +84,16 @@ class ViewportWindow : public QWindow {
 
     void resetScene();
 
+    // Snapshot the finalised model into a SidecarData struct for caching.
+    // Vertices + indices are read back from the GPU; meshes/instances come
+    // from the CPU-side vectors.  Leaves `elements` and `string_table` empty
+    // for the caller to fill in.
+    bool snapshotModel(uint32_t model_id, SidecarData& out) const;
+
+    // Restore a finalised model from a cached SidecarData struct.  Replaces
+    // any existing state for model_id and marks it drawable.
+    void applyCachedModel(uint32_t model_id, SidecarData data);
+
     void hideModel(uint32_t model_id);
     void showModel(uint32_t model_id);
     void removeModel(uint32_t model_id);

From f4b3e6c515da25ffed33ffcba69357bf235819e6 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 20:23:50 +1000
Subject: [PATCH 16/37] BVH frustum culling over instances

Re-wires the BVH acceleration structure on top of the new instanced
renderer.  Per model, build a BVH over per-instance world AABBs at
finalize (and on sidecar apply).  Each frame, traverse the BVH against
the camera frustum to produce a visible-instance index list, bucket by
mesh_id, and upload to a per-model SSBO at binding=1.  The main and
pick vertex shaders do a double-indirection
`instances[visible[u_offset + gl_InstanceID]]` so draws only touch
instances that passed the frustum test.

Models with fewer than BVH_MIN_OBJECTS instances skip the BVH build
and fall back to a linear per-instance frustum test.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/BvhAccel.cpp       |   6 +
 src/ifcviewer/BvhAccel.h         |   4 +
 src/ifcviewer/ViewportWindow.cpp | 181 +++++++++++++++++++++++++++++--
 src/ifcviewer/ViewportWindow.h   |  25 +++++
 4 files changed, 204 insertions(+), 12 deletions(-)

diff --git a/src/ifcviewer/BvhAccel.cpp b/src/ifcviewer/BvhAccel.cpp
index c285f1fbfe0..4b115bfa4ce 100644
--- a/src/ifcviewer/BvhAccel.cpp
+++ b/src/ifcviewer/BvhAccel.cpp
@@ -119,6 +119,12 @@ ModelBvh buildModelBvh(const std::vector<BvhItem>& items,
 
 } // anonymous namespace
 
+ModelBvh buildModelBvhOne(const std::vector<BvhItem>& items, uint32_t model_id) {
+    std::vector<uint32_t> idxs(items.size());
+    for (uint32_t i = 0; i < items.size(); ++i) idxs[i] = i;
+    return buildModelBvh(items, idxs, model_id);
+}
+
 std::shared_ptr<BvhSet> buildBvhSet(const std::vector<BvhItem>& items) {
     auto bvh_set = std::make_shared<BvhSet>();
 
diff --git a/src/ifcviewer/BvhAccel.h b/src/ifcviewer/BvhAccel.h
index a2cb6a13163..7281dff511d 100644
--- a/src/ifcviewer/BvhAccel.h
+++ b/src/ifcviewer/BvhAccel.h
@@ -63,4 +63,8 @@ struct BvhSet {
 // vector — callers providing a single model's items will see 0..N-1.
 std::shared_ptr<BvhSet> buildBvhSet(const std::vector<BvhItem>& items);
 
+// Build a single-model BVH over `items`.  model_id is stored on the result
+// for identification; item_indices will be 0..items.size()-1.
+ModelBvh buildModelBvhOne(const std::vector<BvhItem>& items, uint32_t model_id);
+
 #endif // BVHACCEL_H
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 48558fc64f4..7011ec9d38f 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -68,6 +68,9 @@ struct InstanceRecord {
 layout(std430, binding = 0) readonly buffer Instances {
     InstanceRecord instances[];
 };
+layout(std430, binding = 1) readonly buffer VisibleIndices {
+    uint visible[];
+};
 
 uniform mat4 u_view_projection;
 uniform uint u_instance_offset;
@@ -79,7 +82,8 @@ flat out uint v_object_id;
 flat out uint v_selected;
 
 void main() {
-    InstanceRecord inst = instances[u_instance_offset + uint(gl_InstanceID)];
+    uint iid = visible[u_instance_offset + uint(gl_InstanceID)];
+    InstanceRecord inst = instances[iid];
     vec4 world = inst.transform * vec4(a_position, 1.0);
     gl_Position = u_view_projection * world;
 
@@ -139,6 +143,9 @@ struct InstanceRecord {
 layout(std430, binding = 0) readonly buffer Instances {
     InstanceRecord instances[];
 };
+layout(std430, binding = 1) readonly buffer VisibleIndices {
+    uint visible[];
+};
 
 uniform mat4 u_view_projection;
 uniform uint u_instance_offset;
@@ -146,7 +153,8 @@ uniform uint u_instance_offset;
 flat out uint v_object_id;
 
 void main() {
-    InstanceRecord inst = instances[u_instance_offset + uint(gl_InstanceID)];
+    uint iid = visible[u_instance_offset + uint(gl_InstanceID)];
+    InstanceRecord inst = instances[iid];
     gl_Position = u_view_projection * inst.transform * vec4(a_position, 1.0);
     v_object_id = inst.object_id;
 }
@@ -211,6 +219,59 @@ static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint fra
 
 // -----------------------------------------------------------------------------
 
+static bool aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
+                          const float planes[6][4]) {
+    for (int p = 0; p < 6; ++p) {
+        float px = planes[p][0] >= 0.0f ? aabb_max[0] : aabb_min[0];
+        float py = planes[p][1] >= 0.0f ? aabb_max[1] : aabb_min[1];
+        float pz = planes[p][2] >= 0.0f ? aabb_max[2] : aabb_min[2];
+        float dist = planes[p][0] * px + planes[p][1] * py + planes[p][2] * pz + planes[p][3];
+        if (dist < 0.0f) return false;
+    }
+    return true;
+}
+
+static void extractFrustumPlanes(const QMatrix4x4& vp, float planes[6][4]) {
+    for (int i = 0; i < 4; ++i) {
+        planes[0][i] = vp(3, i) + vp(0, i);
+        planes[1][i] = vp(3, i) - vp(0, i);
+        planes[2][i] = vp(3, i) + vp(1, i);
+        planes[3][i] = vp(3, i) - vp(1, i);
+        planes[4][i] = vp(3, i) + vp(2, i);
+        planes[5][i] = vp(3, i) - vp(2, i);
+    }
+    for (int p = 0; p < 6; ++p) {
+        float len = std::sqrt(planes[p][0]*planes[p][0] +
+                              planes[p][1]*planes[p][1] +
+                              planes[p][2]*planes[p][2]);
+        if (len > 0.0f) {
+            float inv = 1.0f / len;
+            planes[p][0] *= inv; planes[p][1] *= inv;
+            planes[p][2] *= inv; planes[p][3] *= inv;
+        }
+    }
+}
+
+// Build bvh_items (one per instance, 1:1 ordering) and a per-model BVH.
+// Items with instances.size() < BVH_MIN_OBJECTS leave bvh empty — the
+// render path falls back to drawing every instance.
+static void buildBvhForModel(ModelGpuData& m, uint32_t model_id) {
+    m.bvh_items.clear();
+    m.bvh_items.reserve(m.instances.size());
+    for (const auto& inst : m.instances) {
+        BvhItem it;
+        std::memcpy(it.aabb_min, inst.world_aabb_min, sizeof(it.aabb_min));
+        std::memcpy(it.aabb_max, inst.world_aabb_max, sizeof(it.aabb_max));
+        it.model_id = inst.model_id;
+        m.bvh_items.push_back(it);
+    }
+    if (m.bvh_items.size() >= BVH_MIN_OBJECTS) {
+        m.bvh = buildModelBvhOne(m.bvh_items, model_id);
+    } else {
+        m.bvh = ModelBvh{};
+    }
+}
+
 ViewportWindow::ViewportWindow(QWindow* parent)
     : QWindow(parent)
 {
@@ -239,6 +300,7 @@ ViewportWindow::~ViewportWindow() {
                 if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
                 if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
                 if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+                if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
             }
             if (axis_vao_)      gl_->glDeleteVertexArrays(1, &axis_vao_);
             if (axis_vbo_)      gl_->glDeleteBuffers(1, &axis_vbo_);
@@ -512,6 +574,8 @@ void ViewportWindow::finalizeModel(uint32_t model_id) {
     gl_->glNamedBufferStorage(m.ssbo, ssbo_bytes, gpu.data(), 0);
     m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
 
+    buildBvhForModel(m, model_id);
+
     m.finalized = true;
 
     qDebug("Model %u finalized: %zu verts, %zu meshes, %zu instances, %.1f MB vram "
@@ -555,6 +619,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
         if (existing->second.vbo)  gl_->glDeleteBuffers(1, &existing->second.vbo);
         if (existing->second.ebo)  gl_->glDeleteBuffers(1, &existing->second.ebo);
         if (existing->second.ssbo) gl_->glDeleteBuffers(1, &existing->second.ssbo);
+        if (existing->second.visible_ssbo) gl_->glDeleteBuffers(1, &existing->second.visible_ssbo);
         models_gpu_.erase(existing);
     }
 
@@ -606,6 +671,8 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
     }
     m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
 
+    buildBvhForModel(m, model_id);
+
     m.finalized = true;
     models_gpu_.emplace(model_id, std::move(m));
 
@@ -628,6 +695,7 @@ void ViewportWindow::resetScene() {
         if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
         if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
         if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+        if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
     }
     models_gpu_.clear();
     selected_object_id_ = 0;
@@ -652,6 +720,7 @@ void ViewportWindow::removeModel(uint32_t model_id) {
         if (it->second.vbo)  gl_->glDeleteBuffers(1, &it->second.vbo);
         if (it->second.ebo)  gl_->glDeleteBuffers(1, &it->second.ebo);
         if (it->second.ssbo) gl_->glDeleteBuffers(1, &it->second.ssbo);
+        if (it->second.visible_ssbo) gl_->glDeleteBuffers(1, &it->second.visible_ssbo);
         models_gpu_.erase(it);
     }
 }
@@ -689,6 +758,74 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     return pixel;
 }
 
+void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]) {
+    // Ensure per-mesh scratch sized.
+    if (visible_by_mesh_.size() < m.meshes.size()) visible_by_mesh_.resize(m.meshes.size());
+    for (size_t i = 0; i < m.meshes.size(); ++i) visible_by_mesh_[i].clear();
+
+    auto test_and_push = [&](uint32_t inst_idx) {
+        const InstanceCpu& inst = m.instances[inst_idx];
+        if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
+        if (inst.mesh_id < visible_by_mesh_.size())
+            visible_by_mesh_[inst.mesh_id].push_back(inst_idx);
+    };
+
+    if (!m.bvh.nodes.empty()) {
+        uint32_t stack[64];
+        int sp = 0;
+        stack[sp++] = 0;
+        while (sp > 0) {
+            uint32_t ni = stack[--sp];
+            const BvhNode& n = m.bvh.nodes[ni];
+            if (!aabbInFrustum(n.aabb_min, n.aabb_max, planes)) continue;
+            if (n.count > 0) {
+                for (uint32_t k = 0; k < n.count; ++k) {
+                    uint32_t item_idx = m.bvh.item_indices[n.right_or_first + k];
+                    test_and_push(item_idx);
+                }
+            } else {
+                // Left child = ni + 1, right child = n.right_or_first.
+                // Push right first so left is popped next (DFS order).
+                if (sp + 2 <= 64) {
+                    stack[sp++] = n.right_or_first;
+                    stack[sp++] = ni + 1;
+                }
+            }
+        }
+    } else {
+        for (uint32_t i = 0; i < m.instances.size(); ++i) test_and_push(i);
+    }
+
+    // Flatten into visible_flat_ and record per-mesh ranges.
+    visible_flat_.clear();
+    m.mesh_vis_first.assign(m.meshes.size(), 0);
+    m.mesh_vis_count.assign(m.meshes.size(), 0);
+    for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
+        m.mesh_vis_first[mi] = static_cast<uint32_t>(visible_flat_.size());
+        m.mesh_vis_count[mi] = static_cast<uint32_t>(visible_by_mesh_[mi].size());
+        visible_flat_.insert(visible_flat_.end(),
+                             visible_by_mesh_[mi].begin(),
+                             visible_by_mesh_[mi].end());
+    }
+
+    // Grow/create visible SSBO as needed. Keep at least 4 bytes so the binding
+    // is always valid even when nothing is visible.
+    size_t bytes = std::max<size_t>(visible_flat_.size() * sizeof(uint32_t),
+                                    sizeof(uint32_t));
+    if (m.visible_ssbo == 0 || m.visible_ssbo_capacity < bytes) {
+        if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
+        size_t new_cap = m.visible_ssbo_capacity ? m.visible_ssbo_capacity : 4096;
+        while (new_cap < bytes) new_cap *= 2;
+        gl_->glCreateBuffers(1, &m.visible_ssbo);
+        gl_->glNamedBufferStorage(m.visible_ssbo, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
+        m.visible_ssbo_capacity = new_cap;
+    }
+    if (!visible_flat_.empty()) {
+        gl_->glNamedBufferSubData(m.visible_ssbo, 0,
+            visible_flat_.size() * sizeof(uint32_t), visible_flat_.data());
+    }
+}
+
 void ViewportWindow::updateCamera() {
     float yaw_rad = qDegreesToRadians(camera_yaw_);
     float pitch_rad = qDegreesToRadians(camera_pitch_);
@@ -715,6 +852,8 @@ void ViewportWindow::render() {
     gl_->glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
     QMatrix4x4 vp = proj_matrix_ * view_matrix_;
+    float planes[6][4];
+    extractFrustumPlanes(vp, planes);
 
     gl_->glUseProgram(main_program_);
     GLint u_vp        = gl_->glGetUniformLocation(main_program_, "u_view_projection");
@@ -731,21 +870,28 @@ void ViewportWindow::render() {
 
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.finalized || !m.ssbo) continue;
+
+        cullAndUploadVisible(m, planes);
+        if (visible_flat_.empty()) continue;
+
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
 
-        for (const auto& mesh : m.meshes) {
-            if (mesh.instance_count == 0 || mesh.index_count == 0) continue;
-            gl_->glUniform1ui(u_inst_off, mesh.first_instance);
+        for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
+            const auto& mesh = m.meshes[mi];
+            uint32_t vis_count = m.mesh_vis_count[mi];
+            if (vis_count == 0 || mesh.index_count == 0) continue;
+            gl_->glUniform1ui(u_inst_off, m.mesh_vis_first[mi]);
             gl_->glDrawElementsInstancedBaseVertex(
                 GL_TRIANGLES,
                 static_cast<GLsizei>(mesh.index_count),
                 GL_UNSIGNED_INT,
                 reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
-                static_cast<GLsizei>(mesh.instance_count),
+                static_cast<GLsizei>(vis_count),
                 static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
-            visible_triangles_ += (mesh.index_count / 3) * mesh.instance_count;
-            visible_objects_   += mesh.instance_count;
+            visible_triangles_ += (mesh.index_count / 3) * vis_count;
+            visible_objects_   += vis_count;
             ++instanced_draws_;
         }
     }
@@ -810,6 +956,9 @@ void ViewportWindow::renderPickPass() {
     gl_->glClear(GL_DEPTH_BUFFER_BIT);
 
     QMatrix4x4 vp = proj_matrix_ * view_matrix_;
+    float planes[6][4];
+    extractFrustumPlanes(vp, planes);
+
     gl_->glUseProgram(pick_program_);
     GLint u_vp       = gl_->glGetUniformLocation(pick_program_, "u_view_projection");
     GLint u_inst_off = gl_->glGetUniformLocation(pick_program_, "u_instance_offset");
@@ -817,17 +966,25 @@ void ViewportWindow::renderPickPass() {
 
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.finalized || !m.ssbo) continue;
+
+        cullAndUploadVisible(m, planes);
+        if (visible_flat_.empty()) continue;
+
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
-        for (const auto& mesh : m.meshes) {
-            if (mesh.instance_count == 0 || mesh.index_count == 0) continue;
-            gl_->glUniform1ui(u_inst_off, mesh.first_instance);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
+
+        for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
+            const auto& mesh = m.meshes[mi];
+            uint32_t vis_count = m.mesh_vis_count[mi];
+            if (vis_count == 0 || mesh.index_count == 0) continue;
+            gl_->glUniform1ui(u_inst_off, m.mesh_vis_first[mi]);
             gl_->glDrawElementsInstancedBaseVertex(
                 GL_TRIANGLES,
                 static_cast<GLsizei>(mesh.index_count),
                 GL_UNSIGNED_INT,
                 reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
-                static_cast<GLsizei>(mesh.instance_count),
+                static_cast<GLsizei>(vis_count),
                 static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
         }
     }
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 65b15412e9a..5a086fd774f 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -34,6 +34,7 @@
 #include <mutex>
 #include <memory>
 
+#include "BvhAccel.h"
 #include "InstancedGeometry.h"
 #include "SidecarCache.h"
 
@@ -63,6 +64,20 @@ struct ModelGpuData {
     std::vector<InstanceCpu> instances;    // unsorted until finalize
     uint32_t                 ssbo_instance_count = 0;
 
+    // Per-instance world AABB + BVH (built at finalize).  The BVH is the
+    // same ordering as `instances`; bvh_items[i] corresponds to instances[i].
+    std::vector<BvhItem> bvh_items;
+    ModelBvh             bvh;
+
+    // Dynamic visible-instance index buffer (std430, binding = 1).
+    // Re-uploaded each frame from frame_visible_scratch_.
+    GLuint  visible_ssbo = 0;
+    size_t  visible_ssbo_capacity = 0;  // bytes
+
+    // Per-mesh visible-list offset/count, rebuilt each frame.
+    std::vector<uint32_t> mesh_vis_first;
+    std::vector<uint32_t> mesh_vis_count;
+
     bool finalized = false;
     bool hidden    = false;
 };
@@ -135,6 +150,10 @@ class ViewportWindow : public QWindow {
     bool growModelEbo(ModelGpuData& m, size_t needed_total);
     ModelGpuData& getOrCreateModel(uint32_t model_id);
 
+    // Populate m.mesh_vis_first / mesh_vis_count and upload visible indices
+    // to m.visible_ssbo.  Uses BVH when available, else linear scan.
+    void cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]);
+
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
     void handleMouseRelease(QMouseEvent* event);
@@ -171,6 +190,12 @@ class ViewportWindow : public QWindow {
     uint32_t visible_objects_ = 0;
     uint32_t instanced_draws_ = 0;
 
+    // Reused scratch: visible-instance index lists per mesh, flattened into
+    // `visible_flat_` for upload.  Both live in the parent object to avoid
+    // per-frame allocation.
+    std::vector<std::vector<uint32_t>> visible_by_mesh_;
+    std::vector<uint32_t>              visible_flat_;
+
     // Camera
     QVector3D camera_target_{0, 0, 0};
     float camera_distance_ = 50.0f;

From e2d51300c83d086e33eabed8a188de3ef581ca91 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 20:29:16 +1000
Subject: [PATCH 17/37] Progressive rendering during streaming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-allocate the instance SSBO on model creation (4 MB, grow-on-demand)
and append each arriving InstanceChunk directly to the GPU-side
InstanceGpu array in uploadInstanceChunk.  This makes a model drawable
as soon as its first mesh + first instance chunk land, rather than
waiting for finalizeModel.

The visible-list architecture already decouples SSBO order from the
draw path, so appending in insertion order is correct — no sorting
required.  finalizeModel collapses to:
  - compute per-mesh instance counts (for stats + sidecar round-trip)
  - build the per-model BVH over instance world AABBs

Render / pick loops now gate on ssbo_instance_count > 0 rather than
the finalized flag.  Stats include in-progress models in totals
(excluding only hidden).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 118 +++++++++++++++++--------------
 src/ifcviewer/ViewportWindow.h   |   2 +
 2 files changed, 66 insertions(+), 54 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 7011ec9d38f..b70e2bf8324 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -30,9 +30,10 @@
 #include <algorithm>
 #include <limits>
 
-static const size_t INITIAL_VBO_SIZE = 64 * 1024 * 1024;   // 64 MB
-static const size_t INITIAL_EBO_SIZE = 32 * 1024 * 1024;   // 32 MB
-static const size_t MAX_BUFFER_SIZE  = 4ull * 1024 * 1024 * 1024;  // 4 GB
+static const size_t INITIAL_VBO_SIZE  = 64 * 1024 * 1024;   // 64 MB
+static const size_t INITIAL_EBO_SIZE  = 32 * 1024 * 1024;   // 32 MB
+static const size_t INITIAL_SSBO_SIZE = 4  * 1024 * 1024;   // 4 MB (~52k instances)
+static const size_t MAX_BUFFER_SIZE   = 4ull * 1024 * 1024 * 1024;  // 4 GB
 
 // -----------------------------------------------------------------------------
 // Shaders
@@ -420,6 +421,27 @@ bool ViewportWindow::growModelVbo(ModelGpuData& m, size_t needed_total) {
     return true;
 }
 
+bool ViewportWindow::growModelSsbo(ModelGpuData& m, size_t needed_total) {
+    size_t new_capacity = m.ssbo_capacity ? m.ssbo_capacity : INITIAL_SSBO_SIZE;
+    while (new_capacity < needed_total) new_capacity *= 2;
+    if (new_capacity > MAX_BUFFER_SIZE) {
+        qWarning("Instance SSBO grow request (%zu MB) exceeds cap", new_capacity / (1024*1024));
+        return false;
+    }
+    GLuint new_ssbo = 0;
+    gl_->glCreateBuffers(1, &new_ssbo);
+    gl_->glNamedBufferStorage(new_ssbo, new_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+    const size_t used = m.ssbo_instance_count * sizeof(InstanceGpu);
+    if (m.ssbo && used > 0) {
+        gl_->glCopyNamedBufferSubData(m.ssbo, new_ssbo, 0, 0, used);
+    }
+    if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+    m.ssbo = new_ssbo;
+    m.ssbo_capacity = new_capacity;
+    qInfo("Model instance SSBO grew to %zu MB", m.ssbo_capacity / (1024*1024));
+    return true;
+}
+
 bool ViewportWindow::growModelEbo(ModelGpuData& m, size_t needed_total) {
     size_t new_capacity = m.ebo_capacity;
     while (new_capacity < needed_total) new_capacity *= 2;
@@ -456,6 +478,11 @@ ModelGpuData& ViewportWindow::getOrCreateModel(uint32_t model_id) {
     gl_->glNamedBufferStorage(m.ebo, m.ebo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
     setupVaoLayout(m.vao, m.vbo, m.ebo);
 
+    // Pre-allocate instance SSBO so we can append during streaming.
+    gl_->glCreateBuffers(1, &m.ssbo);
+    m.ssbo_capacity = INITIAL_SSBO_SIZE;
+    gl_->glNamedBufferStorage(m.ssbo, m.ssbo_capacity, nullptr, GL_DYNAMIC_STORAGE_BIT);
+
     return models_gpu_.emplace(model_id, std::move(m)).first->second;
 }
 
@@ -501,8 +528,8 @@ void ViewportWindow::uploadMeshChunk(const MeshChunk& chunk) {
 
 void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     if (!gl_initialized_) return;
-    // We don't need a GL context here since we're only touching CPU state,
-    // but the signal may fire on the render thread so keep it simple.
+    context_->makeCurrent(this);
+
     ModelGpuData& m = getOrCreateModel(chunk.model_id);
 
     InstanceCpu inst;
@@ -515,6 +542,23 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     std::memcpy(inst.world_aabb_max, chunk.world_aabb_max, sizeof(inst.world_aabb_max));
     m.instances.push_back(inst);
 
+    // Append the GPU record to the instance SSBO so the model is drawable
+    // immediately, without waiting for finalizeModel.  The visible-list
+    // architecture means SSBO order is irrelevant to correctness.
+    InstanceGpu gpu;
+    std::memcpy(gpu.transform, inst.transform, sizeof(gpu.transform));
+    gpu.object_id = inst.object_id;
+    gpu.color_override_rgba8 = inst.color_override_rgba8;
+    gpu._pad0 = 0;
+    gpu._pad1 = 0;
+
+    const size_t offset = m.ssbo_instance_count * sizeof(InstanceGpu);
+    if (offset + sizeof(InstanceGpu) > m.ssbo_capacity) {
+        if (!growModelSsbo(m, offset + sizeof(InstanceGpu))) return;
+    }
+    gl_->glNamedBufferSubData(m.ssbo, offset, sizeof(InstanceGpu), &gpu);
+    m.ssbo_instance_count++;
+
     if (chunk.local_mesh_id < m.meshes.size()) {
         m.total_triangles += m.meshes[chunk.local_mesh_id].index_count / 3;
     }
@@ -527,64 +571,30 @@ void ViewportWindow::finalizeModel(uint32_t model_id) {
     auto it = models_gpu_.find(model_id);
     if (it == models_gpu_.end()) return;
     ModelGpuData& m = it->second;
-    if (m.instances.empty()) { m.finalized = true; return; }
 
-    // Sort instances by mesh_id (stable for deterministic ordering).
-    std::stable_sort(m.instances.begin(), m.instances.end(),
-        [](const InstanceCpu& a, const InstanceCpu& b) {
-            return a.mesh_id < b.mesh_id;
-        });
-
-    // Assign per-mesh contiguous range.
+    // Instance SSBO has been populated incrementally during streaming, so
+    // we don't re-upload here.  What finalize still does:
+    //   (1) compute per-mesh instance counts — used by stats and the sidecar
+    //       round-trip (first_instance is unused by the visible-list renderer),
+    //   (2) build the per-model BVH over instance world AABBs.
     for (auto& mesh : m.meshes) { mesh.first_instance = 0; mesh.instance_count = 0; }
-    uint32_t current = UINT32_MAX;
-    uint32_t run_start = 0;
-    for (uint32_t i = 0; i < m.instances.size(); ++i) {
-        uint32_t mid = m.instances[i].mesh_id;
-        if (mid != current) {
-            if (current != UINT32_MAX && current < m.meshes.size()) {
-                m.meshes[current].first_instance = run_start;
-                m.meshes[current].instance_count = i - run_start;
-            }
-            current = mid;
-            run_start = i;
-        }
-    }
-    if (current != UINT32_MAX && current < m.meshes.size()) {
-        m.meshes[current].first_instance = run_start;
-        m.meshes[current].instance_count = static_cast<uint32_t>(m.instances.size()) - run_start;
-    }
-
-    // Build GPU-layout array.
-    std::vector<InstanceGpu> gpu(m.instances.size());
-    for (size_t i = 0; i < m.instances.size(); ++i) {
-        const InstanceCpu& src = m.instances[i];
-        InstanceGpu& dst = gpu[i];
-        std::memcpy(dst.transform, src.transform, sizeof(dst.transform));
-        dst.object_id = src.object_id;
-        dst.color_override_rgba8 = src.color_override_rgba8;
-        dst._pad0 = 0;
-        dst._pad1 = 0;
+    for (const auto& inst : m.instances) {
+        if (inst.mesh_id < m.meshes.size()) ++m.meshes[inst.mesh_id].instance_count;
     }
 
-    // Allocate and upload SSBO.
-    if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
-    gl_->glCreateBuffers(1, &m.ssbo);
-    const size_t ssbo_bytes = gpu.size() * sizeof(InstanceGpu);
-    gl_->glNamedBufferStorage(m.ssbo, ssbo_bytes, gpu.data(), 0);
-    m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
-
     buildBvhForModel(m, model_id);
 
     m.finalized = true;
 
+    const size_t ssbo_bytes = m.ssbo_instance_count * sizeof(InstanceGpu);
     qDebug("Model %u finalized: %zu verts, %zu meshes, %zu instances, %.1f MB vram "
-           "(vbo %.1f + ebo %.1f + ssbo %.1f)",
+           "(vbo %.1f + ebo %.1f + ssbo-used %.1f / %.1f cap)",
            model_id, size_t(m.vertex_count), m.meshes.size(), m.instances.size(),
-           (m.vbo_capacity + m.ebo_capacity + ssbo_bytes) / (1024.0*1024.0),
+           (m.vbo_capacity + m.ebo_capacity + m.ssbo_capacity) / (1024.0*1024.0),
            m.vbo_capacity / (1024.0*1024.0),
            m.ebo_capacity / (1024.0*1024.0),
-           ssbo_bytes / (1024.0*1024.0));
+           ssbo_bytes / (1024.0*1024.0),
+           m.ssbo_capacity / (1024.0*1024.0));
 }
 
 bool ViewportWindow::snapshotModel(uint32_t model_id, SidecarData& out) const {
@@ -869,7 +879,7 @@ void ViewportWindow::render() {
     instanced_draws_ = 0;
 
     for (auto& [model_id, m] : models_gpu_) {
-        if (m.hidden || !m.finalized || !m.ssbo) continue;
+        if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
         cullAndUploadVisible(m, planes);
         if (visible_flat_.empty()) continue;
@@ -913,7 +923,7 @@ void ViewportWindow::render() {
         size_t num_models = 0, num_hidden = 0;
         for (const auto& [mid, mm] : models_gpu_) {
             num_models++;
-            if (mm.hidden || !mm.finalized) { num_hidden++; continue; }
+            if (mm.hidden) { num_hidden++; continue; }
             total_obj += static_cast<uint32_t>(mm.instances.size());
             total_tri += mm.total_triangles;
             total_meshes += static_cast<uint32_t>(mm.meshes.size());
@@ -965,7 +975,7 @@ void ViewportWindow::renderPickPass() {
     gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
 
     for (auto& [model_id, m] : models_gpu_) {
-        if (m.hidden || !m.finalized || !m.ssbo) continue;
+        if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
         cullAndUploadVisible(m, planes);
         if (visible_flat_.empty()) continue;
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 5a086fd774f..fd21bb76416 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -55,6 +55,7 @@ struct ModelGpuData {
 
     size_t vbo_capacity = 0;
     size_t ebo_capacity = 0;
+    size_t ssbo_capacity = 0;       // bytes
     size_t vbo_used = 0;
     size_t ebo_used = 0;
     uint32_t vertex_count = 0;      // total (across all meshes)
@@ -148,6 +149,7 @@ class ViewportWindow : public QWindow {
     void setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo);
     bool growModelVbo(ModelGpuData& m, size_t needed_total);
     bool growModelEbo(ModelGpuData& m, size_t needed_total);
+    bool growModelSsbo(ModelGpuData& m, size_t needed_total);
     ModelGpuData& getOrCreateModel(uint32_t model_id);
 
     // Populate m.mesh_vis_first / mesh_vis_count and upload visible indices

From 99ca61b133624d63c2f81432e0ed60f76706fcdf Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 20:34:47 +1000
Subject: [PATCH 18/37] Collapse per-mesh draws into
 glMultiDrawElementsIndirect

Each visible model now issues a single glMultiDrawElementsIndirect
call instead of one glDrawElementsInstancedBaseVertex per mesh.  The
CPU BVH cull populates an array of DrawElementsIndirectCommand
records plus the flat visible-instance list, uploads both, and draws
the whole model in one GL call.

Vertex shaders switch from a uniform u_instance_offset to
gl_BaseInstanceARB (ARB_shader_draw_parameters), so per-draw offset
comes from the indirect command's baseInstance field.

Draw-call counts for BIM scenes with hundreds of unique meshes drop
from hundreds-per-frame to one-per-model, cutting driver overhead.
This also sets up the plumbing for the follow-up compute-shader cull
that will populate the indirect buffer entirely on-GPU.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 112 +++++++++++++++++--------------
 src/ifcviewer/ViewportWindow.h   |  33 ++++++---
 2 files changed, 87 insertions(+), 58 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index b70e2bf8324..b24ff7e3b3d 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -35,6 +35,8 @@ static const size_t INITIAL_EBO_SIZE  = 32 * 1024 * 1024;   // 32 MB
 static const size_t INITIAL_SSBO_SIZE = 4  * 1024 * 1024;   // 4 MB (~52k instances)
 static const size_t MAX_BUFFER_SIZE   = 4ull * 1024 * 1024 * 1024;  // 4 GB
 
+static_assert(sizeof(DrawElementsIndirectCommand) == 20, "indirect cmd must be 20 bytes");
+
 // -----------------------------------------------------------------------------
 // Shaders
 // -----------------------------------------------------------------------------
@@ -55,6 +57,7 @@ static const size_t MAX_BUFFER_SIZE   = 4ull * 1024 * 1024 * 1024;  // 4 GB
 
 static const char* MAIN_VERTEX_SHADER = R"(
 #version 450 core
+#extension GL_ARB_shader_draw_parameters : require
 layout(location = 0) in vec3 a_position;
 layout(location = 1) in vec3 a_normal;
 layout(location = 2) in vec4 a_color;
@@ -74,7 +77,6 @@ layout(std430, binding = 1) readonly buffer VisibleIndices {
 };
 
 uniform mat4 u_view_projection;
-uniform uint u_instance_offset;
 uniform uint u_selected_id;
 
 out vec3 v_normal;
@@ -83,7 +85,8 @@ flat out uint v_object_id;
 flat out uint v_selected;
 
 void main() {
-    uint iid = visible[u_instance_offset + uint(gl_InstanceID)];
+    uint slot = uint(gl_BaseInstanceARB) + uint(gl_InstanceID);
+    uint iid = visible[slot];
     InstanceRecord inst = instances[iid];
     vec4 world = inst.transform * vec4(a_position, 1.0);
     gl_Position = u_view_projection * world;
@@ -132,6 +135,7 @@ void main() {
 
 static const char* PICK_VERTEX_SHADER = R"(
 #version 450 core
+#extension GL_ARB_shader_draw_parameters : require
 layout(location = 0) in vec3 a_position;
 
 struct InstanceRecord {
@@ -149,12 +153,12 @@ layout(std430, binding = 1) readonly buffer VisibleIndices {
 };
 
 uniform mat4 u_view_projection;
-uniform uint u_instance_offset;
 
 flat out uint v_object_id;
 
 void main() {
-    uint iid = visible[u_instance_offset + uint(gl_InstanceID)];
+    uint slot = uint(gl_BaseInstanceARB) + uint(gl_InstanceID);
+    uint iid = visible[slot];
     InstanceRecord inst = instances[iid];
     gl_Position = u_view_projection * inst.transform * vec4(a_position, 1.0);
     v_object_id = inst.object_id;
@@ -302,6 +306,7 @@ ViewportWindow::~ViewportWindow() {
                 if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
                 if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
                 if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
+                if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
             }
             if (axis_vao_)      gl_->glDeleteVertexArrays(1, &axis_vao_);
             if (axis_vbo_)      gl_->glDeleteBuffers(1, &axis_vbo_);
@@ -630,6 +635,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
         if (existing->second.ebo)  gl_->glDeleteBuffers(1, &existing->second.ebo);
         if (existing->second.ssbo) gl_->glDeleteBuffers(1, &existing->second.ssbo);
         if (existing->second.visible_ssbo) gl_->glDeleteBuffers(1, &existing->second.visible_ssbo);
+        if (existing->second.indirect_buffer) gl_->glDeleteBuffers(1, &existing->second.indirect_buffer);
         models_gpu_.erase(existing);
     }
 
@@ -706,6 +712,7 @@ void ViewportWindow::resetScene() {
         if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
         if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
         if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
+        if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
     }
     models_gpu_.clear();
     selected_object_id_ = 0;
@@ -731,6 +738,7 @@ void ViewportWindow::removeModel(uint32_t model_id) {
         if (it->second.ebo)  gl_->glDeleteBuffers(1, &it->second.ebo);
         if (it->second.ssbo) gl_->glDeleteBuffers(1, &it->second.ssbo);
         if (it->second.visible_ssbo) gl_->glDeleteBuffers(1, &it->second.visible_ssbo);
+        if (it->second.indirect_buffer) gl_->glDeleteBuffers(1, &it->second.indirect_buffer);
         models_gpu_.erase(it);
     }
 }
@@ -806,26 +814,36 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         for (uint32_t i = 0; i < m.instances.size(); ++i) test_and_push(i);
     }
 
-    // Flatten into visible_flat_ and record per-mesh ranges.
+    // Flatten into visible_flat_ and build one DrawElementsIndirectCommand
+    // per non-empty mesh.
     visible_flat_.clear();
-    m.mesh_vis_first.assign(m.meshes.size(), 0);
-    m.mesh_vis_count.assign(m.meshes.size(), 0);
+    indirect_scratch_.clear();
     for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
-        m.mesh_vis_first[mi] = static_cast<uint32_t>(visible_flat_.size());
-        m.mesh_vis_count[mi] = static_cast<uint32_t>(visible_by_mesh_[mi].size());
+        const auto& mesh = m.meshes[mi];
+        const uint32_t vis_count = static_cast<uint32_t>(visible_by_mesh_[mi].size());
+        if (vis_count == 0 || mesh.index_count == 0) continue;
+
+        DrawElementsIndirectCommand cmd;
+        cmd.count         = mesh.index_count;
+        cmd.instanceCount = vis_count;
+        cmd.firstIndex    = mesh.ebo_byte_offset / sizeof(uint32_t);
+        cmd.baseVertex    = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
+        cmd.baseInstance  = static_cast<uint32_t>(visible_flat_.size());
+        indirect_scratch_.push_back(cmd);
+
         visible_flat_.insert(visible_flat_.end(),
                              visible_by_mesh_[mi].begin(),
                              visible_by_mesh_[mi].end());
     }
+    m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
 
-    // Grow/create visible SSBO as needed. Keep at least 4 bytes so the binding
-    // is always valid even when nothing is visible.
-    size_t bytes = std::max<size_t>(visible_flat_.size() * sizeof(uint32_t),
-                                    sizeof(uint32_t));
-    if (m.visible_ssbo == 0 || m.visible_ssbo_capacity < bytes) {
+    // Upload visible list (keep binding alive even when empty).
+    size_t vis_bytes = std::max<size_t>(visible_flat_.size() * sizeof(uint32_t),
+                                        sizeof(uint32_t));
+    if (m.visible_ssbo == 0 || m.visible_ssbo_capacity < vis_bytes) {
         if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
         size_t new_cap = m.visible_ssbo_capacity ? m.visible_ssbo_capacity : 4096;
-        while (new_cap < bytes) new_cap *= 2;
+        while (new_cap < vis_bytes) new_cap *= 2;
         gl_->glCreateBuffers(1, &m.visible_ssbo);
         gl_->glNamedBufferStorage(m.visible_ssbo, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
         m.visible_ssbo_capacity = new_cap;
@@ -834,6 +852,19 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         gl_->glNamedBufferSubData(m.visible_ssbo, 0,
             visible_flat_.size() * sizeof(uint32_t), visible_flat_.data());
     }
+
+    // Upload indirect command buffer.
+    size_t ind_bytes = indirect_scratch_.size() * sizeof(DrawElementsIndirectCommand);
+    if (ind_bytes == 0) return;
+    if (m.indirect_buffer == 0 || m.indirect_capacity < ind_bytes) {
+        if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
+        size_t new_cap = m.indirect_capacity ? m.indirect_capacity : 4096;
+        while (new_cap < ind_bytes) new_cap *= 2;
+        gl_->glCreateBuffers(1, &m.indirect_buffer);
+        gl_->glNamedBufferStorage(m.indirect_buffer, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
+        m.indirect_capacity = new_cap;
+    }
+    gl_->glNamedBufferSubData(m.indirect_buffer, 0, ind_bytes, indirect_scratch_.data());
 }
 
 void ViewportWindow::updateCamera() {
@@ -869,7 +900,6 @@ void ViewportWindow::render() {
     GLint u_vp        = gl_->glGetUniformLocation(main_program_, "u_view_projection");
     GLint u_light     = gl_->glGetUniformLocation(main_program_, "u_light_dir");
     GLint u_sel       = gl_->glGetUniformLocation(main_program_, "u_selected_id");
-    GLint u_inst_off  = gl_->glGetUniformLocation(main_program_, "u_instance_offset");
     gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
     gl_->glUniform3f(u_light, 0.3f, 0.5f, 0.8f);
     gl_->glUniform1ui(u_sel, selected_object_id_);
@@ -882,29 +912,23 @@ void ViewportWindow::render() {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
         cullAndUploadVisible(m, planes);
-        if (visible_flat_.empty()) continue;
+        if (m.indirect_command_count == 0) continue;
 
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
-
-        for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
-            const auto& mesh = m.meshes[mi];
-            uint32_t vis_count = m.mesh_vis_count[mi];
-            if (vis_count == 0 || mesh.index_count == 0) continue;
-            gl_->glUniform1ui(u_inst_off, m.mesh_vis_first[mi]);
-            gl_->glDrawElementsInstancedBaseVertex(
-                GL_TRIANGLES,
-                static_cast<GLsizei>(mesh.index_count),
-                GL_UNSIGNED_INT,
-                reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
-                static_cast<GLsizei>(vis_count),
-                static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
-            visible_triangles_ += (mesh.index_count / 3) * vis_count;
-            visible_objects_   += vis_count;
-            ++instanced_draws_;
+        gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
+        gl_->glMultiDrawElementsIndirect(
+            GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
+            static_cast<GLsizei>(m.indirect_command_count), 0);
+
+        for (const auto& cmd : indirect_scratch_) {
+            visible_triangles_ += (cmd.count / 3) * cmd.instanceCount;
+            visible_objects_   += cmd.instanceCount;
         }
+        instanced_draws_ += m.indirect_command_count;
     }
+    gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
 
     renderAxisGizmo();
 
@@ -971,33 +995,23 @@ void ViewportWindow::renderPickPass() {
 
     gl_->glUseProgram(pick_program_);
     GLint u_vp       = gl_->glGetUniformLocation(pick_program_, "u_view_projection");
-    GLint u_inst_off = gl_->glGetUniformLocation(pick_program_, "u_instance_offset");
     gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
 
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
         cullAndUploadVisible(m, planes);
-        if (visible_flat_.empty()) continue;
+        if (m.indirect_command_count == 0) continue;
 
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
-
-        for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
-            const auto& mesh = m.meshes[mi];
-            uint32_t vis_count = m.mesh_vis_count[mi];
-            if (vis_count == 0 || mesh.index_count == 0) continue;
-            gl_->glUniform1ui(u_inst_off, m.mesh_vis_first[mi]);
-            gl_->glDrawElementsInstancedBaseVertex(
-                GL_TRIANGLES,
-                static_cast<GLsizei>(mesh.index_count),
-                GL_UNSIGNED_INT,
-                reinterpret_cast<const void*>(static_cast<uintptr_t>(mesh.ebo_byte_offset)),
-                static_cast<GLsizei>(vis_count),
-                static_cast<GLint>(mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES));
-        }
+        gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
+        gl_->glMultiDrawElementsIndirect(
+            GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
+            static_cast<GLsizei>(m.indirect_command_count), 0);
     }
+    gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
     gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
 }
 
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index fd21bb76416..966761eeaf1 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -38,6 +38,15 @@
 #include "InstancedGeometry.h"
 #include "SidecarCache.h"
 
+// Matches GL_DRAW_INDIRECT_BUFFER layout for glMultiDrawElementsIndirect.
+struct DrawElementsIndirectCommand {
+    uint32_t count;
+    uint32_t instanceCount;
+    uint32_t firstIndex;
+    uint32_t baseVertex;
+    uint32_t baseInstance;
+};
+
 // Per-model GPU state for the instanced render path.
 //
 //   VBO: local-coord interleaved verts (pos3 + normal3 + color1_packed) — 28 B.
@@ -71,13 +80,15 @@ struct ModelGpuData {
     ModelBvh             bvh;
 
     // Dynamic visible-instance index buffer (std430, binding = 1).
-    // Re-uploaded each frame from frame_visible_scratch_.
+    // Re-uploaded each frame from visible_flat_.
     GLuint  visible_ssbo = 0;
     size_t  visible_ssbo_capacity = 0;  // bytes
 
-    // Per-mesh visible-list offset/count, rebuilt each frame.
-    std::vector<uint32_t> mesh_vis_first;
-    std::vector<uint32_t> mesh_vis_count;
+    // GL_DRAW_INDIRECT_BUFFER of DrawElementsIndirectCommand[], one per
+    // non-empty mesh.  Re-uploaded each frame.
+    GLuint  indirect_buffer = 0;
+    size_t  indirect_capacity = 0;        // bytes
+    uint32_t indirect_command_count = 0;  // valid commands this frame
 
     bool finalized = false;
     bool hidden    = false;
@@ -152,8 +163,9 @@ class ViewportWindow : public QWindow {
     bool growModelSsbo(ModelGpuData& m, size_t needed_total);
     ModelGpuData& getOrCreateModel(uint32_t model_id);
 
-    // Populate m.mesh_vis_first / mesh_vis_count and upload visible indices
-    // to m.visible_ssbo.  Uses BVH when available, else linear scan.
+    // Frustum-cull m's instances (BVH if available, else linear scan),
+    // build the per-mesh DrawElementsIndirectCommand array + flat visible
+    // list, and upload both to m.indirect_buffer / m.visible_ssbo.
     void cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]);
 
     // Mouse interaction
@@ -194,9 +206,12 @@ class ViewportWindow : public QWindow {
 
     // Reused scratch: visible-instance index lists per mesh, flattened into
     // `visible_flat_` for upload.  Both live in the parent object to avoid
-    // per-frame allocation.
-    std::vector<std::vector<uint32_t>> visible_by_mesh_;
-    std::vector<uint32_t>              visible_flat_;
+    // per-frame allocation.  indirect_scratch_ is the matching array of
+    // DrawElementsIndirectCommand records — forward-declared as bytes so
+    // the header doesn't need the struct definition.
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_;
+    std::vector<uint32_t>                  visible_flat_;
+    std::vector<DrawElementsIndirectCommand> indirect_scratch_;
 
     // Camera
     QVector3D camera_target_{0, 0, 0};

From 04733487347504b6cd609857d7fe36047961ef87 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 21:10:27 +1000
Subject: [PATCH 19/37] Two-sided lighting, rename misleading draw-count stat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs conflated as "weird colors":

1. Two-sided lighting.  IFC placements often embed reflection
   matrices (mirrored families).  Transforming a_normal by
   mat3(inst.transform) produces a normal pointing the wrong way
   on those instances, and max(n·L, 0) then clamps the surface to
   pure ambient — reads as dark / washed out.  Use gl_FrontFacing
   to flip n in the fragment shader so both winding orientations
   shade correctly.  The proper fix (ship an inverse-transpose
   normal matrix or a det-sign bit per instance) is still owed;
   that would unlock re-enabling GL_CULL_FACE for a big fragment-
   work win on closed solids.

2. Stats label "inst_draws" was counting indirect sub-draws, not
   actual GL draw calls — misleading since MDI collapses N sub-
   draws into one glMultiDrawElementsIndirect.  Split into
   gl_draw_calls (real GL calls, = drawn-model count) and
   indirect_sub_draws (packed sub-commands).  For a BIM model
   with 47k unique meshes at full view this now correctly reads
   "1 gl_draws (47092 sub)" rather than suggesting 47k driver
   dispatches.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/MainWindow.cpp     |  6 ++++--
 src/ifcviewer/ViewportWindow.cpp | 19 ++++++++++++++-----
 src/ifcviewer/ViewportWindow.h   |  6 ++++--
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index ceeedc8cbd4..8b63f3bdf68 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -41,13 +41,15 @@ MainWindow::MainWindow(QWidget* parent)
     connect(viewport_, &ViewportWindow::frameStatsUpdated, this, [this](const ViewportWindow::FrameStats& s) {
         if (!stats_label_->isVisible()) return;
         stats_label_->setText(
-            QString("%1 fps | %2 ms | %3/%4 obj | %5/%6 tri")
+            QString("%1 fps | %2 ms | %3/%4 obj | %5/%6 tri | %7 gl_draws (%8 sub)")
                 .arg(s.fps, 0, 'f', 1)
                 .arg(s.frame_time_ms, 0, 'f', 1)
                 .arg(s.visible_objects)
                 .arg(s.total_objects)
                 .arg(s.visible_triangles)
-                .arg(s.total_triangles));
+                .arg(s.total_triangles)
+                .arg(s.gl_draw_calls)
+                .arg(s.indirect_sub_draws));
     });
 
     connect(&AppSettings::instance(), &AppSettings::showStatsChanged, this, [this](bool show) {
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index b24ff7e3b3d..d58f192733a 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -123,7 +123,13 @@ uniform vec3 u_light_dir;
 out vec4 frag_color;
 
 void main() {
+    // Two-sided lighting: IFC placements frequently embed reflections
+    // (mirrored families), which flip triangle winding and invert the
+    // transformed normal.  Taking abs(dot) — or equivalently flipping n
+    // based on gl_FrontFacing — makes both sides shade correctly
+    // regardless of winding / reflection state.
     vec3 n = normalize(v_normal);
+    if (!gl_FrontFacing) n = -n;
     float ndotl = max(dot(n, u_light_dir), 0.0);
     float ambient = 0.25;
     float diffuse = 0.75 * ndotl;
@@ -906,7 +912,8 @@ void ViewportWindow::render() {
 
     visible_triangles_ = 0;
     visible_objects_ = 0;
-    instanced_draws_ = 0;
+    gl_draw_calls_ = 0;
+    indirect_sub_draws_ = 0;
 
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
@@ -926,7 +933,8 @@ void ViewportWindow::render() {
             visible_triangles_ += (cmd.count / 3) * cmd.instanceCount;
             visible_objects_   += cmd.instanceCount;
         }
-        instanced_draws_ += m.indirect_command_count;
+        indirect_sub_draws_ += m.indirect_command_count;
+        ++gl_draw_calls_;
     }
     gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
 
@@ -964,16 +972,17 @@ void ViewportWindow::render() {
         stats.total_triangles = total_tri;
         stats.visible_triangles = visible_triangles_;
         stats.unique_meshes = total_meshes;
-        stats.instanced_draws = instanced_draws_;
+        stats.gl_draw_calls = gl_draw_calls_;
+        stats.indirect_sub_draws = indirect_sub_draws_;
         emit frameStatsUpdated(stats);
 
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
-               "meshes %u  inst_draws %u  "
+               "meshes %u  gl_draws %u  sub_draws %u  "
                "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
                visible_objects_, total_obj,
                visible_triangles_, total_tri,
-               total_meshes, instanced_draws_,
+               total_meshes, gl_draw_calls_, indirect_sub_draws_,
                (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
                total_vbo / (1024.0*1024.0),
                total_ebo / (1024.0*1024.0),
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 966761eeaf1..2c3019eb15d 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -136,7 +136,8 @@ class ViewportWindow : public QWindow {
         uint32_t total_triangles;
         uint32_t visible_triangles;
         uint32_t unique_meshes;
-        uint32_t instanced_draws;
+        uint32_t gl_draw_calls;        // actual glMultiDrawElementsIndirect issues per frame
+        uint32_t indirect_sub_draws;   // total commands packed into those indirect buffers
     };
 
 signals:
@@ -202,7 +203,8 @@ class ViewportWindow : public QWindow {
     // Per-frame stats
     uint32_t visible_triangles_ = 0;
     uint32_t visible_objects_ = 0;
-    uint32_t instanced_draws_ = 0;
+    uint32_t gl_draw_calls_ = 0;
+    uint32_t indirect_sub_draws_ = 0;
 
     // Reused scratch: visible-instance index lists per mesh, flattened into
     // `visible_flat_` for upload.  Both live in the parent object to avoid

From d1cdec2c54829f147e4c96f8b5d7376621948e19 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 22:06:58 +1000
Subject: [PATCH 20/37] Enable reorient-shells in geometry iterator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IFC files routinely have IfcConnectedFaceSets whose faces point
inconsistently within the same shell — the result under per-vertex
normals is dark inside-out patches, and under GL_CULL_FACE it's
swiss-cheese.  reorient-shells fixes the face winding at geometry
generation time, which is the only place it can be fixed correctly;
no shader trick can recover from a mesh whose triangles disagree
among themselves.

Off by default in IfcOpenShell because it adds iterator time, but
we cache the result in the sidecar so it's a one-shot cost per file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/GeometryStreamer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ifcviewer/GeometryStreamer.cpp b/src/ifcviewer/GeometryStreamer.cpp
index 226fb0808ca..d3edcce19f8 100644
--- a/src/ifcviewer/GeometryStreamer.cpp
+++ b/src/ifcviewer/GeometryStreamer.cpp
@@ -270,6 +270,11 @@ void GeometryStreamer::run(const std::string& path, int num_threads) {
     settings.set("use-world-coords", false);
     settings.set("weld-vertices", false);
     settings.set("apply-default-materials", true);
+    // Off by default in IfcOpenShell — makes face winding consistent within
+    // each shell, which we need for GL_CULL_FACE and for per-vertex normals
+    // to shade a solid without dark inside-out patches.  Costs some iterator
+    // time, but results are cached in the sidecar so it's a one-shot hit.
+    settings.set("reorient-shells", true);
 
     std::unique_ptr<IfcGeom::Iterator> iterator;
     try {

From 4729a094fcdcc8ca6cf51bb32cc9018a12193642 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 22:07:19 +1000
Subject: [PATCH 21/37] Backface culling with reflection-aware two-pass MDI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enables GL_CULL_FACE by default (user-toggleable in Settings) so
closed solids skip shading their back halves.  The catch is that
IFC placements can contain reflections (mat4 with det<0 — mirrored
families, symmetric instances).  Naively culling would make every
mirrored instance vanish because the rasterizer sees its screen-space
winding as backwards.

Fix: detect reflections at upload time via determinant sign, bucket
visible instances into forward (det>=0) and reverse (det<0) per mesh
during culling, and issue two glMultiDrawElementsIndirect calls per
model with glFrontFace toggled CCW/CW between them.  The indirect
buffer is still one buffer — just split into a forward slice followed
by a reverse slice, with m.indirect_forward_count recording the split.

Vertex shader flips the normal when the transform has negative
determinant, keeping lighting correct on mirrored instances.  The
fragment shader keeps the gl_FrontFacing fallback as a safety net
when culling is disabled (e.g. for files with open shells).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/AppSettings.cpp    |  14 +++
 src/ifcviewer/AppSettings.h      |   5 +
 src/ifcviewer/SettingsWindow.cpp |   8 ++
 src/ifcviewer/SettingsWindow.h   |   1 +
 src/ifcviewer/ViewportWindow.cpp | 176 ++++++++++++++++++++++++-------
 src/ifcviewer/ViewportWindow.h   |  13 ++-
 6 files changed, 174 insertions(+), 43 deletions(-)

diff --git a/src/ifcviewer/AppSettings.cpp b/src/ifcviewer/AppSettings.cpp
index af1edfa36f6..ff8d3bb3f1b 100644
--- a/src/ifcviewer/AppSettings.cpp
+++ b/src/ifcviewer/AppSettings.cpp
@@ -25,6 +25,7 @@ namespace {
 constexpr const char* kGeometryLibraryKey = "geometry/library";
 constexpr const char* kGeometryLibraryDefault = "hybrid-cgal-simple-opencascade";
 constexpr const char* kShowStatsKey = "viewport/show_stats";
+constexpr const char* kBackfaceCullingKey = "viewport/backface_culling";
 }
 
 AppSettings& AppSettings::instance() {
@@ -58,14 +59,27 @@ void AppSettings::setShowStats(bool value) {
     emit showStatsChanged(value);
 }
 
+bool AppSettings::backfaceCulling() const {
+    return backface_culling_;
+}
+
+void AppSettings::setBackfaceCulling(bool value) {
+    if (backface_culling_ == value) return;
+    backface_culling_ = value;
+    persist();
+    emit backfaceCullingChanged(value);
+}
+
 void AppSettings::load() {
     QSettings settings;
     geometry_library_ = settings.value(kGeometryLibraryKey, kGeometryLibraryDefault).toString();
     show_stats_ = settings.value(kShowStatsKey, false).toBool();
+    backface_culling_ = settings.value(kBackfaceCullingKey, true).toBool();
 }
 
 void AppSettings::persist() {
     QSettings settings;
     settings.setValue(kGeometryLibraryKey, geometry_library_);
     settings.setValue(kShowStatsKey, show_stats_);
+    settings.setValue(kBackfaceCullingKey, backface_culling_);
 }
diff --git a/src/ifcviewer/AppSettings.h b/src/ifcviewer/AppSettings.h
index f70062475c6..8b38c61a338 100644
--- a/src/ifcviewer/AppSettings.h
+++ b/src/ifcviewer/AppSettings.h
@@ -37,9 +37,13 @@ class AppSettings : public QObject {
     bool showStats() const;
     void setShowStats(bool value);
 
+    bool backfaceCulling() const;
+    void setBackfaceCulling(bool value);
+
 signals:
     void geometryLibraryChanged(const QString& value);
     void showStatsChanged(bool value);
+    void backfaceCullingChanged(bool value);
 
 private:
     AppSettings();
@@ -48,6 +52,7 @@ class AppSettings : public QObject {
 
     QString geometry_library_;
     bool show_stats_ = false;
+    bool backface_culling_ = true;
 };
 
 #endif // APPSETTINGS_H
diff --git a/src/ifcviewer/SettingsWindow.cpp b/src/ifcviewer/SettingsWindow.cpp
index c4ebddc650e..69e1f025b80 100644
--- a/src/ifcviewer/SettingsWindow.cpp
+++ b/src/ifcviewer/SettingsWindow.cpp
@@ -44,6 +44,12 @@ void SettingsWindow::setupUi() {
     show_stats_check_ = new QCheckBox(this);
     form->addRow("Show Performance Stats", show_stats_check_);
 
+    backface_culling_check_ = new QCheckBox(this);
+    backface_culling_check_->setToolTip(
+        "Skip triangles facing away from the camera.  Big FPS win on "
+        "closed solids; disable if you see holes in open geometry.");
+    form->addRow("Backface Culling", backface_culling_check_);
+
     auto* button_box = new QDialogButtonBox(
         QDialogButtonBox::Ok | QDialogButtonBox::Cancel, this);
 
@@ -65,10 +71,12 @@ void SettingsWindow::showEvent(QShowEvent* event) {
 void SettingsWindow::syncFromSettings() {
     geometry_library_edit_->setText(AppSettings::instance().geometryLibrary());
     show_stats_check_->setChecked(AppSettings::instance().showStats());
+    backface_culling_check_->setChecked(AppSettings::instance().backfaceCulling());
 }
 
 void SettingsWindow::onAccepted() {
     AppSettings::instance().setGeometryLibrary(geometry_library_edit_->text());
     AppSettings::instance().setShowStats(show_stats_check_->isChecked());
+    AppSettings::instance().setBackfaceCulling(backface_culling_check_->isChecked());
     accept();
 }
diff --git a/src/ifcviewer/SettingsWindow.h b/src/ifcviewer/SettingsWindow.h
index ea55252682e..967938b4a23 100644
--- a/src/ifcviewer/SettingsWindow.h
+++ b/src/ifcviewer/SettingsWindow.h
@@ -43,6 +43,7 @@ private slots:
 
     QLineEdit* geometry_library_edit_ = nullptr;
     QCheckBox* show_stats_check_ = nullptr;
+    QCheckBox* backface_culling_check_ = nullptr;
 };
 
 #endif
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index d58f192733a..84778f3f2f7 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -19,6 +19,8 @@
 
 #include "ViewportWindow.h"
 
+#include "AppSettings.h"
+
 #include <QMouseEvent>
 #include <QWheelEvent>
 #include <QSurfaceFormat>
@@ -91,10 +93,17 @@ void main() {
     vec4 world = inst.transform * vec4(a_position, 1.0);
     gl_Position = u_view_projection * world;
 
-    // Rotate the normal by the upper-3x3 of the transform. For the vast
-    // majority of BIM placements this is a rigid rotation (+ uniform scale),
-    // so we skip the inverse-transpose.
-    v_normal = normalize(mat3(inst.transform) * a_normal);
+    // Rotate the normal by the upper-3x3 of the transform.  BIM placements
+    // are overwhelmingly rigid rotations (+ optional uniform scale +
+    // optional reflection), so we skip the full inverse-transpose but do
+    // need to flip the normal when the transform contains a reflection,
+    // otherwise mirrored instances shade as if inside-out.  The same
+    // determinant sign is what GL_CULL_FACE uses to decide winding, so
+    // keeping them in agreement means backface culling is safe to enable.
+    mat3 rot = mat3(inst.transform);
+    vec3 n = rot * a_normal;
+    if (determinant(rot) < 0.0) n = -n;
+    v_normal = normalize(n);
 
     vec4 baked = a_color;
     if (inst.color_override != 0u) {
@@ -123,11 +132,11 @@ uniform vec3 u_light_dir;
 out vec4 frag_color;
 
 void main() {
-    // Two-sided lighting: IFC placements frequently embed reflections
-    // (mirrored families), which flip triangle winding and invert the
-    // transformed normal.  Taking abs(dot) — or equivalently flipping n
-    // based on gl_FrontFacing — makes both sides shade correctly
-    // regardless of winding / reflection state.
+    // v_normal already has the reflection flip applied in the vertex
+    // shader.  When backface culling is off, open shells let us see the
+    // "wrong" side of a face — flip based on gl_FrontFacing so both
+    // sides light correctly.  When culling is on this branch is always
+    // true and has no effect.
     vec3 n = normalize(v_normal);
     if (!gl_FrontFacing) n = -n;
     float ndotl = max(dot(n, u_light_dir), 0.0);
@@ -230,6 +239,17 @@ static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint fra
 
 // -----------------------------------------------------------------------------
 
+// Determinant of the upper-left 3x3 of a column-major mat4 stored as 16 floats.
+// Sign tells us whether the transform contains a reflection, which is what
+// decides which glFrontFace winding to draw the instance with.
+static bool transformIsReflected(const float t[16]) {
+    const float det =
+        t[0] * (t[5] * t[10] - t[9] * t[6])
+      - t[4] * (t[1] * t[10] - t[9] * t[2])
+      + t[8] * (t[1] * t[6]  - t[5] * t[2]);
+    return det < 0.0f;
+}
+
 static bool aabbInFrustum(const float aabb_min[3], const float aabb_max[3],
                           const float planes[6][4]) {
     for (int p = 0; p < 6; ++p) {
@@ -344,6 +364,19 @@ void ViewportWindow::initGL() {
     gl_->glEnable(GL_DEPTH_TEST);
     gl_->glEnable(GL_MULTISAMPLE);
     gl_->glClearColor(0.18f, 0.20f, 0.22f, 1.0f);
+    gl_->glCullFace(GL_BACK);
+    if (AppSettings::instance().backfaceCulling()) gl_->glEnable(GL_CULL_FACE);
+    else                                            gl_->glDisable(GL_CULL_FACE);
+
+    // Hot-toggle cull state when the setting changes.  Queued so we touch GL
+    // state only when render() is about to run.
+    connect(&AppSettings::instance(), &AppSettings::backfaceCullingChanged,
+            this, [this](bool on) {
+                if (!gl_initialized_ || !gl_) return;
+                context_->makeCurrent(this);
+                if (on) gl_->glEnable(GL_CULL_FACE);
+                else    gl_->glDisable(GL_CULL_FACE);
+            });
 
     gl_initialized_ = true;
     frame_clock_.start();
@@ -552,6 +585,7 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     std::memcpy(inst.world_aabb_min, chunk.world_aabb_min, sizeof(inst.world_aabb_min));
     std::memcpy(inst.world_aabb_max, chunk.world_aabb_max, sizeof(inst.world_aabb_max));
     m.instances.push_back(inst);
+    m.instance_reflected.push_back(transformIsReflected(inst.transform) ? 1 : 0);
 
     // Append the GPU record to the instance SSBO so the model is drawable
     // immediately, without waiting for finalizeModel.  The visible-list
@@ -693,6 +727,13 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
     }
     m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
 
+    // Recompute the reflection flag from each instance's transform — the
+    // sidecar only caches InstanceCpu, not the parallel reflection flags.
+    m.instance_reflected.resize(m.instances.size());
+    for (size_t i = 0; i < m.instances.size(); ++i) {
+        m.instance_reflected[i] = transformIsReflected(m.instances[i].transform) ? 1 : 0;
+    }
+
     buildBvhForModel(m, model_id);
 
     m.finalized = true;
@@ -783,15 +824,25 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
 }
 
 void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]) {
-    // Ensure per-mesh scratch sized.
-    if (visible_by_mesh_.size() < m.meshes.size()) visible_by_mesh_.resize(m.meshes.size());
-    for (size_t i = 0; i < m.meshes.size(); ++i) visible_by_mesh_[i].clear();
+    // Per-mesh scratch, split by winding: fwd = non-reflected (CCW in screen
+    // space), rev = reflected (CW in screen space).  Splitting lets the draw
+    // pass toggle glFrontFace once between two MDI calls so GL_CULL_FACE does
+    // the right thing for both.
+    if (visible_by_mesh_fwd_.size() < m.meshes.size()) visible_by_mesh_fwd_.resize(m.meshes.size());
+    if (visible_by_mesh_rev_.size() < m.meshes.size()) visible_by_mesh_rev_.resize(m.meshes.size());
+    for (size_t i = 0; i < m.meshes.size(); ++i) {
+        visible_by_mesh_fwd_[i].clear();
+        visible_by_mesh_rev_[i].clear();
+    }
 
     auto test_and_push = [&](uint32_t inst_idx) {
         const InstanceCpu& inst = m.instances[inst_idx];
         if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
-        if (inst.mesh_id < visible_by_mesh_.size())
-            visible_by_mesh_[inst.mesh_id].push_back(inst_idx);
+        if (inst.mesh_id >= m.meshes.size()) return;
+        const bool reflected = inst_idx < m.instance_reflected.size()
+            && m.instance_reflected[inst_idx] != 0;
+        if (reflected) visible_by_mesh_rev_[inst.mesh_id].push_back(inst_idx);
+        else           visible_by_mesh_fwd_[inst.mesh_id].push_back(inst_idx);
     };
 
     if (!m.bvh.nodes.empty()) {
@@ -820,27 +871,34 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         for (uint32_t i = 0; i < m.instances.size(); ++i) test_and_push(i);
     }
 
-    // Flatten into visible_flat_ and build one DrawElementsIndirectCommand
-    // per non-empty mesh.
+    // Flatten fwd-slice first, then rev-slice, into visible_flat_.  Build
+    // matching DrawElementsIndirectCommands; commands for the fwd slice fill
+    // [0, indirect_forward_count), rev fills [indirect_forward_count, end).
     visible_flat_.clear();
     indirect_scratch_.clear();
-    for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
-        const auto& mesh = m.meshes[mi];
-        const uint32_t vis_count = static_cast<uint32_t>(visible_by_mesh_[mi].size());
-        if (vis_count == 0 || mesh.index_count == 0) continue;
-
-        DrawElementsIndirectCommand cmd;
-        cmd.count         = mesh.index_count;
-        cmd.instanceCount = vis_count;
-        cmd.firstIndex    = mesh.ebo_byte_offset / sizeof(uint32_t);
-        cmd.baseVertex    = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
-        cmd.baseInstance  = static_cast<uint32_t>(visible_flat_.size());
-        indirect_scratch_.push_back(cmd);
-
-        visible_flat_.insert(visible_flat_.end(),
-                             visible_by_mesh_[mi].begin(),
-                             visible_by_mesh_[mi].end());
-    }
+
+    auto emit_slice = [&](std::vector<std::vector<uint32_t>>& by_mesh) {
+        for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
+            const auto& mesh = m.meshes[mi];
+            const uint32_t vis_count = static_cast<uint32_t>(by_mesh[mi].size());
+            if (vis_count == 0 || mesh.index_count == 0) continue;
+
+            DrawElementsIndirectCommand cmd;
+            cmd.count         = mesh.index_count;
+            cmd.instanceCount = vis_count;
+            cmd.firstIndex    = mesh.ebo_byte_offset / sizeof(uint32_t);
+            cmd.baseVertex    = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
+            cmd.baseInstance  = static_cast<uint32_t>(visible_flat_.size());
+            indirect_scratch_.push_back(cmd);
+
+            visible_flat_.insert(visible_flat_.end(),
+                                 by_mesh[mi].begin(), by_mesh[mi].end());
+        }
+    };
+
+    emit_slice(visible_by_mesh_fwd_);
+    m.indirect_forward_count = static_cast<uint32_t>(indirect_scratch_.size());
+    emit_slice(visible_by_mesh_rev_);
     m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
 
     // Upload visible list (keep binding alive even when empty).
@@ -915,6 +973,10 @@ void ViewportWindow::render() {
     gl_draw_calls_ = 0;
     indirect_sub_draws_ = 0;
 
+    // Start each frame with CCW-is-front; the two-pass draw below flips
+    // back and forth.  Harmless when culling is off.
+    gl_->glFrontFace(GL_CCW);
+
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
@@ -925,16 +987,34 @@ void ViewportWindow::render() {
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
         gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
-        gl_->glMultiDrawElementsIndirect(
-            GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
-            static_cast<GLsizei>(m.indirect_command_count), 0);
+
+        const uint32_t fwd = m.indirect_forward_count;
+        const uint32_t rev = m.indirect_command_count - fwd;
+        // Forward pass: non-reflected instances, standard CCW winding.
+        if (fwd > 0) {
+            gl_->glFrontFace(GL_CCW);
+            gl_->glMultiDrawElementsIndirect(
+                GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
+                static_cast<GLsizei>(fwd), 0);
+            ++gl_draw_calls_;
+        }
+        // Reverse pass: reflected instances — their world-space winding is
+        // flipped, so telling GL the front is CW keeps cull-back working.
+        if (rev > 0) {
+            gl_->glFrontFace(GL_CW);
+            gl_->glMultiDrawElementsIndirect(
+                GL_TRIANGLES, GL_UNSIGNED_INT,
+                reinterpret_cast<const void*>(fwd * sizeof(DrawElementsIndirectCommand)),
+                static_cast<GLsizei>(rev), 0);
+            ++gl_draw_calls_;
+            gl_->glFrontFace(GL_CCW);
+        }
 
         for (const auto& cmd : indirect_scratch_) {
             visible_triangles_ += (cmd.count / 3) * cmd.instanceCount;
             visible_objects_   += cmd.instanceCount;
         }
         indirect_sub_draws_ += m.indirect_command_count;
-        ++gl_draw_calls_;
     }
     gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
 
@@ -1006,6 +1086,8 @@ void ViewportWindow::renderPickPass() {
     GLint u_vp       = gl_->glGetUniformLocation(pick_program_, "u_view_projection");
     gl_->glUniformMatrix4fv(u_vp, 1, GL_FALSE, vp.constData());
 
+    gl_->glFrontFace(GL_CCW);
+
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
@@ -1016,9 +1098,23 @@ void ViewportWindow::renderPickPass() {
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
         gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
-        gl_->glMultiDrawElementsIndirect(
-            GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
-            static_cast<GLsizei>(m.indirect_command_count), 0);
+
+        const uint32_t fwd = m.indirect_forward_count;
+        const uint32_t rev = m.indirect_command_count - fwd;
+        if (fwd > 0) {
+            gl_->glFrontFace(GL_CCW);
+            gl_->glMultiDrawElementsIndirect(
+                GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
+                static_cast<GLsizei>(fwd), 0);
+        }
+        if (rev > 0) {
+            gl_->glFrontFace(GL_CW);
+            gl_->glMultiDrawElementsIndirect(
+                GL_TRIANGLES, GL_UNSIGNED_INT,
+                reinterpret_cast<const void*>(fwd * sizeof(DrawElementsIndirectCommand)),
+                static_cast<GLsizei>(rev), 0);
+            gl_->glFrontFace(GL_CCW);
+        }
     }
     gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
     gl_->glBindFramebuffer(GL_FRAMEBUFFER, 0);
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 2c3019eb15d..1bbc44c97c4 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -71,7 +71,12 @@ struct ModelGpuData {
     uint32_t total_triangles = 0;
 
     std::vector<MeshInfo>    meshes;
-    std::vector<InstanceCpu> instances;    // unsorted until finalize
+    std::vector<InstanceCpu> instances;    // unsorted
+    // 1:1 with instances[] — true when the instance transform has
+    // det < 0 (a reflection).  Reflected instances need their
+    // triangle winding treated as reversed so GL_CULL_FACE culls
+    // the correct side.
+    std::vector<uint8_t>     instance_reflected;
     uint32_t                 ssbo_instance_count = 0;
 
     // Per-instance world AABB + BVH (built at finalize).  The BVH is the
@@ -88,7 +93,8 @@ struct ModelGpuData {
     // non-empty mesh.  Re-uploaded each frame.
     GLuint  indirect_buffer = 0;
     size_t  indirect_capacity = 0;        // bytes
-    uint32_t indirect_command_count = 0;  // valid commands this frame
+    uint32_t indirect_command_count = 0;  // total valid commands this frame
+    uint32_t indirect_forward_count = 0;  // first N are CCW-winding draws
 
     bool finalized = false;
     bool hidden    = false;
@@ -211,7 +217,8 @@ class ViewportWindow : public QWindow {
     // per-frame allocation.  indirect_scratch_ is the matching array of
     // DrawElementsIndirectCommand records — forward-declared as bytes so
     // the header doesn't need the struct definition.
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_;
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_;
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_;
     std::vector<uint32_t>                  visible_flat_;
     std::vector<DrawElementsIndirectCommand> indirect_scratch_;
 

From f32f471af37ddcb7ef6998d5addc3cfa1244476f Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Sun, 12 Apr 2026 23:16:38 +1000
Subject: [PATCH 22/37] Rewrite README for instancing pipeline and refocus
 Phase 3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous README described a pre-instancing world (32-byte world-
coord vertices with per-vertex object_id, ObjectDrawInfo structs, EBO
reordering after BVH build, and a Phase 3 plan built around moving
draw submission to the GPU).  Most of that is either gone or already
solved:

  - Vertices are now 28 B local-coord; per-instance transforms live
    in an SSBO read through a visible-index SSBO and gl_BaseInstanceARB.
  - ObjectDrawInfo is replaced by MeshInfo + InstanceCpu + InstanceGpu.
  - No EBO reorder on BVH build — the BVH is over instance AABBs and
    the mesh/EBO layout is orthogonal.
  - Draw-call submission is already one glMultiDrawElementsIndirect
    per model; the old Phase 3 goal is met.

New content worth keeping:

  - GPU instancing section documents the mesh/instance/visible/indirect
    buffer contract the whole renderer hangs off of.
  - Reflection-aware two-pass draw is documented (det<0 placements,
    forward/reverse slice split, glFrontFace toggle).
  - reorient-shells and backface culling are called out as correctness
    + perf levers with their tradeoffs.
  - Phase 3 is rewritten around the actual bottleneck surfaced by
    profiling: per-frame glNamedBufferSubData stalls on the visible
    and indirect buffers.  Includes the diagnostic methodology (empty-
    screen jump to 60 fps, window/MSAA invariance, upload-comment-out
    experiment) so future-me remembers why this is the next step.
  - 3A (persistent mapped ring buffers, near-term) and 3B (GPU-side
    compute cull, longer-term) split out with scope estimates.
  - Roadmap updated: instancing / MDI / reflections / reorient-shells
    / backface cull all ticked; 3A surfaced as the next open item.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md | 757 +++++++++++++++++-----------------------
 1 file changed, 322 insertions(+), 435 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index d0122d63c83..4966d27d5ef 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -1,75 +1,115 @@
 # IfcViewer
 
-A high-performance native IFC viewer built on IfcOpenShell's C++ geometry engine with a Qt6 interface and OpenGL 4.5 rendering.
+A high-performance native IFC viewer built on IfcOpenShell's C++ geometry
+engine with a Qt6 interface and OpenGL 4.5 rendering.
 
 ## Architecture
 
 ```
-+-------------------------------------------+
-|  Qt6 Application (MainWindow)             |
-|  +----------+ +--------------------------+|
-|  | Element  | | 3D Viewport              ||
-|  | Tree     | | (QWindow + OpenGL 4.5)   ||
-|  | (per-    | |                          ||
-|  |  model)  | | Per-model VAO/VBO/EBO    ||
-|  +----------+ | glMultiDrawElements      ||
-|  | Property | | BVH frustum culling      ||
-|  | Table    | | GPU pick pass            ||
-|  +----------+ +--------------------------+|
-|  | Status / Progress / Stats              |
-+-------------------------------------------+
-        ^                    ^
-        |                    |
-  element metadata     UploadChunks / Sidecar
-        |                    |
-+-------------------------------------------+
-|  GeometryStreamer (one per loaded model)   |
-|  IfcGeom::Iterator with N threads         |
-|  (models loaded sequentially)             |
-+-------------------------------------------+
++---------------------------------------------------+
+|  Qt6 Application (MainWindow)                     |
+|  +----------+ +----------------------------------+|
+|  | Element  | | 3D Viewport                      ||
+|  | Tree     | | (QWindow + OpenGL 4.5 Core)      ||
+|  | (per-    | |                                  ||
+|  |  model)  | | Per-model: VAO/VBO/EBO           ||
+|  +----------+ |            instance SSBO         ||
+|  | Property | |            visible SSBO          ||
+|  | Table    | |            indirect buffer       ||
+|  +----------+ | glMultiDrawElementsIndirect      ||
+|  | Status / Progress / Stats                      |
++---------------------------------------------------+
+        ^                        ^
+        |                        |
+  element metadata      MeshChunk / InstanceChunk / Sidecar
+        |                        |
++---------------------------------------------------+
+|  GeometryStreamer (one per loaded model)          |
+|  IfcGeom::Iterator with N threads                 |
+|  Dedups representations -> MeshChunk              |
+|  Emits one InstanceChunk per placement            |
++---------------------------------------------------+
 ```
 
 ### Key design decisions
 
-- **QWindow viewport** embedded via `QWidget::createWindowContainer()`. This gives us a raw native surface for OpenGL, bypassing `QOpenGLWidget`'s compositor overhead.
-- **Per-model GPU buffers**: each loaded model gets its own VAO/VBO/EBO. No shared buffer, no cross-model copies on growth. Removing a model frees its GPU memory immediately.
-- **Interleaved vertex format**: position (3 floats) + normal (3 floats) + object ID (1 float, bitcast uint32) + color (RGBA8 packed into 1 float) = 32 bytes per vertex.
-- **Progressive GPU upload**: bulk sidecar loads allocate empty GPU buffers, then stream data in 48 MB chunks per frame. VBO uploads first (no objects visible), then EBO (objects appear progressively as their index range lands). The viewport stays interactive throughout — you can orbit already-loaded models while new ones stream in.
-- **Non-blocking sidecar loading**: sidecar files are read on a background thread. The heavy disk I/O (potentially gigabytes) never blocks the render loop. Only the final GPU upload and tree population happen on the main thread.
-- **BVH frustum culling**: per-model BVH trees cull entire subtrees of objects in one frustum test, reducing per-frame cost from O(N) to O(log N). Falls back to linear scan during progressive upload; BVH activates once the model is fully loaded.
-- **GPU object picking**: a second render pass writes object IDs to an R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
-- **Multi-model support**: multiple IFC files can be loaded simultaneously. Each model gets its own `GeometryStreamer` (owning the `ifcopenshell::file` for property lookup). Models are loaded sequentially. Per-model visibility toggle and removal are supported.
-- **Multi-threaded tessellation**: `IfcGeom::Iterator` runs on a background thread and internally parallelizes geometry conversion across all CPU cores.
-- **Non-blocking streaming**: the iterator emits `UploadChunk` signals via Qt's queued connection. The main thread uploads to the GPU without blocking iteration.
-- **World coordinates**: geometry is emitted in world space (`use-world-coords=true`) so no per-object transform matrices are needed on the GPU.
+- **QWindow viewport** embedded via `QWidget::createWindowContainer()`. Gives
+  us a raw native surface for OpenGL, bypassing `QOpenGLWidget`'s compositor
+  overhead.
+- **GPU instancing as the central pillar.** IFC models are dominated by
+  repeated geometry — identical doors, windows, studs, pipes placed at
+  different transforms. IfcOpenShell's iterator surfaces representation
+  identity, so we upload each unique mesh exactly once and keep per-placement
+  data (transform, object id, optional colour override) in a separate SSBO.
+  For real projects this collapses tens of millions of triangles of duplicate
+  vertex data into a few hundred MB of unique meshes.
+- **Per-model GPU buffers**: each loaded model gets its own
+  VAO/VBO/EBO/instance-SSBO/visible-SSBO/indirect-buffer. No cross-model
+  growth copies. Removing a model frees its GPU memory immediately.
+- **Local-coordinate vertex format (28 B):** position (3 floats) + normal
+  (3 floats) + packed RGBA8 colour (1 uint). The per-instance transform is
+  applied in the vertex shader via an SSBO lookup. No world-baked vertex data.
+- **Multi-draw indirect:** every frame the CPU builds a flat list of visible
+  instance indices and one `DrawElementsIndirectCommand` per non-empty mesh,
+  then issues a single `glMultiDrawElementsIndirect` per model. 50k visible
+  instances across 8k unique meshes collapse to one driver-side command
+  submission per model.
+- **BVH frustum culling over instances**: per-model BVH trees cull whole
+  subtrees of placements with one frustum test. Falls back to a linear scan
+  during progressive upload and for very small models (< 32 instances).
+- **Reflection-aware two-pass draw:** IFC placements can have negative-
+  determinant transforms (mirrored families). These flip the screen-space
+  winding of their triangles, which would make them vanish under
+  `GL_CULL_FACE`. The cull pass buckets visible instances into forward
+  (det ≥ 0) and reverse (det < 0) slices and the renderer issues two MDI
+  calls per model with `glFrontFace` toggled between them.
+- **`reorient-shells` enabled in the iterator:** makes face winding
+  consistent within a shell at geometry-gen time — the only place this can
+  actually be fixed. Without it, files with inside-out faces produce dark
+  patches and swiss-cheese under backface culling. Costs iterator time but
+  is cached in the sidecar.
+- **Progressive rendering during streaming:** the viewport is drawable
+  before `finalizeModel()`. Instances are pushed to the SSBO one at a time
+  via `glNamedBufferSubData` as they arrive, and the linear-scan cull path
+  handles them until the BVH is built. Orbit and pan remain interactive
+  through load.
+- **Non-blocking sidecar loading**: sidecars are read on a background
+  thread; only the final GPU upload touches the main thread.
+- **GPU object picking**: a second render pass writes object IDs into an
+  R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
+- **Multi-model support**: multiple IFCs can be loaded simultaneously.
+  Each gets its own `GeometryStreamer` (which owns the `ifcopenshell::file`
+  for property lookup). Models load sequentially. Per-model
+  hide/show/remove.
 
 ### Files
 
 | File | Purpose |
 |------|---------|
-| `main.cpp` | Application entry point, GL 4.5 surface format, CLI argument parsing |
-| `MainWindow.h/cpp` | Qt main window: multi-model project management, element tree, property table, status bar |
-| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffer management, camera, frustum culling, BVH traversal, picking |
-| `GeometryStreamer.h/cpp` | Background geometry processing: loads IFC, runs iterator, emits chunks (one per model) |
-| `BvhAccel.h/cpp` | BVH construction (median-split), per-model trees, EBO reordering |
-| `SidecarCache.h/cpp` | Raw binary `.ifcview` sidecar read/write |
-| `AppSettings.h/cpp` | Persisted application preferences (geometry library, show stats) |
-| `SettingsWindow.h/cpp` | Settings dialog UI |
+| `main.cpp` | Application entry, GL 4.5 surface format, CLI argument parsing |
+| `MainWindow.h/cpp` | Qt main window: multi-model project, element tree, properties, status |
+| `ViewportWindow.h/cpp` | OpenGL 4.5 Core renderer: shaders, buffers, camera, culling, MDI draw, picking |
+| `GeometryStreamer.h/cpp` | Background iterator runner; emits `MeshChunk` + `InstanceChunk` |
+| `InstancedGeometry.h` | Shared structs: `MeshInfo`, `InstanceCpu`, `InstanceGpu`, chunk records |
+| `BvhAccel.h/cpp` | Median-split BVH builder; operates on instance world-AABBs |
+| `SidecarCache.h/cpp` | Raw binary `.ifcview` (v4) sidecar read/write |
+| `AppSettings.h/cpp` | Persisted preferences (geometry library, stats overlay, backface culling) |
+| `SettingsWindow.h/cpp` | Settings dialog |
 | `CMakeLists.txt` | Build configuration |
 
 ## Dependencies
 
-- **Qt6** (Core, Gui, Widgets)
-- **OpenGL 4.5** (GL_ARB_direct_state_access) - available on Windows and Linux; macOS will need a Vulkan/MoltenVK backend (not yet implemented)
-- **IfcOpenShell C++ libraries** (IfcParse, IfcGeom, and their dependencies: Open CASCADE, Boost, Eigen3, optionally CGAL)
+- **Qt6** (Core, Gui, Widgets, OpenGL)
+- **OpenGL 4.5** with `GL_ARB_direct_state_access` and
+  `GL_ARB_shader_draw_parameters` — available on Windows and Linux. macOS
+  will need a Vulkan/MoltenVK backend (not yet implemented; macOS caps out
+  at GL 4.1).
+- **IfcOpenShell C++ libraries** (IfcParse, IfcGeom, and their
+  dependencies: Open CASCADE, Boost, Eigen3, optionally CGAL).
 
 ## Building
 
-IfcViewer is built as part of the IfcOpenShell CMake project. You do not need to build everything - disable the targets you don't need.
-
-### Minimal build (IfcViewer only)
-
-From the repository root:
+IfcViewer is part of the IfcOpenShell CMake project. From the repo root:
 
 ```sh
 mkdir build && cd build
@@ -89,25 +129,13 @@ cmake ../cmake \
 make -j$(nproc) IfcViewer
 ```
 
-This builds only IfcParse, IfcGeom (with geometry kernels), and IfcViewer itself. All other targets (IfcConvert, Python bindings, serializers, etc.) are skipped.
-
 If Qt6 is not in a standard location, pass `-DQT_DIR=/path/to/qt6`.
 
-### Full build with IfcViewer enabled
-
-```sh
-cmake ../cmake -DBUILD_IFCVIEWER=ON
-make -j$(nproc)
-```
-
 ## Usage
 
 ```sh
-# Open one or more files from the command line
 ./IfcViewer arch.ifc struct.ifc mep.ifc
-
-# Or use File -> Add Files from the menu (supports multiselect)
-./IfcViewer
+./IfcViewer                   # then File -> Add Files
 ```
 
 ### Controls
@@ -117,449 +145,308 @@ make -j$(nproc)
 | Middle mouse drag | Orbit camera |
 | Shift + middle mouse drag | Pan camera |
 | Scroll wheel | Zoom |
-| Left click | Select object (highlights in viewport and tree) |
+| Left click | Select object |
 
-### Keyboard shortcuts
+### Keyboard
 
 | Key | Action |
 |-----|--------|
 | Ctrl+O | Add files |
 | Ctrl+Q | Quit |
 
-## Performance Strategy
-
-The viewer targets smooth orbiting at 60 fps on models up to 1 million IFC objects.
-Rendering performance is addressed in three phases. Each phase builds on the
-previous one, and the system is designed so that smaller models never pay for
-optimizations they don't need.
-
-### Phase 1: Per-Object Frustum Culling (CPU)
-
-**Status:** Implemented.
-
-The simplest win: don't draw what's off screen.
-
-#### Data model
-
-During `uploadChunk()`, the viewport records a small metadata struct for every
-object that enters the GPU buffers:
-
-```cpp
-struct ObjectDrawInfo {
-    uint32_t index_offset;   // byte offset into the model's EBO
-    uint32_t index_count;    // number of indices (triangles * 3)
-    uint32_t model_id;       // which model this object belongs to
-    float    aabb_min[3];    // world-space axis-aligned bounding box
-    float    aabb_max[3];    // (computed from vertex positions at upload time)
-};
-```
-
-This costs 32 bytes per object. For 1M objects that's ~32 MB of CPU-side
-metadata — negligible next to the vertex data.
-
-#### Frustum extraction
-
-Each frame, before drawing, six clip planes are extracted from the
-view-projection matrix (`VP = proj * view`). The standard Griess-Hartmann
-method pulls them directly from the matrix rows:
-
-```
-left   = VP[3] + VP[0]
-right  = VP[3] - VP[0]
-bottom = VP[3] + VP[1]
-top    = VP[3] - VP[1]
-near   = VP[3] + VP[2]
-far    = VP[3] - VP[2]
-```
-
-Each plane is stored as (a, b, c, d) and normalized so that
-`a*x + b*y + c*z + d` gives the signed distance from the plane.
-
-#### AABB-frustum test
+### Settings
 
-For each object, the AABB is tested against all six planes using the
-"p-vertex / n-vertex" method:
+- **Geometry Library** — kernel string passed to IfcOpenShell (default
+  `hybrid-cgal-simple-opencascade`).
+- **Show Performance Stats** — overlay FPS / object / triangle / draw
+  counts in the status bar.
+- **Backface Culling** — `GL_CULL_FACE` on closed solids. Default on.
+  Disable if a model uses open shells and you see missing faces.
 
-- For each plane, find the AABB corner most in the direction of the plane
-  normal (the p-vertex).
-- If the p-vertex is on the negative side of the plane, the entire AABB is
-  outside the frustum → cull.
-- If any plane culls the object, skip it.
-
-This test is conservative: it never culls a visible object, but may
-occasionally keep an invisible one (when the AABB straddles a frustum corner).
-That's fine — false positives just cost a few extra triangles.
+## Performance Strategy
 
-#### Drawing visible objects
+The viewer targets smooth orbiting at 60 fps on real-world multi-discipline
+BIM projects (a "real job" being ~50 models, several million placements,
+hundreds of millions of rasterised triangles when everything is in view).
 
-The surviving objects' `(index_count, index_offset)` pairs are passed to
-`glMultiDrawElements()` in a single call. This replaces the previous single
-`glDrawElements()` that drew everything. The GPU processes only the index
-ranges that survived the frustum test.
+Rendering performance has evolved in phases. Each builds on the previous,
+and smaller models never pay for optimisations they don't need.
 
-Alternatively, for the pick pass (which runs less frequently), the same
-visibility list is reused — objects culled from the main pass are also culled
-from picking.
+### Phase 1 — Per-object Frustum Culling
 
-#### Performance characteristics
+**Status:** implemented (and still the fallback for small models / during
+streaming).
 
-| Metric | Value |
-|--------|-------|
-| Per-object cost | ~6 dot products + 6 comparisons per frame |
-| 50k objects | ~0.3 ms on a modern CPU core |
-| 500k objects | ~3 ms (starts to matter at 60 fps) |
-| 1M objects | ~6 ms (too expensive — need phase 3) |
-| Memory overhead | 32 bytes/object |
-| Load-time overhead | Near zero (AABB computed during existing upload) |
+Six view-frustum planes are extracted from the view-projection matrix each
+frame. Each instance's world AABB is tested with the p-vertex / n-vertex
+method (one dot product + one compare per plane, 6 planes).
 
-Phase 1 is sufficient for models up to ~100k objects. Beyond that, the CPU-side
-frustum test becomes a measurable fraction of the frame budget, motivating
-phase 3.
+Surviving instance indices are written into a per-mesh bucket, then
+flattened into a single `uint[]` (the "visible SSBO", binding = 1) and
+accompanied by one `DrawElementsIndirectCommand` per non-empty mesh.
+One `glMultiDrawElementsIndirect` call per model draws everything.
 
-### Phase 2: BVH Acceleration (optional, for large models)
+Cost: ~6 dot products per instance per frame. Fine up to ~100 k instances
+per frame; above that the linear scan shows up in profiles, motivating
+Phase 2.
 
-**Status:** Implemented.
+### Phase 2 — BVH Acceleration + Sidecar Cache
 
-For models exceeding ~100 objects, a bounding volume hierarchy (BVH) groups
-nearby objects into a binary tree and culls entire subtrees in one frustum
-test. This reduces the number of AABB-frustum tests from O(N_objects) to
-O(log N) in the best case (camera zoomed into a corner) and gives a constant
-overhead for the common case where most of the model is on screen.
+**Status:** implemented.
 
-A BVH was chosen over an octree because BIM data is spatially non-uniform —
-dense MEP risers in one zone, sparse open atriums in another. An octree
-subdivides space uniformly, wasting nodes on empty regions and creating deep
-chains in dense ones. A BVH adapts its splits to the actual object
-distribution, producing balanced trees regardless of density variation.
+For models exceeding ~32 instances, a bounding volume hierarchy groups
+nearby placements into a binary tree and culls entire subtrees with a
+single frustum test. This reduces per-frame work from O(N) to O(log N) in
+the best case (camera zoomed to a corner) and remains well under 1 ms for
+100 k instances in the worst case (everything on screen).
 
-#### When the BVH activates
+A BVH was chosen over an octree because BIM data is spatially non-uniform
+— dense MEP risers in one zone, sparse open atria in another. An octree
+subdivides space uniformly, wasting nodes on empty regions and creating
+deep chains in dense ones. A BVH adapts its splits to the actual
+placement distribution.
 
-The BVH is **optional and non-disruptive**. Until it is built, phase 1's
-linear scan handles all culling. The rendering loop checks for an active BVH
-and falls back to the linear scan for any model that doesn't have one.
+#### Activation
 
-The BVH activates in one of two ways:
+The BVH is optional and non-disruptive. Until it is built, the Phase 1
+linear scan handles culling. The renderer checks for a BVH per model and
+falls back to the scan for any model that doesn't have one.
 
-1. **Sidecar cache exists**: If a `.ifcview` file is found next to the `.ifc`
-   file, the BVH is loaded from it instantly (raw memory read, no parsing).
-   The model uses BVH culling from the first frame after loading.
-2. **Automatic build**: After streaming finishes, a background thread builds
-   the BVH from the per-object AABBs already computed in phase 1. Until it
-   completes, phase 1 culling handles visibility. On completion, the render
-   thread picks up the BVH on the next frame. The sidecar is written for
-   future loads.
+It activates in one of two ways:
 
-Models with fewer than 32 objects skip the BVH entirely — the overhead of tree
-traversal is worse than a linear scan at that scale.
+1. **Sidecar hit** — the `.ifcview` file next to the `.ifc` is found and
+   valid; its instance data is uploaded and the BVH rebuilt on the fly
+   from the restored AABBs (cheap — `< 100 ms` for 100 k placements).
+2. **After streaming** — `finalizeModel()` builds the BVH synchronously
+   once all chunks are in (instances already live on the GPU, so there's
+   no EBO re-sort to do). The sidecar is written afterwards.
 
-#### BVH node layout
+Models under 32 instances skip the BVH.
 
-Each node is 32 bytes, so two nodes fit in one 64-byte cache line:
+#### BVH node layout (32 B, two per cache line)
 
 ```cpp
 struct BvhNode {
-    float    aabb_min[3];     // world-space bounding box (12 bytes)
-    float    aabb_max[3];     // (12 bytes)
-    uint32_t right_or_first;  // interior: right child index; leaf: first object index (4 bytes)
-    uint16_t count;           // 0 = interior node; >0 = leaf with this many objects (2 bytes)
-    uint16_t axis;            // split axis for interior (0=x, 1=y, 2=z); unused for leaf (2 bytes)
+    float    aabb_min[3];      // 12 B
+    float    aabb_max[3];      // 12 B
+    uint32_t right_or_first;   // interior: right child index; leaf: first item index
+    uint16_t count;            // 0 = interior, >0 = leaf
+    uint16_t axis;             // 0/1/2 for interior; unused for leaf
 };
 ```
 
-Interior nodes store the right child index; the left child is always the
-immediately next node in the array (implicit in pre-order DFS layout, no
-pointer needed). Leaf nodes reference a contiguous range in a sorted
-object-index array.
-
-The BVH is stored as a flat `std::vector<BvhNode>` in pre-order DFS layout.
-This means a depth-first traversal (which is what frustum culling does) reads
-memory sequentially, maximizing prefetch and cache-line utilization.
-
-#### Build algorithm: object-median split
-
-1. Compute the centroid of each object's AABB.
-2. Find the longest axis of the current node's bounding box.
-3. Use `std::nth_element` to partition objects at the median centroid on that
-   axis. This is O(n) — no full sort needed.
-4. Recurse on each half. Terminate when the node contains ≤ 8 objects (leaf).
-5. Write nodes into the flat array in pre-order DFS.
+Left child is always the next node (pre-order DFS). Leaf items are
+indices into the per-model `instances` array; the parallel `bvh_items[]`
+array carries the world AABBs.
 
-Total build time is O(n log n). For 100k objects this is well under 100 ms on
-a single core.
+#### Build: object-median split
 
-SAH (Surface Area Heuristic) is the gold standard for ray-tracing BVHs, but
-for frustum culling — where we test 6 planes and early-out entire subtrees —
-the quality difference vs. median split is negligible. Median split is simpler
-and produces reliably balanced trees.
+1. Compute centroid of each item's AABB.
+2. Pick the longest axis of the node's AABB.
+3. `std::nth_element` partitions at the median on that axis — O(n).
+4. Recurse until a leaf holds ≤ 8 items.
 
-#### Frustum traversal
+O(n log n) total. No SAH — for frustum culling (6-plane tests, early
+subtree reject) the quality difference vs median is negligible.
 
-The traversal uses an explicit stack on the C++ stack (no heap allocation,
-no recursion):
+#### Traversal: stack-based, no recursion
 
 ```
-stack[64] = {0}   // start at root; depth 64 handles billions of objects
+stack[64] = { 0 }                     // root
 while stack not empty:
     node = nodes[stack.pop()]
-    if node AABB outside frustum: continue   // cull entire subtree
+    if node.aabb outside frustum: continue
     if leaf:
-        for each object in node:
-            if object AABB in frustum: emit to visible list
+        for each item in node:
+            if item.aabb in frustum: emit to visible list
     else:
-        push right child, push left child    // left processed first (DFS)
+        push right child, push left child   // left processed first (DFS)
 ```
 
-When the camera is zoomed into a corner of the model, the traversal skips
-large portions of the tree after testing only a handful of interior nodes.
-When zoomed out to see everything, the traversal visits all leaves but the
-overhead of the interior-node tests is small relative to the leaf work.
+Depth 64 is enough for billions of items on any balanced tree. The stack
+is on the C++ stack, zero per-frame allocation.
 
-#### Per-model BVH
+#### Sidecar format (`.ifcview`, v4)
 
-Each loaded model gets its own BVH. During frustum culling, the outer loop
-iterates over models (skipping hidden/removed ones); the inner loop traverses
-that model's BVH. This means hiding or removing a model is free — just skip
-its BVH, no tree modification needed.
+Raw memory dump, Blender-`.blend`-style — no serialisation, no parsing.
+Stores everything needed to skip the `IfcGeom::Iterator` pass:
 
-```cpp
-struct ModelBvh {
-    uint32_t model_id;
-    std::vector<BvhNode> nodes;            // flat BVH node array
-    std::vector<uint32_t> object_indices;  // indices into object_draw_info_
-};
+```
+SidecarHeader            (magic "IFVW", version, endian, ...)
+uint64_t                 source_file_size
+uint32_t + float[]       vertex data    (7 floats × N_verts, local coords)
+uint32_t + uint32_t[]    index data     (mesh-local)
+uint32_t + MeshInfo[]    per-unique-mesh metadata (48 B each)
+uint32_t + InstanceCpu[] per-placement records (transform + AABB + ids)
+uint32_t + PackedElementInfo[]   element tree records
+uint32_t + char[]        string table
 ```
 
-#### EBO re-sorting
-
-For BVH culling to maximise GPU cache performance, the EBO is re-sorted so
-that objects in the same BVH leaf are contiguous. This happens via **deferred
-compaction**:
-
-1. During initial load, geometry uploads in iterator order (fast first frame,
-   phase 1 culling active).
-2. After the BVH build completes on the background thread:
-   a. Walk the BVH leaves in DFS order.
-   b. For each object in each leaf, copy its index data to a new EBO buffer,
-      updating `ObjectDrawInfo::index_offset` accordingly.
-   c. Package the reordered EBO + updated draw info as a `BvhBuildResult`.
-3. The render thread picks up the result on the next frame: one
-   `glNamedBufferSubData` call to re-upload the EBO, then swap in the new
-   draw info and activate the BVH. One frame of stutter, bounded by EBO
-   upload time (~5 ms for 32 MB).
-
-#### Async build and render-thread handoff
-
-The BVH build must not stall the render loop:
-
-1. `buildBvhAsync()` snapshots `object_draw_info_` under the upload mutex,
-   then launches a `std::thread`.
-2. The thread builds the BVH and reordered EBO, then stores the result in a
-   `pending_bvh_result_` pointer under a separate mutex.
-3. At the top of each `render()` call, `applyBvhResult()` checks for a
-   pending result. If found, it re-uploads the EBO (requires GL context),
-   swaps the draw info, and activates the BVH.
-4. Until the BVH is ready, phase 1's linear scan runs every frame as before.
-
-#### Preprocessed sidecar format (`.ifcview`)
-
-The sidecar is a raw memory dump (Blender `.blend`-style) — no serialization
-format, no parsing. It stores everything needed to display the model without
-re-tessellating: vertex data, index data, per-object metadata, element tree
-info, and the BVH. Loading is just `fread` into vectors → GPU upload →
-render. The expensive `IfcGeom::Iterator` tessellation is skipped entirely.
-
-The IFC file is still parsed on demand (in background) for detailed property
-lookup; the sidecar provides the basic properties (name, type, GUID)
-immediately.
+Staleness check: `source_file_size` vs actual file size. Mismatched →
+reject and rebuild. Endianness marker rejects cross-arch caches.
 
-```
-SidecarHeader            (16 bytes: magic, version, endian, reserved)
-uint64_t                 source_file_size
+### GPU Instancing pipeline (the central pillar)
 
-uint32_t + float[]       vertex data    (interleaved, 8 floats/vertex)
-uint32_t + uint32_t[]    index data     (global indices, ready for EBO)
-uint32_t + ObjectDrawInfo[]   per-object draw metadata
-uint32_t + PackedElementInfo[]  element tree records (fixed-size)
-uint32_t + char[]        string table   (concatenated UTF-8: guid, name, type)
-
-uint32_t                 num_bvh_models
-per model:
-  uint32_t model_id
-  uint32_t + BvhNode[]        BVH node array
-  uint32_t + uint32_t[]       object indices
-```
+Everything above plugs into a single data-flow, worth documenting on its
+own because it's what makes the whole thing fast.
 
-Staleness check: `source_file_size` is compared against the actual IFC file
-size. If mismatched, the sidecar is stale and is rebuilt. This is cheap and
-sufficient for a local cache (no hash computation on multi-GB files).
+Per-model state on the GPU:
 
-Endianness: if the marker reads back as `0x01020304`, the file was written on
-the same architecture — just `fread` the structs directly. Otherwise, reject
-the sidecar and rebuild.
+| Buffer | Contents | Lifetime |
+|--------|----------|----------|
+| `VBO` | Interleaved local-coord vertex data (28 B/vert). One range per unique representation. | Grow-on-demand during streaming; static after finalize. |
+| `EBO` | Mesh-local uint32 indices. One range per unique representation. | Same. |
+| `SSBO` (binding 0) | `InstanceGpu[]` (80 B each: mat4 transform, object_id, color_override, pad). | Appended during streaming, static after finalize. |
+| `visible SSBO` (binding 1) | `uint32[]` — flat list of visible instance indices, ordered by mesh, uploaded each frame. | Rewritten every frame. |
+| Draw-indirect buffer | `DrawElementsIndirectCommand[]` — one per non-empty mesh, uploaded each frame. | Rewritten every frame. |
 
-#### Performance characteristics
+Draw command:
 
-| Metric | Value |
-|--------|-------|
-| BVH build time (100k objects) | < 100 ms (single-threaded, background) |
-| Per-frame traversal (100k objects, 50% visible) | ~0.1 ms |
-| Per-frame traversal (100k objects, 5% visible) | ~0.02 ms |
-| Memory overhead | 32 bytes/node + 4 bytes/object index (~1.5× object count) |
-| EBO reorder (one-time) | 1–5 ms upload for 32 MB EBO |
-| Sidecar file size | ~same as geometry data (vertices + indices + metadata) |
-| Sidecar read time | bounded by disk I/O (~500 ms for 640 MB, ~2 s for 2.8 GB from NVMe) |
-| GPU upload time | progressive: ~48 MB/frame (~1 s for 2.8 GB at 60 fps, non-blocking) |
-
-#### Spatial coherence bonus
-
-Beyond culling, BVH-leaf-sorted EBOs improve GPU cache performance. When the
-GPU rasterizes a leaf's triangles, the vertices are close together in the VBO,
-so the post-transform vertex cache hits more often. This can yield 10–20%
-rasterization speedup even when nothing is culled (e.g. zoomed out to see the
-whole model).
-
-### Phase 3: GPU-Driven Indirect Draw
-
-For models with 500k+ objects, even tile-level CPU culling is fast, but the
-real bottleneck shifts to draw call submission. Phase 3 moves all per-frame
-visibility decisions to the GPU via compute shaders and indirect draw commands.
-
-#### How it works
-
-Phase 3 builds on the BVH from phase 2. It does not replace the BVH — it
-moves the per-frame traversal to the GPU.
-
-1. **Upload phase** (once, at load time):
-   - Per-leaf AABBs from the BVH are uploaded to a GPU SSBO (`leaf_aabbs`).
-   - One `DrawElementsIndirectCommand` per BVH leaf is written to an indirect
-     draw buffer:
-     ```c
-     struct DrawElementsIndirectCommand {
-         uint count;          // leaf's total index count
-         uint instanceCount;  // 1
-         uint firstIndex;     // offset into EBO (from BVH leaf order)
-         uint baseVertex;     // 0 (indices are global)
-         uint baseInstance;   // leaf_id (available in shader via gl_DrawID)
-     };
-     ```
-   - A "template" copy of the indirect buffer is kept so the compute shader
-     can reset culled commands each frame without re-uploading from CPU.
-
-2. **Cull phase** (every frame, on the GPU):
-   - The CPU uploads 6 frustum plane vec4s as a uniform or small UBO.
-   - A compute shader dispatches `ceil(N_leaves / 64)` workgroups:
-     ```glsl
-     layout(local_size_x = 64) in;
-
-     void main() {
-         uint leaf_id = gl_GlobalInvocationID.x;
-         if (leaf_id >= leaf_count) return;
-
-         // Copy from template (resets any previously zeroed commands)
-         commands[leaf_id] = template_commands[leaf_id];
-
-         // Frustum test
-         if (!aabb_vs_frustum(leaf_aabbs[leaf_id], frustum_planes)) {
-             commands[leaf_id].count = 0;  // culled: GPU skips zero-count draws
-         }
-     }
-     ```
-   - A memory barrier ensures the indirect buffer is visible to the draw stage.
-
-3. **Draw phase** (every frame):
-   - One call: `glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT,
-     nullptr, N_leaves, 0)`.
-   - The GPU reads the indirect buffer, skips tiles with `count == 0`, and
-     draws the rest. Zero CPU-side per-object or per-tile work.
-
-#### What the CPU does per frame
-
-1. Upload 6 vec4 frustum planes (96 bytes).
-2. Dispatch one compute shader.
-3. Issue one `glMultiDrawElementsIndirect`.
-4. Swap buffers.
-
-That's it. The CPU frame time is essentially constant regardless of model size.
-
-#### Future extensions (enabled by this architecture)
-
-Once the compute-based cull pass exists, it's straightforward to add:
-
-- **Hierarchical-Z occlusion culling**: render a coarse depth buffer from the
-  previous frame, then test BVH leaf AABBs against it in the compute shader.
-  Leaves fully behind closer geometry get culled. This handles interior-heavy
-  BIM models well (most rooms are occluded from any given viewpoint).
-- **Distance-based LOD**: the compute shader can select different index ranges
-  (coarse vs. fine tessellation) per leaf based on distance to camera.
-- **Contribution culling**: leaves whose screen-space projection is below a
-  pixel threshold get `count = 0`. Removes distant small objects.
-
-#### Performance characteristics
+```c
+struct DrawElementsIndirectCommand {
+    uint32_t count;         // mesh.index_count
+    uint32_t instanceCount; // visible-list length for this mesh
+    uint32_t firstIndex;    // mesh.ebo_byte_offset / 4
+    uint32_t baseVertex;    // mesh.vbo_byte_offset / 28
+    uint32_t baseInstance;  // offset into the flat visible-index array
+};
+```
 
-| Metric | Value |
-|--------|-------|
-| CPU per-frame work | ~0.01 ms (constant, independent of model size) |
-| GPU compute dispatch | ~0.02 ms for 2k leaves |
-| Draw call overhead | 1 indirect multi-draw call |
-| GPU memory overhead | ~48 bytes/leaf (AABB SSBO) + 20 bytes/leaf (indirect commands) × 2 (template + live) |
-| Total for 2k leaves | ~176 KB GPU memory |
-| Implementation complexity | High (compute shaders, SSBOs, memory barriers, indirect draw) |
+The vertex shader reads `visible[gl_BaseInstanceARB + gl_InstanceID]` to
+get the real instance id, then indexes into the instance SSBO:
 
-#### When to use
+```glsl
+uint slot = uint(gl_BaseInstanceARB) + uint(gl_InstanceID);
+uint iid  = visible[slot];
+InstanceRecord inst = instances[iid];
+gl_Position = u_view_projection * inst.transform * vec4(a_position, 1.0);
+```
 
-Phase 3 is worthwhile when:
+`gl_BaseInstanceARB` requires `GL_ARB_shader_draw_parameters`, which is
+available on all GL-4.6-capable drivers.
 
-- The model has 500k+ objects (CPU frustum testing > 3 ms).
-- Smooth 60 fps orbiting is required during interaction.
-- The GPU has compute shader support (OpenGL 4.3+, which is guaranteed since
-  the viewer requires 4.5).
+Reflection handling: at upload time we store a parallel
+`instance_reflected[]` byte array (1 if the transform's upper-3×3 has
+det < 0). The cull pass produces two flat visible-list slices — fwd
+(non-reflected) first, rev (reflected) after — concatenated into one
+buffer. The renderer issues MDI twice: fwd with `glFrontFace(GL_CCW)`,
+rev with `glFrontFace(GL_CW)`. `GL_CULL_FACE` stays on and does the
+right thing in both passes.
 
-For models under 100k objects, phase 1 alone is sufficient. For 100k–500k,
-phase 2 (BVH) keeps CPU culling well under 1 ms. Phase 3 is the final step
-that makes the CPU frame time constant.
+### Current bottleneck — Phase 3 as designed is already obsolete
 
-### Summary
+The original README's Phase 3 ("GPU-driven indirect draw") described
+moving draw submission to the GPU via compute. In the meantime, GPU
+instancing and MDI made the CPU-side draw cost essentially free (10
+`glMultiDrawElementsIndirect` calls per frame for 10 models). **That
+goal is met.** The real Phase 3 problem is different.
 
-```
-Model size       Active phases     CPU cull cost     Draw calls
-─────────────    ──────────────    ──────────────    ──────────
-< 10k objects    Phase 1           ~0.06 ms          1 multi-draw
-10k–100k         Phase 1           ~0.6 ms           1 multi-draw
-100k–500k        Phase 1 + 2       ~0.01 ms          1 multi-draw
-500k–1M+         Phase 1 + 2 + 3   ~0 (GPU)          1 indirect multi-draw
-```
+#### Diagnosed on a 10-model / 379 k-instance / 128 M-triangle scene
 
-The load path:
+Observed numbers (everything in view, no movement):
+
+| Metric | Value |
+|--------|-------|
+| FPS | 10 |
+| Frame time | ~100 ms |
+| gl_draws | 10 |
+| Sub-draws packed in indirect buffers | 67 037 |
+
+Elimination experiments:
+
+| Probe | Result | Interpretation |
+|-------|--------|----------------|
+| Camera off-screen (nothing visible) | → 60 fps | GPU is idle; CPU path is cheap |
+| Resize window to 1/4 area | no change | Not fragment/raster bound |
+| `setSamples(4)` → `setSamples(1)` | no change | Not MSAA/resolve bound |
+| Comment out the two `glNamedBufferSubData` in `cullAndUploadVisible` | → 60 fps (screen blank) | **The per-frame uploads are the bottleneck.** |
+
+So the bottleneck is two `glNamedBufferSubData` calls per model per
+frame uploading ~1.5 MB (visible list) + ~1.3 MB (indirect buffer).
+3 MB/frame / 60 fps = 180 MB/s — trivial for the bus, but `glNamedBufferSubData`
+against a buffer the GPU is still reading forces the driver to stall
+the CPU or orphan/reallocate the backing store, and we're hitting that
+on 20 buffers per frame.
+
+### Phase 3 (proposed) — Eliminate per-frame upload stalls
+
+Two ways to attack it, in ascending order of effort:
+
+#### 3A. Persistent mapped ring buffers (near-term)
+
+Allocate each of the per-frame-written buffers with
+`glBufferStorage(GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_MAP_WRITE_BIT)`
+at 3× the needed size. Keep one `void*` from `glMapBufferRange` forever.
+Each frame, write the CPU-side data into slice `frame % 3` and bind
+that slice via `glBindBufferRange`. The GPU reads slice N−1 while the
+CPU writes slice N — no driver sync, no orphan, no stall.
+
+Scope: ~80 lines across `ModelGpuData` + `cullAndUploadVisible` +
+binding in `render()` / `renderPickPass()`. No algorithmic change, no
+shader change. Expected result on the stats scene: 10 fps → ~60 fps
+(the measured ceiling once uploads are removed).
+
+#### 3B. GPU-side culling (longer-term)
+
+Push culling itself to the GPU. A compute shader reads the
+`InstanceCpu`-equivalent SSBO + frustum planes, builds the visible list
+and indirect commands in-place via atomics. Zero CPU→GPU per-frame
+bytes. Also lays the foundation for occlusion and contribution culling
+(both want to run on the GPU anyway, with access to the depth buffer
+or screen-space projection).
+
+Scope: compute shader + atomic counter + BVH-traversal-on-GPU (or a
+linear compute scan — simpler and still gains most of the win since
+traversal isn't the bottleneck once upload is gone). Bigger change;
+worth doing after 3A is measured, because 3A may be enough for a long
+while.
+
+### Planned follow-ups (post-Phase-3)
+
+- **Screen-space contribution cull.** Reject instances whose projected
+  screen-space AABB is below a pixel threshold. Cheap CPU-side filter
+  that eliminates distant MEP detail. Big win on unfiltered plant-room
+  scenes.
+- **Hierarchical-Z occlusion culling.** Render large occluders, build a
+  depth pyramid, test BVH / instance AABBs against it. In dense BIM,
+  most geometry is behind other geometry from any given viewpoint; this
+  is historically a 3–10× reduction in drawn instances.
+- **Distance / contribution LOD.** Unique meshes pre-simplified at load
+  time; compute shader selects an LOD per instance per frame based on
+  screen-space size. Same visible-SSBO plumbing, different `firstIndex`.
+- **Mesh shaders / meshlets.** Ceiling-raising but overkill until the
+  above are exhausted.
+
+## Summary table
 
 ```
-open(model.ifc):
-  ├─ sidecar exists (.ifcview)?
-  │   ├─ yes: background thread reads sidecar file (non-blocking I/O)
-  │   │       → allocate per-model VAO/VBO/EBO (empty, exact size)
-  │   │       → progressive GPU upload: 48 MB/frame VBO, then EBO
-  │   │       → objects appear as EBO chunks land
-  │   │       → BVH activates once fully loaded
-  │   │       → viewport interactive throughout
-  │   └─ no:  stream from IFC via GeometryStreamer
-  │           → uploadChunk() appends to per-model buffers (immediately drawable)
-  │           → phase 1 linear-scan culling active from first chunk
-  │           → on completion: background BVH build, re-sort EBO, save .ifcview
-  └─ rendering (per model, per frame):
-      ├─ phase 3 available?  → compute cull + indirect multi-draw
-      ├─ BVH available?      → BVH traversal + glMultiDrawElements
-      └─ else / progressive  → linear scan of active objects + glMultiDrawElements
+Scene size                      Bottleneck           Fix
+-----------                     ----------           ---
+< 100k instances                CPU cull scan        Phase 1 only (current)
+100k–500k                       CPU cull scan        BVH (Phase 2) — done
+500k+ across many models        visible/indirect     Phase 3A mapped rings
+                                buffer uploads       (next)
+---                             ---                  ---
+multi-million + occlusion-heavy fragment / overdraw  HiZ occlusion + LOD
 ```
 
 ## Roadmap
 
-- [x] Material color support (per-vertex RGBA8)
+- [x] Material colour support (per-vertex RGBA8)
 - [x] Per-model GPU buffers (VAO/VBO/EBO per model, no cross-model copies)
-- [x] Per-object frustum culling (phase 1)
-- [x] BVH acceleration with per-model trees (phase 2)
-- [x] Raw binary `.ifcview` sidecar cache (full geometry + BVH, Blender-style)
+- [x] Per-object frustum culling (Phase 1)
+- [x] BVH acceleration with per-model trees (Phase 2)
+- [x] Raw binary `.ifcview` sidecar cache
 - [x] Non-blocking sidecar loading (background thread I/O)
-- [x] Progressive GPU upload (48 MB/frame chunked VBO/EBO transfer)
-- [ ] GPU-driven indirect draw (phase 3)
+- [x] Progressive GPU upload (VBO/EBO growth + streaming-time instance appends)
+- [x] GPU instancing (unique meshes + per-placement SSBO)
+- [x] `glMultiDrawElementsIndirect` draw path
+- [x] Reflection-aware two-pass draw for mirrored placements
+- [x] Backface culling (user-toggleable, default on)
+- [x] `reorient-shells` enabled in iterator
+- [ ] **Phase 3A — persistent-mapped ring buffers for visible + indirect** (next)
+- [ ] Phase 3B — GPU-side compute-shader culling
+- [ ] Screen-space contribution culling
 - [ ] Hierarchical-Z occlusion culling
 - [ ] Distance-based LOD selection
 - [ ] Vulkan/MoltenVK backend for macOS

From 8360cd3ca8d1990dd502eaf9c15f225a52b92916 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Mon, 13 Apr 2026 09:33:21 +1000
Subject: [PATCH 23/37] Pivot Phase 3: diagnose as draw-bound, not upload-bound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier probes pointed at per-frame glNamedBufferSubData uploads as the
bottleneck (60 fps when those two calls were commented out).  That was a
false reading — zeroing the uploads also emptied the indirect buffer, so
MDI drew nothing.  "No upload" and "no draw" were indistinguishable.

Two new diagnostic env vars in render() isolate the real costs:

  IFC_SKIP_MDI=1       keep cull + upload + binds, skip only the MDI
                       draws.  Gives 62 fps with everything else running,
                       confirming the non-draw path fits in ~16 ms.
  IFC_MAX_SUBDRAWS=N   cap each MDI's drawcount.  67k -> 30k sub-draws
                       saves 0 ms, confirming sub-draw count itself is
                       not the bottleneck; the long tail of sub-draws
                       carries ~no triangles.

On a GTX 1650 with 128 M triangles in view, nvidia-smi sits at 95 %
GPU util and FPS scales with triangle work, not sub-draw count.  The
card is simply rasterising at ~850 M tri/s.  No CPU-side or upload
trick recovers it.

Revised Phase 3 is therefore shedding triangles, not bytes:
  3A screen-space contribution culling (next)
  3B LOD
  3C HiZ occlusion
  3D GPU-side compute culling

README Phase 3 section rewritten around the diagnosis, including the
false lead, so future work doesn't re-tread the upload path.  The
aborted staging+resident ring-buffer implementation was reverted (the
uncommitted working tree is gone — pure glNamedBufferSubData retained
for the visible + indirect buffers, which we now know is fine).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md          | 187 +++++++++++++++++++------------
 src/ifcviewer/ViewportWindow.cpp |  33 +++++-
 2 files changed, 145 insertions(+), 75 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 4966d27d5ef..4af736bfad8 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -333,101 +333,148 @@ buffer. The renderer issues MDI twice: fwd with `glFrontFace(GL_CCW)`,
 rev with `glFrontFace(GL_CW)`. `GL_CULL_FACE` stays on and does the
 right thing in both passes.
 
-### Current bottleneck — Phase 3 as designed is already obsolete
+### Current bottleneck — draw-bound, not upload-bound
 
 The original README's Phase 3 ("GPU-driven indirect draw") described
 moving draw submission to the GPU via compute. In the meantime, GPU
 instancing and MDI made the CPU-side draw cost essentially free (10
 `glMultiDrawElementsIndirect` calls per frame for 10 models). **That
-goal is met.** The real Phase 3 problem is different.
+goal is met.** The real ceiling lies elsewhere, and it took a couple of
+bad hypotheses to pin down.
 
-#### Diagnosed on a 10-model / 379 k-instance / 128 M-triangle scene
+#### Profiled scene
 
-Observed numbers (everything in view, no movement):
+10 models / 379 k instances / 128 M triangles, everything in view, no
+camera motion, GTX 1650 (PCIe dGPU, 4 GB VRAM):
 
 | Metric | Value |
 |--------|-------|
-| FPS | 10 |
-| Frame time | ~100 ms |
+| FPS | 6.7 |
+| Frame time | 149 ms |
 | gl_draws | 10 |
 | Sub-draws packed in indirect buffers | 67 037 |
 
-Elimination experiments:
+`nvidia-smi` reports 95 % GPU utilisation during render — the GPU is
+the thing that's pinned.
+
+#### False lead: "the per-frame uploads are the bottleneck"
 
-| Probe | Result | Interpretation |
-|-------|--------|----------------|
-| Camera off-screen (nothing visible) | → 60 fps | GPU is idle; CPU path is cheap |
-| Resize window to 1/4 area | no change | Not fragment/raster bound |
-| `setSamples(4)` → `setSamples(1)` | no change | Not MSAA/resolve bound |
-| Comment out the two `glNamedBufferSubData` in `cullAndUploadVisible` | → 60 fps (screen blank) | **The per-frame uploads are the bottleneck.** |
+The first round of probes pointed at the two `glNamedBufferSubData`
+calls per model per frame (visible list ~1.5 MB + indirect buffer
+~1.3 MB):
 
-So the bottleneck is two `glNamedBufferSubData` calls per model per
-frame uploading ~1.5 MB (visible list) + ~1.3 MB (indirect buffer).
-3 MB/frame / 60 fps = 180 MB/s — trivial for the bus, but `glNamedBufferSubData`
-against a buffer the GPU is still reading forces the driver to stall
-the CPU or orphan/reallocate the backing store, and we're hitting that
-on 20 buffers per frame.
+| Probe | Result | Initial interpretation |
+|-------|--------|------------------------|
+| Camera off-screen (nothing visible) | 60 fps | GPU idle → CPU path cheap |
+| Comment out the two `glNamedBufferSubData` | 60 fps, blank screen | Uploads are the bottleneck |
 
-### Phase 3 (proposed) — Eliminate per-frame upload stalls
+This led to an aborted Phase 3A implementation of persistent-mapped
+triple-buffered rings (and then staging + VRAM-resident with
+`glCopyNamedBufferSubData`). Neither moved the FPS needle — both still
+sat at 6.7 fps.
 
-Two ways to attack it, in ascending order of effort:
+The probe was wrong: **commenting out the uploads emptied the indirect
+buffer, so MDI drew zero triangles. "No upload" and "no draw" were
+indistinguishable in the test.**
 
-#### 3A. Persistent mapped ring buffers (near-term)
+#### What actually isolates the draw cost
 
-Allocate each of the per-frame-written buffers with
-`glBufferStorage(GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_MAP_WRITE_BIT)`
-at 3× the needed size. Keep one `void*` from `glMapBufferRange` forever.
-Each frame, write the CPU-side data into slice `frame % 3` and bind
-that slice via `glBindBufferRange`. The GPU reads slice N−1 while the
-CPU writes slice N — no driver sync, no orphan, no stall.
+Two diagnostic env vars now live in `render()`:
 
-Scope: ~80 lines across `ModelGpuData` + `cullAndUploadVisible` +
-binding in `render()` / `renderPickPass()`. No algorithmic change, no
-shader change. Expected result on the stats scene: 10 fps → ~60 fps
-(the measured ceiling once uploads are removed).
+- `IFC_SKIP_MDI=1` — keep everything (cull, upload, binds) but skip the
+  actual `glMultiDrawElementsIndirect` calls.
+- `IFC_MAX_SUBDRAWS=N` — truncate each MDI's drawcount to N while still
+  running the rest of the frame.
 
-#### 3B. GPU-side culling (longer-term)
+Results on the profiled scene:
 
-Push culling itself to the GPU. A compute shader reads the
-`InstanceCpu`-equivalent SSBO + frustum planes, builds the visible list
-and indirect commands in-place via atomics. Zero CPU→GPU per-frame
-bytes. Also lays the foundation for occlusion and contribution culling
-(both want to run on the GPU anyway, with access to the depth buffer
-or screen-space projection).
+| Probe | FPS | Frame time |
+|-------|-----|-----------|
+| baseline | 6.7 | 149 ms |
+| `IFC_SKIP_MDI=1` | 62.5 | 16 ms |
+| `IFC_MAX_SUBDRAWS=30000` | 6.7 | 149 ms |
+| `IFC_MAX_SUBDRAWS=10000` | 7.5 | 133 ms |
+| `IFC_MAX_SUBDRAWS=1000` | 20.2 | 49 ms |
 
-Scope: compute shader + atomic counter + BVH-traversal-on-GPU (or a
-linear compute scan — simpler and still gains most of the win since
-traversal isn't the bottleneck once upload is gone). Bigger change;
-worth doing after 3A is measured, because 3A may be enough for a long
-while.
+Readings:
+
+1. `SKIP_MDI` gives 62 fps with all upload/bind machinery still running
+   — the non-draw path fits in ~16 ms easily. **Not upload-bound.**
+2. Halving the sub-draw count (67 k → 30 k) saves 0 ms. If per-sub-draw
+   command-processor overhead were material, dropping 37 k sub-draws
+   would save measurable time no matter which sub-draws were dropped.
+   It doesn't. **67 k sub-draws is not the bottleneck** — the long tail
+   carries almost no triangles, and the heavyweights dominate.
+3. Time only starts coming down once the cap is low enough to shed bulk
+   triangle work (1000 sub-draws → 49 ms). The curve is consistent with
+   a long-tailed distribution: a handful of very big meshes × instance
+   counts do most of the rasterisation.
+
+**Conclusion: the GTX 1650 is rasterising 128 M triangles at ~850 M
+tri/s, and that eats ~133 ms of the 149 ms frame.** No CPU-side or
+upload-side work will recover it. The only way forward is to draw
+fewer triangles.
+
+### Phase 3 (revised) — Shed triangles, not bytes
+
+In order of effort/payoff for BIM workloads:
+
+#### 3A. Screen-space contribution culling (near-term)
+
+Project each visible-instance AABB to screen space during BVH
+traversal. Reject instances whose projected size is below a threshold
+(~4 px). In BIM this is the single biggest win: at viewer zoom levels
+that encompass a whole building, most MEP fittings, fixings, furniture
+legs, door hardware etc. occupy < 1 px and contribute nothing.
+
+Scope: a projection + pixel-area test inside
+`ViewportWindow::cullAndUploadVisible`. Zero new GPU state. Expect
+10–30× reduction in drawn triangles on plant/MEP-dense scenes; full
+buildings viewed in overview should approach 60 fps.
+
+#### 3B. Distance / contribution LOD (medium-term)
+
+Pre-simplify unique representations at ingress time (store LOD 0 / 1 /
+2 meshes in the VBO/EBO with offsets), select LOD per instance per
+frame by the same projected-size metric as 3A. The visible-SSBO
+plumbing and MDI structure don't change — only `firstIndex`/`count` in
+the indirect command does. Ingress side needs a decimation pass
+(`meshoptimizer` or similar); GPU side is nearly free.
+
+#### 3C. Hierarchical-Z occlusion culling (longer-term)
+
+Render large occluders first, build a depth pyramid, test instance
+AABBs against it. In dense BIM most geometry is behind other geometry
+from any given interior viewpoint; historically a 3–10× reduction in
+drawn instances. Most valuable *after* 3A+3B, which together handle
+the far-away and small-detail cases. Pairs naturally with GPU-side
+culling (a compute shader doing the HiZ test and writing the visible
+list + indirect buffer in place).
+
+#### 3D. GPU-side culling via compute (longer-term)
+
+Push the cull loop to a compute shader reading the per-instance SSBO +
+frustum planes + HiZ pyramid, emitting the visible list and indirect
+commands with atomic counters. Eliminates all CPU→GPU per-frame bytes
+and lets 3C scale to millions of instances. Worth doing once 3A–3C
+have stabilised the CPU-side algorithm we'd be porting.
 
 ### Planned follow-ups (post-Phase-3)
 
-- **Screen-space contribution cull.** Reject instances whose projected
-  screen-space AABB is below a pixel threshold. Cheap CPU-side filter
-  that eliminates distant MEP detail. Big win on unfiltered plant-room
-  scenes.
-- **Hierarchical-Z occlusion culling.** Render large occluders, build a
-  depth pyramid, test BVH / instance AABBs against it. In dense BIM,
-  most geometry is behind other geometry from any given viewpoint; this
-  is historically a 3–10× reduction in drawn instances.
-- **Distance / contribution LOD.** Unique meshes pre-simplified at load
-  time; compute shader selects an LOD per instance per frame based on
-  screen-space size. Same visible-SSBO plumbing, different `firstIndex`.
-- **Mesh shaders / meshlets.** Ceiling-raising but overkill until the
-  above are exhausted.
+- **Mesh shaders / meshlets.** Ceiling-raising, but overkill until the
+  above are exhausted and we've hit silicon limits on vertex/raster
+  throughput.
 
 ## Summary table
 
 ```
-Scene size                      Bottleneck           Fix
------------                     ----------           ---
-< 100k instances                CPU cull scan        Phase 1 only (current)
-100k–500k                       CPU cull scan        BVH (Phase 2) — done
-500k+ across many models        visible/indirect     Phase 3A mapped rings
-                                buffer uploads       (next)
----                             ---                  ---
-multi-million + occlusion-heavy fragment / overdraw  HiZ occlusion + LOD
+Scene size                      Bottleneck              Fix
+-----------                     ----------              ---
+< 100k instances                CPU cull scan           Phase 1 only
+100k–500k                       CPU cull scan           BVH (Phase 2) — done
+500k+ tris / overview shot      GPU vertex + raster     Phase 3A contribution cull
+                                                        (+ 3B LOD for close-ups)
+multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
 ```
 
 ## Roadmap
@@ -444,10 +491,10 @@ multi-million + occlusion-heavy fragment / overdraw  HiZ occlusion + LOD
 - [x] Reflection-aware two-pass draw for mirrored placements
 - [x] Backface culling (user-toggleable, default on)
 - [x] `reorient-shells` enabled in iterator
-- [ ] **Phase 3A — persistent-mapped ring buffers for visible + indirect** (next)
-- [ ] Phase 3B — GPU-side compute-shader culling
-- [ ] Screen-space contribution culling
-- [ ] Hierarchical-Z occlusion culling
-- [ ] Distance-based LOD selection
+- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`)
+- [ ] **Phase 3A — screen-space contribution culling** (next)
+- [ ] Phase 3B — distance / contribution LOD
+- [ ] Phase 3C — Hierarchical-Z occlusion culling
+- [ ] Phase 3D — GPU-side compute-shader culling
 - [ ] Vulkan/MoltenVK backend for macOS
 - [ ] Embedded Python scripting console
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 84778f3f2f7..4680e188bb1 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -28,6 +28,7 @@
 #include <QtOpenGL/QOpenGLVersionFunctionsFactory>
 
 #include <cstring>
+#include <cstdlib>
 #include <cmath>
 #include <algorithm>
 #include <limits>
@@ -988,10 +989,32 @@ void ViewportWindow::render() {
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
         gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
 
-        const uint32_t fwd = m.indirect_forward_count;
-        const uint32_t rev = m.indirect_command_count - fwd;
+        uint32_t fwd = m.indirect_forward_count;
+        uint32_t rev = m.indirect_command_count - fwd;
+        // Perf diagnostics (confirmed 2026-04 on GTX 1650 @ 128M tris:
+        // draw-bound, not upload-bound — see README Phase 3):
+        //   IFC_SKIP_MDI=1         skip the actual MDI draws (keeps cull +
+        //                          upload + binds).  FPS jump == draw-bound.
+        //   IFC_MAX_SUBDRAWS=N     truncate drawcount to N per MDI.  Lets
+        //                          you distinguish per-subdraw command-
+        //                          processor overhead from raw tri work.
+        static const bool skip_mdi = []{
+            const char* e = std::getenv("IFC_SKIP_MDI");
+            return e && e[0] == '1';
+        }();
+        static const uint32_t max_subdraws = []{
+            const char* e = std::getenv("IFC_MAX_SUBDRAWS");
+            return (e && *e) ? static_cast<uint32_t>(std::atoi(e))
+                             : std::numeric_limits<uint32_t>::max();
+        }();
+        if (max_subdraws < m.indirect_command_count) {
+            // Keep the fwd/rev ratio so the workload mix is preserved.
+            const uint32_t total = m.indirect_command_count;
+            fwd = static_cast<uint32_t>((uint64_t)fwd * max_subdraws / total);
+            rev = max_subdraws - fwd;
+        }
         // Forward pass: non-reflected instances, standard CCW winding.
-        if (fwd > 0) {
+        if (fwd > 0 && !skip_mdi) {
             gl_->glFrontFace(GL_CCW);
             gl_->glMultiDrawElementsIndirect(
                 GL_TRIANGLES, GL_UNSIGNED_INT, nullptr,
@@ -1000,11 +1023,11 @@ void ViewportWindow::render() {
         }
         // Reverse pass: reflected instances — their world-space winding is
         // flipped, so telling GL the front is CW keeps cull-back working.
-        if (rev > 0) {
+        if (rev > 0 && !skip_mdi) {
             gl_->glFrontFace(GL_CW);
             gl_->glMultiDrawElementsIndirect(
                 GL_TRIANGLES, GL_UNSIGNED_INT,
-                reinterpret_cast<const void*>(fwd * sizeof(DrawElementsIndirectCommand)),
+                reinterpret_cast<const void*>(m.indirect_forward_count * sizeof(DrawElementsIndirectCommand)),
                 static_cast<GLsizei>(rev), 0);
             ++gl_draw_calls_;
             gl_->glFrontFace(GL_CCW);

From 09c1eefa9408714d53ba56ee117fac7c06ddc08d Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Mon, 13 Apr 2026 09:51:09 +1000
Subject: [PATCH 24/37] Phase 3A: screen-space contribution culling

Reject frustum-visible objects whose bounding sphere projects below a
pixel-radius threshold.  Applied at both BVH-node level (whole subtrees
pruned) and per-instance level; short-circuits when the camera is
inside the AABB so nothing-you're-standing-next-to is ever lost.
Pick pass passes threshold 0 so sub-pixel objects stay clickable.

Threshold defaults to 2 px (radius), overridable via IFC_MIN_PX env
var.  Measured on the 128 M-tri test scene (GTX 1650):

  0 px (off):   6.7 fps, 128 M tris
  2 px:        20.2 fps,  40 M tris (31%)
  4 px:        30.3 fps,  15 M tris (12%)

The metric is sphere-based (cheap: one sqrt per test) rather than
AABB-corner projection; loses a little precision on very elongated
bounds but costs ~5x less per test and the BVH-node pre-cull means
the long-tail-of-small-things case is already handled by subtree
pruning before we touch individual instances.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 64 ++++++++++++++++++++++++++++++--
 src/ifcviewer/ViewportWindow.h   | 10 ++++-
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 4680e188bb1..db002c1870d 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -824,7 +824,8 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     return pixel;
 }
 
-void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]) {
+void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4],
+                                          float focal_px, float min_pixel_radius) {
     // Per-mesh scratch, split by winding: fwd = non-reflected (CCW in screen
     // space), rev = reflected (CW in screen space).  Splitting lets the draw
     // pass toggle glFrontFace once between two MDI calls so GL_CULL_FACE does
@@ -836,9 +837,44 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         visible_by_mesh_rev_[i].clear();
     }
 
+    // Bounding-sphere contribution test: approximate an AABB by its enclosing
+    // sphere (centre = midpoint, radius = half-diagonal).  Project radius to
+    // pixels as r_px = focal_px * r / distance (perspective).  Reject if
+    // smaller than the threshold.  Returns true when the node/instance
+    // should be kept.
+    //
+    // If the camera is inside the AABB the sphere-radius test would reject
+    // by distance going to zero / negative — we handle that by skipping the
+    // test whenever the camera lies within an inflated AABB.  Cheap and
+    // conservative: never drops things you're standing next to.
+    const float cx = camera_eye_.x();
+    const float cy = camera_eye_.y();
+    const float cz = camera_eye_.z();
+    auto contributionPasses = [&](const float mn[3], const float mx[3]) -> bool {
+        if (min_pixel_radius <= 0.0f) return true;
+        // Camera inside AABB? Always keep.
+        if (cx >= mn[0] && cx <= mx[0] &&
+            cy >= mn[1] && cy <= mx[1] &&
+            cz >= mn[2] && cz <= mx[2]) {
+            return true;
+        }
+        float ex = 0.5f * (mx[0] - mn[0]);
+        float ey = 0.5f * (mx[1] - mn[1]);
+        float ez = 0.5f * (mx[2] - mn[2]);
+        float radius = std::sqrt(ex*ex + ey*ey + ez*ez);
+        float dx = 0.5f * (mx[0] + mn[0]) - cx;
+        float dy = 0.5f * (mx[1] + mn[1]) - cy;
+        float dz = 0.5f * (mx[2] + mn[2]) - cz;
+        float dist = std::sqrt(dx*dx + dy*dy + dz*dz);
+        // r_px = focal_px * radius / dist; compare r_px >= min_pixel_radius,
+        // rearranged to avoid the divide.
+        return focal_px * radius >= min_pixel_radius * dist;
+    };
+
     auto test_and_push = [&](uint32_t inst_idx) {
         const InstanceCpu& inst = m.instances[inst_idx];
         if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
+        if (!contributionPasses(inst.world_aabb_min, inst.world_aabb_max)) return;
         if (inst.mesh_id >= m.meshes.size()) return;
         const bool reflected = inst_idx < m.instance_reflected.size()
             && m.instance_reflected[inst_idx] != 0;
@@ -854,6 +890,9 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
             uint32_t ni = stack[--sp];
             const BvhNode& n = m.bvh.nodes[ni];
             if (!aabbInFrustum(n.aabb_min, n.aabb_max, planes)) continue;
+            // Contribution cull the whole subtree: if the node's enclosing
+            // sphere is below threshold, every child is too.
+            if (!contributionPasses(n.aabb_min, n.aabb_max)) continue;
             if (n.count > 0) {
                 for (uint32_t k = 0; k < n.count; ++k) {
                     uint32_t item_idx = m.bvh.item_indices[n.right_or_first + k];
@@ -939,11 +978,12 @@ void ViewportWindow::updateCamera() {
     eye.setX(camera_target_.x() + camera_distance_ * cosf(pitch_rad) * cosf(yaw_rad));
     eye.setY(camera_target_.y() + camera_distance_ * cosf(pitch_rad) * sinf(yaw_rad));
     eye.setZ(camera_target_.z() + camera_distance_ * sinf(pitch_rad));
+    camera_eye_ = eye;
     view_matrix_.setToIdentity();
     view_matrix_.lookAt(eye, camera_target_, QVector3D(0, 0, 1));
     proj_matrix_.setToIdentity();
     float aspect = width() > 0 ? float(width()) / float(height()) : 1.0f;
-    proj_matrix_.perspective(45.0f, aspect, 0.1f, camera_distance_ * 10.0f);
+    proj_matrix_.perspective(camera_fov_y_deg_, aspect, 0.1f, camera_distance_ * 10.0f);
 }
 
 void ViewportWindow::render() {
@@ -961,6 +1001,20 @@ void ViewportWindow::render() {
     float planes[6][4];
     extractFrustumPlanes(vp, planes);
 
+    // Pixels-per-radian vertical focal length.  Combined with per-instance
+    // world-space radius this gives screen-space pixel size for contribution
+    // culling below.
+    const float focal_px = 0.5f * static_cast<float>(h) /
+        std::tan(qDegreesToRadians(0.5f * camera_fov_y_deg_));
+    // Drop frustum-visible objects smaller than this many pixels.  Override
+    // with IFC_MIN_PX (0 = disabled).  2 px radius = ~4x4 pixels, well below
+    // what's meaningful at normal viewing distances and eliminates the long
+    // tail of distant MEP/fixings that dominate BIM triangle counts.
+    static const float min_pixel_radius = []{
+        const char* e = std::getenv("IFC_MIN_PX");
+        return (e && *e) ? static_cast<float>(std::atof(e)) : 2.0f;
+    }();
+
     gl_->glUseProgram(main_program_);
     GLint u_vp        = gl_->glGetUniformLocation(main_program_, "u_view_projection");
     GLint u_light     = gl_->glGetUniformLocation(main_program_, "u_light_dir");
@@ -981,7 +1035,7 @@ void ViewportWindow::render() {
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
-        cullAndUploadVisible(m, planes);
+        cullAndUploadVisible(m, planes, focal_px, min_pixel_radius);
         if (m.indirect_command_count == 0) continue;
 
         gl_->glBindVertexArray(m.vao);
@@ -1114,7 +1168,9 @@ void ViewportWindow::renderPickPass() {
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
-        cullAndUploadVisible(m, planes);
+        // Pick pass: contribution-cull disabled (0.0 threshold) so every
+        // frustum-visible object is clickable, even sub-pixel ones.
+        cullAndUploadVisible(m, planes, 1.0f, 0.0f);
         if (m.indirect_command_count == 0) continue;
 
         gl_->glBindVertexArray(m.vao);
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 1bbc44c97c4..a8d696121a2 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -173,7 +173,13 @@ class ViewportWindow : public QWindow {
     // Frustum-cull m's instances (BVH if available, else linear scan),
     // build the per-mesh DrawElementsIndirectCommand array + flat visible
     // list, and upload both to m.indirect_buffer / m.visible_ssbo.
-    void cullAndUploadVisible(ModelGpuData& m, const float planes[6][4]);
+    //
+    // `min_pixel_radius` controls contribution culling: instances (and BVH
+    // subtrees) whose projected bounding-sphere radius would be below this
+    // many pixels are dropped.  0 = disabled (all frustum-visible kept),
+    // which is what the pick pass uses so clickable targets aren't filtered.
+    void cullAndUploadVisible(ModelGpuData& m, const float planes[6][4],
+                              float focal_px, float min_pixel_radius);
 
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
@@ -224,9 +230,11 @@ class ViewportWindow : public QWindow {
 
     // Camera
     QVector3D camera_target_{0, 0, 0};
+    QVector3D camera_eye_{0, 0, 0};      // world-space eye, set in updateCamera
     float camera_distance_ = 50.0f;
     float camera_yaw_ = 45.0f;
     float camera_pitch_ = 30.0f;
+    float camera_fov_y_deg_ = 45.0f;
     QMatrix4x4 view_matrix_;
     QMatrix4x4 proj_matrix_;
 

From 95b1b976b005b01b28ed4b9e022bc19b067f74f5 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Mon, 13 Apr 2026 10:06:37 +1000
Subject: [PATCH 25/37] README: mark Phase 3A done with measured numbers

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md | 47 ++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 4af736bfad8..7bb6972b839 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -419,18 +419,35 @@ fewer triangles.
 
 In order of effort/payoff for BIM workloads:
 
-#### 3A. Screen-space contribution culling (near-term)
-
-Project each visible-instance AABB to screen space during BVH
-traversal. Reject instances whose projected size is below a threshold
-(~4 px). In BIM this is the single biggest win: at viewer zoom levels
-that encompass a whole building, most MEP fittings, fixings, furniture
-legs, door hardware etc. occupy < 1 px and contribute nothing.
-
-Scope: a projection + pixel-area test inside
-`ViewportWindow::cullAndUploadVisible`. Zero new GPU state. Expect
-10–30× reduction in drawn triangles on plant/MEP-dense scenes; full
-buildings viewed in overview should approach 60 fps.
+#### 3A. Screen-space contribution culling — ✅ done
+
+Reject frustum-visible objects whose bounding-sphere projects below a
+pixel-radius threshold. Applied both at BVH-node level (whole subtrees
+pruned, so distant parts of the model never touch per-instance tests)
+and per-instance level. Short-circuits when the camera is inside the
+AABB so nothing-you're-standing-next-to is ever lost. Pick pass uses
+threshold 0 so sub-pixel objects remain clickable.
+
+Sphere-based (centre = AABB midpoint, radius = half-diagonal,
+r_px = focal_px · radius / distance). Loses a little precision on
+very elongated bounds vs. 8-corner projection, but costs ~5× less per
+test, and because BVH-node pre-cull handles the long tail in one shot
+it doesn't matter.
+
+Threshold defaults to 2 px radius, overridable via `IFC_MIN_PX` env
+var. Measured on the 10-model / 128 M-tri test scene (GTX 1650):
+
+| Threshold | FPS | Triangles drawn | Objects drawn |
+|-----------|-----|-----------------|---------------|
+| 0 px (off) | 6.7 | 128 M | 379 k |
+| 2 px | 20.2 | 40 M (31 %) | 89 k (24 %) |
+| 4 px | 30.3 | 15 M (12 %) | 29 k (8 %) |
+
+At 4 px, frame time breakdown matches: ~16 ms non-draw baseline (from
+`IFC_SKIP_MDI=1`) + ~18 ms of raster (15 M tris / 850 M tri/s) ≈ 34 ms
+= observed 33 ms. The ceiling is now genuinely vertex/raster
+throughput on the post-cull geometry — next steps (LOD, HiZ) attack
+that directly.
 
 #### 3B. Distance / contribution LOD (medium-term)
 
@@ -491,9 +508,9 @@ multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
 - [x] Reflection-aware two-pass draw for mirrored placements
 - [x] Backface culling (user-toggleable, default on)
 - [x] `reorient-shells` enabled in iterator
-- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`)
-- [ ] **Phase 3A — screen-space contribution culling** (next)
-- [ ] Phase 3B — distance / contribution LOD
+- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`)
+- [x] Phase 3A — screen-space contribution culling
+- [ ] **Phase 3B — distance / contribution LOD** (next)
 - [ ] Phase 3C — Hierarchical-Z occlusion culling
 - [ ] Phase 3D — GPU-side compute-shader culling
 - [ ] Vulkan/MoltenVK backend for macOS

From 3fe183e1582669b30c30e82398b814767907f18b Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Mon, 13 Apr 2026 18:31:43 +1000
Subject: [PATCH 26/37] Phase 3B: per-instance LOD via meshoptimizer
 simplifySloppy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decimate each unique mesh once at sidecar-build time and swap to the
reduced index slice per-instance per-frame when projected sphere radius
drops below IFC_LOD1_PX (default 30).  Same VBO, same SSBO, just a
different firstIndex/count in the indirect command.

Extends MeshInfo (48→56 B) with lod1_ebo_byte_offset + lod1_index_count
and bumps the sidecar to v5.  buildLods() runs inside
onStreamingFinished, appends decimated indices to sd.indices,
applyLodExtension pushes the EBO suffix to the live GPU state, and the
sidecar is written with LOD1 baked in.

simplifySloppy (voxel clustering) is used instead of the default
edge-collapse meshopt_simplify because BIM brep output is per-triangle-
unwelded and non-manifold after welding — simplify returned the input
unchanged for every mesh tested.  Sloppy ignores topology.  Knobs
(IFC_LOD_SLOPPY, IFC_LOD_ERROR, IFC_LOD_RATIO, IFC_LOD_MIN_SAVINGS,
IFC_LOD_LOCK_BORDER, IFC_LOD_DEBUG) are available for A/B tuning.

Result on the 128M-tri 10-model test scene (GTX 1650, 2px contribution
cull): 20.2 → 43.2 fps, 40M → 14M visible triangles, no change in
object count.  LOD build adds 100–600 ms per model on first open,
cached thereafter.

README Phase 3B section is now a full writeup of pipeline, selection,
decimator-choice rationale, env vars, and measured numbers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/CMakeLists.txt      |   2 +
 src/ifcviewer/InstancedGeometry.h |  18 ++-
 src/ifcviewer/LodBuilder.cpp      | 203 ++++++++++++++++++++++++++++++
 src/ifcviewer/LodBuilder.h        |  56 +++++++++
 src/ifcviewer/MainWindow.cpp      |  14 +++
 src/ifcviewer/README.md           | 137 +++++++++++++++++---
 src/ifcviewer/SidecarCache.cpp    |   6 +-
 src/ifcviewer/SidecarCache.h      |   5 +-
 src/ifcviewer/ViewportWindow.cpp  | 117 ++++++++++++++---
 src/ifcviewer/ViewportWindow.h    |  16 ++-
 10 files changed, 533 insertions(+), 41 deletions(-)
 create mode 100644 src/ifcviewer/LodBuilder.cpp
 create mode 100644 src/ifcviewer/LodBuilder.h

diff --git a/src/ifcviewer/CMakeLists.txt b/src/ifcviewer/CMakeLists.txt
index 9f1c4dac502..70642acabf2 100644
--- a/src/ifcviewer/CMakeLists.txt
+++ b/src/ifcviewer/CMakeLists.txt
@@ -26,6 +26,7 @@ set(QT_VERSION 6 CACHE STRING "Qt version")
 find_package(Qt${QT_VERSION} COMPONENTS Core Gui Widgets OpenGL REQUIRED PATHS ${QT_DIR})
 
 find_package(OpenGL REQUIRED)
+find_package(meshoptimizer REQUIRED)
 
 file(GLOB IFCVIEWER_CPP_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 file(GLOB IFCVIEWER_H_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
@@ -51,6 +52,7 @@ target_link_libraries(IfcViewer PRIVATE
     Qt${QT_VERSION}::Widgets
     Qt${QT_VERSION}::OpenGL
     OpenGL::GL
+    meshoptimizer::meshoptimizer
 )
 
 if(UNIX AND NOT APPLE)
diff --git a/src/ifcviewer/InstancedGeometry.h b/src/ifcviewer/InstancedGeometry.h
index 1c027976ef1..ef79751806a 100644
--- a/src/ifcviewer/InstancedGeometry.h
+++ b/src/ifcviewer/InstancedGeometry.h
@@ -33,18 +33,28 @@ static constexpr int INSTANCED_VERTEX_STRIDE_BYTES = 28;
 static constexpr int INSTANCED_VERTEX_STRIDE_FLOATS = 7;
 
 // Per-mesh metadata on the CPU side.  Meshes own a slice of the model's
-// VBO and EBO (both local-coords/mesh-local indices).
+// VBO (shared across LODs) and one or more slices of the EBO, one per LOD.
+//
+// LOD0 is the original, full-resolution tessellation — the fields
+// `ebo_byte_offset` / `index_count` describe it.
+//
+// LOD1 is an optional decimated copy of the same triangles referencing the
+// same vertex buffer.  Built at sidecar time via meshoptimizer for meshes
+// whose triangle count crosses a threshold.  `lod1_index_count == 0`
+// means no LOD1 was built; the renderer must use LOD0 at every distance.
 struct MeshInfo {
     uint32_t vbo_byte_offset = 0;    // where this mesh's vertices start
     uint32_t vertex_count    = 0;
-    uint32_t ebo_byte_offset = 0;    // where this mesh's indices start
-    uint32_t index_count     = 0;
+    uint32_t ebo_byte_offset = 0;    // LOD0 indices
+    uint32_t index_count     = 0;    // LOD0 index count
     float    local_aabb_min[3]{};
     float    local_aabb_max[3]{};
     uint32_t first_instance  = 0;    // index into per-model instances array
     uint32_t instance_count  = 0;
+    uint32_t lod1_ebo_byte_offset = 0;
+    uint32_t lod1_index_count     = 0;   // 0 = no LOD1 available
 };
-static_assert(sizeof(MeshInfo) == 48, "MeshInfo must be 48 bytes");
+static_assert(sizeof(MeshInfo) == 56, "MeshInfo must be 56 bytes");
 
 // Per-instance record uploaded to an SSBO and read by the vertex shader.
 // Layout deliberately matches std430 expectations:
diff --git a/src/ifcviewer/LodBuilder.cpp b/src/ifcviewer/LodBuilder.cpp
new file mode 100644
index 00000000000..88b8c9f0468
--- /dev/null
+++ b/src/ifcviewer/LodBuilder.cpp
@@ -0,0 +1,203 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#include "LodBuilder.h"
+
+#include <meshoptimizer.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+void buildLods(SidecarData& sd,
+               int min_triangles,
+               float target_ratio,
+               float target_error) {
+    if (sd.meshes.empty() || sd.vertices.empty() || sd.indices.empty()) return;
+
+    const size_t vtx_stride_bytes  = INSTANCED_VERTEX_STRIDE_BYTES;
+    const size_t vtx_stride_floats = INSTANCED_VERTEX_STRIDE_FLOATS;
+    const size_t total_vertex_count = sd.vertices.size() / vtx_stride_floats;
+
+    // Env var knobs so we can tune without rebuilding.
+    //   IFC_LOD_LOCK_BORDER=1      re-enable LockBorder (off by default: BIM
+    //                              geometry is often non-manifold so locking
+    //                              borders prevents any collapse).
+    //   IFC_LOD_ERROR=<float>      override target_error (default 0.05 → 0.2).
+    //   IFC_LOD_RATIO=<float>      override target_ratio.
+    //   IFC_LOD_MIN_SAVINGS=<0..1> minimum fraction of tris saved to accept
+    //                              (default 0.25).
+    //   IFC_LOD_DEBUG=1            print per-mesh diagnostics for the first
+    //                              few meshes of each call.
+    //   IFC_LOD_SLOPPY=0           disable sloppy (clustering) decimator.
+    //                              Default ON: BIM brep output is usually
+    //                              non-manifold, so edge-collapse simplify
+    //                              returns the input unchanged.
+    const char* env_lock    = std::getenv("IFC_LOD_LOCK_BORDER");
+    const char* env_err     = std::getenv("IFC_LOD_ERROR");
+    const char* env_ratio   = std::getenv("IFC_LOD_RATIO");
+    const char* env_savings = std::getenv("IFC_LOD_MIN_SAVINGS");
+    const char* env_debug   = std::getenv("IFC_LOD_DEBUG");
+    const char* env_sloppy  = std::getenv("IFC_LOD_SLOPPY");
+
+    const bool lock_border = env_lock && env_lock[0] == '1';
+    const bool use_sloppy  = !(env_sloppy && env_sloppy[0] == '0');
+    if (env_err)   target_error = static_cast<float>(std::atof(env_err));
+    if (env_ratio) target_ratio = static_cast<float>(std::atof(env_ratio));
+    float min_savings = 0.25f;
+    if (env_savings) min_savings = static_cast<float>(std::atof(env_savings));
+    const bool debug = env_debug && env_debug[0] == '1';
+
+    // Loosened defaults: BIM meshes are non-manifold; LockBorder ≈ zero
+    // collapses. A 0.2 error budget still looks fine at sub-4px.
+    if (target_error < 0.2f) target_error = 0.2f;
+
+    // Scratch buffers reused across meshes so we only allocate once.
+    std::vector<uint32_t> simplified;
+    std::vector<uint32_t> shadow;
+    simplified.reserve(1024);
+    shadow.reserve(1024);
+
+    int dbg_printed = 0;
+    int dbg_rejected_savings = 0;
+    int dbg_rejected_noreduce = 0;
+    int dbg_accepted = 0;
+
+    for (auto& mesh : sd.meshes) {
+        mesh.lod1_ebo_byte_offset = 0;
+        mesh.lod1_index_count     = 0;
+
+        const uint32_t tri_count = mesh.index_count / 3;
+        if (static_cast<int>(tri_count) < min_triangles) continue;
+        if (mesh.vertex_count == 0) continue;
+
+        // meshopt wants a pointer to the *first position* and a vertex_count
+        // equal to the number of referenced vertices (i.e. the absolute upper
+        // bound on indices we might see).  Indices in `sd.indices` for this
+        // mesh are mesh-local (0..mesh.vertex_count).  Pass the base-vertex
+        // as an offset into sd.vertices so meshopt reads positions at the
+        // right place.
+        const uint32_t base_vertex = mesh.vbo_byte_offset / vtx_stride_bytes;
+        if (base_vertex + mesh.vertex_count > total_vertex_count) continue;
+
+        const uint32_t first_index = mesh.ebo_byte_offset / sizeof(uint32_t);
+        if (first_index + mesh.index_count > sd.indices.size()) continue;
+
+        const float* positions =
+            sd.vertices.data() + base_vertex * vtx_stride_floats;
+        const uint32_t* indices = sd.indices.data() + first_index;
+
+        const size_t target_index_count = std::max<size_t>(
+            3, static_cast<size_t>(mesh.index_count * target_ratio) / 3 * 3);
+
+        // The instanced VBO stores each triangle's vertices separately, so the
+        // mesh's index buffer is topologically disconnected — every edge is
+        // boundary, every vertex is unique, and meshopt_simplify can't collapse
+        // anything.  Build a shadow index buffer that welds by position, so
+        // shared-position vertices share an ID; then simplify on that.  Output
+        // indices are still valid mesh-local IDs (canonical representatives),
+        // usable directly as LOD1 indices against the same VBO.
+        shadow.resize(mesh.index_count);
+        meshopt_generateShadowIndexBuffer(
+            shadow.data(),
+            indices, mesh.index_count,
+            positions, mesh.vertex_count,
+            sizeof(float) * 3,       // compare only xyz
+            vtx_stride_bytes);
+
+        simplified.resize(mesh.index_count);
+        float result_error = 0.0f;
+        size_t new_index_count = 0;
+
+        if (use_sloppy) {
+            // Cluster-based decimator.  Ignores topology entirely; great for
+            // BIM brep output which is usually non-manifold / has T-junctions.
+            // Operates directly on the original indices — welding isn't
+            // needed since it quantises positions into voxel cells.
+            new_index_count = meshopt_simplifySloppy(
+                simplified.data(),
+                indices, mesh.index_count,
+                positions, mesh.vertex_count, vtx_stride_bytes,
+                target_index_count, target_error,
+                &result_error);
+        } else {
+            const unsigned int options =
+                lock_border ? static_cast<unsigned int>(meshopt_SimplifyLockBorder) : 0u;
+            new_index_count = meshopt_simplify(
+                simplified.data(),
+                shadow.data(), mesh.index_count,
+                positions, mesh.vertex_count, vtx_stride_bytes,
+                target_index_count, target_error,
+                options, &result_error);
+        }
+
+        if (debug && dbg_printed < 8) {
+            std::fprintf(stderr,
+                "  [lod] mesh tris=%u target=%zu got=%zu err=%.4f\n",
+                tri_count, target_index_count / 3,
+                new_index_count / 3, result_error);
+            ++dbg_printed;
+        }
+
+        // Accept only if we actually saved a meaningful chunk of tris.
+        if (new_index_count == 0 || new_index_count >= mesh.index_count) {
+            ++dbg_rejected_noreduce;
+            continue;
+        }
+
+        const uint32_t saved = mesh.index_count - static_cast<uint32_t>(new_index_count);
+        if (static_cast<float>(saved) < min_savings * static_cast<float>(mesh.index_count)) {
+            ++dbg_rejected_savings;
+            continue;
+        }
+        ++dbg_accepted;
+
+        // Append the surviving indices to sd.indices; record the offset.
+        const size_t append_offset_bytes = sd.indices.size() * sizeof(uint32_t);
+        sd.indices.insert(sd.indices.end(),
+                          simplified.begin(),
+                          simplified.begin() + new_index_count);
+        mesh.lod1_ebo_byte_offset = static_cast<uint32_t>(append_offset_bytes);
+        mesh.lod1_index_count     = static_cast<uint32_t>(new_index_count);
+    }
+
+    if (debug) {
+        std::fprintf(stderr,
+            "  [lod] summary: accepted=%d rejected_noreduce=%d rejected_savings=%d "
+            "(lock_border=%d target_error=%.3f target_ratio=%.3f min_savings=%.3f)\n",
+            dbg_accepted, dbg_rejected_noreduce, dbg_rejected_savings,
+            lock_border ? 1 : 0, target_error, target_ratio, min_savings);
+    }
+}
+
+LodStats summariseLods(const SidecarData& sd) {
+    LodStats s;
+    s.meshes_total = static_cast<uint32_t>(sd.meshes.size());
+    for (const auto& m : sd.meshes) {
+        s.tris_lod0 += m.index_count / 3;
+        if (m.lod1_index_count > 0) {
+            ++s.meshes_with_lod1;
+            s.tris_lod1          += m.lod1_index_count / 3;
+            s.tris_lod0_for_lod1 += m.index_count / 3;
+        }
+    }
+    return s;
+}
diff --git a/src/ifcviewer/LodBuilder.h b/src/ifcviewer/LodBuilder.h
new file mode 100644
index 00000000000..a937ae49870
--- /dev/null
+++ b/src/ifcviewer/LodBuilder.h
@@ -0,0 +1,56 @@
+/********************************************************************************
+ *                                                                              *
+ * This file is part of IfcOpenShell.                                           *
+ *                                                                              *
+ * IfcOpenShell is free software: you can redistribute it and/or modify         *
+ * it under the terms of the Lesser GNU General Public License as published by  *
+ * the Free Software Foundation, either version 3.0 of the License, or          *
+ * (at your option) any later version.                                          *
+ *                                                                              *
+ * IfcOpenShell is distributed in the hope that it will be useful,              *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of               *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                 *
+ * Lesser GNU General Public License for more details.                          *
+ *                                                                              *
+ * You should have received a copy of the Lesser GNU General Public License     *
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.         *
+ *                                                                              *
+ ********************************************************************************/
+
+#ifndef LODBUILDER_H
+#define LODBUILDER_H
+
+#include "SidecarCache.h"
+
+// Build a LOD1 index slice for every mesh in `sd` whose triangle count is
+// above `min_triangles`, using meshoptimizer's edge-collapse decimator.  The
+// LOD1 indices are appended to `sd.indices`; each MeshInfo's
+// lod1_ebo_byte_offset + lod1_index_count are populated to point at the
+// appended range.  Meshes that don't qualify (too small) or where the
+// decimator couldn't meet the target within the error budget have
+// lod1_index_count left at 0 (renderer falls back to LOD0).
+//
+// Defaults match the Phase 3B first-iteration design:
+//   min_triangles = 500     — below this the overhead dominates
+//   target_ratio  = 0.25    — aim for 25% of original tris
+//   target_error  = 0.05    — stop if relative error exceeds 5%
+//
+// `sd.vertices` is read (position is the first 3 floats of each
+// INSTANCED_VERTEX_STRIDE_FLOATS-wide vertex) but not modified — LOD1
+// reuses the same vertex buffer, just with a different index list.
+void buildLods(SidecarData& sd,
+               int min_triangles = 500,
+               float target_ratio = 0.25f,
+               float target_error = 0.05f);
+
+// Cheap summary for logging.  Safe to call before or after buildLods.
+struct LodStats {
+    uint32_t meshes_total       = 0;
+    uint32_t meshes_with_lod1   = 0;
+    uint32_t tris_lod0          = 0;   // sum across all meshes
+    uint32_t tris_lod1          = 0;   // only for meshes that got LOD1
+    uint32_t tris_lod0_for_lod1 = 0;   // LOD0 tris of the meshes that got LOD1
+};
+LodStats summariseLods(const SidecarData& sd);
+
+#endif // LODBUILDER_H
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 8b63f3bdf68..7dc5454700b 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -20,6 +20,7 @@
 #include "MainWindow.h"
 #include "AppSettings.h"
 #include "SettingsWindow.h"
+#include "LodBuilder.h"
 #include "SidecarCache.h"
 
 #include <QApplication>
@@ -395,6 +396,19 @@ void MainWindow::onStreamingFinished() {
                     sd.elements.push_back(pe);
                 }
 
+                // Build LOD1 for eligible meshes (extends sd.indices and
+                // populates MeshInfo::lod1_*), push the extension onto the
+                // live GPU state so this session benefits too, then cache.
+                QElapsedTimer t_lod; t_lod.start();
+                buildLods(sd);
+                LodStats ls = summariseLods(sd);
+                qDebug("  LOD build: %lld ms — %u/%u meshes got LOD1 "
+                       "(%u tris → %u tris for those meshes)",
+                       t_lod.elapsed(),
+                       ls.meshes_with_lod1, ls.meshes_total,
+                       ls.tris_lod0_for_lod1, ls.tris_lod1);
+                viewport_->applyLodExtension(loading_model_id_, sd);
+
                 std::string ifc_path = it->second.file_path.toStdString();
                 uint64_t file_size = static_cast<uint64_t>(
                     QFileInfo(it->second.file_path).size());
diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 7bb6972b839..82bd89555cd 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -92,7 +92,8 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
 | `GeometryStreamer.h/cpp` | Background iterator runner; emits `MeshChunk` + `InstanceChunk` |
 | `InstancedGeometry.h` | Shared structs: `MeshInfo`, `InstanceCpu`, `InstanceGpu`, chunk records |
 | `BvhAccel.h/cpp` | Median-split BVH builder; operates on instance world-AABBs |
-| `SidecarCache.h/cpp` | Raw binary `.ifcview` (v4) sidecar read/write |
+| `LodBuilder.h/cpp` | Post-stream decimation of unique meshes via meshoptimizer (`simplifySloppy`) |
+| `SidecarCache.h/cpp` | Raw binary `.ifcview` (v5) sidecar read/write |
 | `AppSettings.h/cpp` | Persisted preferences (geometry library, stats overlay, backface culling) |
 | `SettingsWindow.h/cpp` | Settings dialog |
 | `CMakeLists.txt` | Build configuration |
@@ -106,6 +107,9 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
   at GL 4.1).
 - **IfcOpenShell C++ libraries** (IfcParse, IfcGeom, and their
   dependencies: Open CASCADE, Boost, Eigen3, optionally CGAL).
+- **[meshoptimizer](https://github.com/zeux/meshoptimizer)** — linked via
+  `find_package(meshoptimizer REQUIRED)`. Used at sidecar-build time for LOD
+  decimation; not needed at runtime once a sidecar exists.
 
 ## Building
 
@@ -266,7 +270,7 @@ while stack not empty:
 Depth 64 is enough for billions of items on any balanced tree. The stack
 is on the C++ stack, zero per-frame allocation.
 
-#### Sidecar format (`.ifcview`, v4)
+#### Sidecar format (`.ifcview`, v5)
 
 Raw memory dump, Blender-`.blend`-style — no serialisation, no parsing.
 Stores everything needed to skip the `IfcGeom::Iterator` pass:
@@ -276,7 +280,7 @@ SidecarHeader            (magic "IFVW", version, endian, ...)
 uint64_t                 source_file_size
 uint32_t + float[]       vertex data    (7 floats × N_verts, local coords)
 uint32_t + uint32_t[]    index data     (mesh-local)
-uint32_t + MeshInfo[]    per-unique-mesh metadata (48 B each)
+uint32_t + MeshInfo[]    per-unique-mesh metadata (56 B each, incl. LOD1 slice)
 uint32_t + InstanceCpu[] per-placement records (transform + AABB + ids)
 uint32_t + PackedElementInfo[]   element tree records
 uint32_t + char[]        string table
@@ -449,14 +453,117 @@ At 4 px, frame time breakdown matches: ~16 ms non-draw baseline (from
 throughput on the post-cull geometry — next steps (LOD, HiZ) attack
 that directly.
 
-#### 3B. Distance / contribution LOD (medium-term)
-
-Pre-simplify unique representations at ingress time (store LOD 0 / 1 /
-2 meshes in the VBO/EBO with offsets), select LOD per instance per
-frame by the same projected-size metric as 3A. The visible-SSBO
-plumbing and MDI structure don't change — only `firstIndex`/`count` in
-the indirect command does. Ingress side needs a decimation pass
-(`meshoptimizer` or similar); GPU side is nearly free.
+#### 3B. Distance / contribution LOD — ✅ done
+
+Decimate each unique representation once (at sidecar-build time), store
+the reduced index slice in the same EBO, and switch to it per-instance
+per-frame whenever the projected sphere radius is small enough that the
+reduced silhouette is indistinguishable from the original.
+
+##### Pipeline
+
+1. **After streaming finishes**, `MainWindow` calls `buildLods(sd)` on
+   the snapshotted `SidecarData`. Each eligible mesh's decimated index
+   list is appended to `sd.indices`; the per-mesh `MeshInfo` gains two
+   new fields:
+
+   ```cpp
+   uint32_t lod1_ebo_byte_offset;  // appended slice, same VBO
+   uint32_t lod1_index_count;      // 0 = no LOD1 was built
+   ```
+
+   `MeshInfo` grew from 48 to 56 bytes, which also bumps the sidecar
+   format to v5.
+
+2. `viewport_->applyLodExtension(model_id, sd)` pushes the new index
+   suffix onto the live EBO via `glNamedBufferSubData` and replaces the
+   CPU-side `m.meshes` vector. The VBO and instance SSBO are untouched
+   — LOD1 reuses the same vertices, only the indices differ.
+
+3. The sidecar is then written with both LOD0 and LOD1 indices baked in,
+   so subsequent loads of the same file pick up LOD1 for free.
+
+##### Selection
+
+The contribution-cull pass already computes each instance's projected
+pixel radius. LOD1 is selected when that radius falls below
+`IFC_LOD1_PX` (default 30 px) and the mesh has a non-empty LOD1 slice.
+Camera-inside-AABB short-circuits select LOD0 (treated as "infinite
+radius") so you never accidentally see the reduced mesh up close.
+
+The visible-instance pipeline gains two more buckets (`fwd_lod1_`,
+`rev_lod1_`), so the four-way split is now `{fwd, rev} × {LOD0, LOD1}`.
+LOD0/LOD1 within a winding slice are contiguous — only winding requires
+`glFrontFace` to flip between MDI calls, LOD does not. `firstIndex` /
+`count` in the `DrawElementsIndirectCommand` pick which slice of the EBO
+to walk; everything else (base vertex, base instance, SSBO bindings,
+shader) is unchanged.
+
+##### Decimator choice: `meshopt_simplifySloppy`
+
+The first attempt used `meshopt_simplify`, which is an edge-collapse
+decimator. It returned every input mesh unchanged (`err = 0.0`) for two
+reasons, both inherent to BIM brep output:
+
+1. **Per-triangle vertex duplication.** The instanced VBO stores each
+   triangle's vertices separately so that hard-edge normals can differ
+   across triangles. Topologically there are no shared vertices, so no
+   edges exist for `meshopt_simplify` to collapse. A
+   `meshopt_generateShadowIndexBuffer` welding pass (hash xyz only,
+   ignore the interleaved normal/colour) fixes this half cheaply — the
+   VBO isn't touched, only a per-call shadow index buffer is built.
+2. **Non-manifold topology even after welding.** BIM brep output has
+   T-junctions, coplanar slivers, separate solids meeting at a plane,
+   and multi-material cuts. `meshopt_simplify` needs valid 2-manifold
+   edge pairs to score collapses; it refuses the non-manifold ones, the
+   priority queue never fires, and it returns the input untouched.
+
+`meshopt_simplifySloppy` is a **voxel-clustering decimator** — it
+quantises positions into cells and merges everything in a cell to a
+single point. Topology is irrelevant, so it works directly on the
+original indices (welding isn't even needed). The trade-off is that it
+rounds off sharp corners and can produce slightly degenerate triangles,
+so it doesn't look great at mid-screen size. For a LOD1 that only
+activates below 30 px projected radius that's invisible in practice. If
+you ever want LOD1 to remain active at larger sizes, the only robust
+fix is to pre-process BIM meshes into manifold form (fuse coplanar
+faces, split at T-junctions) — a significant project unto itself.
+
+##### Tuning knobs (env vars)
+
+| Var | Default | Effect |
+|-----|---------|--------|
+| `IFC_LOD1_PX` | `30` | Projected sphere radius (px) below which LOD1 kicks in. `0` disables LOD1 entirely. |
+| `IFC_LOD_SLOPPY` | `1` | `0` falls back to edge-collapse (`meshopt_simplify`) on shadow-welded indices. Typically produces zero LOD1 output for BIM — useful only for A/B comparison. |
+| `IFC_LOD_ERROR` | `0.2` | Target relative error passed to meshopt. |
+| `IFC_LOD_RATIO` | `0.25` | Target triangle-count ratio (LOD1 aims for 25 % of LOD0 tris). |
+| `IFC_LOD_MIN_SAVINGS` | `0.25` | Reject the LOD1 result if it doesn't shave at least this fraction of triangles. |
+| `IFC_LOD_LOCK_BORDER` | `0` | `1` re-enables `meshopt_SimplifyLockBorder` (only meaningful with `IFC_LOD_SLOPPY=0`). |
+| `IFC_LOD_DEBUG` | `0` | `1` prints per-mesh `tris / target / got / err` for the first 8 candidate meshes plus an accept/reject summary per model. |
+
+##### Measured results
+
+Same 10-model / 128 M-tri scene as Phase 3A (GTX 1650), 2 px contribution
+threshold, overview camera, all models finalised with LOD1 built:
+
+| Build | FPS | Frame time | Visible tris | Visible objs |
+|-------|-----|-----------|--------------|--------------|
+| Phase 3A alone (2 px) | 20.2 | 49 ms | 40 M | 89 k |
+| Phase 3A + 3B (LOD1 ≤ 30 px) | **43.2** | **23 ms** | 14 M | 81 k |
+
+Roughly half the remaining frame time, same object count (LOD is
+lossless w.r.t. visibility — swapping index slice doesn't hide
+anything). The triangle reduction on meshes that qualified for LOD1 is
+~80 %: e.g. 4.17 M → 0.82 M tris for the 3618 eligible meshes of Model
+1, 3.25 M → 0.65 M for Model 2, etc. Only about 20 % of unique meshes
+qualify (the threshold is 500 tris — below that the indirect-command
+overhead dominates), but those are the fat tail carrying most of the
+rasterisation cost.
+
+LOD build itself runs on the main thread inside `onStreamingFinished`;
+typical cost is 100–600 ms per model, folded into the already-visible
+"finalizing" step. Cached into the sidecar afterwards, so subsequent
+opens skip it entirely.
 
 #### 3C. Hierarchical-Z occlusion culling (longer-term)
 
@@ -490,7 +597,7 @@ Scene size                      Bottleneck              Fix
 < 100k instances                CPU cull scan           Phase 1 only
 100k–500k                       CPU cull scan           BVH (Phase 2) — done
 500k+ tris / overview shot      GPU vertex + raster     Phase 3A contribution cull
-                                                        (+ 3B LOD for close-ups)
+                                                        + Phase 3B LOD (done)
 multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
 ```
 
@@ -508,10 +615,10 @@ multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
 - [x] Reflection-aware two-pass draw for mirrored placements
 - [x] Backface culling (user-toggleable, default on)
 - [x] `reorient-shells` enabled in iterator
-- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`)
+- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`, `IFC_LOD1_PX`)
 - [x] Phase 3A — screen-space contribution culling
-- [ ] **Phase 3B — distance / contribution LOD** (next)
-- [ ] Phase 3C — Hierarchical-Z occlusion culling
+- [x] Phase 3B — distance / contribution LOD (meshoptimizer `simplifySloppy`)
+- [ ] **Phase 3C — Hierarchical-Z occlusion culling** (next)
 - [ ] Phase 3D — GPU-side compute-shader culling
 - [ ] Vulkan/MoltenVK backend for macOS
 - [ ] Embedded Python scripting console
diff --git a/src/ifcviewer/SidecarCache.cpp b/src/ifcviewer/SidecarCache.cpp
index 3c5ca9cd8d5..da3943988d2 100644
--- a/src/ifcviewer/SidecarCache.cpp
+++ b/src/ifcviewer/SidecarCache.cpp
@@ -17,7 +17,11 @@
  *                                                                              *
  ********************************************************************************/
 
-// v4 layout (all multi-byte fields native-endian; endianness marker in header):
+// v5 layout (all multi-byte fields native-endian; endianness marker in header).
+// Same sequence as v4; the only change is that MeshInfo grew two uint32_ts
+// (lod1_ebo_byte_offset + lod1_index_count) and `indices` may contain extra
+// appended LOD1 slices pointed at by those offsets.
+//
 //
 //   SidecarHeader (16 bytes)
 //   uint64_t  source_file_size
diff --git a/src/ifcviewer/SidecarCache.h b/src/ifcviewer/SidecarCache.h
index e14eb9d2561..332abdc8029 100644
--- a/src/ifcviewer/SidecarCache.h
+++ b/src/ifcviewer/SidecarCache.h
@@ -34,7 +34,10 @@
 #include <memory>
 
 static constexpr uint32_t SIDECAR_MAGIC   = 0x49465657;  // "IFVW"
-static constexpr uint32_t SIDECAR_VERSION = 4;
+// v5 = MeshInfo extended with lod1_ebo_byte_offset + lod1_index_count (56 B).
+//      sd.indices may contain an appended LOD1 index slice for each mesh
+//      where meshoptimizer decimation produced useful output.
+static constexpr uint32_t SIDECAR_VERSION = 5;
 static constexpr uint32_t SIDECAR_ENDIAN  = 0x01020304;
 
 // Fixed-size element record.  Strings are stored as (offset, length) pairs
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index db002c1870d..2606ffd3f3f 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -751,6 +751,34 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
            ssbo_bytes / (1024.0*1024.0));
 }
 
+void ViewportWindow::applyLodExtension(uint32_t model_id, const SidecarData& sd) {
+    if (!gl_initialized_) return;
+    auto it = models_gpu_.find(model_id);
+    if (it == models_gpu_.end() || !it->second.finalized) return;
+    ModelGpuData& m = it->second;
+
+    const size_t total_ib_bytes = sd.indices.size() * sizeof(uint32_t);
+    if (total_ib_bytes <= m.ebo_used) {
+        // buildLods didn't add anything; just refresh the meshes vector in
+        // case lod1_* fields were touched.
+        m.meshes = sd.meshes;
+        return;
+    }
+
+    context_->makeCurrent(this);
+    if (total_ib_bytes > m.ebo_capacity) {
+        if (!growModelEbo(m, total_ib_bytes)) return;
+    }
+    const size_t append_bytes = total_ib_bytes - m.ebo_used;
+    const uint32_t* appended_src =
+        sd.indices.data() + (m.ebo_used / sizeof(uint32_t));
+    gl_->glNamedBufferSubData(m.ebo, m.ebo_used, append_bytes, appended_src);
+    m.ebo_used = total_ib_bytes;
+
+    // Replace mesh metadata so cullAndUploadVisible sees the new lod1_ fields.
+    m.meshes = sd.meshes;
+}
+
 void ViewportWindow::resetScene() {
     if (!gl_initialized_) return;
     context_->makeCurrent(this);
@@ -826,17 +854,33 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
 
 void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4],
                                           float focal_px, float min_pixel_radius) {
-    // Per-mesh scratch, split by winding: fwd = non-reflected (CCW in screen
-    // space), rev = reflected (CW in screen space).  Splitting lets the draw
+    // Per-mesh scratch, split by winding × LOD.  Winding split lets the draw
     // pass toggle glFrontFace once between two MDI calls so GL_CULL_FACE does
-    // the right thing for both.
-    if (visible_by_mesh_fwd_.size() < m.meshes.size()) visible_by_mesh_fwd_.resize(m.meshes.size());
-    if (visible_by_mesh_rev_.size() < m.meshes.size()) visible_by_mesh_rev_.resize(m.meshes.size());
+    // the right thing for both.  LOD split means instances that want the
+    // decimated mesh go into a different bucket that emits against
+    // mesh.lod1_ebo_byte_offset / lod1_index_count.
+    auto resize_if = [&](std::vector<std::vector<uint32_t>>& v) {
+        if (v.size() < m.meshes.size()) v.resize(m.meshes.size());
+    };
+    resize_if(visible_by_mesh_fwd_lod0_);
+    resize_if(visible_by_mesh_fwd_lod1_);
+    resize_if(visible_by_mesh_rev_lod0_);
+    resize_if(visible_by_mesh_rev_lod1_);
     for (size_t i = 0; i < m.meshes.size(); ++i) {
-        visible_by_mesh_fwd_[i].clear();
-        visible_by_mesh_rev_[i].clear();
+        visible_by_mesh_fwd_lod0_[i].clear();
+        visible_by_mesh_fwd_lod1_[i].clear();
+        visible_by_mesh_rev_lod0_[i].clear();
+        visible_by_mesh_rev_lod1_[i].clear();
     }
 
+    // LOD1 switches in when projected sphere radius (in pixels) drops below
+    // this threshold.  Overridable for tuning.  Set to 0 to disable LOD1
+    // entirely (always draw LOD0).
+    static const float lod1_px_threshold = []{
+        const char* e = std::getenv("IFC_LOD1_PX");
+        return (e && *e) ? static_cast<float>(std::atof(e)) : 30.0f;
+    }();
+
     // Bounding-sphere contribution test: approximate an AABB by its enclosing
     // sphere (centre = midpoint, radius = half-diagonal).  Project radius to
     // pixels as r_px = focal_px * r / distance (perspective).  Reject if
@@ -871,15 +915,44 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         return focal_px * radius >= min_pixel_radius * dist;
     };
 
+    // Returns projected sphere radius in pixels (or +inf when camera is
+    // inside the AABB).  Shares the geometry with contributionPasses; this
+    // version returns the value so we can also use it for LOD selection.
+    auto pixelRadius = [&](const float mn[3], const float mx[3]) -> float {
+        if (cx >= mn[0] && cx <= mx[0] &&
+            cy >= mn[1] && cy <= mx[1] &&
+            cz >= mn[2] && cz <= mx[2]) {
+            return std::numeric_limits<float>::infinity();
+        }
+        float ex = 0.5f * (mx[0] - mn[0]);
+        float ey = 0.5f * (mx[1] - mn[1]);
+        float ez = 0.5f * (mx[2] - mn[2]);
+        float radius = std::sqrt(ex*ex + ey*ey + ez*ez);
+        float dx = 0.5f * (mx[0] + mn[0]) - cx;
+        float dy = 0.5f * (mx[1] + mn[1]) - cy;
+        float dz = 0.5f * (mx[2] + mn[2]) - cz;
+        float dist = std::sqrt(dx*dx + dy*dy + dz*dz);
+        return dist > 0.0f ? focal_px * radius / dist
+                           : std::numeric_limits<float>::infinity();
+    };
+
     auto test_and_push = [&](uint32_t inst_idx) {
         const InstanceCpu& inst = m.instances[inst_idx];
         if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
         if (!contributionPasses(inst.world_aabb_min, inst.world_aabb_max)) return;
         if (inst.mesh_id >= m.meshes.size()) return;
+        const MeshInfo& mesh = m.meshes[inst.mesh_id];
+        const bool want_lod1 = mesh.lod1_index_count > 0 &&
+            lod1_px_threshold > 0.0f &&
+            pixelRadius(inst.world_aabb_min, inst.world_aabb_max) < lod1_px_threshold;
         const bool reflected = inst_idx < m.instance_reflected.size()
             && m.instance_reflected[inst_idx] != 0;
-        if (reflected) visible_by_mesh_rev_[inst.mesh_id].push_back(inst_idx);
-        else           visible_by_mesh_fwd_[inst.mesh_id].push_back(inst_idx);
+        auto& bucket =
+            reflected ? (want_lod1 ? visible_by_mesh_rev_lod1_
+                                   : visible_by_mesh_rev_lod0_)
+                      : (want_lod1 ? visible_by_mesh_fwd_lod1_
+                                   : visible_by_mesh_fwd_lod0_);
+        bucket[inst.mesh_id].push_back(inst_idx);
     };
 
     if (!m.bvh.nodes.empty()) {
@@ -911,22 +984,28 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         for (uint32_t i = 0; i < m.instances.size(); ++i) test_and_push(i);
     }
 
-    // Flatten fwd-slice first, then rev-slice, into visible_flat_.  Build
-    // matching DrawElementsIndirectCommands; commands for the fwd slice fill
-    // [0, indirect_forward_count), rev fills [indirect_forward_count, end).
+    // Flatten fwd-slice first (LOD0 then LOD1), then rev-slice (ditto), into
+    // visible_flat_.  Commands for the fwd slice fill [0, indirect_forward_count),
+    // rev fills [indirect_forward_count, end).  LOD0/LOD1 within a winding
+    // slice are contiguous — winding is what requires glFrontFace to flip
+    // between MDI calls, LOD is not.
     visible_flat_.clear();
     indirect_scratch_.clear();
 
-    auto emit_slice = [&](std::vector<std::vector<uint32_t>>& by_mesh) {
+    auto emit_slice = [&](std::vector<std::vector<uint32_t>>& by_mesh, int lod) {
         for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
             const auto& mesh = m.meshes[mi];
             const uint32_t vis_count = static_cast<uint32_t>(by_mesh[mi].size());
-            if (vis_count == 0 || mesh.index_count == 0) continue;
+            const uint32_t idx_count =
+                (lod == 1) ? mesh.lod1_index_count : mesh.index_count;
+            const uint32_t ebo_off =
+                (lod == 1) ? mesh.lod1_ebo_byte_offset : mesh.ebo_byte_offset;
+            if (vis_count == 0 || idx_count == 0) continue;
 
             DrawElementsIndirectCommand cmd;
-            cmd.count         = mesh.index_count;
+            cmd.count         = idx_count;
             cmd.instanceCount = vis_count;
-            cmd.firstIndex    = mesh.ebo_byte_offset / sizeof(uint32_t);
+            cmd.firstIndex    = ebo_off / sizeof(uint32_t);
             cmd.baseVertex    = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
             cmd.baseInstance  = static_cast<uint32_t>(visible_flat_.size());
             indirect_scratch_.push_back(cmd);
@@ -936,9 +1015,11 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         }
     };
 
-    emit_slice(visible_by_mesh_fwd_);
+    emit_slice(visible_by_mesh_fwd_lod0_, 0);
+    emit_slice(visible_by_mesh_fwd_lod1_, 1);
     m.indirect_forward_count = static_cast<uint32_t>(indirect_scratch_.size());
-    emit_slice(visible_by_mesh_rev_);
+    emit_slice(visible_by_mesh_rev_lod0_, 0);
+    emit_slice(visible_by_mesh_rev_lod1_, 1);
     m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
 
     // Upload visible list (keep binding alive even when empty).
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index a8d696121a2..fe54cce9210 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -127,6 +127,13 @@ class ViewportWindow : public QWindow {
     // any existing state for model_id and marks it drawable.
     void applyCachedModel(uint32_t model_id, SidecarData data);
 
+    // After buildLods() has extended sd.indices + populated lod1_* fields,
+    // push just the appended index slice + the refreshed mesh metadata onto
+    // the live GPU state for model_id.  VBO / SSBO / instance array are left
+    // alone; only the EBO grows and m.meshes is replaced.  No-op if the
+    // model isn't finalised on the viewport.
+    void applyLodExtension(uint32_t model_id, const SidecarData& sd);
+
     void hideModel(uint32_t model_id);
     void showModel(uint32_t model_id);
     void removeModel(uint32_t model_id);
@@ -223,8 +230,13 @@ class ViewportWindow : public QWindow {
     // per-frame allocation.  indirect_scratch_ is the matching array of
     // DrawElementsIndirectCommand records — forward-declared as bytes so
     // the header doesn't need the struct definition.
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_;
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_;
+    // Four buckets = {fwd, rev} × {LOD0, LOD1}.  LOD1 buckets are only
+    // populated when the mesh has lod1_index_count > 0 and the projected
+    // pixel radius is below the LOD switch threshold.
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_lod0_;
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_lod1_;
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_lod0_;
+    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_lod1_;
     std::vector<uint32_t>                  visible_flat_;
     std::vector<DrawElementsIndirectCommand> indirect_scratch_;
 

From 91c8e46d1de7a68ecf06517c44713a14ed16b0dd Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Mon, 13 Apr 2026 23:25:33 +1000
Subject: [PATCH 27/37] Phase 3C: Hierarchical-Z occlusion culling (CPU-side
 v1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the main draw, blit the MSAA default-framebuffer depth to a
single-sample 256×128 depth texture, read it back, and build a CPU
max-reduced mip pyramid.  Next frame's cullAndUploadVisible projects
each BVH node / instance AABB through the previous frame's VP and
compares the AABB's nearest depth against the pyramid's deepest value
at the matching mip level; strictly-beyond AABBs are rejected.

Conservative direction (aabb_near > hiz_max) — never wrongly rejects a
visible instance, so no flicker.  BVH subtree-level test lets a single
8-corner projection reject up to a leaf's worth of instances.

Tuning knobs: IFC_NO_HIZ=1 disables; IFC_HIZ_SIZE overrides base width.
New stats counter hiz_rej shows rejects/frame.

Measured: big win on interior views (GPU-bound), roughly zero net
effect on exterior overviews (CPU-bound on cull traversal, so the
saved GPU work is masked).  Tried a 3-deep PBO ring for async readback
and reverted — the extra frame of staleness produced visible flicker
on fast orbit, and the synchronous readback wasn't actually a measured
bottleneck at 256×128.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md          | 129 +++++++++++++--
 src/ifcviewer/ViewportWindow.cpp | 264 ++++++++++++++++++++++++++++++-
 src/ifcviewer/ViewportWindow.h   |  40 +++++
 3 files changed, 420 insertions(+), 13 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 82bd89555cd..afa20426103 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -565,15 +565,120 @@ typical cost is 100–600 ms per model, folded into the already-visible
 "finalizing" step. Cached into the sidecar afterwards, so subsequent
 opens skip it entirely.
 
-#### 3C. Hierarchical-Z occlusion culling (longer-term)
+#### 3C. Hierarchical-Z occlusion culling — ✅ done (v1, CPU-side)
 
-Render large occluders first, build a depth pyramid, test instance
-AABBs against it. In dense BIM most geometry is behind other geometry
-from any given interior viewpoint; historically a 3–10× reduction in
-drawn instances. Most valuable *after* 3A+3B, which together handle
-the far-away and small-detail cases. Pairs naturally with GPU-side
-culling (a compute shader doing the HiZ test and writing the visible
-list + indirect buffer in place).
+Reject frustum-visible instances whose AABB is fully behind something
+already drawn. The last drawn frame's depth buffer is the oracle — if a
+region's deepest rasterised fragment is closer than an AABB's nearest
+point, nothing in that AABB can win the depth test.
+
+In dense BIM this matters most on interior views: standing inside a
+building, 80–95 % of the model sits behind the walls of the current
+room and contributes nothing to the frame. Phase 3A drops the
+*distant-and-small* geometry, 3B drops its triangle count when kept,
+and 3C drops the *close-and-big-but-hidden* bulk that neither of those
+can touch. On an outdoor overview shot (nothing is occluded) 3C does
+almost nothing — which is fine, 3A+3B already cover that case.
+
+##### Pipeline (v1: CPU-side, 1-frame stale)
+
+```
+render():
+  draw main scene into MSAA default fb
+  axis gizmo
+  buildHizPyramid():          <-- new
+    glBlitFramebuffer MSAA depth → single-sample depth tex (256×128)
+    glReadPixels  depth tex → CPU
+    max-reduce mip chain on CPU (8–9 levels)
+    store the VP that produced this frame
+  swapBuffers
+
+cullAndUploadVisible():
+  per BVH node:     frustum ∧ contribution ∧ hiz  (subtree early-out)
+  per instance:     frustum ∧ contribution ∧ hiz
+```
+
+The pyramid is always the *previous* frame's depth. On a newly loaded
+scene or after a camera jump the cull is conservatively too permissive
+for a frame or two (draws the occluded stuff by accident) and then
+settles. No flicker because we never *wrongly reject* a visible
+instance — the comparison is `aabb_near_depth > hiz_max`, so the
+worst case is a kept instance that was actually occluded.
+
+##### Why CPU-side?
+
+Because the readback is cheap at this resolution (~128 KB / frame,
+single glReadPixels ≈ 0.5 ms on PCIe) and the test itself is trivial
+— ~100 k AABBs × 8 corners × a small mip lookup is well under a
+millisecond on one thread. Phase 3D will port the cull to a compute
+shader reading the pyramid as a texture, eliminating the readback; but
+Phase 3C's CPU implementation was small enough to do first and
+measure.
+
+No MSAA complication on the write side: we just blit the default
+framebuffer's multi-sample depth into a single-sample texture (GL
+handles the resolve). No separate occluder pass either — we use the
+previous completed frame's depth buffer directly, which is what a
+temporal-reprojection HiZ reduces to when the "occluder set" is
+"everything visible last frame".
+
+##### The test
+
+```cpp
+project 8 AABB corners through hiz_vp  →  NDC rect + min z
+if any corner has w ≤ 0:        return false  // crosses near plane
+if rect is outside [-1, 1]²:    return false
+pick mip level where rect ≤ 2×2 texels
+hiz_max = max(pyramid[mip][covered texels])
+return aabb_near_depth > hiz_max
+```
+
+Comparing the AABB's *closest* point against the pyramid's *deepest*
+value is the conservative direction — it only rejects when the AABB
+is strictly beyond everything we already drew in that region. We pick
+the mip at which the rect covers ≲ 2 texels on each axis so the lookup
+is O(1) regardless of AABB size.
+
+##### BVH integration
+
+The same test runs on interior BVH node AABBs before leaf expansion,
+so an occluded subtree skips all its instances in one shot. This is
+where most of the per-frame cost savings show up on interior shots —
+rejecting a 500-instance BVH subtree costs one 8-corner projection.
+
+##### Tuning knobs
+
+| Var | Default | Effect |
+|-----|---------|--------|
+| `IFC_NO_HIZ` | unset | `1` disables HiZ entirely (forces the Phase-3B-only path). |
+| `IFC_HIZ_SIZE` | `256` | Base pyramid width in texels; height tracks viewport aspect. Raise for more accurate near-silhouette occlusion, lower to shrink readback. |
+
+The stats overlay gains one counter, `hiz_rej`, showing how many
+instances per frame the HiZ test rejected. On outdoor overview shots
+it hovers near zero; on indoor shots it climbs into the hundreds of
+thousands and the frame time drops accordingly.
+
+##### Known caveats
+
+- **1 frame stale.** The pyramid is aligned to last frame's view, so
+  when you whip the camera across the scene we may draw one frame of
+  stuff that the new view would have occluded. Invisible in practice
+  at 60 fps. We tried a 3-deep PBO ring for async readback (2-frame
+  stale) and it produced visible flicker on fast orbits — reverted.
+- **Readback syncs the GPU.** `glGetTextureImage` is blocking.
+  Measured cost is well under a millisecond at 256×128; not a
+  bottleneck on the machines tested. Phase 3D's compute-shader cull
+  removes it entirely.
+- **Doesn't move the needle on overview shots.** Those scenes are
+  CPU-bound on the cull traversal itself, not GPU-bound on drawing,
+  so cutting the drawn-triangle count in half is invisible in the
+  frame time. `hiz_rej` still rises modestly on overviews (the frustum
+  hull contains everything behind visible walls) but saved GPU work
+  is masked by CPU cost. HiZ pays off on interior views, where the
+  GPU *was* the bottleneck. If a project never leaves overview,
+  `IFC_NO_HIZ=1` shaves the ~1 ms of HiZ cost.
+- **Transparent geometry would need special handling**, but the
+  current renderer doesn't have any, so no-op for now.
 
 #### 3D. GPU-side culling via compute (longer-term)
 
@@ -598,7 +703,7 @@ Scene size                      Bottleneck              Fix
 100k–500k                       CPU cull scan           BVH (Phase 2) — done
 500k+ tris / overview shot      GPU vertex + raster     Phase 3A contribution cull
                                                         + Phase 3B LOD (done)
-multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
+multi-million + occluders       redundant rasterisation Phase 3C HiZ (done, CPU readback)
 ```
 
 ## Roadmap
@@ -615,10 +720,10 @@ multi-million + occluders       redundant rasterisation Phase 3C HiZ occlusion
 - [x] Reflection-aware two-pass draw for mirrored placements
 - [x] Backface culling (user-toggleable, default on)
 - [x] `reorient-shells` enabled in iterator
-- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`, `IFC_LOD1_PX`)
+- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`, `IFC_LOD1_PX`, `IFC_NO_HIZ`, `IFC_HIZ_SIZE`)
 - [x] Phase 3A — screen-space contribution culling
 - [x] Phase 3B — distance / contribution LOD (meshoptimizer `simplifySloppy`)
-- [ ] **Phase 3C — Hierarchical-Z occlusion culling** (next)
-- [ ] Phase 3D — GPU-side compute-shader culling
+- [x] Phase 3C — Hierarchical-Z occlusion culling (v1, CPU-side readback)
+- [ ] **Phase 3D — GPU-side compute-shader culling** (next; replaces the readback)
 - [ ] Vulkan/MoltenVK backend for macOS
 - [ ] Embedded Python scripting console
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 2606ffd3f3f..fdfff63997c 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -343,6 +343,10 @@ ViewportWindow::~ViewportWindow() {
             if (pick_fbo_)      gl_->glDeleteFramebuffers(1, &pick_fbo_);
             if (pick_color_tex_) gl_->glDeleteTextures(1, &pick_color_tex_);
             if (pick_depth_rbo_) gl_->glDeleteRenderbuffers(1, &pick_depth_rbo_);
+            if (hiz_fbo_)         gl_->glDeleteFramebuffers(1, &hiz_fbo_);
+            if (hiz_depth_tex_)   gl_->glDeleteTextures(1, &hiz_depth_tex_);
+            if (hiz_resolve_fbo_) gl_->glDeleteFramebuffers(1, &hiz_resolve_fbo_);
+            if (hiz_resolve_depth_tex_) gl_->glDeleteTextures(1, &hiz_resolve_depth_tex_);
         }
         context_->doneCurrent();
     }
@@ -821,6 +825,240 @@ void ViewportWindow::removeModel(uint32_t model_id) {
 
 void ViewportWindow::setSelectedObjectId(uint32_t id) { selected_object_id_ = id; }
 
+// --- HiZ occlusion culling (Phase 3C) -----------------------------------
+
+// Baseline HiZ resolution.  256x128 is enough to cull big occluders
+// (walls, slabs) reliably; finer detail doesn't help much because we're
+// sampling the pyramid at the mip level where the AABB's rect is ~2
+// texels anyway.  Readback cost is ~128 KB/frame ≈ negligible.
+// IFC_HIZ_SIZE=<N> overrides the width; height tracks aspect.
+static int hizBaseWidth() {
+    static const int w = []{
+        const char* e = std::getenv("IFC_HIZ_SIZE");
+        return (e && *e) ? std::max(64, std::atoi(e)) : 256;
+    }();
+    return w;
+}
+
+static bool hizEnabled() {
+    static const bool disabled = []{
+        const char* e = std::getenv("IFC_NO_HIZ");
+        return e && e[0] == '1';
+    }();
+    return !disabled;
+}
+
+void ViewportWindow::buildHizPyramid() {
+    if (!gl_initialized_) return;
+
+    const int win_w = width()  * devicePixelRatio();
+    const int win_h = height() * devicePixelRatio();
+    if (win_w <= 0 || win_h <= 0) return;
+
+    const int base_w = hizBaseWidth();
+    const int base_h = std::max(1, (base_w * win_h) / win_w);
+
+    // Depth format must match the default FBO's depth format for the blit
+    // to succeed — GL spec requires identical internal formats for depth
+    // blits.  Qt's default surface uses 24-bit depth (setDepthBufferSize(24)
+    // in initGL), so we match with DEPTH_COMPONENT24 on both textures.
+    //
+    // Resolve target (full window size, single sample).  Needed because
+    // GL also forbids scale-blitting from an MSAA source: resolve at 1:1
+    // first, then down-blit.
+    if (win_w != hiz_resolve_w_ || win_h != hiz_resolve_h_) {
+        if (hiz_resolve_fbo_)        gl_->glDeleteFramebuffers(1, &hiz_resolve_fbo_);
+        if (hiz_resolve_depth_tex_)  gl_->glDeleteTextures(1, &hiz_resolve_depth_tex_);
+        gl_->glCreateTextures(GL_TEXTURE_2D, 1, &hiz_resolve_depth_tex_);
+        gl_->glTextureStorage2D(hiz_resolve_depth_tex_, 1,
+                                GL_DEPTH_COMPONENT24, win_w, win_h);
+        gl_->glCreateFramebuffers(1, &hiz_resolve_fbo_);
+        gl_->glNamedFramebufferTexture(hiz_resolve_fbo_, GL_DEPTH_ATTACHMENT,
+                                       hiz_resolve_depth_tex_, 0);
+        hiz_resolve_w_ = win_w;
+        hiz_resolve_h_ = win_h;
+    }
+
+    if (base_w != hiz_base_w_ || base_h != hiz_base_h_) {
+        if (hiz_fbo_)       gl_->glDeleteFramebuffers(1, &hiz_fbo_);
+        if (hiz_depth_tex_) gl_->glDeleteTextures(1, &hiz_depth_tex_);
+        gl_->glCreateTextures(GL_TEXTURE_2D, 1, &hiz_depth_tex_);
+        gl_->glTextureStorage2D(hiz_depth_tex_, 1, GL_DEPTH_COMPONENT24,
+                                base_w, base_h);
+        gl_->glCreateFramebuffers(1, &hiz_fbo_);
+        gl_->glNamedFramebufferTexture(hiz_fbo_, GL_DEPTH_ATTACHMENT,
+                                       hiz_depth_tex_, 0);
+
+        hiz_base_w_ = base_w;
+        hiz_base_h_ = base_h;
+        hiz_depth_readback_.assign(base_w * base_h, 1.0f);
+
+        // Build the mip-offset table.  Level 0 = base_w x base_h.
+        hiz_mip_offset_.clear();
+        hiz_mip_w_.clear();
+        hiz_mip_h_.clear();
+        uint32_t off = 0;
+        int mw = base_w, mh = base_h;
+        while (mw >= 1 && mh >= 1) {
+            hiz_mip_offset_.push_back(off);
+            hiz_mip_w_.push_back(static_cast<uint32_t>(mw));
+            hiz_mip_h_.push_back(static_cast<uint32_t>(mh));
+            off += static_cast<uint32_t>(mw) * static_cast<uint32_t>(mh);
+            if (mw == 1 && mh == 1) break;
+            mw = std::max(1, mw / 2);
+            mh = std::max(1, mh / 2);
+        }
+        hiz_pyramid_.assign(off, 1.0f);
+    }
+
+    // Two-step: MSAA default-fb → full-size SS resolve, then SS → down-scaled.
+    // GL forbids scaling a blit whose source is multisampled, and also
+    // requires matching depth internal formats — hence this dance.
+    gl_->glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+    gl_->glBindFramebuffer(GL_DRAW_FRAMEBUFFER, hiz_resolve_fbo_);
+    gl_->glBlitFramebuffer(0, 0, win_w, win_h,
+                           0, 0, win_w, win_h,
+                           GL_DEPTH_BUFFER_BIT, GL_NEAREST);
+
+    gl_->glBindFramebuffer(GL_READ_FRAMEBUFFER, hiz_resolve_fbo_);
+    gl_->glBindFramebuffer(GL_DRAW_FRAMEBUFFER, hiz_fbo_);
+    gl_->glBlitFramebuffer(0, 0, win_w, win_h,
+                           0, 0, hiz_base_w_, hiz_base_h_,
+                           GL_DEPTH_BUFFER_BIT, GL_NEAREST);
+    gl_->glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+    gl_->glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+
+    // One-shot diagnostic so blit failures aren't silent.  We only warn
+    // the first handful of times — GL errors can pile up and spam.
+    static int err_warn_budget = 3;
+    if (err_warn_budget > 0) {
+        GLenum e = gl_->glGetError();
+        if (e != GL_NO_ERROR) {
+            qWarning("HiZ blit/readback GL error 0x%04x (win %dx%d → %dx%d → %dx%d)",
+                     e, win_w, win_h, win_w, win_h, hiz_base_w_, hiz_base_h_);
+            --err_warn_budget;
+        }
+    }
+
+    // Synchronous readback into level 0 of the pyramid.  At 256x128 this
+    // is ~128 KB and the driver copy is fast enough not to matter in
+    // practice; PBO-ring async was tried and made orbiting flicker worse
+    // (2-frame-stale depth vs 1-frame).
+    gl_->glGetTextureImage(hiz_depth_tex_, 0, GL_DEPTH_COMPONENT, GL_FLOAT,
+                           static_cast<GLsizei>(hiz_depth_readback_.size() * sizeof(float)),
+                           hiz_depth_readback_.data());
+
+    // Copy level 0 into the pyramid, then max-reduce subsequent levels.
+    std::memcpy(hiz_pyramid_.data() + hiz_mip_offset_[0],
+                hiz_depth_readback_.data(),
+                hiz_depth_readback_.size() * sizeof(float));
+    for (size_t lvl = 1; lvl < hiz_mip_offset_.size(); ++lvl) {
+        const uint32_t pw = hiz_mip_w_[lvl - 1];
+        const uint32_t ph = hiz_mip_h_[lvl - 1];
+        const uint32_t cw = hiz_mip_w_[lvl];
+        const uint32_t ch = hiz_mip_h_[lvl];
+        const float* parent = hiz_pyramid_.data() + hiz_mip_offset_[lvl - 1];
+        float* child  = hiz_pyramid_.data() + hiz_mip_offset_[lvl];
+        for (uint32_t y = 0; y < ch; ++y) {
+            const uint32_t py0 = std::min(2 * y,     ph - 1);
+            const uint32_t py1 = std::min(2 * y + 1, ph - 1);
+            for (uint32_t x = 0; x < cw; ++x) {
+                const uint32_t px0 = std::min(2 * x,     pw - 1);
+                const uint32_t px1 = std::min(2 * x + 1, pw - 1);
+                const float a = parent[py0 * pw + px0];
+                const float b = parent[py0 * pw + px1];
+                const float c = parent[py1 * pw + px0];
+                const float d = parent[py1 * pw + px1];
+                child[y * cw + x] = std::max(std::max(a, b), std::max(c, d));
+            }
+        }
+    }
+
+    hiz_vp_ = proj_matrix_ * view_matrix_;
+    hiz_vp_valid_ = true;
+}
+
+bool ViewportWindow::aabbOccludedByHiz(const float mn[3], const float mx[3]) const {
+    if (!hiz_vp_valid_ || hiz_pyramid_.empty()) return false;
+
+    // Project all 8 corners through the HiZ frame's VP (stored last frame).
+    // Track NDC min/max over x, y, z.  If any corner has w <= 0, the AABB
+    // straddles the near plane and we skip (behaves like "not occluded").
+    float sx_min =  std::numeric_limits<float>::infinity();
+    float sx_max = -std::numeric_limits<float>::infinity();
+    float sy_min =  std::numeric_limits<float>::infinity();
+    float sy_max = -std::numeric_limits<float>::infinity();
+    float sz_min =  std::numeric_limits<float>::infinity();
+    const float* vp = hiz_vp_.constData();  // column-major
+    for (int c = 0; c < 8; ++c) {
+        const float x = (c & 1) ? mx[0] : mn[0];
+        const float y = (c & 2) ? mx[1] : mn[1];
+        const float z = (c & 4) ? mx[2] : mn[2];
+        const float cx = vp[0]*x + vp[4]*y + vp[8]*z  + vp[12];
+        const float cy = vp[1]*x + vp[5]*y + vp[9]*z  + vp[13];
+        const float cz = vp[2]*x + vp[6]*y + vp[10]*z + vp[14];
+        const float cw = vp[3]*x + vp[7]*y + vp[11]*z + vp[15];
+        if (cw <= 1e-4f) return false;  // near-plane straddle
+        const float inv = 1.0f / cw;
+        const float nx = cx * inv;
+        const float ny = cy * inv;
+        const float nz = cz * inv;
+        if (nx < sx_min) sx_min = nx;  if (nx > sx_max) sx_max = nx;
+        if (ny < sy_min) sy_min = ny;  if (ny > sy_max) sy_max = ny;
+        if (nz < sz_min) sz_min = nz;
+    }
+
+    if (sx_max < -1.0f || sx_min > 1.0f ||
+        sy_max < -1.0f || sy_min > 1.0f) return false;
+    if (sz_min < -1.0f) return false;
+
+    sx_min = std::max(sx_min, -1.0f);
+    sx_max = std::min(sx_max,  1.0f);
+    sy_min = std::max(sy_min, -1.0f);
+    sy_max = std::min(sy_max,  1.0f);
+
+    const float u_min = 0.5f * (sx_min + 1.0f);
+    const float u_max = 0.5f * (sx_max + 1.0f);
+    const float v_min = 0.5f * (sy_min + 1.0f);
+    const float v_max = 0.5f * (sy_max + 1.0f);
+    const float aabb_near_depth = 0.5f * (sz_min + 1.0f);
+
+    // Pick mip level where the projected rect covers at most 2 texels on
+    // each axis; sample the max over the covered texels there.
+    const float px_w = (u_max - u_min) * static_cast<float>(hiz_base_w_);
+    const float px_h = (v_max - v_min) * static_cast<float>(hiz_base_h_);
+    int mip = 0;
+    while ((int)hiz_mip_offset_.size() - 1 > mip &&
+           ((px_w / (1 << mip)) > 2.0f || (px_h / (1 << mip)) > 2.0f)) {
+        ++mip;
+    }
+
+    const uint32_t mw = hiz_mip_w_[mip];
+    const uint32_t mh = hiz_mip_h_[mip];
+    int x0 = static_cast<int>(std::floor(u_min * mw));
+    int x1 = static_cast<int>(std::ceil (u_max * mw));
+    int y0 = static_cast<int>(std::floor(v_min * mh));
+    int y1 = static_cast<int>(std::ceil (v_max * mh));
+    if (x0 < 0) x0 = 0;
+    if (y0 < 0) y0 = 0;
+    if (x1 > (int)mw) x1 = mw;
+    if (y1 > (int)mh) y1 = mh;
+    if (x1 <= x0 || y1 <= y0) return false;
+
+    const float* level = hiz_pyramid_.data() + hiz_mip_offset_[mip];
+    float hiz_max = 0.0f;
+    for (int y = y0; y < y1; ++y) {
+        const float* row = level + static_cast<size_t>(y) * mw;
+        for (int x = x0; x < x1; ++x) {
+            if (row[x] > hiz_max) hiz_max = row[x];
+        }
+    }
+
+    // AABB's closest point must be strictly farther than everything drawn
+    // in the region for it to be fully occluded.
+    return aabb_near_depth > hiz_max;
+}
+
 uint32_t ViewportWindow::pickObjectAt(int x, int y) {
     if (!gl_initialized_) return 0;
     context_->makeCurrent(this);
@@ -936,10 +1174,19 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
                            : std::numeric_limits<float>::infinity();
     };
 
+    // HiZ occlusion is skipped entirely when the pick pass runs
+    // (min_pixel_radius == 0 on that path), when the user disables it via
+    // env var, or before the first pyramid has been built.
+    const bool hiz_on = hizEnabled() && min_pixel_radius > 0.0f && hiz_vp_valid_;
+
     auto test_and_push = [&](uint32_t inst_idx) {
         const InstanceCpu& inst = m.instances[inst_idx];
         if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
         if (!contributionPasses(inst.world_aabb_min, inst.world_aabb_max)) return;
+        if (hiz_on && aabbOccludedByHiz(inst.world_aabb_min, inst.world_aabb_max)) {
+            ++hiz_reject_count_;
+            return;
+        }
         if (inst.mesh_id >= m.meshes.size()) return;
         const MeshInfo& mesh = m.meshes[inst.mesh_id];
         const bool want_lod1 = mesh.lod1_index_count > 0 &&
@@ -966,6 +1213,12 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
             // Contribution cull the whole subtree: if the node's enclosing
             // sphere is below threshold, every child is too.
             if (!contributionPasses(n.aabb_min, n.aabb_max)) continue;
+            // HiZ cull the whole subtree: if the node AABB is fully
+            // occluded, every leaf is too.  The conservative test (AABB
+            // near-depth vs max pyramid depth) never rejects a visible
+            // parent wrongly even when some children could have peeked
+            // through.
+            if (hiz_on && aabbOccludedByHiz(n.aabb_min, n.aabb_max)) continue;
             if (n.count > 0) {
                 for (uint32_t k = 0; k < n.count; ++k) {
                     uint32_t item_idx = m.bvh.item_indices[n.right_or_first + k];
@@ -1108,6 +1361,7 @@ void ViewportWindow::render() {
     visible_objects_ = 0;
     gl_draw_calls_ = 0;
     indirect_sub_draws_ = 0;
+    hiz_reject_count_ = 0;
 
     // Start each frame with CCW-is-front; the two-pass draw below flips
     // back and forth.  Harmless when culling is off.
@@ -1178,6 +1432,13 @@ void ViewportWindow::render() {
 
     renderAxisGizmo();
 
+    // Build HiZ from this frame's resolved depth for next frame's cull.
+    // Synchronous glReadPixels inside — cost ~0.5 ms at 256x128 on a
+    // mid-range dGPU.  Skippable via IFC_NO_HIZ=1.
+    if (hizEnabled()) {
+        buildHizPyramid();
+    }
+
     context_->swapBuffers(this);
 
     float dt = frame_clock_.restart() / 1000.0f;
@@ -1215,12 +1476,13 @@ void ViewportWindow::render() {
         emit frameStatsUpdated(stats);
 
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
-               "meshes %u  gl_draws %u  sub_draws %u  "
+               "meshes %u  gl_draws %u  sub_draws %u  hiz_rej %u  "
                "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
                visible_objects_, total_obj,
                visible_triangles_, total_tri,
                total_meshes, gl_draw_calls_, indirect_sub_draws_,
+               hiz_reject_count_,
                (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
                total_vbo / (1024.0*1024.0),
                total_ebo / (1024.0*1024.0),
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index fe54cce9210..fd584bb4cb8 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -172,6 +172,20 @@ class ViewportWindow : public QWindow {
     void buildShaders();
     void buildAxisGizmo();
     void setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo);
+
+    // Resolve the default framebuffer's MSAA depth into a single-sample
+    // texture, read it back, and max-reduce a mip pyramid on the CPU.  The
+    // resulting pyramid is stored in hiz_pyramid_ along with the VP matrix
+    // used to draw it; next frame's cullAndUploadVisible can test AABBs
+    // against it.  Synchronous readback — at 256×128 the cost is sub-ms
+    // and not a measured bottleneck; Phase 3D's compute-shader cull will
+    // eliminate the readback entirely.
+    void buildHizPyramid();
+
+    // True if the AABB is fully occluded by the previous frame's depth.
+    // Returns false when the HiZ is invalid, the AABB crosses the near
+    // plane, or the projection falls outside NDC.
+    bool aabbOccludedByHiz(const float mn[3], const float mx[3]) const;
     bool growModelVbo(ModelGpuData& m, size_t needed_total);
     bool growModelEbo(ModelGpuData& m, size_t needed_total);
     bool growModelSsbo(ModelGpuData& m, size_t needed_total);
@@ -219,6 +233,32 @@ class ViewportWindow : public QWindow {
     int pick_width_ = 0;
     int pick_height_ = 0;
 
+    // HiZ occlusion culling (Phase 3C).
+    //
+    // Each frame after the main draw we blit the MSAA depth buffer down
+    // into a single-sample depth texture (hiz_fbo_ / hiz_depth_tex_), then
+    // glReadPixels it into hiz_depth_readback_.  We max-reduce that into a
+    // mip pyramid (hiz_pyramid_) and remember the VP matrix used
+    // (hiz_vp_ + hiz_vp_valid_) so next frame's cull can test AABBs
+    // against a slightly-stale depth.  Skipped for the pick pass and when
+    // IFC_NO_HIZ=1.
+    GLuint hiz_fbo_ = 0;
+    GLuint hiz_depth_tex_ = 0;
+    GLuint hiz_resolve_fbo_ = 0;         // full-size single-sample resolve
+    GLuint hiz_resolve_depth_tex_ = 0;
+    int    hiz_resolve_w_ = 0;
+    int    hiz_resolve_h_ = 0;
+    int    hiz_base_w_ = 0;
+    int    hiz_base_h_ = 0;
+    std::vector<float>    hiz_depth_readback_;   // hiz_base_w_ * hiz_base_h_ floats
+    std::vector<float>    hiz_pyramid_;          // concatenated mip levels
+    std::vector<uint32_t> hiz_mip_offset_;       // into hiz_pyramid_
+    std::vector<uint32_t> hiz_mip_w_;
+    std::vector<uint32_t> hiz_mip_h_;
+    QMatrix4x4            hiz_vp_;
+    bool                  hiz_vp_valid_ = false;
+    uint32_t              hiz_reject_count_ = 0;  // per-frame stat
+
     // Per-frame stats
     uint32_t visible_triangles_ = 0;
     uint32_t visible_objects_ = 0;

From cc680df236e1cb3d79a3baf55157a5d4f44dd300 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Tue, 14 Apr 2026 20:33:21 +1000
Subject: [PATCH 28/37] Cull: read AABBs from compact bvh_items in the hot path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cullAndUploadVisible was reading each instance's AABB through
m.instances[idx] — a 104-byte InstanceCpu struct — for the frustum /
contribution / HiZ tests.  Only 24 of those bytes (the two float[3]
AABBs) are actually used by the tests; the rest (4×4 transform +
header) is pure cache-line waste, and with 569k instances the array
is 59 MB, well past any cache.

bvh_items[idx] already stores a 1:1 compact 28-byte record with the
same AABB, built unconditionally in buildBvhForModel().  Switch the
hot test path to read from it, and only touch InstanceCpu once an
instance has passed all three tests (for mesh_id).  Modest ~20 %
drop in cull-traverse time on a 569k-object overview (26 ms → 21 ms).

Also add four cull-phase timers (clr / trv / emt / upl) to the
per-second stats line so future optimisation work has concrete
numbers to chase.  Confirmed via these timers that bucket clears,
emit and GPU upload are all <1 ms combined; traversal is where the
remaining CPU cost lives.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 43 +++++++++++++++++++++++++++-----
 src/ifcviewer/ViewportWindow.h   | 10 ++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index fdfff63997c..a85b33bae40 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -1097,6 +1097,9 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     // the right thing for both.  LOD split means instances that want the
     // decimated mesh go into a different bucket that emits against
     // mesh.lod1_ebo_byte_offset / lod1_index_count.
+    QElapsedTimer phase_timer;
+    phase_timer.start();
+
     auto resize_if = [&](std::vector<std::vector<uint32_t>>& v) {
         if (v.size() < m.meshes.size()) v.resize(m.meshes.size());
     };
@@ -1110,6 +1113,8 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         visible_by_mesh_rev_lod0_[i].clear();
         visible_by_mesh_rev_lod1_[i].clear();
     }
+    cull_clear_ns_ += phase_timer.nsecsElapsed();
+    phase_timer.restart();
 
     // LOD1 switches in when projected sphere radius (in pixels) drops below
     // this threshold.  Overridable for tuning.  Set to 0 to disable LOD1
@@ -1179,19 +1184,26 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     // env var, or before the first pyramid has been built.
     const bool hiz_on = hizEnabled() && min_pixel_radius > 0.0f && hiz_vp_valid_;
 
+    // Hot path: read the AABB from the compact bvh_items array (28 B stride)
+    // rather than the wide InstanceCpu (104 B stride).  Most instances fail
+    // frustum or contribution, so we want to avoid touching the wider struct
+    // until a survivor needs its mesh_id.  This alone turns the cull from
+    // cache-miss-per-instance into stream-friendly linear reads.
     auto test_and_push = [&](uint32_t inst_idx) {
-        const InstanceCpu& inst = m.instances[inst_idx];
-        if (!aabbInFrustum(inst.world_aabb_min, inst.world_aabb_max, planes)) return;
-        if (!contributionPasses(inst.world_aabb_min, inst.world_aabb_max)) return;
-        if (hiz_on && aabbOccludedByHiz(inst.world_aabb_min, inst.world_aabb_max)) {
+        const BvhItem& item = m.bvh_items[inst_idx];
+        if (!aabbInFrustum(item.aabb_min, item.aabb_max, planes)) return;
+        if (!contributionPasses(item.aabb_min, item.aabb_max)) return;
+        if (hiz_on && aabbOccludedByHiz(item.aabb_min, item.aabb_max)) {
             ++hiz_reject_count_;
             return;
         }
+        // Survivor — now pay the wide-struct fetch for mesh_id.
+        const InstanceCpu& inst = m.instances[inst_idx];
         if (inst.mesh_id >= m.meshes.size()) return;
         const MeshInfo& mesh = m.meshes[inst.mesh_id];
         const bool want_lod1 = mesh.lod1_index_count > 0 &&
             lod1_px_threshold > 0.0f &&
-            pixelRadius(inst.world_aabb_min, inst.world_aabb_max) < lod1_px_threshold;
+            pixelRadius(item.aabb_min, item.aabb_max) < lod1_px_threshold;
         const bool reflected = inst_idx < m.instance_reflected.size()
             && m.instance_reflected[inst_idx] != 0;
         auto& bucket =
@@ -1236,6 +1248,8 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     } else {
         for (uint32_t i = 0; i < m.instances.size(); ++i) test_and_push(i);
     }
+    cull_traverse_ns_ += phase_timer.nsecsElapsed();
+    phase_timer.restart();
 
     // Flatten fwd-slice first (LOD0 then LOD1), then rev-slice (ditto), into
     // visible_flat_.  Commands for the fwd slice fill [0, indirect_forward_count),
@@ -1274,6 +1288,8 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     emit_slice(visible_by_mesh_rev_lod0_, 0);
     emit_slice(visible_by_mesh_rev_lod1_, 1);
     m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
+    cull_emit_ns_ += phase_timer.nsecsElapsed();
+    phase_timer.restart();
 
     // Upload visible list (keep binding alive even when empty).
     size_t vis_bytes = std::max<size_t>(visible_flat_.size() * sizeof(uint32_t),
@@ -1293,7 +1309,10 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
 
     // Upload indirect command buffer.
     size_t ind_bytes = indirect_scratch_.size() * sizeof(DrawElementsIndirectCommand);
-    if (ind_bytes == 0) return;
+    if (ind_bytes == 0) {
+        cull_upload_ns_ += phase_timer.nsecsElapsed();
+        return;
+    }
     if (m.indirect_buffer == 0 || m.indirect_capacity < ind_bytes) {
         if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
         size_t new_cap = m.indirect_capacity ? m.indirect_capacity : 4096;
@@ -1303,6 +1322,7 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         m.indirect_capacity = new_cap;
     }
     gl_->glNamedBufferSubData(m.indirect_buffer, 0, ind_bytes, indirect_scratch_.data());
+    cull_upload_ns_ += phase_timer.nsecsElapsed();
 }
 
 void ViewportWindow::updateCamera() {
@@ -1446,6 +1466,7 @@ void ViewportWindow::render() {
     frame_count_++;
     if (accumulated_time_ >= 1.0f) {
         last_fps_ = static_cast<float>(frame_count_) / accumulated_time_;
+        const uint32_t frames_in_window = static_cast<uint32_t>(frame_count_);
         frame_count_ = 0;
         accumulated_time_ = 0.0f;
 
@@ -1475,14 +1496,24 @@ void ViewportWindow::render() {
         stats.indirect_sub_draws = indirect_sub_draws_;
         emit frameStatsUpdated(stats);
 
+        const double inv_frames = frames_in_window > 0
+            ? 1.0 / static_cast<double>(frames_in_window) : 0.0;
+        const double clr_ms = cull_clear_ns_    * 1e-6 * inv_frames;
+        const double trv_ms = cull_traverse_ns_ * 1e-6 * inv_frames;
+        const double emt_ms = cull_emit_ns_     * 1e-6 * inv_frames;
+        const double upl_ms = cull_upload_ns_   * 1e-6 * inv_frames;
+        cull_clear_ns_ = cull_traverse_ns_ = cull_emit_ns_ = cull_upload_ns_ = 0;
+
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
                "meshes %u  gl_draws %u  sub_draws %u  hiz_rej %u  "
+               "cull[clr %.2f trv %.2f emt %.2f upl %.2f]ms  "
                "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
                visible_objects_, total_obj,
                visible_triangles_, total_tri,
                total_meshes, gl_draw_calls_, indirect_sub_draws_,
                hiz_reject_count_,
+               clr_ms, trv_ms, emt_ms, upl_ms,
                (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
                total_vbo / (1024.0*1024.0),
                total_ebo / (1024.0*1024.0),
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index fd584bb4cb8..5d22f892884 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -259,6 +259,16 @@ class ViewportWindow : public QWindow {
     bool                  hiz_vp_valid_ = false;
     uint32_t              hiz_reject_count_ = 0;  // per-frame stat
 
+    // Cull-phase timers.  Accumulated across all frames in the current
+    // 1-second stats window; divided by frame_count_ at print time to
+    // give per-frame average ms.  Reset each window.  Lets us see where
+    // CPU time actually goes: bucket clears vs BVH traversal vs emit vs
+    // GPU upload.
+    uint64_t cull_clear_ns_    = 0;
+    uint64_t cull_traverse_ns_ = 0;
+    uint64_t cull_emit_ns_     = 0;
+    uint64_t cull_upload_ns_   = 0;
+
     // Per-frame stats
     uint32_t visible_triangles_ = 0;
     uint32_t visible_objects_ = 0;

From a17399212969e7e23249cf3736a60ea92b892c39 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Tue, 14 Apr 2026 20:50:58 +1000
Subject: [PATCH 29/37] Cull: skip cullAndUploadVisible + HiZ on still frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

render() was re-running the full cull every 16 ms timer tick even when
nothing had changed — the camera matrices, scene state, and therefore
visible set were all identical to the previous frame's.  The GPU was
still happy to redraw from the cached indirect buffer, but the CPU was
burning 21 ms/frame rebuilding the same visible list.

Detect the no-op case by comparing view/proj against last_cull_view_ /
last_cull_proj_ and checking a scene-dirty flag (have_cached_cull_)
that every mutator on models_gpu_ invalidates — finalizeModel,
applyCachedModel, applyLodExtension, hide/show/remove/reset, and
uploadInstanceChunk.  When the check passes we skip both
cullAndUploadVisible and buildHizPyramid (the depth buffer is
bit-identical, so re-reading it produces the same pyramid).

Per-model visible_objects / visible_triangles stats now live on
ModelGpuData so the stats line reports correct numbers on skipped
frames instead of reading from a stale indirect_scratch_.

Measured on a 569k-object overview: still frames go 22 fps → 62 fps;
orbiting goes 23 fps → ~30-50 fps depending on how hard you move the
mouse (the cull only pays its full cost on the ~25 % of frames where
the camera actually moved).  The stats line gains a "skipped N/M"
field so you can see the ratio live.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 72 +++++++++++++++++++++++++++-----
 src/ifcviewer/ViewportWindow.h   | 16 +++++++
 2 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index a85b33bae40..2070467362f 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -612,6 +612,7 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     if (chunk.local_mesh_id < m.meshes.size()) {
         m.total_triangles += m.meshes[chunk.local_mesh_id].index_count / 3;
     }
+    have_cached_cull_ = false;
 }
 
 void ViewportWindow::finalizeModel(uint32_t model_id) {
@@ -635,6 +636,7 @@ void ViewportWindow::finalizeModel(uint32_t model_id) {
     buildBvhForModel(m, model_id);
 
     m.finalized = true;
+    have_cached_cull_ = false;
 
     const size_t ssbo_bytes = m.ssbo_instance_count * sizeof(InstanceGpu);
     qDebug("Model %u finalized: %zu verts, %zu meshes, %zu instances, %.1f MB vram "
@@ -743,6 +745,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
 
     m.finalized = true;
     models_gpu_.emplace(model_id, std::move(m));
+    have_cached_cull_ = false;
 
     qDebug("Sidecar apply: model %u  %zu verts, %zu meshes, %zu instances  "
            "%.1f MB vram (vbo %.1f + ebo %.1f + ssbo %.1f)",
@@ -766,6 +769,7 @@ void ViewportWindow::applyLodExtension(uint32_t model_id, const SidecarData& sd)
         // buildLods didn't add anything; just refresh the meshes vector in
         // case lod1_* fields were touched.
         m.meshes = sd.meshes;
+        have_cached_cull_ = false;
         return;
     }
 
@@ -781,6 +785,7 @@ void ViewportWindow::applyLodExtension(uint32_t model_id, const SidecarData& sd)
 
     // Replace mesh metadata so cullAndUploadVisible sees the new lod1_ fields.
     m.meshes = sd.meshes;
+    have_cached_cull_ = false;
 }
 
 void ViewportWindow::resetScene() {
@@ -796,16 +801,23 @@ void ViewportWindow::resetScene() {
     }
     models_gpu_.clear();
     selected_object_id_ = 0;
+    have_cached_cull_ = false;
 }
 
 void ViewportWindow::hideModel(uint32_t model_id) {
     auto it = models_gpu_.find(model_id);
-    if (it != models_gpu_.end()) it->second.hidden = true;
+    if (it != models_gpu_.end()) {
+        it->second.hidden = true;
+        have_cached_cull_ = false;
+    }
 }
 
 void ViewportWindow::showModel(uint32_t model_id) {
     auto it = models_gpu_.find(model_id);
-    if (it != models_gpu_.end()) it->second.hidden = false;
+    if (it != models_gpu_.end()) {
+        it->second.hidden = false;
+        have_cached_cull_ = false;
+    }
 }
 
 void ViewportWindow::removeModel(uint32_t model_id) {
@@ -820,6 +832,7 @@ void ViewportWindow::removeModel(uint32_t model_id) {
         if (it->second.visible_ssbo) gl_->glDeleteBuffers(1, &it->second.visible_ssbo);
         if (it->second.indirect_buffer) gl_->glDeleteBuffers(1, &it->second.indirect_buffer);
         models_gpu_.erase(it);
+        have_cached_cull_ = false;
     }
 }
 
@@ -1288,6 +1301,17 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     emit_slice(visible_by_mesh_rev_lod0_, 0);
     emit_slice(visible_by_mesh_rev_lod1_, 1);
     m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
+
+    // Per-model stats snapshot — summed into the frame counters regardless
+    // of whether this frame ran a full cull or reused the cached one.
+    uint32_t model_vis_obj = 0, model_vis_tri = 0;
+    for (const auto& cmd : indirect_scratch_) {
+        model_vis_tri += (cmd.count / 3) * cmd.instanceCount;
+        model_vis_obj += cmd.instanceCount;
+    }
+    m.cached_visible_objects   = model_vis_obj;
+    m.cached_visible_triangles = model_vis_tri;
+
     cull_emit_ns_ += phase_timer.nsecsElapsed();
     phase_timer.restart();
 
@@ -1381,7 +1405,23 @@ void ViewportWindow::render() {
     visible_objects_ = 0;
     gl_draw_calls_ = 0;
     indirect_sub_draws_ = 0;
-    hiz_reject_count_ = 0;
+    // Only reset hiz_reject_count_ on frames where we actually re-cull;
+    // otherwise we'd wipe the previous cull's number and print 0 every
+    // still frame.  See the cull_this_frame branch below.
+
+    // Decide whether this frame's view+scene is identical to the last
+    // successful cull.  If so the per-model indirect buffers / visible
+    // SSBOs are still valid — we just re-issue the draws from them and
+    // skip the expensive cull traversal entirely.
+    const bool camera_unchanged = have_cached_cull_
+        && last_cull_view_ == view_matrix_
+        && last_cull_proj_ == proj_matrix_;
+    const bool cull_this_frame = !camera_unchanged;
+    if (cull_this_frame) {
+        hiz_reject_count_ = 0;
+    } else {
+        ++cull_skipped_frames_;
+    }
 
     // Start each frame with CCW-is-front; the two-pass draw below flips
     // back and forth.  Harmless when culling is off.
@@ -1390,7 +1430,9 @@ void ViewportWindow::render() {
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
-        cullAndUploadVisible(m, planes, focal_px, min_pixel_radius);
+        if (cull_this_frame) {
+            cullAndUploadVisible(m, planes, focal_px, min_pixel_radius);
+        }
         if (m.indirect_command_count == 0) continue;
 
         gl_->glBindVertexArray(m.vao);
@@ -1442,20 +1484,25 @@ void ViewportWindow::render() {
             gl_->glFrontFace(GL_CCW);
         }
 
-        for (const auto& cmd : indirect_scratch_) {
-            visible_triangles_ += (cmd.count / 3) * cmd.instanceCount;
-            visible_objects_   += cmd.instanceCount;
-        }
+        visible_triangles_  += m.cached_visible_triangles;
+        visible_objects_    += m.cached_visible_objects;
         indirect_sub_draws_ += m.indirect_command_count;
     }
+    if (cull_this_frame) {
+        last_cull_view_     = view_matrix_;
+        last_cull_proj_     = proj_matrix_;
+        have_cached_cull_   = true;
+    }
     gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
 
     renderAxisGizmo();
 
     // Build HiZ from this frame's resolved depth for next frame's cull.
     // Synchronous glReadPixels inside — cost ~0.5 ms at 256x128 on a
-    // mid-range dGPU.  Skippable via IFC_NO_HIZ=1.
-    if (hizEnabled()) {
+    // mid-range dGPU.  Skippable via IFC_NO_HIZ=1.  Also skipped on
+    // still frames: if we didn't re-cull, the depth buffer is
+    // bit-identical to the one we already turned into a pyramid.
+    if (hizEnabled() && cull_this_frame) {
         buildHizPyramid();
     }
 
@@ -1503,10 +1550,12 @@ void ViewportWindow::render() {
         const double emt_ms = cull_emit_ns_     * 1e-6 * inv_frames;
         const double upl_ms = cull_upload_ns_   * 1e-6 * inv_frames;
         cull_clear_ns_ = cull_traverse_ns_ = cull_emit_ns_ = cull_upload_ns_ = 0;
+        const uint32_t skipped = cull_skipped_frames_;
+        cull_skipped_frames_ = 0;
 
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
                "meshes %u  gl_draws %u  sub_draws %u  hiz_rej %u  "
-               "cull[clr %.2f trv %.2f emt %.2f upl %.2f]ms  "
+               "cull[clr %.2f trv %.2f emt %.2f upl %.2f]ms  skipped %u/%u  "
                "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
                visible_objects_, total_obj,
@@ -1514,6 +1563,7 @@ void ViewportWindow::render() {
                total_meshes, gl_draw_calls_, indirect_sub_draws_,
                hiz_reject_count_,
                clr_ms, trv_ms, emt_ms, upl_ms,
+               skipped, frames_in_window,
                (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
                total_vbo / (1024.0*1024.0),
                total_ebo / (1024.0*1024.0),
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 5d22f892884..26c6d20b588 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -79,6 +79,13 @@ struct ModelGpuData {
     std::vector<uint8_t>     instance_reflected;
     uint32_t                 ssbo_instance_count = 0;
 
+    // Stats snapshot from the last cullAndUploadVisible call.  Cached so we
+    // can report the same numbers on skipped-cull frames (see
+    // have_cached_cull_ on ViewportWindow) without iterating the per-model
+    // scratch array again.
+    uint32_t cached_visible_objects   = 0;
+    uint32_t cached_visible_triangles = 0;
+
     // Per-instance world AABB + BVH (built at finalize).  The BVH is the
     // same ordering as `instances`; bvh_items[i] corresponds to instances[i].
     std::vector<BvhItem> bvh_items;
@@ -268,6 +275,15 @@ class ViewportWindow : public QWindow {
     uint64_t cull_traverse_ns_ = 0;
     uint64_t cull_emit_ns_     = 0;
     uint64_t cull_upload_ns_   = 0;
+    uint32_t cull_skipped_frames_ = 0;
+
+    // Skip cullAndUploadVisible + buildHizPyramid when the camera and scene
+    // haven't changed since the last cull.  The existing per-model
+    // indirect_buffer / visible_ssbo are still correct and just get
+    // redrawn.  Invalidated by any function that mutates models_gpu_.
+    QMatrix4x4 last_cull_view_;
+    QMatrix4x4 last_cull_proj_;
+    bool       have_cached_cull_ = false;
 
     // Per-frame stats
     uint32_t visible_triangles_ = 0;

From bbe644e92c283c945ba6840c4128fd98ff8ce551 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Tue, 14 Apr 2026 21:05:27 +1000
Subject: [PATCH 30/37] ifcviewer: event-driven rendering, idle scenes cost
 zero CPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaced the 16ms QTimer with QEvent::UpdateRequest delivered via
requestUpdate(), posted from every state mutator (mouse/wheel, model
lifecycle, selection, visibility, resize).  A static BIM scene — the
common case for a viewer — now does no work at all between user actions.

FPS is now measured as time spent inside render() rather than wall-clock
gap between frames, so idle gaps don't pollute the 1-second window and
the headline number reflects real render throughput.  Headline fps still
caps at vsync; sub-vsync profiling lives in the cull[...] phase timers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 53 +++++++++++++++++++++++++-------
 src/ifcviewer/ViewportWindow.h   |  8 +++--
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 2070467362f..a48ef7f6d42 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -317,10 +317,12 @@ ViewportWindow::ViewportWindow(QWindow* parent)
     fmt.setSamples(4);
     setFormat(fmt);
 
-    connect(&render_timer_, &QTimer::timeout, this, [this]() {
-        if (isExposed()) render();
-    });
-    render_timer_.setInterval(16);
+    // Redraw is driven by QEvent::UpdateRequest.  We post one via
+    // requestUpdate() from every function that mutates visible state
+    // (mouse/wheel, model lifecycle, selection, resize).  When nothing
+    // changes — the common case for a static BIM model — we don't burn
+    // CPU/GPU redrawing the same frame.  Qt coalesces multiple
+    // requestUpdate() calls inside a single vblank.
 }
 
 ViewportWindow::~ViewportWindow() {
@@ -381,11 +383,11 @@ void ViewportWindow::initGL() {
                 context_->makeCurrent(this);
                 if (on) gl_->glEnable(GL_CULL_FACE);
                 else    gl_->glDisable(GL_CULL_FACE);
+                requestUpdate();
             });
 
     gl_initialized_ = true;
-    frame_clock_.start();
-    render_timer_.start();
+    requestUpdate();
 
     emit initialized();
 }
@@ -613,6 +615,7 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
         m.total_triangles += m.meshes[chunk.local_mesh_id].index_count / 3;
     }
     have_cached_cull_ = false;
+    requestUpdate();
 }
 
 void ViewportWindow::finalizeModel(uint32_t model_id) {
@@ -637,6 +640,7 @@ void ViewportWindow::finalizeModel(uint32_t model_id) {
 
     m.finalized = true;
     have_cached_cull_ = false;
+    requestUpdate();
 
     const size_t ssbo_bytes = m.ssbo_instance_count * sizeof(InstanceGpu);
     qDebug("Model %u finalized: %zu verts, %zu meshes, %zu instances, %.1f MB vram "
@@ -746,6 +750,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
     m.finalized = true;
     models_gpu_.emplace(model_id, std::move(m));
     have_cached_cull_ = false;
+    requestUpdate();
 
     qDebug("Sidecar apply: model %u  %zu verts, %zu meshes, %zu instances  "
            "%.1f MB vram (vbo %.1f + ebo %.1f + ssbo %.1f)",
@@ -770,6 +775,7 @@ void ViewportWindow::applyLodExtension(uint32_t model_id, const SidecarData& sd)
         // case lod1_* fields were touched.
         m.meshes = sd.meshes;
         have_cached_cull_ = false;
+        requestUpdate();
         return;
     }
 
@@ -786,6 +792,7 @@ void ViewportWindow::applyLodExtension(uint32_t model_id, const SidecarData& sd)
     // Replace mesh metadata so cullAndUploadVisible sees the new lod1_ fields.
     m.meshes = sd.meshes;
     have_cached_cull_ = false;
+    requestUpdate();
 }
 
 void ViewportWindow::resetScene() {
@@ -802,6 +809,7 @@ void ViewportWindow::resetScene() {
     models_gpu_.clear();
     selected_object_id_ = 0;
     have_cached_cull_ = false;
+    requestUpdate();
 }
 
 void ViewportWindow::hideModel(uint32_t model_id) {
@@ -809,6 +817,7 @@ void ViewportWindow::hideModel(uint32_t model_id) {
     if (it != models_gpu_.end()) {
         it->second.hidden = true;
         have_cached_cull_ = false;
+        requestUpdate();
     }
 }
 
@@ -817,6 +826,7 @@ void ViewportWindow::showModel(uint32_t model_id) {
     if (it != models_gpu_.end()) {
         it->second.hidden = false;
         have_cached_cull_ = false;
+        requestUpdate();
     }
 }
 
@@ -833,10 +843,14 @@ void ViewportWindow::removeModel(uint32_t model_id) {
         if (it->second.indirect_buffer) gl_->glDeleteBuffers(1, &it->second.indirect_buffer);
         models_gpu_.erase(it);
         have_cached_cull_ = false;
+        requestUpdate();
     }
 }
 
-void ViewportWindow::setSelectedObjectId(uint32_t id) { selected_object_id_ = id; }
+void ViewportWindow::setSelectedObjectId(uint32_t id) {
+    selected_object_id_ = id;
+    requestUpdate();
+}
 
 // --- HiZ occlusion culling (Phase 3C) -----------------------------------
 
@@ -1367,6 +1381,9 @@ void ViewportWindow::updateCamera() {
 void ViewportWindow::render() {
     if (!gl_initialized_ || !isExposed()) return;
 
+    QElapsedTimer frame_cost_clock;
+    frame_cost_clock.start();
+
     context_->makeCurrent(this);
     updateCamera();
 
@@ -1508,8 +1525,13 @@ void ViewportWindow::render() {
 
     context_->swapBuffers(this);
 
-    float dt = frame_clock_.restart() / 1000.0f;
-    accumulated_time_ += dt;
+    // Measure frame *cost* (time spent inside render()) rather than the
+    // wall-clock gap between frames.  With event-driven rendering, idle gaps
+    // between requestUpdate() calls would otherwise pollute the FPS window.
+    // Reported fps = "if I rendered continuously, this is the rate I'd hit",
+    // which is what profiling actually wants.
+    const float frame_cost_s = frame_cost_clock.nsecsElapsed() * 1e-9f;
+    accumulated_time_ += frame_cost_s;
     frame_count_++;
     if (accumulated_time_ >= 1.0f) {
         last_fps_ = static_cast<float>(frame_count_) / accumulated_time_;
@@ -1649,13 +1671,19 @@ void ViewportWindow::renderAxisGizmo() {
 }
 
 void ViewportWindow::exposeEvent(QExposeEvent*) {
-    if (isExposed() && !gl_initialized_) initGL();
+    if (isExposed()) {
+        if (!gl_initialized_) initGL();
+        else                  requestUpdate();
+    }
 }
 void ViewportWindow::resizeEvent(QResizeEvent*) {
-    if (gl_initialized_) render();
+    if (gl_initialized_) requestUpdate();
 }
 bool ViewportWindow::event(QEvent* e) {
     switch (e->type()) {
+    case QEvent::UpdateRequest:
+        if (isExposed() && gl_initialized_) render();
+        return true;
     case QEvent::MouseButtonPress:   handleMousePress(static_cast<QMouseEvent*>(e));   return true;
     case QEvent::MouseButtonRelease: handleMouseRelease(static_cast<QMouseEvent*>(e)); return true;
     case QEvent::MouseMove:          handleMouseMove(static_cast<QMouseEvent*>(e));    return true;
@@ -1673,6 +1701,7 @@ void ViewportWindow::handleMouseRelease(QMouseEvent* e) {
         uint32_t id = pickObjectAt(e->pos().x(), e->pos().y());
         selected_object_id_ = id;
         emit objectPicked(id);
+        requestUpdate();  // selection highlight changed
     }
     active_button_ = Qt::NoButton;
 }
@@ -1695,10 +1724,12 @@ void ViewportWindow::handleMouseMove(QMouseEvent* e) {
             camera_pitch_ += delta.y() * 0.3f;
             camera_pitch_ = qBound(-89.0f, camera_pitch_, 89.0f);
         }
+        requestUpdate();
     }
 }
 void ViewportWindow::handleWheel(QWheelEvent* e) {
     float factor = e->angleDelta().y() > 0 ? 0.9f : 1.1f;
     camera_distance_ *= factor;
     camera_distance_ = qMax(0.1f, camera_distance_);
+    requestUpdate();
 }
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 26c6d20b588..0a95ede0775 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -23,7 +23,6 @@
 #include <QWindow>
 #include <QOpenGLContext>
 #include <QtOpenGL/QOpenGLFunctions_4_5_Core>
-#include <QTimer>
 #include <QElapsedTimer>
 #include <QMatrix4x4>
 #include <QVector3D>
@@ -107,6 +106,11 @@ struct ModelGpuData {
     bool hidden    = false;
 };
 
+// Rendering is event-driven: render() runs only when QEvent::UpdateRequest
+// is delivered, posted via requestUpdate().  An idle scene costs zero CPU.
+// INVARIANT: every public mutator that changes what should be on screen
+// (camera, selection, model lifecycle, visibility) MUST call requestUpdate()
+// before returning, or the viewport will go silently stale.
 class ViewportWindow : public QWindow {
     Q_OBJECT
 public:
@@ -217,8 +221,6 @@ class ViewportWindow : public QWindow {
 
     QOpenGLContext* context_ = nullptr;
     QOpenGLFunctions_4_5_Core* gl_ = nullptr;
-    QTimer render_timer_;
-    QElapsedTimer frame_clock_;
     bool gl_initialized_ = false;
 
     // Shaders

From 91198c99cf3660911dda839bfd281ab0a1cfe61c Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Tue, 14 Apr 2026 21:54:15 +1000
Subject: [PATCH 31/37] ifcviewer: quantize VBO to 16 B/vertex (sidecar v6)

Position now u16x3 normalized against each mesh's local AABB; normal
oct-encoded to i16x2; RGBA8 colour unchanged. Per-mesh dequant basis
lives in a new MeshGpu SSBO at binding 2; both main and pick shaders
mix() against it before applying the instance transform.

Drops VBO and sidecar size by ~43 % (28 -> 16 B/vert), which matters
mostly for warm-load downloads of precomputed sidecars and steady-state
VRAM. LodBuilder dequantizes positions into a scratch buffer before
calling meshopt, since meshoptimizer needs float positions.

Also fixes a streaming-time crash in cullAndUploadVisible: bvh_items
was only populated at finalize, but the linear fallback indexes it
during streaming. Mirror BvhItem appends in uploadInstanceChunk so the
hot path stays valid before the BVH is built.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/InstancedGeometry.h |  39 ++++-
 src/ifcviewer/LodBuilder.cpp      |  36 ++++-
 src/ifcviewer/LodBuilder.h        |   7 +-
 src/ifcviewer/MainWindow.cpp      |   2 +-
 src/ifcviewer/README.md           |  20 ++-
 src/ifcviewer/SidecarCache.cpp    |  11 +-
 src/ifcviewer/SidecarCache.h      |  11 +-
 src/ifcviewer/ViewportWindow.cpp  | 243 +++++++++++++++++++++++++-----
 src/ifcviewer/ViewportWindow.h    |   2 +
 9 files changed, 299 insertions(+), 72 deletions(-)

diff --git a/src/ifcviewer/InstancedGeometry.h b/src/ifcviewer/InstancedGeometry.h
index ef79751806a..729e4df1474 100644
--- a/src/ifcviewer/InstancedGeometry.h
+++ b/src/ifcviewer/InstancedGeometry.h
@@ -24,14 +24,36 @@
 #include <string>
 #include <vector>
 
-// Per-vertex layout for instanced meshes, stored in local coordinates.
-// 28 bytes per vertex:
-//   pos(3 float)    -- 12 B
-//   normal(3 float) -- 12 B
-//   color(4 bytes RGBA8, read as GL_UNSIGNED_BYTE*4 normalized) -- 4 B
-static constexpr int INSTANCED_VERTEX_STRIDE_BYTES = 28;
+// Per-vertex layout for instanced meshes, stored in local coordinates,
+// quantized against each mesh's local AABB.  16 bytes per vertex:
+//   offset 0   pos     3 x uint16   normalized -> [0,1]; dequant to
+//                                   mix(mesh.aabb_min, mesh.aabb_max, t)
+//   offset 6   _pad    2 bytes
+//   offset 8   normal  2 x int16    normalized -> [-1,1]; octahedral-decoded
+//   offset 12  color   4 x uint8    normalized -> [0,1]
+//
+// Quantization basis is per mesh, stored in the MeshGpu SSBO bound at
+// binding=2.  The vertex shader looks up its basis via the instance's mesh_id.
+static constexpr int INSTANCED_VERTEX_STRIDE_BYTES = 16;
+
+// Streamer-side intermediate format: 7 floats per vertex (pos3 + normal3 +
+// color-as-float).  GeometryStreamer writes this into MeshChunk.vertices;
+// ViewportWindow::uploadMeshChunk quantizes it down to STRIDE_BYTES on the
+// way to the VBO.  Not the GPU layout — purely a transfer convention.
 static constexpr int INSTANCED_VERTEX_STRIDE_FLOATS = 7;
 
+static constexpr int INSTANCED_VERTEX_POS_OFFSET    = 0;
+static constexpr int INSTANCED_VERTEX_NORMAL_OFFSET = 8;
+static constexpr int INSTANCED_VERTEX_COLOR_OFFSET  = 12;
+
+// Per-mesh quantization basis, uploaded to a std430 SSBO.  Two vec4s so
+// std430 layout is trivial (no alignment surprises).  w components unused.
+struct alignas(16) MeshGpu {
+    float aabb_min[4];  // xyz = local AABB min; w = 0
+    float aabb_max[4];  // xyz = local AABB max; w = 0
+};
+static_assert(sizeof(MeshGpu) == 32, "MeshGpu must be 32 bytes");
+
 // Per-mesh metadata on the CPU side.  Meshes own a slice of the model's
 // VBO (shared across LODs) and one or more slices of the EBO, one per LOD.
 //
@@ -61,12 +83,13 @@ static_assert(sizeof(MeshInfo) == 56, "MeshInfo must be 56 bytes");
 //   mat4 transform (64 B column-major)
 //   uint object_id
 //   uint color_override_rgba8   -- 0 = use baked vertex color, else override
-//   uint _pad0, _pad1           -- align to 16 for std430
+//   uint mesh_id                -- index into per-model MeshGpu[]
+//   uint _pad1                  -- align to 16 for std430
 struct alignas(16) InstanceGpu {
     float    transform[16];
     uint32_t object_id            = 0;
     uint32_t color_override_rgba8 = 0;
-    uint32_t _pad0                = 0;
+    uint32_t mesh_id              = 0;   // index into per-model MeshGpu[]
     uint32_t _pad1                = 0;
 };
 static_assert(sizeof(InstanceGpu) == 80, "InstanceGpu must be 80 bytes");
diff --git a/src/ifcviewer/LodBuilder.cpp b/src/ifcviewer/LodBuilder.cpp
index 88b8c9f0468..35b97df44a6 100644
--- a/src/ifcviewer/LodBuilder.cpp
+++ b/src/ifcviewer/LodBuilder.cpp
@@ -33,9 +33,8 @@ void buildLods(SidecarData& sd,
                float target_error) {
     if (sd.meshes.empty() || sd.vertices.empty() || sd.indices.empty()) return;
 
-    const size_t vtx_stride_bytes  = INSTANCED_VERTEX_STRIDE_BYTES;
-    const size_t vtx_stride_floats = INSTANCED_VERTEX_STRIDE_FLOATS;
-    const size_t total_vertex_count = sd.vertices.size() / vtx_stride_floats;
+    const size_t vtx_stride_bytes   = INSTANCED_VERTEX_STRIDE_BYTES;
+    const size_t total_vertex_count = sd.vertices.size() / vtx_stride_bytes;
 
     // Env var knobs so we can tune without rebuilding.
     //   IFC_LOD_LOCK_BORDER=1      re-enable LockBorder (off by default: BIM
@@ -73,8 +72,10 @@ void buildLods(SidecarData& sd,
     // Scratch buffers reused across meshes so we only allocate once.
     std::vector<uint32_t> simplified;
     std::vector<uint32_t> shadow;
+    std::vector<float>    dequant_pos;   // 3 floats/vertex, dequantized
     simplified.reserve(1024);
     shadow.reserve(1024);
+    dequant_pos.reserve(1024 * 3);
 
     int dbg_printed = 0;
     int dbg_rejected_savings = 0;
@@ -101,8 +102,27 @@ void buildLods(SidecarData& sd,
         const uint32_t first_index = mesh.ebo_byte_offset / sizeof(uint32_t);
         if (first_index + mesh.index_count > sd.indices.size()) continue;
 
-        const float* positions =
-            sd.vertices.data() + base_vertex * vtx_stride_floats;
+        // Dequantize positions for this mesh into a temp float array.
+        // meshopt needs contiguous float3 positions with a known stride;
+        // quantized bytes aren't directly usable.
+        const uint8_t* quant_base =
+            sd.vertices.data() + base_vertex * vtx_stride_bytes;
+        dequant_pos.resize(static_cast<size_t>(mesh.vertex_count) * 3);
+        const float extent[3] = {
+            mesh.local_aabb_max[0] - mesh.local_aabb_min[0],
+            mesh.local_aabb_max[1] - mesh.local_aabb_min[1],
+            mesh.local_aabb_max[2] - mesh.local_aabb_min[2],
+        };
+        for (uint32_t v = 0; v < mesh.vertex_count; ++v) {
+            const uint16_t* p = reinterpret_cast<const uint16_t*>(
+                quant_base + v * vtx_stride_bytes);
+            for (int a = 0; a < 3; ++a) {
+                float t = p[a] / 65535.0f;
+                dequant_pos[v * 3 + a] = mesh.local_aabb_min[a] + t * extent[a];
+            }
+        }
+        const float* positions = dequant_pos.data();
+        const size_t local_pos_stride = sizeof(float) * 3;
         const uint32_t* indices = sd.indices.data() + first_index;
 
         const size_t target_index_count = std::max<size_t>(
@@ -121,7 +141,7 @@ void buildLods(SidecarData& sd,
             indices, mesh.index_count,
             positions, mesh.vertex_count,
             sizeof(float) * 3,       // compare only xyz
-            vtx_stride_bytes);
+            local_pos_stride);
 
         simplified.resize(mesh.index_count);
         float result_error = 0.0f;
@@ -135,7 +155,7 @@ void buildLods(SidecarData& sd,
             new_index_count = meshopt_simplifySloppy(
                 simplified.data(),
                 indices, mesh.index_count,
-                positions, mesh.vertex_count, vtx_stride_bytes,
+                positions, mesh.vertex_count, local_pos_stride,
                 target_index_count, target_error,
                 &result_error);
         } else {
@@ -144,7 +164,7 @@ void buildLods(SidecarData& sd,
             new_index_count = meshopt_simplify(
                 simplified.data(),
                 shadow.data(), mesh.index_count,
-                positions, mesh.vertex_count, vtx_stride_bytes,
+                positions, mesh.vertex_count, local_pos_stride,
                 target_index_count, target_error,
                 options, &result_error);
         }
diff --git a/src/ifcviewer/LodBuilder.h b/src/ifcviewer/LodBuilder.h
index a937ae49870..0147ba82f9b 100644
--- a/src/ifcviewer/LodBuilder.h
+++ b/src/ifcviewer/LodBuilder.h
@@ -35,9 +35,10 @@
 //   target_ratio  = 0.25    — aim for 25% of original tris
 //   target_error  = 0.05    — stop if relative error exceeds 5%
 //
-// `sd.vertices` is read (position is the first 3 floats of each
-// INSTANCED_VERTEX_STRIDE_FLOATS-wide vertex) but not modified — LOD1
-// reuses the same vertex buffer, just with a different index list.
+// `sd.vertices` is raw bytes at the quantized layout; positions are
+// dequantized per-mesh (using MeshInfo.local_aabb_min/max) into a temp
+// float array before feeding meshoptimizer.  Vertices are not modified —
+// LOD1 reuses the same VBO, just with a different index list.
 void buildLods(SidecarData& sd,
                int min_triangles = 500,
                float target_ratio = 0.25f,
diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 7dc5454700b..0e8162f0436 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -239,7 +239,7 @@ void MainWindow::applySidecarData(ModelId mid, SidecarData data) {
 
     qDebug("Sidecar hit: %s (%zu verts, %zu indices, %zu meshes, %zu instances, %zu elements)",
            model.file_path.toStdString().c_str(),
-           data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS,
+           data.vertices.size() / INSTANCED_VERTEX_STRIDE_BYTES,
            data.indices.size(),
            data.meshes.size(),
            data.instances.size(),
diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index afa20426103..be4a69ec42b 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -46,9 +46,12 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
 - **Per-model GPU buffers**: each loaded model gets its own
   VAO/VBO/EBO/instance-SSBO/visible-SSBO/indirect-buffer. No cross-model
   growth copies. Removing a model frees its GPU memory immediately.
-- **Local-coordinate vertex format (28 B):** position (3 floats) + normal
-  (3 floats) + packed RGBA8 colour (1 uint). The per-instance transform is
-  applied in the vertex shader via an SSBO lookup. No world-baked vertex data.
+- **Quantized local-coordinate vertex format (16 B):** position as
+  `u16x3` normalised against each mesh's local AABB, octahedral-encoded
+  normal as `i16x2`, packed RGBA8 colour. Dequantisation basis is per
+  mesh, uploaded once in a `MeshGpu` SSBO at binding 2. The per-instance
+  transform is applied in the vertex shader. No world-baked vertex data.
+  ~43 % smaller VBO and sidecar than the previous 28 B float layout.
 - **Multi-draw indirect:** every frame the CPU builds a flat list of visible
   instance indices and one `DrawElementsIndirectCommand` per non-empty mesh,
   then issues a single `glMultiDrawElementsIndirect` per model. 50k visible
@@ -93,7 +96,7 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
 | `InstancedGeometry.h` | Shared structs: `MeshInfo`, `InstanceCpu`, `InstanceGpu`, chunk records |
 | `BvhAccel.h/cpp` | Median-split BVH builder; operates on instance world-AABBs |
 | `LodBuilder.h/cpp` | Post-stream decimation of unique meshes via meshoptimizer (`simplifySloppy`) |
-| `SidecarCache.h/cpp` | Raw binary `.ifcview` (v5) sidecar read/write |
+| `SidecarCache.h/cpp` | Raw binary `.ifcview` (v6) sidecar read/write |
 | `AppSettings.h/cpp` | Persisted preferences (geometry library, stats overlay, backface culling) |
 | `SettingsWindow.h/cpp` | Settings dialog |
 | `CMakeLists.txt` | Build configuration |
@@ -270,7 +273,7 @@ while stack not empty:
 Depth 64 is enough for billions of items on any balanced tree. The stack
 is on the C++ stack, zero per-frame allocation.
 
-#### Sidecar format (`.ifcview`, v5)
+#### Sidecar format (`.ifcview`, v6)
 
 Raw memory dump, Blender-`.blend`-style — no serialisation, no parsing.
 Stores everything needed to skip the `IfcGeom::Iterator` pass:
@@ -278,7 +281,7 @@ Stores everything needed to skip the `IfcGeom::Iterator` pass:
 ```
 SidecarHeader            (magic "IFVW", version, endian, ...)
 uint64_t                 source_file_size
-uint32_t + float[]       vertex data    (7 floats × N_verts, local coords)
+uint32_t + uint8_t[]     vertex data    (16 B/vert quantized; per-mesh basis in MeshInfo)
 uint32_t + uint32_t[]    index data     (mesh-local)
 uint32_t + MeshInfo[]    per-unique-mesh metadata (56 B each, incl. LOD1 slice)
 uint32_t + InstanceCpu[] per-placement records (transform + AABB + ids)
@@ -298,7 +301,8 @@ Per-model state on the GPU:
 
 | Buffer | Contents | Lifetime |
 |--------|----------|----------|
-| `VBO` | Interleaved local-coord vertex data (28 B/vert). One range per unique representation. | Grow-on-demand during streaming; static after finalize. |
+| `VBO` | Quantized local-coord vertex data (16 B/vert: u16x3 pos, oct i16x2 normal, RGBA8). One range per unique representation. | Grow-on-demand during streaming; static after finalize. |
+| `MeshGpu SSBO` (binding 2) | Per-mesh dequant basis (`vec4 aabb_min`, `vec4 aabb_max`). | Grow-on-demand; static after finalize. |
 | `EBO` | Mesh-local uint32 indices. One range per unique representation. | Same. |
 | `SSBO` (binding 0) | `InstanceGpu[]` (80 B each: mat4 transform, object_id, color_override, pad). | Appended during streaming, static after finalize. |
 | `visible SSBO` (binding 1) | `uint32[]` — flat list of visible instance indices, ordered by mesh, uploaded each frame. | Rewritten every frame. |
@@ -311,7 +315,7 @@ struct DrawElementsIndirectCommand {
     uint32_t count;         // mesh.index_count
     uint32_t instanceCount; // visible-list length for this mesh
     uint32_t firstIndex;    // mesh.ebo_byte_offset / 4
-    uint32_t baseVertex;    // mesh.vbo_byte_offset / 28
+    uint32_t baseVertex;    // mesh.vbo_byte_offset / 16
     uint32_t baseInstance;  // offset into the flat visible-index array
 };
 ```
diff --git a/src/ifcviewer/SidecarCache.cpp b/src/ifcviewer/SidecarCache.cpp
index da3943988d2..171bf4bda65 100644
--- a/src/ifcviewer/SidecarCache.cpp
+++ b/src/ifcviewer/SidecarCache.cpp
@@ -17,17 +17,16 @@
  *                                                                              *
  ********************************************************************************/
 
-// v5 layout (all multi-byte fields native-endian; endianness marker in header).
-// Same sequence as v4; the only change is that MeshInfo grew two uint32_ts
-// (lod1_ebo_byte_offset + lod1_index_count) and `indices` may contain extra
-// appended LOD1 slices pointed at by those offsets.
+// v6 layout (all multi-byte fields native-endian; endianness marker in header).
+// Same sequence as v5; the only change is that vertex data is now raw bytes
+// at the 16 B/vertex quantized layout (see InstancedGeometry.h).
 //
 //
 //   SidecarHeader (16 bytes)
 //   uint64_t  source_file_size
 //
-//   uint32_t  num_vertices_floats
-//   float[]   vertex data (28 B/vertex: pos3 + normal3 + color1_packed)
+//   uint32_t  num_vertex_bytes
+//   uint8_t[] vertex data (16 B/vertex: pos u16x3 + pad2 + oct-normal i16x2 + rgba8)
 //   uint32_t  num_indices
 //   uint32_t[] index data (mesh-local indices; base_vertex applied at draw time)
 //
diff --git a/src/ifcviewer/SidecarCache.h b/src/ifcviewer/SidecarCache.h
index 332abdc8029..e2e34373abe 100644
--- a/src/ifcviewer/SidecarCache.h
+++ b/src/ifcviewer/SidecarCache.h
@@ -37,7 +37,9 @@ static constexpr uint32_t SIDECAR_MAGIC   = 0x49465657;  // "IFVW"
 // v5 = MeshInfo extended with lod1_ebo_byte_offset + lod1_index_count (56 B).
 //      sd.indices may contain an appended LOD1 index slice for each mesh
 //      where meshoptimizer decimation produced useful output.
-static constexpr uint32_t SIDECAR_VERSION = 5;
+// v6 = VBO vertices quantized to 16 B/vertex (pos u16x3 + normal oct i16x2 +
+//      color u8x4).  Dequant basis is per-mesh MeshInfo.local_aabb_min/max.
+static constexpr uint32_t SIDECAR_VERSION = 6;
 static constexpr uint32_t SIDECAR_ENDIAN  = 0x01020304;
 
 // Fixed-size element record.  Strings are stored as (offset, length) pairs
@@ -56,10 +58,11 @@ struct PackedElementInfo {
 };
 
 // Everything needed to display an already-tessellated model without
-// re-running the iterator.  v4 schema: instanced geometry.
+// re-running the iterator.  v6 schema: instanced + quantized geometry.
 struct SidecarData {
-    // Per-model GPU geometry (local coords).  28 bytes/vertex.
-    std::vector<float>        vertices;
+    // Per-model GPU geometry (local coords).  Raw VBO bytes at the
+    // INSTANCED_VERTEX_STRIDE_BYTES layout (16 B/vertex as of v6).
+    std::vector<uint8_t>      vertices;
     std::vector<uint32_t>     indices;
 
     // Mesh dictionary and per-instance data.
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index a48ef7f6d42..4731a431867 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -44,16 +44,17 @@ static_assert(sizeof(DrawElementsIndirectCommand) == 20, "indirect cmd must be 2
 // Shaders
 // -----------------------------------------------------------------------------
 //
-// Vertex layout (GL side, 28 bytes):
-//   location 0: vec3 a_position     (local coords)
-//   location 1: vec3 a_normal       (local)
-//   location 2: vec4 a_color        (GL_UNSIGNED_BYTE * 4 normalized)
+// Vertex layout (GL side, 16 bytes — quantized; see InstancedGeometry.h):
+//   location 0: vec3 a_position_q   (u16x3 normalized, per-mesh AABB basis)
+//   location 1: vec2 a_normal_oct   (i16x2 normalized, octahedral)
+//   location 2: vec4 a_color        (u8x4 normalized)
 //
 // Per-instance record in SSBO std430 (80 bytes):
 //   mat4 transform
 //   uint object_id
 //   uint color_override_rgba8     -- 0 => use baked a_color
-//   uint _pad0, _pad1
+//   uint mesh_id                  -- index into per-model MeshGpu[]
+//   uint _pad1
 //
 // The draw calls pass `u_instance_offset = mesh.first_instance`; the shader
 // reads `instances[u_instance_offset + gl_InstanceID]`.
@@ -61,15 +62,16 @@ static_assert(sizeof(DrawElementsIndirectCommand) == 20, "indirect cmd must be 2
 static const char* MAIN_VERTEX_SHADER = R"(
 #version 450 core
 #extension GL_ARB_shader_draw_parameters : require
-layout(location = 0) in vec3 a_position;
-layout(location = 1) in vec3 a_normal;
+// Quantized vertex inputs — see InstancedGeometry.h for layout.
+layout(location = 0) in vec3 a_position_q;  // u16x3 normalized -> [0,1]
+layout(location = 1) in vec2 a_normal_oct;  // i16x2 normalized -> [-1,1]
 layout(location = 2) in vec4 a_color;
 
 struct InstanceRecord {
     mat4 transform;
     uint object_id;
     uint color_override;
-    uint _pad0;
+    uint mesh_id;
     uint _pad1;
 };
 layout(std430, binding = 0) readonly buffer Instances {
@@ -78,6 +80,10 @@ layout(std430, binding = 0) readonly buffer Instances {
 layout(std430, binding = 1) readonly buffer VisibleIndices {
     uint visible[];
 };
+struct MeshQuant { vec4 aabb_min; vec4 aabb_max; };
+layout(std430, binding = 2) readonly buffer Meshes {
+    MeshQuant meshes[];
+};
 
 uniform mat4 u_view_projection;
 uniform uint u_selected_id;
@@ -87,11 +93,24 @@ out vec4 v_color;
 flat out uint v_object_id;
 flat out uint v_selected;
 
+// Meyer et al. octahedral normal decode.  Input is in [-1,1]^2.
+vec3 octDecode(vec2 e) {
+    vec3 n = vec3(e.xy, 1.0 - abs(e.x) - abs(e.y));
+    if (n.z < 0.0) n.xy = (1.0 - abs(n.yx)) * vec2(n.x >= 0.0 ? 1.0 : -1.0,
+                                                    n.y >= 0.0 ? 1.0 : -1.0);
+    return normalize(n);
+}
+
 void main() {
     uint slot = uint(gl_BaseInstanceARB) + uint(gl_InstanceID);
     uint iid = visible[slot];
     InstanceRecord inst = instances[iid];
-    vec4 world = inst.transform * vec4(a_position, 1.0);
+    MeshQuant mq = meshes[inst.mesh_id];
+
+    // Dequantize local position against this mesh's AABB.
+    vec3 pos_local = mix(mq.aabb_min.xyz, mq.aabb_max.xyz, a_position_q);
+
+    vec4 world = inst.transform * vec4(pos_local, 1.0);
     gl_Position = u_view_projection * world;
 
     // Rotate the normal by the upper-3x3 of the transform.  BIM placements
@@ -101,8 +120,9 @@ void main() {
     // otherwise mirrored instances shade as if inside-out.  The same
     // determinant sign is what GL_CULL_FACE uses to decide winding, so
     // keeping them in agreement means backface culling is safe to enable.
+    vec3 n_local = octDecode(a_normal_oct);
     mat3 rot = mat3(inst.transform);
-    vec3 n = rot * a_normal;
+    vec3 n = rot * n_local;
     if (determinant(rot) < 0.0) n = -n;
     v_normal = normalize(n);
 
@@ -152,13 +172,13 @@ void main() {
 static const char* PICK_VERTEX_SHADER = R"(
 #version 450 core
 #extension GL_ARB_shader_draw_parameters : require
-layout(location = 0) in vec3 a_position;
+layout(location = 0) in vec3 a_position_q;
 
 struct InstanceRecord {
     mat4 transform;
     uint object_id;
     uint color_override;
-    uint _pad0;
+    uint mesh_id;
     uint _pad1;
 };
 layout(std430, binding = 0) readonly buffer Instances {
@@ -167,6 +187,10 @@ layout(std430, binding = 0) readonly buffer Instances {
 layout(std430, binding = 1) readonly buffer VisibleIndices {
     uint visible[];
 };
+struct MeshQuant { vec4 aabb_min; vec4 aabb_max; };
+layout(std430, binding = 2) readonly buffer Meshes {
+    MeshQuant meshes[];
+};
 
 uniform mat4 u_view_projection;
 
@@ -176,7 +200,9 @@ void main() {
     uint slot = uint(gl_BaseInstanceARB) + uint(gl_InstanceID);
     uint iid = visible[slot];
     InstanceRecord inst = instances[iid];
-    gl_Position = u_view_projection * inst.transform * vec4(a_position, 1.0);
+    MeshQuant mq = meshes[inst.mesh_id];
+    vec3 pos_local = mix(mq.aabb_min.xyz, mq.aabb_max.xyz, a_position_q);
+    gl_Position = u_view_projection * inst.transform * vec4(pos_local, 1.0);
     v_object_id = inst.object_id;
 }
 )";
@@ -240,6 +266,51 @@ static GLuint linkProgram(QOpenGLFunctions_4_5_Core* gl, GLuint vert, GLuint fra
 
 // -----------------------------------------------------------------------------
 
+// Meyer et al. octahedral normal encode.  Input unit vector -> [-1,1]^2.
+static void octEncode(const float n[3], float out[2]) {
+    float ax = std::fabs(n[0]), ay = std::fabs(n[1]), az = std::fabs(n[2]);
+    float denom = ax + ay + az;
+    if (denom < 1e-12f) { out[0] = 0.0f; out[1] = 0.0f; return; }
+    float px = n[0] / denom;
+    float py = n[1] / denom;
+    if (n[2] < 0.0f) {
+        float sx = px >= 0.0f ? 1.0f : -1.0f;
+        float sy = py >= 0.0f ? 1.0f : -1.0f;
+        float nx = (1.0f - std::fabs(py)) * sx;
+        float ny = (1.0f - std::fabs(px)) * sy;
+        px = nx; py = ny;
+    }
+    out[0] = px;
+    out[1] = py;
+}
+
+// Quantize a streamer-format vertex (pos3 + normal3 + color-as-float) into
+// the 16 B VBO record, given the mesh's tight local AABB.  `extent_recip`
+// is 1/(max-min) per axis, or 0 for degenerate axes (quantum becomes 0).
+static void quantizeVertex(const float src[7],
+                           const float aabb_min[3],
+                           const float extent_recip[3],
+                           uint8_t dst[INSTANCED_VERTEX_STRIDE_BYTES]) {
+    // Position -> u16 normalized.
+    uint16_t* p = reinterpret_cast<uint16_t*>(dst + INSTANCED_VERTEX_POS_OFFSET);
+    for (int a = 0; a < 3; ++a) {
+        float t = (src[a] - aabb_min[a]) * extent_recip[a];
+        if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f;
+        p[a] = static_cast<uint16_t>(t * 65535.0f + 0.5f);
+    }
+    // Normal -> oct i16x2.
+    float oct[2];
+    octEncode(src + 3, oct);
+    int16_t* n = reinterpret_cast<int16_t*>(dst + INSTANCED_VERTEX_NORMAL_OFFSET);
+    for (int a = 0; a < 2; ++a) {
+        float v = oct[a];
+        if (v < -1.0f) v = -1.0f; else if (v > 1.0f) v = 1.0f;
+        n[a] = static_cast<int16_t>(std::lrintf(v * 32767.0f));
+    }
+    // Color passes through — streamer packs 4 bytes into the 7th float slot.
+    std::memcpy(dst + INSTANCED_VERTEX_COLOR_OFFSET, src + 6, 4);
+}
+
 // Determinant of the upper-left 3x3 of a column-major mat4 stored as 16 floats.
 // Sign tells us whether the transform contains a reflection, which is what
 // decides which glFrontFace winding to draw the instance with.
@@ -334,6 +405,7 @@ ViewportWindow::~ViewportWindow() {
                 if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
                 if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
                 if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+                if (m.mesh_info_ssbo) gl_->glDeleteBuffers(1, &m.mesh_info_ssbo);
                 if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
                 if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
             }
@@ -396,19 +468,22 @@ void ViewportWindow::setupVaoLayout(GLuint vao, GLuint vbo, GLuint ebo) {
     gl_->glVertexArrayVertexBuffer(vao, 0, vbo, 0, INSTANCED_VERTEX_STRIDE_BYTES);
     gl_->glVertexArrayElementBuffer(vao, ebo);
 
-    // position (3 float @ 0)
+    // position (3 x u16 normalized @ 0)
     gl_->glEnableVertexArrayAttrib(vao, 0);
-    gl_->glVertexArrayAttribFormat(vao, 0, 3, GL_FLOAT, GL_FALSE, 0);
+    gl_->glVertexArrayAttribFormat(vao, 0, 3, GL_UNSIGNED_SHORT, GL_TRUE,
+                                   INSTANCED_VERTEX_POS_OFFSET);
     gl_->glVertexArrayAttribBinding(vao, 0, 0);
 
-    // normal (3 float @ 12)
+    // normal oct-encoded (2 x i16 normalized @ 8)
     gl_->glEnableVertexArrayAttrib(vao, 1);
-    gl_->glVertexArrayAttribFormat(vao, 1, 3, GL_FLOAT, GL_FALSE, 12);
+    gl_->glVertexArrayAttribFormat(vao, 1, 2, GL_SHORT, GL_TRUE,
+                                   INSTANCED_VERTEX_NORMAL_OFFSET);
     gl_->glVertexArrayAttribBinding(vao, 1, 0);
 
-    // color (4 ubyte @ 24, normalized)
+    // color (4 x u8 normalized @ 12)
     gl_->glEnableVertexArrayAttrib(vao, 2);
-    gl_->glVertexArrayAttribFormat(vao, 2, 4, GL_UNSIGNED_BYTE, GL_TRUE, 24);
+    gl_->glVertexArrayAttribFormat(vao, 2, 4, GL_UNSIGNED_BYTE, GL_TRUE,
+                                   INSTANCED_VERTEX_COLOR_OFFSET);
     gl_->glVertexArrayAttribBinding(vao, 2, 0);
 }
 
@@ -544,8 +619,44 @@ void ViewportWindow::uploadMeshChunk(const MeshChunk& chunk) {
 
     ModelGpuData& m = getOrCreateModel(chunk.model_id);
 
-    const size_t vb_size = chunk.vertices.size() * sizeof(float);
-    const size_t ib_size = chunk.indices.size()  * sizeof(uint32_t);
+    // Streamer format: 7 floats/vertex (pos3 + normal3 + color-as-float).
+    const size_t src_stride_floats = 7;
+    const size_t n_verts = chunk.vertices.size() / src_stride_floats;
+
+    // Recompute a tight local AABB from the actual vertex positions — the
+    // chunk-provided AABB can be slightly loose, which wastes quantization
+    // precision.  Also derives the dequant basis we'll ship to the GPU.
+    float bmin[3] = {  std::numeric_limits<float>::infinity(),
+                       std::numeric_limits<float>::infinity(),
+                       std::numeric_limits<float>::infinity() };
+    float bmax[3] = { -std::numeric_limits<float>::infinity(),
+                      -std::numeric_limits<float>::infinity(),
+                      -std::numeric_limits<float>::infinity() };
+    for (size_t i = 0; i < n_verts; ++i) {
+        const float* v = chunk.vertices.data() + i * src_stride_floats;
+        for (int a = 0; a < 3; ++a) {
+            if (v[a] < bmin[a]) bmin[a] = v[a];
+            if (v[a] > bmax[a]) bmax[a] = v[a];
+        }
+    }
+    // Degenerate / zero-extent axis: collapse to a single quantum.  The
+    // dequant shader will output bmin[a] for every vertex, which is correct.
+    float extent_recip[3];
+    for (int a = 0; a < 3; ++a) {
+        float ext = bmax[a] - bmin[a];
+        extent_recip[a] = ext > 0.0f ? 1.0f / ext : 0.0f;
+    }
+
+    // Quantize into a scratch buffer sized to the destination layout.
+    std::vector<uint8_t> quant(n_verts * INSTANCED_VERTEX_STRIDE_BYTES);
+    for (size_t i = 0; i < n_verts; ++i) {
+        quantizeVertex(chunk.vertices.data() + i * src_stride_floats,
+                       bmin, extent_recip,
+                       quant.data() + i * INSTANCED_VERTEX_STRIDE_BYTES);
+    }
+
+    const size_t vb_size = quant.size();
+    const size_t ib_size = chunk.indices.size() * sizeof(uint32_t);
 
     if (m.vbo_used + vb_size > m.vbo_capacity) {
         if (!growModelVbo(m, m.vbo_used + vb_size)) return;
@@ -556,18 +667,17 @@ void ViewportWindow::uploadMeshChunk(const MeshChunk& chunk) {
 
     MeshInfo info;
     info.vbo_byte_offset = static_cast<uint32_t>(m.vbo_used);
-    info.vertex_count    = static_cast<uint32_t>(
-        chunk.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS);
+    info.vertex_count    = static_cast<uint32_t>(n_verts);
     info.ebo_byte_offset = static_cast<uint32_t>(m.ebo_used);
     info.index_count     = static_cast<uint32_t>(chunk.indices.size());
     for (int a = 0; a < 3; ++a) {
-        info.local_aabb_min[a] = chunk.local_aabb_min[a];
-        info.local_aabb_max[a] = chunk.local_aabb_max[a];
+        info.local_aabb_min[a] = bmin[a];
+        info.local_aabb_max[a] = bmax[a];
     }
     info.first_instance = 0;
     info.instance_count = 0;
 
-    gl_->glNamedBufferSubData(m.vbo, m.vbo_used, vb_size, chunk.vertices.data());
+    gl_->glNamedBufferSubData(m.vbo, m.vbo_used, vb_size, quant.data());
     gl_->glNamedBufferSubData(m.ebo, m.ebo_used, ib_size, chunk.indices.data());
     m.vbo_used += vb_size;
     m.ebo_used += ib_size;
@@ -575,6 +685,33 @@ void ViewportWindow::uploadMeshChunk(const MeshChunk& chunk) {
 
     if (m.meshes.size() <= chunk.local_mesh_id) m.meshes.resize(chunk.local_mesh_id + 1);
     m.meshes[chunk.local_mesh_id] = info;
+
+    // Write the matching dequant basis into the MeshGpu SSBO.  Grow on
+    // demand; geometrically doubling keeps this amortized O(1) over streaming.
+    MeshGpu mg{};
+    for (int a = 0; a < 3; ++a) {
+        mg.aabb_min[a] = bmin[a];
+        mg.aabb_max[a] = bmax[a];
+    }
+    mg.aabb_min[3] = 0.0f;
+    mg.aabb_max[3] = 0.0f;
+
+    const size_t mg_offset = chunk.local_mesh_id * sizeof(MeshGpu);
+    if (mg_offset + sizeof(MeshGpu) > m.mesh_info_capacity) {
+        size_t new_cap = m.mesh_info_capacity ? m.mesh_info_capacity : 32 * sizeof(MeshGpu);
+        while (new_cap < mg_offset + sizeof(MeshGpu)) new_cap *= 2;
+        GLuint new_ssbo = 0;
+        gl_->glCreateBuffers(1, &new_ssbo);
+        gl_->glNamedBufferStorage(new_ssbo, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
+        if (m.mesh_info_ssbo && m.mesh_info_capacity > 0) {
+            gl_->glCopyNamedBufferSubData(m.mesh_info_ssbo, new_ssbo, 0, 0,
+                                          m.mesh_info_capacity);
+            gl_->glDeleteBuffers(1, &m.mesh_info_ssbo);
+        }
+        m.mesh_info_ssbo = new_ssbo;
+        m.mesh_info_capacity = new_cap;
+    }
+    gl_->glNamedBufferSubData(m.mesh_info_ssbo, mg_offset, sizeof(MeshGpu), &mg);
 }
 
 void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
@@ -594,6 +731,15 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     m.instances.push_back(inst);
     m.instance_reflected.push_back(transformIsReflected(inst.transform) ? 1 : 0);
 
+    // Mirror into bvh_items so the hot cull path (which reads AABBs out of
+    // bvh_items even when no BVH has been built yet) stays correct during
+    // streaming.  finalizeModel rebuilds the real BVH over these items.
+    BvhItem bi;
+    std::memcpy(bi.aabb_min, inst.world_aabb_min, sizeof(bi.aabb_min));
+    std::memcpy(bi.aabb_max, inst.world_aabb_max, sizeof(bi.aabb_max));
+    bi.model_id = inst.model_id;
+    m.bvh_items.push_back(bi);
+
     // Append the GPU record to the instance SSBO so the model is drawable
     // immediately, without waiting for finalizeModel.  The visible-list
     // architecture means SSBO order is irrelevant to correctness.
@@ -601,7 +747,7 @@ void ViewportWindow::uploadInstanceChunk(const InstanceChunk& chunk) {
     std::memcpy(gpu.transform, inst.transform, sizeof(gpu.transform));
     gpu.object_id = inst.object_id;
     gpu.color_override_rgba8 = inst.color_override_rgba8;
-    gpu._pad0 = 0;
+    gpu.mesh_id = inst.mesh_id;
     gpu._pad1 = 0;
 
     const size_t offset = m.ssbo_instance_count * sizeof(InstanceGpu);
@@ -659,9 +805,10 @@ bool ViewportWindow::snapshotModel(uint32_t model_id, SidecarData& out) const {
     const auto& m = it->second;
     if (!m.finalized) return false;
 
-    // GPU readback of the packed VBO/EBO ranges actually in use.
+    // GPU readback of the packed VBO/EBO ranges actually in use.  VBO is
+    // raw bytes at the quantized layout.
     if (m.vbo_used > 0) {
-        out.vertices.resize(m.vbo_used / sizeof(float));
+        out.vertices.resize(m.vbo_used);
         gl_->glGetNamedBufferSubData(m.vbo, 0, m.vbo_used, out.vertices.data());
     }
     if (m.ebo_used > 0) {
@@ -685,6 +832,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
         if (existing->second.vbo)  gl_->glDeleteBuffers(1, &existing->second.vbo);
         if (existing->second.ebo)  gl_->glDeleteBuffers(1, &existing->second.ebo);
         if (existing->second.ssbo) gl_->glDeleteBuffers(1, &existing->second.ssbo);
+        if (existing->second.mesh_info_ssbo) gl_->glDeleteBuffers(1, &existing->second.mesh_info_ssbo);
         if (existing->second.visible_ssbo) gl_->glDeleteBuffers(1, &existing->second.visible_ssbo);
         if (existing->second.indirect_buffer) gl_->glDeleteBuffers(1, &existing->second.indirect_buffer);
         models_gpu_.erase(existing);
@@ -695,7 +843,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
     gl_->glCreateBuffers(1, &m.vbo);
     gl_->glCreateBuffers(1, &m.ebo);
 
-    const size_t vb_bytes = data.vertices.size() * sizeof(float);
+    const size_t vb_bytes = data.vertices.size();
     const size_t ib_bytes = data.indices.size()  * sizeof(uint32_t);
     m.vbo_capacity = std::max<size_t>(vb_bytes, 1);
     m.ebo_capacity = std::max<size_t>(ib_bytes, 1);
@@ -709,8 +857,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
 
     m.vbo_used = vb_bytes;
     m.ebo_used = ib_bytes;
-    m.vertex_count = static_cast<uint32_t>(
-        data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS);
+    m.vertex_count = static_cast<uint32_t>(vb_bytes / INSTANCED_VERTEX_STRIDE_BYTES);
     m.meshes = std::move(data.meshes);
     m.instances = std::move(data.instances);
 
@@ -728,7 +875,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
         std::memcpy(dst.transform, src.transform, sizeof(dst.transform));
         dst.object_id = src.object_id;
         dst.color_override_rgba8 = src.color_override_rgba8;
-        dst._pad0 = 0;
+        dst.mesh_id = src.mesh_id;
         dst._pad1 = 0;
     }
     gl_->glCreateBuffers(1, &m.ssbo);
@@ -738,6 +885,30 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
     }
     m.ssbo_instance_count = static_cast<uint32_t>(gpu.size());
 
+    // Build and upload the per-mesh quantization SSBO from cached meshes.
+    {
+        std::vector<MeshGpu> mesh_gpu(m.meshes.size());
+        for (size_t i = 0; i < m.meshes.size(); ++i) {
+            for (int a = 0; a < 3; ++a) {
+                mesh_gpu[i].aabb_min[a] = m.meshes[i].local_aabb_min[a];
+                mesh_gpu[i].aabb_max[a] = m.meshes[i].local_aabb_max[a];
+            }
+            mesh_gpu[i].aabb_min[3] = 0.0f;
+            mesh_gpu[i].aabb_max[3] = 0.0f;
+        }
+        const size_t mg_bytes = mesh_gpu.size() * sizeof(MeshGpu);
+        gl_->glCreateBuffers(1, &m.mesh_info_ssbo);
+        if (mg_bytes > 0) {
+            gl_->glNamedBufferStorage(m.mesh_info_ssbo, mg_bytes,
+                                      mesh_gpu.data(), GL_DYNAMIC_STORAGE_BIT);
+            m.mesh_info_capacity = mg_bytes;
+        } else {
+            gl_->glNamedBufferStorage(m.mesh_info_ssbo, sizeof(MeshGpu),
+                                      nullptr, GL_DYNAMIC_STORAGE_BIT);
+            m.mesh_info_capacity = sizeof(MeshGpu);
+        }
+    }
+
     // Recompute the reflection flag from each instance's transform — the
     // sidecar only caches InstanceCpu, not the parallel reflection flags.
     m.instance_reflected.resize(m.instances.size());
@@ -754,7 +925,7 @@ void ViewportWindow::applyCachedModel(uint32_t model_id, SidecarData data) {
 
     qDebug("Sidecar apply: model %u  %zu verts, %zu meshes, %zu instances  "
            "%.1f MB vram (vbo %.1f + ebo %.1f + ssbo %.1f)",
-           model_id, data.vertices.size() / INSTANCED_VERTEX_STRIDE_FLOATS,
+           model_id, vb_bytes / INSTANCED_VERTEX_STRIDE_BYTES,
            models_gpu_[model_id].meshes.size(),
            models_gpu_[model_id].instances.size(),
            (vb_bytes + ib_bytes + ssbo_bytes) / (1024.0*1024.0),
@@ -803,6 +974,7 @@ void ViewportWindow::resetScene() {
         if (m.vbo)  gl_->glDeleteBuffers(1, &m.vbo);
         if (m.ebo)  gl_->glDeleteBuffers(1, &m.ebo);
         if (m.ssbo) gl_->glDeleteBuffers(1, &m.ssbo);
+        if (m.mesh_info_ssbo) gl_->glDeleteBuffers(1, &m.mesh_info_ssbo);
         if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
         if (m.indirect_buffer) gl_->glDeleteBuffers(1, &m.indirect_buffer);
     }
@@ -839,6 +1011,7 @@ void ViewportWindow::removeModel(uint32_t model_id) {
         if (it->second.vbo)  gl_->glDeleteBuffers(1, &it->second.vbo);
         if (it->second.ebo)  gl_->glDeleteBuffers(1, &it->second.ebo);
         if (it->second.ssbo) gl_->glDeleteBuffers(1, &it->second.ssbo);
+        if (it->second.mesh_info_ssbo) gl_->glDeleteBuffers(1, &it->second.mesh_info_ssbo);
         if (it->second.visible_ssbo) gl_->glDeleteBuffers(1, &it->second.visible_ssbo);
         if (it->second.indirect_buffer) gl_->glDeleteBuffers(1, &it->second.indirect_buffer);
         models_gpu_.erase(it);
@@ -1455,6 +1628,7 @@ void ViewportWindow::render() {
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, m.mesh_info_ssbo);
         gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
 
         uint32_t fwd = m.indirect_forward_count;
@@ -1622,6 +1796,7 @@ void ViewportWindow::renderPickPass() {
         gl_->glBindVertexArray(m.vao);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m.ssbo);
         gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, m.visible_ssbo);
+        gl_->glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, m.mesh_info_ssbo);
         gl_->glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m.indirect_buffer);
 
         const uint32_t fwd = m.indirect_forward_count;
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index 0a95ede0775..ed6668cc116 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -60,6 +60,8 @@ struct ModelGpuData {
     GLuint vbo = 0;
     GLuint ebo = 0;
     GLuint ssbo = 0;
+    GLuint mesh_info_ssbo = 0;   // MeshGpu[] — per-mesh quantization basis
+    size_t mesh_info_capacity = 0;  // bytes
 
     size_t vbo_capacity = 0;
     size_t ebo_capacity = 0;

From 036864c7197b2097f09b78e447d6f6e5411bfdc6 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Tue, 14 Apr 2026 21:55:14 +1000
Subject: [PATCH 32/37] =?UTF-8?q?ifcviewer:=20README=20=E2=80=94=20documen?=
 =?UTF-8?q?t=20event-driven=20rendering=20and=20VBO=20quantization?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the event-driven rendering bullet (zero idle cost, in-render frame
timing) and roadmap entries for VBO quantization and event-driven
rendering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index be4a69ec42b..01ceef7ecfd 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -78,6 +78,14 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
   through load.
 - **Non-blocking sidecar loading**: sidecars are read on a background
   thread; only the final GPU upload touches the main thread.
+- **Event-driven rendering:** no continuous render timer. Frames are
+  scheduled via `QWindow::requestUpdate()` only when something changes
+  (camera move, streaming chunk, hover, settings). When the camera and
+  scene are idle the cull pass and HiZ readback are skipped entirely
+  and the main thread blocks in the Qt event loop — the viewer costs
+  zero CPU/GPU on a static scene. FPS is still reported accurately
+  because frame cost is measured *inside* `render()`, not as wall-clock
+  between frames.
 - **GPU object picking**: a second render pass writes object IDs into an
   R32UI framebuffer. Click reads back one pixel. No CPU-side raycasting.
 - **Multi-model support**: multiple IFCs can be loaded simultaneously.
@@ -728,6 +736,8 @@ multi-million + occluders       redundant rasterisation Phase 3C HiZ (done, CPU
 - [x] Phase 3A — screen-space contribution culling
 - [x] Phase 3B — distance / contribution LOD (meshoptimizer `simplifySloppy`)
 - [x] Phase 3C — Hierarchical-Z occlusion culling (v1, CPU-side readback)
+- [x] Quantized VBO (16 B/vert, sidecar v6)
+- [x] Event-driven rendering (zero idle CPU/GPU, cull skipped on still frames)
 - [ ] **Phase 3D — GPU-side compute-shader culling** (next; replaces the readback)
 - [ ] Vulkan/MoltenVK backend for macOS
 - [ ] Embedded Python scripting console

From f243f804da9aaedc54ef13ac100e3c58f6219c5b Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Wed, 15 Apr 2026 15:18:51 +1000
Subject: [PATCH 33/37] ifcviewer: parallel per-model CPU cull

Split cullAndUploadVisible into cullModelCpu (CPU-only, thread-safe) and
uploadCullResults (GL-only, main thread). render() fans the per-model
culls out via std::async and joins before the serial upload pass.

The cull scratch (vis_fwd/rev_lod0/1, visible_flat, indirect_scratch)
moved onto ModelGpuData so each worker owns its output buffers. Phase
timers and hiz_reject_count_ are atomic since workers fetch_add into
them. A new wall-clock timer around the dispatch block reports the
actual frame-time contribution; the existing clr/trv/emt counters are
now documented as per-thread sums.

Measured on the 18-model / 569k-instance test scene: wall-clock cull
dropped from ~25 ms to ~5 ms while the aggregate CPU work (trv) stayed
~30 ms. Frame time 34 ms -> 19 ms. IFC_CULL_THREADS=0 forces the
single-threaded fallback.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 135 +++++++++++++++++++++----------
 src/ifcviewer/ViewportWindow.h   |  54 ++++++++-----
 2 files changed, 127 insertions(+), 62 deletions(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 4731a431867..66525e28cf5 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -1292,6 +1292,12 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
 
 void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6][4],
                                           float focal_px, float min_pixel_radius) {
+    cullModelCpu(m, planes, focal_px, min_pixel_radius);
+    uploadCullResults(m);
+}
+
+void ViewportWindow::cullModelCpu(ModelGpuData& m, const float planes[6][4],
+                                  float focal_px, float min_pixel_radius) {
     // Per-mesh scratch, split by winding × LOD.  Winding split lets the draw
     // pass toggle glFrontFace once between two MDI calls so GL_CULL_FACE does
     // the right thing for both.  LOD split means instances that want the
@@ -1303,15 +1309,15 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     auto resize_if = [&](std::vector<std::vector<uint32_t>>& v) {
         if (v.size() < m.meshes.size()) v.resize(m.meshes.size());
     };
-    resize_if(visible_by_mesh_fwd_lod0_);
-    resize_if(visible_by_mesh_fwd_lod1_);
-    resize_if(visible_by_mesh_rev_lod0_);
-    resize_if(visible_by_mesh_rev_lod1_);
+    resize_if(m.vis_fwd_lod0);
+    resize_if(m.vis_fwd_lod1);
+    resize_if(m.vis_rev_lod0);
+    resize_if(m.vis_rev_lod1);
     for (size_t i = 0; i < m.meshes.size(); ++i) {
-        visible_by_mesh_fwd_lod0_[i].clear();
-        visible_by_mesh_fwd_lod1_[i].clear();
-        visible_by_mesh_rev_lod0_[i].clear();
-        visible_by_mesh_rev_lod1_[i].clear();
+        m.vis_fwd_lod0[i].clear();
+        m.vis_fwd_lod1[i].clear();
+        m.vis_rev_lod0[i].clear();
+        m.vis_rev_lod1[i].clear();
     }
     cull_clear_ns_ += phase_timer.nsecsElapsed();
     phase_timer.restart();
@@ -1394,7 +1400,7 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         if (!aabbInFrustum(item.aabb_min, item.aabb_max, planes)) return;
         if (!contributionPasses(item.aabb_min, item.aabb_max)) return;
         if (hiz_on && aabbOccludedByHiz(item.aabb_min, item.aabb_max)) {
-            ++hiz_reject_count_;
+            hiz_reject_count_.fetch_add(1, std::memory_order_relaxed);
             return;
         }
         // Survivor — now pay the wide-struct fetch for mesh_id.
@@ -1407,10 +1413,10 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         const bool reflected = inst_idx < m.instance_reflected.size()
             && m.instance_reflected[inst_idx] != 0;
         auto& bucket =
-            reflected ? (want_lod1 ? visible_by_mesh_rev_lod1_
-                                   : visible_by_mesh_rev_lod0_)
-                      : (want_lod1 ? visible_by_mesh_fwd_lod1_
-                                   : visible_by_mesh_fwd_lod0_);
+            reflected ? (want_lod1 ? m.vis_rev_lod1
+                                   : m.vis_rev_lod0)
+                      : (want_lod1 ? m.vis_fwd_lod1
+                                   : m.vis_fwd_lod0);
         bucket[inst.mesh_id].push_back(inst_idx);
     };
 
@@ -1456,8 +1462,8 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     // rev fills [indirect_forward_count, end).  LOD0/LOD1 within a winding
     // slice are contiguous — winding is what requires glFrontFace to flip
     // between MDI calls, LOD is not.
-    visible_flat_.clear();
-    indirect_scratch_.clear();
+    m.visible_flat.clear();
+    m.indirect_scratch.clear();
 
     auto emit_slice = [&](std::vector<std::vector<uint32_t>>& by_mesh, int lod) {
         for (size_t mi = 0; mi < m.meshes.size(); ++mi) {
@@ -1474,25 +1480,25 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
             cmd.instanceCount = vis_count;
             cmd.firstIndex    = ebo_off / sizeof(uint32_t);
             cmd.baseVertex    = mesh.vbo_byte_offset / INSTANCED_VERTEX_STRIDE_BYTES;
-            cmd.baseInstance  = static_cast<uint32_t>(visible_flat_.size());
-            indirect_scratch_.push_back(cmd);
+            cmd.baseInstance  = static_cast<uint32_t>(m.visible_flat.size());
+            m.indirect_scratch.push_back(cmd);
 
-            visible_flat_.insert(visible_flat_.end(),
-                                 by_mesh[mi].begin(), by_mesh[mi].end());
+            m.visible_flat.insert(m.visible_flat.end(),
+                                  by_mesh[mi].begin(), by_mesh[mi].end());
         }
     };
 
-    emit_slice(visible_by_mesh_fwd_lod0_, 0);
-    emit_slice(visible_by_mesh_fwd_lod1_, 1);
-    m.indirect_forward_count = static_cast<uint32_t>(indirect_scratch_.size());
-    emit_slice(visible_by_mesh_rev_lod0_, 0);
-    emit_slice(visible_by_mesh_rev_lod1_, 1);
-    m.indirect_command_count = static_cast<uint32_t>(indirect_scratch_.size());
+    emit_slice(m.vis_fwd_lod0, 0);
+    emit_slice(m.vis_fwd_lod1, 1);
+    m.indirect_forward_count = static_cast<uint32_t>(m.indirect_scratch.size());
+    emit_slice(m.vis_rev_lod0, 0);
+    emit_slice(m.vis_rev_lod1, 1);
+    m.indirect_command_count = static_cast<uint32_t>(m.indirect_scratch.size());
 
     // Per-model stats snapshot — summed into the frame counters regardless
     // of whether this frame ran a full cull or reused the cached one.
     uint32_t model_vis_obj = 0, model_vis_tri = 0;
-    for (const auto& cmd : indirect_scratch_) {
+    for (const auto& cmd : m.indirect_scratch) {
         model_vis_tri += (cmd.count / 3) * cmd.instanceCount;
         model_vis_obj += cmd.instanceCount;
     }
@@ -1500,10 +1506,14 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
     m.cached_visible_triangles = model_vis_tri;
 
     cull_emit_ns_ += phase_timer.nsecsElapsed();
-    phase_timer.restart();
+}
+
+void ViewportWindow::uploadCullResults(ModelGpuData& m) {
+    QElapsedTimer phase_timer;
+    phase_timer.start();
 
     // Upload visible list (keep binding alive even when empty).
-    size_t vis_bytes = std::max<size_t>(visible_flat_.size() * sizeof(uint32_t),
+    size_t vis_bytes = std::max<size_t>(m.visible_flat.size() * sizeof(uint32_t),
                                         sizeof(uint32_t));
     if (m.visible_ssbo == 0 || m.visible_ssbo_capacity < vis_bytes) {
         if (m.visible_ssbo) gl_->glDeleteBuffers(1, &m.visible_ssbo);
@@ -1513,13 +1523,13 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         gl_->glNamedBufferStorage(m.visible_ssbo, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
         m.visible_ssbo_capacity = new_cap;
     }
-    if (!visible_flat_.empty()) {
+    if (!m.visible_flat.empty()) {
         gl_->glNamedBufferSubData(m.visible_ssbo, 0,
-            visible_flat_.size() * sizeof(uint32_t), visible_flat_.data());
+            m.visible_flat.size() * sizeof(uint32_t), m.visible_flat.data());
     }
 
     // Upload indirect command buffer.
-    size_t ind_bytes = indirect_scratch_.size() * sizeof(DrawElementsIndirectCommand);
+    size_t ind_bytes = m.indirect_scratch.size() * sizeof(DrawElementsIndirectCommand);
     if (ind_bytes == 0) {
         cull_upload_ns_ += phase_timer.nsecsElapsed();
         return;
@@ -1532,7 +1542,7 @@ void ViewportWindow::cullAndUploadVisible(ModelGpuData& m, const float planes[6]
         gl_->glNamedBufferStorage(m.indirect_buffer, new_cap, nullptr, GL_DYNAMIC_STORAGE_BIT);
         m.indirect_capacity = new_cap;
     }
-    gl_->glNamedBufferSubData(m.indirect_buffer, 0, ind_bytes, indirect_scratch_.data());
+    gl_->glNamedBufferSubData(m.indirect_buffer, 0, ind_bytes, m.indirect_scratch.data());
     cull_upload_ns_ += phase_timer.nsecsElapsed();
 }
 
@@ -1608,7 +1618,7 @@ void ViewportWindow::render() {
         && last_cull_proj_ == proj_matrix_;
     const bool cull_this_frame = !camera_unchanged;
     if (cull_this_frame) {
-        hiz_reject_count_ = 0;
+        hiz_reject_count_.store(0, std::memory_order_relaxed);
     } else {
         ++cull_skipped_frames_;
     }
@@ -1617,11 +1627,47 @@ void ViewportWindow::render() {
     // back and forth.  Harmless when culling is off.
     gl_->glFrontFace(GL_CCW);
 
+    // Parallel cull: each model's CPU cull is independent (no shared mutable
+    // state other than the atomic timing counters), so we fan them out to
+    // std::async and join before the (serial, GL-touching) upload pass.
+    // IFC_CULL_THREADS=0 forces the single-threaded fallback.
+    static const bool mt_cull_enabled = []{
+        const char* e = std::getenv("IFC_CULL_THREADS");
+        return !(e && e[0] == '0');
+    }();
+    QElapsedTimer cull_wall_timer;
+    if (cull_this_frame) {
+        cull_wall_timer.start();
+        std::vector<ModelGpuData*> cull_targets;
+        cull_targets.reserve(models_gpu_.size());
+        for (auto& [mid, m] : models_gpu_) {
+            if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
+            cull_targets.push_back(&m);
+        }
+        if (mt_cull_enabled && cull_targets.size() > 1) {
+            std::vector<std::future<void>> futs;
+            futs.reserve(cull_targets.size());
+            for (ModelGpuData* mp : cull_targets) {
+                const float mpr = min_pixel_radius;
+                futs.emplace_back(std::async(std::launch::async,
+                    [this, mp, &planes, focal_px, mpr]() {
+                        cullModelCpu(*mp, planes, focal_px, mpr);
+                    }));
+            }
+            for (auto& f : futs) f.get();
+        } else {
+            for (ModelGpuData* mp : cull_targets) {
+                cullModelCpu(*mp, planes, focal_px, min_pixel_radius);
+            }
+        }
+        cull_wall_ns_ += cull_wall_timer.nsecsElapsed();
+    }
+
     for (auto& [model_id, m] : models_gpu_) {
         if (m.hidden || !m.ssbo || m.ssbo_instance_count == 0) continue;
 
         if (cull_this_frame) {
-            cullAndUploadVisible(m, planes, focal_px, min_pixel_radius);
+            uploadCullResults(m);
         }
         if (m.indirect_command_count == 0) continue;
 
@@ -1741,24 +1787,29 @@ void ViewportWindow::render() {
 
         const double inv_frames = frames_in_window > 0
             ? 1.0 / static_cast<double>(frames_in_window) : 0.0;
-        const double clr_ms = cull_clear_ns_    * 1e-6 * inv_frames;
-        const double trv_ms = cull_traverse_ns_ * 1e-6 * inv_frames;
-        const double emt_ms = cull_emit_ns_     * 1e-6 * inv_frames;
-        const double upl_ms = cull_upload_ns_   * 1e-6 * inv_frames;
-        cull_clear_ns_ = cull_traverse_ns_ = cull_emit_ns_ = cull_upload_ns_ = 0;
+        const double clr_ms = cull_clear_ns_.load()    * 1e-6 * inv_frames;
+        const double trv_ms = cull_traverse_ns_.load() * 1e-6 * inv_frames;
+        const double emt_ms = cull_emit_ns_.load()     * 1e-6 * inv_frames;
+        const double upl_ms = cull_upload_ns_.load()   * 1e-6 * inv_frames;
+        const double wall_ms = cull_wall_ns_           * 1e-6 * inv_frames;
+        cull_clear_ns_.store(0);
+        cull_traverse_ns_.store(0);
+        cull_emit_ns_.store(0);
+        cull_upload_ns_.store(0);
+        cull_wall_ns_ = 0;
         const uint32_t skipped = cull_skipped_frames_;
         cull_skipped_frames_ = 0;
 
         qDebug("[frame] %.1f fps  %.2f ms  obj %u/%u  tri %u/%u  "
                "meshes %u  gl_draws %u  sub_draws %u  hiz_rej %u  "
-               "cull[clr %.2f trv %.2f emt %.2f upl %.2f]ms  skipped %u/%u  "
+               "cull[wall %.2f | work: clr %.2f trv %.2f emt %.2f upl %.2f]ms  skipped %u/%u  "
                "vram %.1f MB (vbo %.1f + ebo %.1f + ssbo %.1f)  models %zu (%zu hidden)",
                last_fps_, 1000.0f / last_fps_,
                visible_objects_, total_obj,
                visible_triangles_, total_tri,
                total_meshes, gl_draw_calls_, indirect_sub_draws_,
-               hiz_reject_count_,
-               clr_ms, trv_ms, emt_ms, upl_ms,
+               hiz_reject_count_.load(),
+               wall_ms, clr_ms, trv_ms, emt_ms, upl_ms,
                skipped, frames_in_window,
                (total_vbo + total_ebo + total_ssbo) / (1024.0*1024.0),
                total_vbo / (1024.0*1024.0),
diff --git a/src/ifcviewer/ViewportWindow.h b/src/ifcviewer/ViewportWindow.h
index ed6668cc116..30b9e8cfa19 100644
--- a/src/ifcviewer/ViewportWindow.h
+++ b/src/ifcviewer/ViewportWindow.h
@@ -32,6 +32,8 @@
 #include <cstdint>
 #include <mutex>
 #include <memory>
+#include <atomic>
+#include <future>
 
 #include "BvhAccel.h"
 #include "InstancedGeometry.h"
@@ -104,6 +106,15 @@ struct ModelGpuData {
     uint32_t indirect_command_count = 0;  // total valid commands this frame
     uint32_t indirect_forward_count = 0;  // first N are CCW-winding draws
 
+    // Per-model cull scratch — owned by the model so each cull job runs
+    // without sharing mutable state.  Four buckets = {fwd, rev} × {LOD0, LOD1}.
+    std::vector<std::vector<uint32_t>>       vis_fwd_lod0;
+    std::vector<std::vector<uint32_t>>       vis_fwd_lod1;
+    std::vector<std::vector<uint32_t>>       vis_rev_lod0;
+    std::vector<std::vector<uint32_t>>       vis_rev_lod1;
+    std::vector<uint32_t>                    visible_flat;
+    std::vector<DrawElementsIndirectCommand> indirect_scratch;
+
     bool finalized = false;
     bool hidden    = false;
 };
@@ -215,6 +226,18 @@ class ViewportWindow : public QWindow {
     void cullAndUploadVisible(ModelGpuData& m, const float planes[6][4],
                               float focal_px, float min_pixel_radius);
 
+    // Thread-safe: CPU-only cull (frustum + contribution + HiZ + bucketing +
+    // emit).  Writes survivors into m.vis_* / m.visible_flat / m.indirect_scratch
+    // and sets m.indirect_forward_count / m.indirect_command_count /
+    // m.cached_visible_*.  Touches no GL state and no ViewportWindow mutable
+    // state other than the atomic counters below — safe to run on a worker.
+    void cullModelCpu(ModelGpuData& m, const float planes[6][4],
+                      float focal_px, float min_pixel_radius);
+
+    // Main-thread only: uploads m.visible_flat / m.indirect_scratch into the
+    // model's SSBO + indirect buffer, growing them if needed.
+    void uploadCullResults(ModelGpuData& m);
+
     // Mouse interaction
     void handleMousePress(QMouseEvent* event);
     void handleMouseRelease(QMouseEvent* event);
@@ -268,17 +291,23 @@ class ViewportWindow : public QWindow {
     std::vector<uint32_t> hiz_mip_h_;
     QMatrix4x4            hiz_vp_;
     bool                  hiz_vp_valid_ = false;
-    uint32_t              hiz_reject_count_ = 0;  // per-frame stat
+    std::atomic<uint32_t> hiz_reject_count_{0};  // per-frame stat
 
     // Cull-phase timers.  Accumulated across all frames in the current
     // 1-second stats window; divided by frame_count_ at print time to
     // give per-frame average ms.  Reset each window.  Lets us see where
     // CPU time actually goes: bucket clears vs BVH traversal vs emit vs
     // GPU upload.
-    uint64_t cull_clear_ns_    = 0;
-    uint64_t cull_traverse_ns_ = 0;
-    uint64_t cull_emit_ns_     = 0;
-    uint64_t cull_upload_ns_   = 0;
+    // Atomic so parallel cull workers can fetch_add into them without
+    // contending on a lock.  clr/trv/emt are SUMS across all worker threads
+    // for the frame — they describe total CPU work, not wall-clock.  The
+    // wall counter is measured once around the dispatch block in render()
+    // and is what actually determines frame time.
+    std::atomic<uint64_t> cull_clear_ns_{0};
+    std::atomic<uint64_t> cull_traverse_ns_{0};
+    std::atomic<uint64_t> cull_emit_ns_{0};
+    std::atomic<uint64_t> cull_upload_ns_{0};
+    uint64_t              cull_wall_ns_ = 0;    // main-thread only
     uint32_t cull_skipped_frames_ = 0;
 
     // Skip cullAndUploadVisible + buildHizPyramid when the camera and scene
@@ -295,21 +324,6 @@ class ViewportWindow : public QWindow {
     uint32_t gl_draw_calls_ = 0;
     uint32_t indirect_sub_draws_ = 0;
 
-    // Reused scratch: visible-instance index lists per mesh, flattened into
-    // `visible_flat_` for upload.  Both live in the parent object to avoid
-    // per-frame allocation.  indirect_scratch_ is the matching array of
-    // DrawElementsIndirectCommand records — forward-declared as bytes so
-    // the header doesn't need the struct definition.
-    // Four buckets = {fwd, rev} × {LOD0, LOD1}.  LOD1 buckets are only
-    // populated when the mesh has lod1_index_count > 0 and the projected
-    // pixel radius is below the LOD switch threshold.
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_lod0_;
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_fwd_lod1_;
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_lod0_;
-    std::vector<std::vector<uint32_t>>     visible_by_mesh_rev_lod1_;
-    std::vector<uint32_t>                  visible_flat_;
-    std::vector<DrawElementsIndirectCommand> indirect_scratch_;
-
     // Camera
     QVector3D camera_target_{0, 0, 0};
     QVector3D camera_eye_{0, 0, 0};      // world-space eye, set in updateCamera

From c0e99a19fd2498612aafc7c3688942273f9b963d Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Wed, 15 Apr 2026 15:29:21 +1000
Subject: [PATCH 34/37] =?UTF-8?q?ifcviewer:=20README=20=E2=80=94=20documen?=
 =?UTF-8?q?t=20parallel=20per-model=20cull=20(Phase=203D)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the parallel cull bullet to the feature list, a Phase 3D section
explaining the fan-out / scratch-ownership design + measured 4x
speedup, and renumber the planned GPU compute cull to Phase 3E so it
can cite 3D as the CPU algorithm being ported.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/README.md | 44 +++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 01ceef7ecfd..8c09ed661f7 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -60,6 +60,13 @@ engine with a Qt6 interface and OpenGL 4.5 rendering.
 - **BVH frustum culling over instances**: per-model BVH trees cull whole
   subtrees of placements with one frustum test. Falls back to a linear scan
   during progressive upload and for very small models (< 32 instances).
+- **Parallel per-model cull:** each model's CPU cull (frustum + contribution
+  + HiZ + bucketing + indirect-command emit) is independent, so `render()`
+  fans them out via `std::async` and joins before the serial GL-upload
+  pass. On an 18-model scene this took wall-clock cull from ~25 ms to
+  ~5 ms. The cull scratch buffers live on `ModelGpuData` so each worker
+  owns its output storage; phase-timer counters are atomic for the same
+  reason. `IFC_CULL_THREADS=0` forces single-threaded fallback.
 - **Reflection-aware two-pass draw:** IFC placements can have negative-
   determinant transforms (mirrored families). These flip the screen-space
   winding of their triangles, which would make them vanish under
@@ -692,13 +699,35 @@ thousands and the frame time drops accordingly.
 - **Transparent geometry would need special handling**, but the
   current renderer doesn't have any, so no-op for now.
 
-#### 3D. GPU-side culling via compute (longer-term)
+#### 3D. Parallel per-model cull (CPU, done)
+
+A cheaper intermediate step before going full-GPU: each model's cull is
+independent (no shared mutable state beyond atomic timing counters), so
+`render()` fans the per-model culls out to a `std::async` pool and joins
+before the serial GL-upload pass. On the 18-model / 569 k-instance test
+scene this took the cull from ~25 ms wall-clock to ~5 ms — roughly a 4×
+speedup on an 8-core machine, tracking `std::thread::hardware_concurrency()`
+up to the model count. Load balancing is static (one job per model); a
+single massive model still bottlenecks to single-threaded speed and would
+need intra-model partitioning, but in practice BIM projects are
+multi-discipline so the coarse partition lands well.
+
+The stats line now reports `cull[wall X | work: clr Y trv Z emt W upl U]`:
+`wall` is frame-time impact, the `work` numbers are per-thread sums showing
+where CPU cycles went. `IFC_CULL_THREADS=0` forces single-threaded mode
+for comparison.
+
+#### 3E. GPU-side culling via compute (longer-term)
 
 Push the cull loop to a compute shader reading the per-instance SSBO +
 frustum planes + HiZ pyramid, emitting the visible list and indirect
-commands with atomic counters. Eliminates all CPU→GPU per-frame bytes
-and lets 3C scale to millions of instances. Worth doing once 3A–3C
-have stabilised the CPU-side algorithm we'd be porting.
+commands with atomic counters. Three compute dispatches per model: (1)
+count survivors per `(mesh, winding, LOD)` bucket, (2) prefix-sum the
+counts into `baseInstance` offsets and write the indirect command buffer,
+(3) re-test and compact survivors into the dense visible list. HiZ moves
+to a GPU depth texture sampled directly in the shader, eliminating the
+Phase 3C readback. Lets culling scale to millions of instances and
+single-model scenes where Phase 3D can't parallelise.
 
 ### Planned follow-ups (post-Phase-3)
 
@@ -716,6 +745,8 @@ Scene size                      Bottleneck              Fix
 500k+ tris / overview shot      GPU vertex + raster     Phase 3A contribution cull
                                                         + Phase 3B LOD (done)
 multi-million + occluders       redundant rasterisation Phase 3C HiZ (done, CPU readback)
+many models, serial cull        single-thread BVH trv   Phase 3D parallel cull (done)
+single giant model / <18 cores  CPU BVH trv             Phase 3E GPU cull (planned)
 ```
 
 ## Roadmap
@@ -732,12 +763,13 @@ multi-million + occluders       redundant rasterisation Phase 3C HiZ (done, CPU
 - [x] Reflection-aware two-pass draw for mirrored placements
 - [x] Backface culling (user-toggleable, default on)
 - [x] `reorient-shells` enabled in iterator
-- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`, `IFC_LOD1_PX`, `IFC_NO_HIZ`, `IFC_HIZ_SIZE`)
+- [x] Perf diagnostic env vars (`IFC_SKIP_MDI`, `IFC_MAX_SUBDRAWS`, `IFC_MIN_PX`, `IFC_LOD1_PX`, `IFC_NO_HIZ`, `IFC_HIZ_SIZE`, `IFC_CULL_THREADS`)
 - [x] Phase 3A — screen-space contribution culling
 - [x] Phase 3B — distance / contribution LOD (meshoptimizer `simplifySloppy`)
 - [x] Phase 3C — Hierarchical-Z occlusion culling (v1, CPU-side readback)
+- [x] Phase 3D — Parallel per-model CPU cull (`std::async` fan-out)
 - [x] Quantized VBO (16 B/vert, sidecar v6)
 - [x] Event-driven rendering (zero idle CPU/GPU, cull skipped on still frames)
-- [ ] **Phase 3D — GPU-side compute-shader culling** (next; replaces the readback)
+- [ ] **Phase 3E — GPU-side compute-shader culling** (next; replaces the HiZ readback)
 - [ ] Vulkan/MoltenVK backend for macOS
 - [ ] Embedded Python scripting console

From 6b496d802dc7102c4a1abc16ee959c42cf0ff854 Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Wed, 15 Apr 2026 17:46:47 +1000
Subject: [PATCH 35/37] ifcviewer: disable HiZ cull when camera has moved
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HiZ from last frame encodes depth from last frame's viewpoint. When
the camera moves, projecting a current-frame AABB through the stored
VP answers 'was this occluded last frame?' rather than 'is it occluded
now?' — a self-reinforcing feedback loop where objects culled in
prior frames never appear in any depth buffer and stay permanently
hidden at certain camera angles.

Fix: require hiz_vp_ == current VP for the HiZ test to apply. HiZ
still helps static views (kicks in one frame after camera stops) but
no longer produces false occlusions during orbit. The correct fix for
orbit coverage is a depth pre-pass feeding fresh HiZ — planned as
part of Phase 3E GPU compute cull.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/ifcviewer/ViewportWindow.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index 66525e28cf5..e96b45f9094 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -1388,7 +1388,21 @@ void ViewportWindow::cullModelCpu(ModelGpuData& m, const float planes[6][4],
     // HiZ occlusion is skipped entirely when the pick pass runs
     // (min_pixel_radius == 0 on that path), when the user disables it via
     // env var, or before the first pyramid has been built.
-    const bool hiz_on = hizEnabled() && min_pixel_radius > 0.0f && hiz_vp_valid_;
+    //
+    // Crucially, HiZ is also skipped when the stored VP (hiz_vp_, captured at
+    // the end of the previous frame) differs from this frame's VP — i.e.
+    // whenever the camera has moved.  The stored depth buffer encodes what
+    // was visible from hiz_vp_'s viewpoint; projecting a current-frame AABB
+    // through that VP answers "was this occluded LAST frame?", which is only
+    // a correct proxy for "is this occluded NOW?" when the camera is static.
+    // Orbiting past a wall would otherwise leave objects persistently culled
+    // because prior frames' depth buffers only ever contained the wall (the
+    // objects behind it were themselves HiZ-culled, never drawn, so never in
+    // the buffer — a self-reinforcing feedback loop).  On static views HiZ
+    // kicks in after a single frame of lag.
+    const QMatrix4x4 current_vp = proj_matrix_ * view_matrix_;
+    const bool hiz_vp_matches = hiz_vp_valid_ && hiz_vp_ == current_vp;
+    const bool hiz_on = hizEnabled() && min_pixel_radius > 0.0f && hiz_vp_matches;
 
     // Hot path: read the AABB from the compact bvh_items array (28 B stride)
     // rather than the wide InstanceCpu (104 B stride).  Most instances fail

From 1caf4496113b727bf0caa4d3dd3e990c3eccebcb Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Wed, 15 Apr 2026 18:30:03 +1000
Subject: [PATCH 36/37] ifcviewer: fix pick-pass cull corruption and
 cached-model ID collisions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two stability bugs:

1. Clicking an object left the scene with wrong shading until the camera
   moved.  The pick pass re-culls every model with its own parameters
   (min_pixel_radius=0, no HiZ) and overwrites each model's visible_ssbo
   and indirect buffer.  The next render() saw an unchanged camera,
   skipped the cull via the have_cached_cull_ shortcut, and drew the
   stale pick-pass buffers.  Fix: invalidate have_cached_cull_ at the
   end of pickObjectAt().

2. Loading two sidecar-cached models made the second model's picked
   properties resolve to the first model's elements.  Sidecars store raw
   object_id / model_id values from the session that wrote them, and
   both files start at object_id=1, so element_map_ entries collided.
   Fix: on load, rebase every PackedElementInfo and InstanceCpu by
   (next_object_id_ - min_id_in_sidecar) and overwrite model_id with
   the freshly-assigned handle before the elements hit element_map_.

Also document both in the README — the pick-pass note under 3A
contribution culling, the sidecar rebase under the sidecar format
section.
---
 src/ifcviewer/MainWindow.cpp     | 27 ++++++++++++++++++++++-----
 src/ifcviewer/README.md          | 15 +++++++++++++++
 src/ifcviewer/ViewportWindow.cpp |  7 +++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/ifcviewer/MainWindow.cpp b/src/ifcviewer/MainWindow.cpp
index 0e8162f0436..e75f7cf0dd1 100644
--- a/src/ifcviewer/MainWindow.cpp
+++ b/src/ifcviewer/MainWindow.cpp
@@ -248,11 +248,28 @@ void MainWindow::applySidecarData(ModelId mid, SidecarData data) {
     QElapsedTimer t;
     t.start();
 
-    // Update next_object_id_ past all objects in this model before the
-    // extracted `elements` is moved out of `data`.
-    for (const auto& elem : data.elements) {
-        if (elem.object_id >= next_object_id_)
-            next_object_id_ = elem.object_id + 1;
+    // Sidecars store raw object_ids and model_ids from the session that wrote
+    // them.  On load we must rebase both onto the current session's ID space,
+    // or two cached models collide (both starting at object_id=1, both
+    // claiming the original model_id).  Offset by (next_object_id_ - min_id)
+    // so the first cached object takes the next free slot.
+    uint32_t min_oid = UINT32_MAX;
+    for (const auto& pe : data.elements) {
+        if (pe.object_id < min_oid) min_oid = pe.object_id;
+    }
+    uint32_t oid_offset = 0;
+    if (!data.elements.empty() && min_oid < UINT32_MAX) {
+        oid_offset = next_object_id_ - min_oid;
+    }
+    for (auto& pe : data.elements) {
+        pe.object_id += oid_offset;
+        pe.model_id   = mid;
+        if (pe.object_id >= next_object_id_)
+            next_object_id_ = pe.object_id + 1;
+    }
+    for (auto& inst : data.instances) {
+        inst.object_id += oid_offset;
+        inst.model_id   = mid;
     }
 
     // Hand off geometry to GPU in a single call.
diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 8c09ed661f7..70540abfb19 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -307,6 +307,14 @@ uint32_t + char[]        string table
 Staleness check: `source_file_size` vs actual file size. Mismatched →
 reject and rebuild. Endianness marker rejects cross-arch caches.
 
+Sidecars store the raw `object_id` / `model_id` values from the session
+that wrote them. On load they are rebased onto the current session's ID
+space (`object_id += next_object_id_ - min_id_in_sidecar`, `model_id`
+overwritten with the freshly-assigned handle) before the elements hit
+`element_map_` or the viewport. Without this, two cached models loaded
+back-to-back collide — both start at `object_id=1` and the second model's
+property lookups return the first model's data.
+
 ### GPU Instancing pipeline (the central pillar)
 
 Everything above plugs into a single data-flow, worth documenting on its
@@ -451,6 +459,13 @@ and per-instance level. Short-circuits when the camera is inside the
 AABB so nothing-you're-standing-next-to is ever lost. Pick pass uses
 threshold 0 so sub-pixel objects remain clickable.
 
+Because the pick pass re-runs the cull with its own parameters (no
+contribution cull, no HiZ) and writes into each model's shared
+`visible_ssbo` / indirect buffer, `pickObjectAt()` must invalidate
+`have_cached_cull_` on exit. Otherwise the next `render()` sees an
+unchanged camera, skips the cull, and draws the pick-pass buffers —
+the user sees obviously-wrong shading until they nudge the camera.
+
 Sphere-based (centre = AABB midpoint, radius = half-diagonal,
 r_px = focal_px · radius / distance). Loses a little precision on
 very elongated bounds vs. 8-corner projection, but costs ~5× less per
diff --git a/src/ifcviewer/ViewportWindow.cpp b/src/ifcviewer/ViewportWindow.cpp
index e96b45f9094..a97714950a2 100644
--- a/src/ifcviewer/ViewportWindow.cpp
+++ b/src/ifcviewer/ViewportWindow.cpp
@@ -1282,6 +1282,13 @@ uint32_t ViewportWindow::pickObjectAt(int x, int y) {
 
     renderPickPass();
 
+    // The pick pass overwrote each model's visible_ssbo / indirect_buffer with
+    // pick-specific cull params (no contribution cull, no HiZ).  Invalidate
+    // the cached cull so the next render() rebuilds them with main-render
+    // params; otherwise the viewport draws with stale pick-pass buffers and
+    // shading looks wrong until the camera moves.
+    have_cached_cull_ = false;
+
     int px = x * devicePixelRatio();
     int py = (height() - y) * devicePixelRatio();
     uint32_t pixel = 0;

From 196f98440de93be1bb8a67ecf6164ca96377d5ee Mon Sep 17 00:00:00 2001
From: Dion Moult <dion@thinkmoult.com>
Date: Wed, 15 Apr 2026 18:31:17 +1000
Subject: [PATCH 37/37] =?UTF-8?q?ifcviewer:=20README=20=E2=80=94=20documen?=
 =?UTF-8?q?t=20HiZ=20disabled=20during=20camera=20motion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'Known caveats' bullet still described the old 1-frame-stale
behavior.  Since 6b496d802 the cull compares hiz_vp_ to the current VP
and drops HiZ rejection whenever they differ, so HiZ only helps on
still frames — orbiting gets no benefit.  Call out the tradeoff and
the planned same-frame-depth-pre-pass fix slated for Phase 3E.
---
 src/ifcviewer/README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/ifcviewer/README.md b/src/ifcviewer/README.md
index 70540abfb19..77ffefc40e7 100644
--- a/src/ifcviewer/README.md
+++ b/src/ifcviewer/README.md
@@ -694,11 +694,19 @@ thousands and the frame time drops accordingly.
 
 ##### Known caveats
 
-- **1 frame stale.** The pyramid is aligned to last frame's view, so
-  when you whip the camera across the scene we may draw one frame of
-  stuff that the new view would have occluded. Invisible in practice
-  at 60 fps. We tried a 3-deep PBO ring for async readback (2-frame
-  stale) and it produced visible flicker on fast orbits — reverted.
+- **Disabled while the camera moves.** The pyramid is aligned to the
+  VP matrix of the frame that produced it. On a moving camera the
+  stored VP no longer matches the current one, and reusing it would
+  pop objects in and out as the stale depth falsely claims they're
+  occluded. The cull now compares `hiz_vp_ == current_vp` and drops
+  HiZ rejection entirely when they differ, so HiZ only contributes on
+  still frames. The honest cost: orbiting — the exact motion where
+  the frame rate tends to dip — gets no HiZ help. A proper fix needs
+  a same-frame depth pre-pass (draw cheap depth, build HiZ from *that*
+  frame's VP, then issue the colour pass against it); deferred to the
+  GPU-compute cull rewrite in Phase 3E where we're touching this code
+  anyway. We also tried a 3-deep PBO ring for async readback (2-frame
+  stale) which produced visible flicker on fast orbits — reverted.
 - **Readback syncs the GPU.** `glGetTextureImage` is blocking.
   Measured cost is well under a millisecond at 256×128; not a
   bottleneck on the machines tested. Phase 3D's compute-shader cull