diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1924466..2ac2716 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -6,6 +6,24 @@ on:
   workflow_dispatch:
 
 jobs:
+  perf-budget:
+    name: Performance budget
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r tests/requirements.txt
+          python -m pip install -e .
+      - name: Performance budget
+        run: python -m pytest tests/test_perf_budget.py -v --no-cov
+
   build:
     name: tox (${{ matrix.tox-env }})
     runs-on: ubuntu-latest
diff --git a/README.rst b/README.rst
index 25a630c..f0d7b1f 100644
--- a/README.rst
+++ b/README.rst
@@ -72,6 +72,54 @@ of widgets:
 The progressbar module is very easy to use, yet very powerful. It will also
 automatically enable features like auto-resizing when the system supports it.
 
+******************************************************************************
+Performance
+******************************************************************************
+
+The default ``progressbar.progressbar(...)`` is the **fastest** progress bar
+available -- on iteration overhead, per-update render cost, *and* import time.
+On the benchmark machine (CPython 3.13, macOS arm64):
+
+============  ====================  ==============  ===========
+Metric        progressbar2          tqdm            rich
+============  ====================  ==============  ===========
+Per iter      **5 ns** *(fast)*     54 ns           19 ns
+Per render    **~5 us** *(fast)*    11 us           172 us
+Cold import   **~1.5 ms**           ~22 ms          ~47 ms
+============  ====================  ==============  ===========
+
+How the default stays fast:
+
+- **Iteration** -- an integer "next update" gate keeps the common iteration to
+  an increment and a compare, entering the (rate-limited) redraw machinery only
+  a few times per second. ~30 ns/iter in pure Python (already faster than
+  tqdm); ~5 ns with the optional native iterator
+  (``pip install progressbar2[fast]``), which counts in a C field.
+- **Render** -- the default bar uses a fixed formatter (percentage, count, bar,
+  elapsed/ETA) built directly each redraw, so it renders in ~5 us/update,
+  roughly **2x faster than tqdm**, without the per-widget overhead.
+- **Import** -- ``import progressbar`` is lazy (PEP 562): widgets, the
+  terminal/colour tables and multi-bar support load only when used, so a bare
+  import is ~1.5 ms and pulls in nothing heavy (no ``asyncio``).
+
+The fast path stays close to the classic look but drops the gradient and
+per-iteration ``value`` liveness (``value`` is synced at redraw crossings, like
+``tqdm.n``). For the full widget set -- gradient ``Bar``, custom widgets,
+dynamic variables -- pass ``widgets=[...]`` or ``fast=False`` to
+``progressbar()``, or construct ``progressbar.ProgressBar(...)`` directly; that
+path is unchanged (and intentionally richer, so a touch slower). Set
+``PROGRESSBAR_DISABLE_FASTPATH=1`` to force the classic path everywhere.
+
+The benchmark is fully reproducible and pits ``progressbar2`` against ``tqdm``,
+``rich``, ``alive-progress`` and ``click`` across iteration overhead, forced
+redraw cost, and import time -- all rendered to a real pseudo-terminal so the
+comparison is apples-to-apples::
+
+    python benchmarks/bench.py && python benchmarks/report.py
+
+.. image:: https://raw.githubusercontent.com/WoLpH/python-progressbar/master/benchmarks/chart.png
+   :alt: progressbar2 vs common Python progress-bar libraries
+
 ******************************************************************************
 Known issues
 ******************************************************************************
diff --git a/benchmarks/bench.py b/benchmarks/bench.py
new file mode 100644
index 0000000..74cd3eb
--- /dev/null
+++ b/benchmarks/bench.py
@@ -0,0 +1,348 @@
+"""Benchmark progressbar2 against other common Python progress-bar libraries.
+
+Measures three things, fairly, with all rendered output sent to a *real* pseudo
+terminal (so every library believes it is attached to a TTY and actually draws):
+
+  A. Default iterator-wrap overhead .. the idiomatic "wrap my loop" call with
+     each library's default settings (ns added per iteration). Headline number.
+  B. Forced per-update render cost ... rendering forced on every single update,
+     for the libraries whose API supports it (us per rendered update).
+  C. Import time ...................... cold `import` cost in a fresh interpreter
+     (ms), interpreter-startup baseline subtracted.
+
+Results are written to results.json for the reporting step to consume.
+"""
+
+from __future__ import annotations
+
+import fcntl
+import gc
+import json
+import os
+import platform
+import pty
+import statistics
+import struct
+import subprocess
+import sys
+import termios
+import threading
+import time
+import typing
+from importlib import metadata
+
+# Scenario sizes / repeats -------------------------------------------------
+N_ITER: int = 1_000_000  # scenario A: default-overhead loop length
+ITER_REPEATS: int = 7
+N_RENDER: int = 30_000  # scenario B: forced-render loop length
+RENDER_REPEATS: int = 5
+IMPORT_RUNS: int = 9  # scenario C: cold-import subprocess runs
+
+TERM_COLS: int = 80
+TERM_ROWS: int = 24
+
+
+class PtySink:
+    """A real pty whose output is continuously drained and discarded.
+
+    Writing to a pty that nobody reads will eventually block when the kernel
+    buffer fills; the background drain thread keeps it flowing so timings are
+    not polluted by blocked writes.
+    """
+
+    def __init__(self, cols: int = TERM_COLS, rows: int = TERM_ROWS) -> None:
+        self._master, slave = pty.openpty()
+        fcntl.ioctl(
+            slave, termios.TIOCSWINSZ, struct.pack('HHHH', rows, cols, 0, 0)
+        )
+        self.file: typing.TextIO = os.fdopen(
+            slave, 'w', buffering=1, encoding='utf-8', errors='replace'
+        )
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._drain, daemon=True)
+        self._thread.start()
+
+    def _drain(self) -> None:
+        while not self._stop.is_set():
+            try:
+                if not os.read(self._master, 65536):
+                    break
+            except OSError:
+                break
+
+    def close(self) -> None:
+        try:
+            self.file.flush()
+            self.file.close()
+        except Exception:
+            # Teardown only: the slave fd may already be gone; nothing to do.
+            pass
+        self._stop.set()
+        try:
+            os.close(self._master)
+        except OSError:
+            # Master already closed once the drain thread hit EOF; ignore.
+            pass
+        self._thread.join(timeout=1)
+
+
+def time_call(fn: typing.Callable[[], None], repeats: int) -> dict[str, float]:
+    """Run ``fn`` ``repeats`` times (plus one warmup); return min/median secs."""
+    fn()  # warmup: pay one-time import/compile/thread-spawn costs
+    samples: list[float] = []
+    for _ in range(repeats):
+        gc.collect()
+        gc.disable()
+        start = time.perf_counter()
+        fn()
+        elapsed = time.perf_counter() - start
+        gc.enable()
+        samples.append(elapsed)
+    return {'min': min(samples), 'median': statistics.median(samples)}
+
+
+# --- Scenario A: default iterator-wrap overhead ---------------------------
+
+
+def baseline_loop(n: int) -> None:
+    for _ in range(n):
+        pass
+
+
+def iter_progressbar2(f: typing.TextIO, n: int) -> None:
+    import progressbar
+
+    for _ in progressbar.progressbar(range(n), fd=f):
+        pass
+
+
+def iter_tqdm(f: typing.TextIO, n: int) -> None:
+    from tqdm import tqdm
+
+    for _ in tqdm(range(n), file=f):
+        pass
+
+
+def iter_rich(f: typing.TextIO, n: int) -> None:
+    from rich.console import Console
+    from rich.progress import track
+
+    console = Console(file=f, force_terminal=True, width=TERM_COLS)
+    for _ in track(range(n), console=console):
+        pass
+
+
+def iter_alive(f: typing.TextIO, n: int) -> None:
+    from alive_progress import alive_bar
+
+    with alive_bar(n, file=f, force_tty=True) as bar:
+        for _ in range(n):
+            bar()
+
+
+def iter_click(f: typing.TextIO, n: int) -> None:
+    import click
+
+    with click.progressbar(range(n), file=f) as bar:
+        for _ in bar:
+            pass
+
+
+# --- Scenario B: forced per-update render ---------------------------------
+
+
+def render_progressbar2(f: typing.TextIO, n: int) -> None:
+    import progressbar
+
+    # force=True bypasses the time-based redraw throttle (whose floor is
+    # _MINIMUM_UPDATE_INTERVAL=0.050s), so every update actually renders.
+    with progressbar.ProgressBar(max_value=n, fd=f) as bar:
+        for i in range(n):
+            bar.update(i + 1, force=True)
+
+
+def render_progressbar2_fast(f: typing.TextIO, n: int) -> None:
+    import progressbar
+
+    # The fast default path: fixed formatter, no widget machinery.
+    with progressbar.FastProgressBar(max_value=n, fd=f) as bar:
+        for i in range(n):
+            bar.update(i + 1, force=True)
+
+
+def render_tqdm(f: typing.TextIO, n: int) -> None:
+    from tqdm import tqdm
+
+    for _ in tqdm(range(n), file=f, mininterval=0, miniters=1):
+        pass
+
+
+def render_rich(f: typing.TextIO, n: int) -> None:
+    from rich.console import Console
+    from rich.progress import Progress
+
+    console = Console(file=f, force_terminal=True, width=TERM_COLS)
+    with Progress(console=console, auto_refresh=False) as progress:
+        task = progress.add_task('bench', total=n)
+        for _ in range(n):
+            progress.advance(task)
+            progress.refresh()
+
+
+# --- Scenario C: cold import time -----------------------------------------
+
+IMPORT_STMTS: dict[str, str] = {
+    'progressbar2': 'import progressbar',
+    'tqdm': 'from tqdm import tqdm',
+    'rich': 'from rich.progress import track',
+    'alive-progress': 'from alive_progress import alive_bar',
+    'click': 'import click',
+}
+
+
+def time_import(stmt: str, runs: int) -> float:
+    """Return the minimum wall-clock seconds to run ``stmt`` in a fresh py."""
+    samples: list[float] = []
+    for _ in range(runs):
+        start = time.perf_counter()
+        subprocess.run(
+            [sys.executable, '-c', stmt],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        samples.append(time.perf_counter() - start)
+    return min(samples)
+
+
+ITER_LIBS: dict[str, typing.Callable[[typing.TextIO, int], None]] = {
+    'progressbar2': iter_progressbar2,
+    'tqdm': iter_tqdm,
+    'rich': iter_rich,
+    'alive-progress': iter_alive,
+    'click': iter_click,
+}
+
+RENDER_LIBS: dict[str, typing.Callable[[typing.TextIO, int], None]] = {
+    'progressbar2': render_progressbar2,
+    'progressbar2-fast': render_progressbar2_fast,
+    'tqdm': render_tqdm,
+    'rich': render_rich,
+}
+
+
+def main() -> None:
+    sink = PtySink()
+    results: dict[str, typing.Any] = {
+        'meta': {
+            'python': platform.python_version(),
+            'implementation': platform.python_implementation(),
+            'platform': platform.platform(),
+            'processor': platform.processor() or platform.machine(),
+            'cpu_count': os.cpu_count(),
+            'versions': {
+                name: metadata.version(dist)
+                for name, dist in {
+                    'progressbar2': 'progressbar2',
+                    'tqdm': 'tqdm',
+                    'rich': 'rich',
+                    'alive-progress': 'alive-progress',
+                    'click': 'click',
+                }.items()
+            },
+            'n_iter': N_ITER,
+            'iter_repeats': ITER_REPEATS,
+            'n_render': N_RENDER,
+            'render_repeats': RENDER_REPEATS,
+            'import_runs': IMPORT_RUNS,
+            'term': f'{TERM_COLS}x{TERM_ROWS}',
+        },
+    }
+
+    try:
+        # Scenario A ----------------------------------------------------
+        print('[A] default iterator-wrap overhead', file=sys.stderr)
+        base = time_call(lambda: baseline_loop(N_ITER), ITER_REPEATS)
+        print(
+            f'    baseline           {base["min"] * 1e3:8.2f} ms',
+            file=sys.stderr,
+        )
+        iter_results: dict[str, typing.Any] = {}
+        for name, fn in ITER_LIBS.items():
+            res = time_call(lambda f=fn: f(sink.file, N_ITER), ITER_REPEATS)
+            overhead_ns = (res['min'] - base['min']) / N_ITER * 1e9
+            iter_results[name] = {
+                'total_min_s': res['min'],
+                'total_median_s': res['median'],
+                'overhead_ns_per_iter': overhead_ns,
+            }
+            print(
+                f'    {name:16} {res["min"] * 1e3:8.2f} ms  '
+                f'({overhead_ns:8.1f} ns/iter)',
+                file=sys.stderr,
+            )
+        results['scenario_a_default_overhead'] = {
+            'baseline_min_s': base['min'],
+            'baseline_median_s': base['median'],
+            'libs': iter_results,
+        }
+
+        # Scenario B ----------------------------------------------------
+        print('[B] forced per-update render', file=sys.stderr)
+        baseR = time_call(lambda: baseline_loop(N_RENDER), RENDER_REPEATS)
+        render_results: dict[str, typing.Any] = {}
+        for name, fn in RENDER_LIBS.items():
+            res = time_call(
+                lambda f=fn: f(sink.file, N_RENDER), RENDER_REPEATS
+            )
+            per_update_us = (res['min'] - baseR['min']) / N_RENDER * 1e6
+            render_results[name] = {
+                'total_min_s': res['min'],
+                'total_median_s': res['median'],
+                'per_update_us': per_update_us,
+            }
+            print(
+                f'    {name:16} {res["min"] * 1e3:8.2f} ms  '
+                f'({per_update_us:7.2f} us/update)',
+                file=sys.stderr,
+            )
+        results['scenario_b_forced_render'] = {
+            'baseline_min_s': baseR['min'],
+            'libs': render_results,
+            'excluded': {
+                'alive-progress': 'renders on a background timer thread; no '
+                'per-update render API',
+                'click': 'self-throttles writes (renders only when the drawn '
+                'line changes); no force-every-update API',
+            },
+        }
+    finally:
+        sink.close()
+
+    # Scenario C --------------------------------------------------------
+    print('[C] cold import time', file=sys.stderr)
+    base_import = time_import('pass', IMPORT_RUNS)
+    import_results: dict[str, typing.Any] = {}
+    for name, stmt in IMPORT_STMTS.items():
+        t = time_import(stmt, IMPORT_RUNS)
+        net_ms = (t - base_import) * 1e3
+        import_results[name] = {
+            'total_min_s': t,
+            'net_ms': net_ms,
+        }
+        print(f'    {name:16} {net_ms:8.1f} ms (net)', file=sys.stderr)
+    results['scenario_c_import_time'] = {
+        'interpreter_baseline_s': base_import,
+        'libs': import_results,
+    }
+
+    out = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), 'results.json'
+    )
+    with open(out, 'w', encoding='utf-8') as fh:
+        json.dump(results, fh, indent=2)
+    print(f'\nwrote {out}', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/chart.png b/benchmarks/chart.png
new file mode 100644
index 0000000..9138434
Binary files /dev/null and b/benchmarks/chart.png differ
diff --git a/benchmarks/report.md b/benchmarks/report.md
new file mode 100644
index 0000000..59e916b
--- /dev/null
+++ b/benchmarks/report.md
@@ -0,0 +1,83 @@
+# Python progress-bar library benchmark
+
+_Generated 2026-06-24 19:25. Subject: **progressbar2** (version 4.5.0)._
+
+Compares `progressbar2` against the most common alternatives across three independent dimensions. All rendered output is written to a real pseudo-terminal (pty) that is continuously drained, so every library believes it is attached to a TTY and actually draws — the comparison is apples-to-apples, not "is output suppressed when piped".
+
+![benchmark chart](chart.png)
+
+## Environment
+
+| | |
+|---|---|
+| Python | CPython 3.13.12 |
+| Platform | macOS-26.5-arm64-arm-64bit-Mach-O |
+| Processor | arm (18 cores) |
+| Terminal | 80x24 (pty) |
+
+| Library | Version |
+|---|---|
+| progressbar2 | 4.5.0 |
+| tqdm | 4.68.3 |
+| rich | 15.0.0 |
+| alive-progress | 3.3.0 |
+| click | 8.4.1 |
+
+## A. Default iterator-wrap overhead (headline)
+
+Idiomatic "wrap my loop" call with each library's **default** settings, over **1,000,000** iterations with a trivial body. This is the real-world cost of dropping a progress bar around a fast loop. Overhead = (wrapped time − bare-loop time) / iterations. Lower is faster.
+
+Bare loop baseline: **5.47 ms** for 1,000,000 iterations.
+
+| Library | Total time | Overhead/iter | vs progressbar2 |
+|---|--:|--:|--:|
+| **progressbar2** | 10.5 ms | 5.1 ns | baseline |
+| rich | 24.5 ms | 19.0 ns | 3.76x |
+| tqdm | 59.3 ms | 53.8 ns | 10.64x |
+| alive-progress | 253.5 ms | 248.0 ns | 49.06x |
+| click | 1861.4 ms | 1855.9 ns | 367.13x |
+
+## B. Forced per-update render cost
+
+Rendering **forced on every single update** over **30,000** updates — i.e. the cost of one full bar redraw, throttling disabled. Lower is faster.
+
+| Library | Total time | Per rendered update | vs progressbar2 |
+|---|--:|--:|--:|
+| progressbar2-fast | 148.9 ms | 4.96 us | 0.19x |
+| tqdm | 332.7 ms | 11.08 us | 0.44x |
+| **progressbar2** | 764.6 ms | 25.48 us | baseline |
+| rich | 5156.6 ms | 171.88 us | 6.75x |
+
+Excluded from this panel (no per-update force-render API):
+- **alive-progress** — renders on a background timer thread; no per-update render API
+- **click** — self-throttles writes (renders only when the drawn line changes); no force-every-update API
+
+## C. Cold import time
+
+Wall-clock cost of importing the library in a fresh interpreter (minimum of 9 runs), with bare-interpreter startup (15 ms) subtracted. Matters for short-lived CLIs. Lower is lighter.
+
+| Library | Import time (net) |
+|---|--:|
+| **progressbar2** | 1.5 ms |
+| alive-progress | 8.5 ms |
+| tqdm | 21.6 ms |
+| click | 23.5 ms |
+| rich | 46.9 ms |
+
+## Takeaways
+
+- **Default per-iteration overhead:** `progressbar2` is 5 ns/iter, ranking #1 of 5. `progressbar2` is the lightest per iteration (5 ns), `click` the heaviest (1856 ns).
+  - `progressbar2` and `tqdm` win here because their default settings do almost no per-iteration work (counter compare / background refresh thread); `progressbar2` calls a monotonic clock and evaluates its redraw predicate on every `update()`.
+- **Render cost:** when a redraw actually happens, `progressbar2` draws one update in 25.5 us — 5.14x the cheapest (`progressbar2-fast`) but 6.7x cheaper than rich's full-display re-render.
+- **Why both numbers matter:** `progressbar2` caps redraws at ~20/sec by default (50 ms floor), so in practice the cheap render in B fires rarely and the per-iteration cost in A dominates real workloads.
+- **Import weight:** `progressbar2` is mid-pack to import; `alive-progress` is the lightest, `rich` the heaviest.
+
+## Methodology & caveats
+
+- Timing: `time.perf_counter`, GC disabled during measurement, one untimed warmup per case, **minimum** of N repeats reported (A: 7, B: 5). Minimum is used to reduce scheduler/JIT noise.
+- Output goes to a real pty sized 80x24, drained by a background thread so writes never block.
+- "Overhead/iter" subtracts the bare-loop baseline, isolating the library's own cost.
+- Default settings reflect out-of-the-box behaviour; tuning (`mininterval`, `poll_interval`, etc.) shifts these numbers. Results are specific to the environment above and will vary by machine.
+- This measures CPU/throughput overhead only — not feature set, output quality, nesting, or multi-bar support.
+
+Reproduce: `python benchmarks/bench.py && python benchmarks/report.py`
diff --git a/benchmarks/report.py b/benchmarks/report.py
new file mode 100644
index 0000000..99ee884
--- /dev/null
+++ b/benchmarks/report.py
@@ -0,0 +1,315 @@
+"""Render results.json into chart.png + report.md."""
+
+from __future__ import annotations
+
+import datetime
+import json
+import os
+import typing
+
+import matplotlib
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt  # noqa: E402
+
+HERE: str = os.path.dirname(os.path.abspath(__file__))
+SUBJECT: str = 'progressbar2'
+HIGHLIGHT: str = '#d62728'  # progressbar2 bars
+OTHER: str = '#7f8fa6'  # everyone else
+
+
+def load() -> dict[str, typing.Any]:
+    with open(os.path.join(HERE, 'results.json'), encoding='utf-8') as fh:
+        return json.load(fh)
+
+
+def _sorted(pairs: dict[str, float]) -> list[tuple[str, float]]:
+    return sorted(pairs.items(), key=lambda kv: kv[1])
+
+
+def make_chart(data: dict[str, typing.Any]) -> str:
+    a = data['scenario_a_default_overhead']['libs']
+    b = data['scenario_b_forced_render']['libs']
+    c = data['scenario_c_import_time']['libs']
+
+    panels: list[tuple[str, str, list[tuple[str, float]], bool]] = [
+        (
+            'A. Default iterator-wrap overhead\n(lower is faster)',
+            'nanoseconds added per iteration',
+            _sorted({k: v['overhead_ns_per_iter'] for k, v in a.items()}),
+            True,
+        ),
+        (
+            'B. Forced per-update render cost\n(lower is faster)',
+            'microseconds per rendered update',
+            _sorted({k: v['per_update_us'] for k, v in b.items()}),
+            True,
+        ),
+        (
+            'C. Cold import time\n(lower is lighter)',
+            'milliseconds (net of interpreter startup)',
+            _sorted({k: v['net_ms'] for k, v in c.items()}),
+            False,
+        ),
+    ]
+
+    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
+    for ax, (title, xlabel, pairs, logx) in zip(axes, panels):
+        labels = [k for k, _ in pairs]
+        values = [v for _, v in pairs]
+        colors = [HIGHLIGHT if k == SUBJECT else OTHER for k in labels]
+        ypos = range(len(labels))
+        ax.barh(list(ypos), values, color=colors)
+        ax.set_yticks(list(ypos))
+        ax.set_yticklabels(labels)
+        ax.invert_yaxis()  # fastest at top
+        ax.set_xlabel(xlabel)
+        ax.set_title(title, fontsize=11, fontweight='bold')
+        if logx:
+            ax.set_xscale('log')
+        ax.grid(axis='x', linestyle=':', alpha=0.4)
+        xmax = max(values)
+        for y, val in zip(ypos, values):
+            label = f'{val:.1f}' if val >= 1 else f'{val:.2f}'
+            ax.text(
+                val * 1.05 if logx else val + xmax * 0.01,
+                y,
+                label,
+                va='center',
+                fontsize=9,
+            )
+        ax.margins(x=0.18)
+
+    fig.suptitle(
+        'progressbar2 vs common Python progress-bar libraries',
+        fontsize=14,
+        fontweight='bold',
+    )
+    fig.tight_layout(rect=(0, 0, 1, 0.96))
+    out = os.path.join(HERE, 'chart.png')
+    fig.savefig(out, dpi=130)
+    plt.close(fig)
+    return out
+
+
+def _rel(value: float, ref: float) -> str:
+    if ref == 0:
+        return 'n/a'
+    factor = value / ref
+    if abs(factor - 1) < 0.005:
+        return 'baseline'
+    return f'{factor:.2f}x'
+
+
+def make_report(data: dict[str, typing.Any], chart_name: str) -> str:
+    meta = data['meta']
+    a = data['scenario_a_default_overhead']
+    b = data['scenario_b_forced_render']
+    c = data['scenario_c_import_time']
+    n_iter = meta['n_iter']
+    n_render = meta['n_render']
+    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+
+    pb_a = a['libs'][SUBJECT]['overhead_ns_per_iter']
+    pb_b = b['libs'][SUBJECT]['per_update_us']
+
+    lines: list[str] = []
+    w = lines.append
+
+    w('# Python progress-bar library benchmark')
+    w('')
+    w(
+        f'_Generated {now}. Subject: **{SUBJECT}** '
+        f'(version {meta["versions"]["progressbar2"]})._'
+    )
+    w('')
+    w(
+        'Compares `progressbar2` against the most common alternatives across '
+        'three independent dimensions. All rendered output is written to a real '
+        'pseudo-terminal (pty) that is continuously drained, so every library '
+        'believes it is attached to a TTY and actually draws — the comparison is '
+        'apples-to-apples, not "is output suppressed when piped".'
+    )
+    w('')
+    w(f'![benchmark chart]({chart_name})')
+    w('')
+
+    # Environment ------------------------------------------------------
+    w('## Environment')
+    w('')
+    w('| | |')
+    w('|---|---|')
+    w(f'| Python | {meta["implementation"]} {meta["python"]} |')
+    w(f'| Platform | {meta["platform"]} |')
+    w(f'| Processor | {meta["processor"]} ({meta["cpu_count"]} cores) |')
+    w(f'| Terminal | {meta["term"]} (pty) |')
+    w('')
+    w('| Library | Version |')
+    w('|---|---|')
+    for name, ver in meta['versions'].items():
+        w(f'| {name} | {ver} |')
+    w('')
+
+    # Scenario A -------------------------------------------------------
+    w('## A. Default iterator-wrap overhead (headline)')
+    w('')
+    w(
+        f'Idiomatic "wrap my loop" call with each library\'s **default** '
+        f'settings, over **{n_iter:,}** iterations with a trivial body. This is '
+        f'the real-world cost of dropping a progress bar around a fast loop. '
+        f'Overhead = (wrapped time − bare-loop time) / iterations. '
+        f'Lower is faster.'
+    )
+    w('')
+    w(
+        f'Bare loop baseline: **{a["baseline_min_s"] * 1e3:.2f} ms** '
+        f'for {n_iter:,} iterations.'
+    )
+    w('')
+    w('| Library | Total time | Overhead/iter | vs progressbar2 |')
+    w('|---|--:|--:|--:|')
+    for name, v in _sorted(
+        {k: vv['overhead_ns_per_iter'] for k, vv in a['libs'].items()}
+    ):
+        vv = a['libs'][name]
+        bold = '**' if name == SUBJECT else ''
+        w(
+            f'| {bold}{name}{bold} | {vv["total_min_s"] * 1e3:.1f} ms '
+            f'| {vv["overhead_ns_per_iter"]:.1f} ns '
+            f'| {_rel(vv["overhead_ns_per_iter"], pb_a)} |'
+        )
+    w('')
+
+    # Scenario B -------------------------------------------------------
+    w('## B. Forced per-update render cost')
+    w('')
+    w(
+        f'Rendering **forced on every single update** over **{n_render:,}** '
+        f'updates — i.e. the cost of one full bar redraw, throttling disabled. '
+        f'Lower is faster.'
+    )
+    w('')
+    w('| Library | Total time | Per rendered update | vs progressbar2 |')
+    w('|---|--:|--:|--:|')
+    for name, v in _sorted(
+        {k: vv['per_update_us'] for k, vv in b['libs'].items()}
+    ):
+        vv = b['libs'][name]
+        bold = '**' if name == SUBJECT else ''
+        w(
+            f'| {bold}{name}{bold} | {vv["total_min_s"] * 1e3:.1f} ms '
+            f'| {vv["per_update_us"]:.2f} us '
+            f'| {_rel(vv["per_update_us"], pb_b)} |'
+        )
+    w('')
+    w('Excluded from this panel (no per-update force-render API):')
+    for name, why in b['excluded'].items():
+        w(f'- **{name}** — {why}')
+    w('')
+
+    # Scenario C -------------------------------------------------------
+    w('## C. Cold import time')
+    w('')
+    w(
+        f'Wall-clock cost of importing the library in a fresh interpreter '
+        f'(minimum of {meta["import_runs"]} runs), with bare-interpreter startup '
+        f'({c["interpreter_baseline_s"] * 1e3:.0f} ms) subtracted. Matters for '
+        f'short-lived CLIs. Lower is lighter.'
+    )
+    w('')
+    w('| Library | Import time (net) |')
+    w('|---|--:|')
+    for name, v in _sorted({k: vv['net_ms'] for k, vv in c['libs'].items()}):
+        vv = c['libs'][name]
+        bold = '**' if name == SUBJECT else ''
+        w(f'| {bold}{name}{bold} | {vv["net_ms"]:.1f} ms |')
+    w('')
+
+    # Takeaways --------------------------------------------------------
+    a_rank = _sorted(
+        {k: vv['overhead_ns_per_iter'] for k, vv in a['libs'].items()}
+    )
+    b_rank = _sorted({k: vv['per_update_us'] for k, vv in b['libs'].items()})
+    pb_a_pos = [k for k, _ in a_rank].index(SUBJECT) + 1
+    fastest_a = a_rank[0][0]
+    slowest_a = a_rank[-1][0]
+    w('## Takeaways')
+    w('')
+    w(
+        f'- **Default per-iteration overhead:** `{SUBJECT}` is '
+        f'{pb_a:.0f} ns/iter, ranking #{pb_a_pos} of '
+        f'{len(a_rank)}. `{fastest_a}` is the lightest per iteration '
+        f'({a_rank[0][1]:.0f} ns), `{slowest_a}` the heaviest '
+        f'({a_rank[-1][1]:.0f} ns).'
+    )
+    w(
+        f'  - `{fastest_a}` and `tqdm` win here because their default settings '
+        f'do almost no per-iteration work (counter compare / background refresh '
+        f'thread); `{SUBJECT}` calls a monotonic clock and evaluates its redraw '
+        f'predicate on every `update()`.'
+    )
+    w(
+        f'- **Render cost:** when a redraw actually happens, `{SUBJECT}` draws '
+        f'one update in {b_rank[[k for k, _ in b_rank].index(SUBJECT)][1]:.1f} us '
+        f'— {_rel(pb_b, b_rank[0][1])} the cheapest (`{b_rank[0][0]}`) but '
+        f"{b['libs']['rich']['per_update_us'] / pb_b:.1f}x cheaper than rich's "
+        f'full-display re-render.'
+    )
+    w(
+        f'- **Why both numbers matter:** `{SUBJECT}` caps redraws at ~20/sec by '
+        f'default (50 ms floor), so in practice the cheap render in B fires '
+        f'rarely and the per-iteration cost in A dominates real workloads.'
+    )
+    w(
+        f'- **Import weight:** `{SUBJECT}` is mid-pack to import; '
+        f'`alive-progress` is the lightest, `rich` the heaviest.'
+    )
+    w('')
+
+    # Methodology ------------------------------------------------------
+    w('## Methodology & caveats')
+    w('')
+    w(
+        f'- Timing: `time.perf_counter`, GC disabled during measurement, one '
+        f'untimed warmup per case, **minimum** of N repeats reported '
+        f'(A: {meta["iter_repeats"]}, B: {meta["render_repeats"]}). Minimum is '
+        f'used to reduce scheduler/JIT noise.'
+    )
+    w(
+        '- Output goes to a real pty sized '
+        f'{meta["term"]}, drained by a background thread so writes never block.'
+    )
+    w(
+        '- "Overhead/iter" subtracts the bare-loop baseline, isolating the '
+        "library's own cost."
+    )
+    w(
+        '- Default settings reflect out-of-the-box behaviour; tuning '
+        '(`mininterval`, `poll_interval`, etc.) shifts these numbers. Results '
+        'are specific to the environment above and will vary by machine.'
+    )
+    w(
+        '- This measures CPU/throughput overhead only — not feature set, output '
+        'quality, nesting, or multi-bar support.'
+    )
+    w('')
+    w('Reproduce: `python benchmarks/bench.py && python benchmarks/report.py`')
+    w('')
+
+    report = '\n'.join(lines)
+    out = os.path.join(HERE, 'report.md')
+    with open(out, 'w', encoding='utf-8') as fh:
+        fh.write(report)
+    return out
+
+
+def main() -> None:
+    data = load()
+    chart = make_chart(data)
+    report = make_report(data, os.path.basename(chart))
+    print('wrote', chart)
+    print('wrote', report)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 0000000..d5cbec2
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,5 @@
+tqdm==4.68.3
+rich==15.0.0
+alive-progress==3.3.0
+click==8.4.1
+matplotlib==3.11.0
diff --git a/benchmarks/results.json b/benchmarks/results.json
new file mode 100644
index 0000000..8c402aa
--- /dev/null
+++ b/benchmarks/results.json
@@ -0,0 +1,107 @@
+{
+  "meta": {
+    "python": "3.13.12",
+    "implementation": "CPython",
+    "platform": "macOS-26.5-arm64-arm-64bit-Mach-O",
+    "processor": "arm",
+    "cpu_count": 18,
+    "versions": {
+      "progressbar2": "4.5.0",
+      "tqdm": "4.68.3",
+      "rich": "15.0.0",
+      "alive-progress": "3.3.0",
+      "click": "8.4.1"
+    },
+    "n_iter": 1000000,
+    "iter_repeats": 7,
+    "n_render": 30000,
+    "render_repeats": 5,
+    "import_runs": 9,
+    "term": "80x24"
+  },
+  "scenario_a_default_overhead": {
+    "baseline_min_s": 0.005465084221214056,
+    "baseline_median_s": 0.005537374876439571,
+    "libs": {
+      "progressbar2": {
+        "total_min_s": 0.01052025007084012,
+        "total_median_s": 0.010792375076562166,
+        "overhead_ns_per_iter": 5.055165849626064
+      },
+      "tqdm": {
+        "total_min_s": 0.05926787527278066,
+        "total_median_s": 0.06034625042229891,
+        "overhead_ns_per_iter": 53.8027910515666
+      },
+      "rich": {
+        "total_min_s": 0.024457333143800497,
+        "total_median_s": 0.024638874921947718,
+        "overhead_ns_per_iter": 18.99224892258644
+      },
+      "alive-progress": {
+        "total_min_s": 0.2534806248731911,
+        "total_median_s": 0.26495025027543306,
+        "overhead_ns_per_iter": 248.01554065197706
+      },
+      "click": {
+        "total_min_s": 1.8613821249455214,
+        "total_median_s": 1.8722192500717938,
+        "overhead_ns_per_iter": 1855.9170407243073
+      }
+    }
+  },
+  "scenario_b_forced_render": {
+    "baseline_min_s": 0.00015695812180638313,
+    "libs": {
+      "progressbar2": {
+        "total_min_s": 0.7645597076043487,
+        "total_median_s": 0.7665333333425224,
+        "per_update_us": 25.480091649418075
+      },
+      "progressbar2-fast": {
+        "total_min_s": 0.1489091250114143,
+        "total_median_s": 0.14924920815974474,
+        "per_update_us": 4.95840556298693
+      },
+      "tqdm": {
+        "total_min_s": 0.3326972499489784,
+        "total_median_s": 0.33332004211843014,
+        "per_update_us": 11.084676394239068
+      },
+      "rich": {
+        "total_min_s": 5.156592667102814,
+        "total_median_s": 5.1774807083420455,
+        "per_update_us": 171.8811902993669
+      }
+    },
+    "excluded": {
+      "alive-progress": "renders on a background timer thread; no per-update render API",
+      "click": "self-throttles writes (renders only when the drawn line changes); no force-every-update API"
+    }
+  },
+  "scenario_c_import_time": {
+    "interpreter_baseline_s": 0.015367416199296713,
+    "libs": {
+      "progressbar2": {
+        "total_min_s": 0.016899917274713516,
+        "net_ms": 1.5325010754168034
+      },
+      "tqdm": {
+        "total_min_s": 0.03698108298704028,
+        "net_ms": 21.61366678774357
+      },
+      "rich": {
+        "total_min_s": 0.06224083295091987,
+        "net_ms": 46.873416751623154
+      },
+      "alive-progress": {
+        "total_min_s": 0.023891292046755552,
+        "net_ms": 8.52387584745884
+      },
+      "click": {
+        "total_min_s": 0.0388890840113163,
+        "net_ms": 23.521667812019587
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/progressbar/__init__.py b/progressbar/__init__.py
index cf4de76..a223c0d 100644
--- a/progressbar/__init__.py
+++ b/progressbar/__init__.py
@@ -1,48 +1,145 @@
+"""progressbar2 public API.
+
+Imports are lazy (PEP 562): ``import progressbar`` loads almost nothing; each
+submodule and exported name is imported on first access. This keeps the import
+light (in particular the widgets and the terminal/color tables are only loaded
+when actually used) while preserving the full public API.
+"""
+
+import importlib
+import typing
 from datetime import date
 
 from .__about__ import __author__, __version__
-from .algorithms import (
-    DoubleExponentialMovingAverage,
-    ExponentialMovingAverage,
-    SmoothingAlgorithm,
-)
-from .bar import DataTransferBar, NullBar, ProgressBar
-from .base import UnknownLength
-from .multi import MultiBar, SortKey
-from .shortcuts import progressbar
-from .terminal.stream import LineOffsetStreamWrapper
-from .utils import len_color, streams
-from .widgets import (
-    ETA,
-    AbsoluteETA,
-    AdaptiveETA,
-    AdaptiveTransferSpeed,
-    AnimatedMarker,
-    Bar,
-    BouncingBar,
-    Counter,
-    CurrentTime,
-    DataSize,
-    DynamicMessage,
-    FileTransferSpeed,
-    FormatCustomText,
-    FormatLabel,
-    FormatLabelBar,
-    GranularBar,
-    JobStatusBar,
-    MultiProgressBar,
-    MultiRangeBar,
-    Percentage,
-    PercentageLabelBar,
-    ReverseBar,
-    RotatingMarker,
-    SimpleProgress,
-    SmoothingETA,
-    Timer,
-    Variable,
-    VariableMixin,
+
+if typing.TYPE_CHECKING:
+    # Eager imports for type checkers only; loaded lazily at runtime by
+    # __getattr__ below. Names appear in __all__ so they read as re-exports.
+    from .algorithms import (
+        DoubleExponentialMovingAverage,
+        ExponentialMovingAverage,
+        SmoothingAlgorithm,
+    )
+    from .bar import DataTransferBar, NullBar, ProgressBar
+    from .base import UnknownLength
+    from .fast import FastProgressBar
+    from .multi import MultiBar, SortKey
+    from .shortcuts import progressbar
+    from .terminal.stream import LineOffsetStreamWrapper
+    from .utils import len_color, streams
+    from .widgets import (
+        ETA,
+        AbsoluteETA,
+        AdaptiveETA,
+        AdaptiveTransferSpeed,
+        AnimatedMarker,
+        Bar,
+        BouncingBar,
+        Counter,
+        CurrentTime,
+        DataSize,
+        DynamicMessage,
+        FileTransferSpeed,
+        FormatCustomText,
+        FormatLabel,
+        FormatLabelBar,
+        GranularBar,
+        JobStatusBar,
+        MultiProgressBar,
+        MultiRangeBar,
+        Percentage,
+        PercentageLabelBar,
+        ReverseBar,
+        RotatingMarker,
+        SimpleProgress,
+        SmoothingETA,
+        Timer,
+        Variable,
+        VariableMixin,
+    )
+
+#: Submodules accessible as ``progressbar.<name>``.
+_SUBMODULES: frozenset[str] = frozenset(
+    {
+        'algorithms',
+        'bar',
+        'base',
+        'env',
+        'fast',
+        'multi',
+        'shortcuts',
+        'terminal',
+        'utils',
+        'widgets',
+    }
 )
 
+#: Exported name -> submodule it lives in.
+_NAME_TO_MODULE: dict[str, str] = {
+    'DoubleExponentialMovingAverage': 'algorithms',
+    'ExponentialMovingAverage': 'algorithms',
+    'SmoothingAlgorithm': 'algorithms',
+    'DataTransferBar': 'bar',
+    'NullBar': 'bar',
+    'ProgressBar': 'bar',
+    'FastProgressBar': 'fast',
+    'UnknownLength': 'base',
+    'MultiBar': 'multi',
+    'SortKey': 'multi',
+    'progressbar': 'shortcuts',
+    'LineOffsetStreamWrapper': 'terminal.stream',
+    'len_color': 'utils',
+    'streams': 'utils',
+    'ETA': 'widgets',
+    'AbsoluteETA': 'widgets',
+    'AdaptiveETA': 'widgets',
+    'AdaptiveTransferSpeed': 'widgets',
+    'AnimatedMarker': 'widgets',
+    'Bar': 'widgets',
+    'BouncingBar': 'widgets',
+    'Counter': 'widgets',
+    'CurrentTime': 'widgets',
+    'DataSize': 'widgets',
+    'DynamicMessage': 'widgets',
+    'FileTransferSpeed': 'widgets',
+    'FormatCustomText': 'widgets',
+    'FormatLabel': 'widgets',
+    'FormatLabelBar': 'widgets',
+    'GranularBar': 'widgets',
+    'JobStatusBar': 'widgets',
+    'MultiProgressBar': 'widgets',
+    'MultiRangeBar': 'widgets',
+    'Percentage': 'widgets',
+    'PercentageLabelBar': 'widgets',
+    'ReverseBar': 'widgets',
+    'RotatingMarker': 'widgets',
+    'SimpleProgress': 'widgets',
+    'SmoothingETA': 'widgets',
+    'Timer': 'widgets',
+    'Variable': 'widgets',
+    'VariableMixin': 'widgets',
+}
+
+
+def __getattr__(name: str) -> typing.Any:
+    """Lazily import submodules and exported names on first access."""
+    if name in _SUBMODULES:
+        module = importlib.import_module(f'.{name}', __name__)
+        globals()[name] = module  # cache so __getattr__ runs only once
+        return module
+
+    module_name = _NAME_TO_MODULE.get(name)
+    if module_name is None:
+        raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
+    value = getattr(importlib.import_module(f'.{module_name}', __name__), name)
+    globals()[name] = value
+    return value
+
+
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | set(__all__) | _SUBMODULES)
+
+
 __date__ = str(date.today())
 __all__ = [
     'ETA',
@@ -59,6 +156,7 @@
     'DoubleExponentialMovingAverage',
     'DynamicMessage',
     'ExponentialMovingAverage',
+    'FastProgressBar',
     'FileTransferSpeed',
     'FormatCustomText',
     'FormatLabel',
diff --git a/progressbar/bar.py b/progressbar/bar.py
index 0352901..3465ba9 100644
--- a/progressbar/bar.py
+++ b/progressbar/bar.py
@@ -2,6 +2,7 @@
 
 import abc
 import contextlib
+import importlib
 import itertools
 import logging
 import math
@@ -25,11 +26,32 @@
 from . import (
     base,
     utils,
-    widgets,
-    widgets as widgets_module,  # Avoid name collision
 )
 from .terminal import os_specific
 
+try:
+    # Optional native accelerator, shipped as the ``progressbar2[fast]`` extra
+    # (the separate ``speedups`` package). When importable, the iterator path
+    # uses it automatically; otherwise we fall back to the pure-Python gate.
+    # Loaded via importlib so type checkers don't try to resolve the optional
+    # compiled module when it is absent.
+    _FastBarIterator = importlib.import_module(
+        'speedups.progressbar',
+    ).FastBarIterator
+except Exception:  # pragma: no cover - environmental (absent / ABI mismatch)
+    _FastBarIterator = None
+
+
+def _load_widgets() -> typing.Any:
+    """Import the widgets module lazily.
+
+    The full-bar code needs ``widgets``, but the lean fast path must not pull
+    it in (it drags the terminal/colour tables). Imported via importlib so the
+    deferred load doesn't read as a static ``bar -> widgets`` import cycle.
+    """
+    return importlib.import_module('progressbar.widgets')
+
+
 logger = logging.getLogger(__name__)
 
 # float also accepts integers and longs but we don't want an explicit union
@@ -49,7 +71,9 @@ class ProgressBarMixinBase(abc.ABC):
     #: fall back to 80 if auto detection is not possible.
     term_width: int = 80
     #: The widgets to render, defaults to the result of `default_widget()`
-    widgets: types.MutableSequence[widgets_module.WidgetBase | str]
+    #: (typed loosely as Any to avoid a static bar->widgets import cycle; the
+    #: public ``progressbar()`` shortcut keeps the precise WidgetBase typing).
+    widgets: types.MutableSequence[typing.Any]
     #: When going beyond the max_value, raise an error if True or silently
     #: ignore otherwise
     max_error: bool
@@ -86,6 +110,9 @@ class ProgressBarMixinBase(abc.ABC):
     value: NumberT
     #: Previous progress value
     previous_value: types.Optional[NumberT]
+    #: Value at the last actual redraw (internal; used by the update gate's
+    #: pixel check, kept separate from the public `previous_value`)
+    _last_drawn_value: types.Optional[NumberT]
     #: The minimum/start value for the progress bar
     min_value: NumberT
     #: Maximum (and final) value. Beyond this value an error will be raised
@@ -347,6 +374,8 @@ def _format_line(self):
             return widgets.rjust(self.term_width)
 
     def _format_widgets(self):
+        widgets = _load_widgets()
+
         result = []
         expanding = []
         width = self.term_width
@@ -632,9 +661,7 @@ def __init__(
         self,
         min_value: NumberT = 0,
         max_value: ValueT = None,
-        widgets: types.Optional[
-            types.Sequence[widgets_module.WidgetBase | str]
-        ] = None,
+        widgets: types.Optional[types.Sequence[typing.Any]] = None,
         left_justify: bool = True,
         initial_value: NumberT = 0,
         poll_interval: types.Optional[float] = None,
@@ -725,12 +752,15 @@ def __init__(
 
         # A dictionary of names that can be used by Variable and FormatWidget
         self.variables = utils.AttributeDict(variables or {})
-        for widget in self.widgets:
-            if (
-                isinstance(widget, widgets_module.VariableMixin)
-                and widget.name not in self.variables
-            ):
-                self.variables[widget.name] = None
+        if self.widgets:
+            widgets_module = _load_widgets()
+
+            for widget in self.widgets:
+                if (
+                    isinstance(widget, widgets_module.VariableMixin)
+                    and widget.name not in self.variables
+                ):
+                    self.variables[widget.name] = None
 
     @property
     def dynamic_messages(self):  # pragma: no cover
@@ -746,12 +776,27 @@ def init(self):
         used (again).
         """
         self.previous_value = None
+        # Value at the last actual redraw; used internally by the update gate's
+        # pixel check (distinct from the public `previous_value`).
+        self._last_drawn_value = None
         self.last_update_time = None
         self.start_time = None
         self.updates = 0
         self.end_time = None
         self.extra = dict()
         self._last_update_timer = timeit.default_timer()
+        # Fast-path "next update" gate. The common iteration only re-enters
+        # the redraw machinery when value reaches `_next_update`. `_gate_step`
+        # is a closed-loop estimate of iterations per `min_poll_interval`,
+        # calibrated in `update()` from the value/time elapsed between redraws
+        # (tracked by `_last_drawn_value`/`_last_update_timer`). It starts at 1
+        # so the gate forces an `update()` every iteration until a real timing
+        # measurement (or the back-off doubling) grows the step, so slow
+        # iterators (where time advances between calls) are never skipped
+        # before that.
+        self._next_update = 0
+        self._gate_step = 1
+        self._gate_enabled = True
         self._started = False
         self._finished = False
 
@@ -870,6 +915,8 @@ def data(self) -> types.Dict[str, types.Any]:
         )
 
     def default_widgets(self):
+        widgets = _load_widgets()
+
         if self.max_value:
             return [
                 widgets.Percentage(**self.widget_kwargs),
@@ -910,21 +957,99 @@ def __call__(self, iterable, max_value=None):
         return self
 
     def __iter__(self):
-        # A generator (rather than returning ``self``) so that abandoning the
-        # loop early - a `break` or an exception in the loop body - triggers
-        # `GeneratorExit` on garbage collection, letting us finish the bar and
-        # restore any redirected streams. See issue #212.
+        # Dispatch to the optional native iterator when available, else the
+        # pure-Python generator. The native path counts in C and syncs
+        # `value`/`previous_value` only at redraw crossings (so they lag
+        # mid-loop, like `tqdm.n`), beating the per-iteration attribute writes
+        # the pure-Python path pays to keep them live every iteration.
+        if (
+            _FastBarIterator is not None
+            and self._iterable is not None
+            and not os.environ.get('PROGRESSBAR_DISABLE_FASTPATH')
+        ):
+            return _FastBarIterator(self, self._iterable)
+        return self._iter_python()
+
+    def _iter_python(self):
+        # Single generator (see issue #212): a `break`/exception in the loop
+        # body triggers `GeneratorExit`, letting us finish and restore any
+        # redirected streams. The integer gate keeps the common iteration to
+        # an increment + compare + store; the slow path (`update`) makes the
+        # real redraw decision and recomputes the gate.
+        #
+        # Value semantics MUST match pre-change behavior: `start()` draws 0%
+        # and the FIRST item is yielded at `value == min_value` (no increment),
+        # so during the body for item i (0-indexed), `bar.value == i` — NOT
+        # i+1. Only subsequent items increment. The peek-first structure below
+        # reproduces this without a per-iteration branch.
+        iterable = self._iterable if self._iterable is not None else iter(())
         try:
-            while True:
-                try:
-                    value = next(self)
-                except StopIteration:
-                    return
-                yield value
+            if self.start_time is None:
+                self.start()
+            iterator = iter(iterable)
+            try:
+                item = next(iterator)
+            except StopIteration:
+                self.finish()
+                return
+            yield item  # first item at value == min_value (matches old code)
+            value = self.value
+            next_update = value
+            update = self.update
+            # `_gate_enabled` is set once in `start()` and never mutated during
+            # iteration, so hoist it to a local and drop the per-iteration
+            # attribute load on the hot path.
+            gate_enabled = self._gate_enabled
+            for item in iterator:
+                value += 1
+                # When the gate is disabled, call `update()` every iteration so
+                # behaviour is byte-identical to the ungated bar. When enabled,
+                # only re-enter `update()` once value reaches the threshold.
+                # The step starts at 1, so until a real measurement grows it
+                # this still calls `update()` every iteration and lets
+                # `_needs_update()` make the real redraw decision. Calling
+                # `update()` (rather than pre-setting `self.value`) lets it
+                # record the prior value in the public `previous_value`,
+                # preserving its original semantics.
+                if not gate_enabled or value >= next_update:
+                    update(value)
+                    next_update = self._next_update
+                else:
+                    # Gated out: advance bar.value AND previous_value (exactly
+                    # as update() would) without entering the redraw machinery,
+                    # so reads of bar.previous_value mid-loop stay identical to
+                    # the original every-iteration semantics. The gate's pixel
+                    # reference is the separate `_last_drawn_value`.
+                    self.previous_value = self.value
+                    self.value = value
+                yield item
+            self.finish()
         except GeneratorExit:
             self.finish(dirty=True)
             raise
 
+    # --- Native accelerator protocol (used by speedups.FastBarIterator) ------
+    # The C iterator counts items itself and calls back here only at gate
+    # crossings, reusing the existing gate/redraw/calibration machinery so the
+    # redraw cadence is identical to `_iter_python`.
+
+    def _fast_begin(self) -> None:
+        """Start the bar (draws 0%, sets `_next_update`/`_gate_enabled`)."""
+        if self.start_time is None:
+            self.start()
+
+    def _fast_tick(self, value: int) -> None:
+        """Handle a redraw crossing: redraw-if-due and recompute the gate."""
+        self.update(value)
+
+    def _fast_end(self) -> None:
+        """Finish normally (draws 100%, restores streams) on exhaustion."""
+        self.finish()
+
+    def _fast_end_dirty(self) -> None:
+        """Finish dirty on early break/exception (restores streams)."""
+        self.finish(dirty=True)
+
     def __next__(self):
         value: typing.Any
         try:
@@ -979,7 +1104,7 @@ def _needs_update(self):
             # There's no terminal-width threshold to compute for an unknown
             # length, so redraw whenever the value advanced (still rate
             # limited by the min_poll_interval check above)
-            return self.value != self.previous_value
+            return self.value != self._last_drawn_value
 
         # Update if value increment is not large enough to
         # add more bars to progressbar (according to current
@@ -987,12 +1112,71 @@ def _needs_update(self):
         with contextlib.suppress(Exception):
             divisor: float = self.max_value / self.term_width  # type: ignore
             value_divisor = self.value // divisor  # type: ignore
-            pvalue_divisor = self.previous_value // divisor  # type: ignore
+            pvalue_divisor = self._last_drawn_value // divisor  # type: ignore
             if value_divisor != pvalue_divisor:
                 return True
         # No need to redraw yet
         return False
 
+    def _gate_skips(
+        self, value: ValueT, force: bool, variables_changed: bool
+    ) -> bool:
+        """Whether the fast-path gate should skip this update() call entirely.
+
+        Only skips while enabled, never for forced draws, variable changes,
+        or a `None` (tick) value, and only while the value is still below the
+        `_next_update` threshold.
+        """
+        return (
+            self._gate_enabled
+            and not force
+            and not variables_changed
+            and value is not None
+            and self.value < self._next_update
+        )
+
+    def _draw_and_recalibrate(
+        self, value: ValueT, variables_changed: bool, force: bool
+    ) -> None:
+        """Redraw if due, then resize the gate's next-update threshold.
+
+        On a redraw, `_gate_step` is calibrated to ~one `min_poll_interval`
+        window of iterations, measured from the value/time elapsed since the
+        previous redraw (snapshotted here before the draw overwrites
+        `_last_drawn_value`/`_last_update_timer` — so the gate needs no extra
+        copies of those quantities). If we passed the threshold but no redraw
+        was due (the loop sped up), back off by doubling the step.
+        """
+        if self._needs_update() or variables_changed or force:
+            prev_value = self._last_drawn_value
+            prev_timer = self._last_update_timer
+            try:
+                self._update_parents(value)  # data() refreshes the timer
+            finally:
+                # `_last_drawn_value` is the value at the last *redraw* (the
+                # pixel reference for `_needs_update`); set in finally so it
+                # advances even if a draw raised.
+                self._last_drawn_value = self.value
+            if self._gate_enabled:
+                interval = self._last_update_timer - prev_timer
+                if (
+                    prev_value is not None
+                    and interval > 0
+                    and self.value > prev_value
+                ):
+                    self._gate_step = max(
+                        1,
+                        int(
+                            (self.value - prev_value)
+                            * self.min_poll_interval
+                            / interval
+                        ),
+                    )
+                self._next_update = self.value + self._gate_step
+        elif self._gate_enabled and value is not None:
+            self._gate_step = max(1, self._gate_step * 2)
+            self._next_update = self.value + self._gate_step
+
     def update(
         self, value: ValueT = None, force: bool = False, **kwargs: typing.Any
     ):
@@ -1022,14 +1206,20 @@ def update(
                 else:
                     value = typing.cast(NumberT, self.max_value)
 
+            # `previous_value` keeps its original public meaning: the value
+            # before this update() call. The gate uses a separate private
+            # `_last_drawn_value` (set on redraw) for its pixel check.
             self.previous_value = self.value
             self.value = value
 
-        # Save the updated values for dynamic messages
-        variables_changed = self._update_variables(kwargs)
+        # Save the updated values for dynamic messages (skip the call and the
+        # empty-dict iteration on the common no-kwargs path).
+        variables_changed = self._update_variables(kwargs) if kwargs else False
 
-        if self._needs_update() or variables_changed or force:
-            self._update_parents(value)
+        if self._gate_skips(value, force, variables_changed):
+            return
+
+        self._draw_and_recalibrate(value, variables_changed, force)
 
     def _update_variables(self, kwargs):
         variables_changed = False
@@ -1100,6 +1290,11 @@ def start(
         self._init_prefix()
         self._init_suffix()
         self._calculate_poll_interval()
+        if (
+            os.environ.get('PROGRESSBAR_DISABLE_FASTPATH')
+            or not self.min_poll_interval
+        ):
+            self._gate_enabled = False
         self._verify_max_value()
 
         now = datetime.now()
@@ -1112,6 +1307,8 @@ def start(
 
     def _init_suffix(self):
         if self.suffix:
+            widgets = _load_widgets()
+
             self.widgets.append(
                 widgets.FormatLabel(self.suffix, new_style=True),
             )
@@ -1121,6 +1318,8 @@ def _init_suffix(self):
 
     def _init_prefix(self):
         if self.prefix:
+            widgets = _load_widgets()
+
             self.widgets.insert(
                 0,
                 widgets.FormatLabel(self.prefix, new_style=True),
@@ -1197,6 +1396,8 @@ class DataTransferBar(ProgressBar):
     """
 
     def default_widgets(self):
+        widgets = _load_widgets()
+
         if self.max_value:
             return [
                 widgets.Percentage(),
diff --git a/progressbar/fast.py b/progressbar/fast.py
new file mode 100644
index 0000000..8e128d8
--- /dev/null
+++ b/progressbar/fast.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import typing
+from datetime import datetime, timedelta
+
+from . import (
+    bar as bar_module,
+    base,
+)
+
+#: Optional native line formatter, provided by the `speedups` package. When
+#: present it replaces the pure-Python formatter below. Wired in a later task.
+_format_fast_line: typing.Callable[[FastProgressBar], str] | None = None
+
+
+def _format_seconds(seconds: float) -> str:
+    """Render elapsed/ETA seconds as H:MM:SS, matching the Timer widget."""
+    return str(timedelta(seconds=int(seconds)))
+
+
+def _pure_format_fast_line(bar: FastProgressBar) -> str:
+    """Build the whole status line directly (no widgets, no data() dict)."""
+    value = bar.value
+    min_value = bar.min_value
+    max_value = bar.max_value
+    width = bar.term_width
+    elapsed = bar._fast_elapsed()
+    elapsed_text = _format_seconds(elapsed)
+    prefix = bar.prefix or ''
+    suffix = bar.suffix or ''
+
+    known = max_value not in (None, base.UnknownLength)
+    if known:
+        total = max_value - min_value  # type: ignore[operator]
+        # Clamp progress to the total so an over-shooting value (e.g. a forced
+        # render past max_value with max_error=False) can't produce a negative
+        # ETA or a bar that overflows its width.
+        done = min(value - min_value, total)
+        pct = 100.0 * done / total if total else 100.0
+        count = f'({value} of {max_value})'
+        if done > 0 and elapsed > 0:
+            eta = _format_seconds(elapsed * (total - done) / done)
+        else:
+            eta = '--:--:--'
+        left = f'{pct:3.0f}% {count} '
+        right = f' Elapsed Time: {elapsed_text} ETA: {eta}'
+        inner = max(width - len(left) - len(right) - 2, 0)
+        filled = int(inner * done / total) if total else inner
+        barstr = '|' + '#' * filled + ' ' * (inner - filled) + '|'
+        return f'{prefix}{left}{barstr}{right}{suffix}'
+
+    # Unknown length: spinner + count + elapsed (no bar/eta).
+    spinner = r'|/-\\'[int(elapsed * 4) % 4]
+    item_count = value - min_value + 1
+    return (
+        f'{prefix}{spinner} {item_count} Elapsed Time: {elapsed_text}{suffix}'
+    )
+
+
+class FastProgressBar(bar_module.ProgressBar):
+    """A lean ProgressBar whose render bypasses the widget system.
+
+    Reuses the full ProgressBar lifecycle (the next-update gate, the native
+    iterator, stream redirect, resize, start/update/finish) and overrides only
+    the render with a fixed formatter, so the common case is import- and
+    render-cheap. Output stays close to the default look without the gradient.
+    """
+
+    def default_widgets(self) -> list:
+        # No widgets: the fixed formatter renders everything.
+        return []
+
+    def _fast_elapsed(self) -> float:
+        if self.start_time is None:
+            return 0.0
+        end = self.end_time or self._fast_now()
+        return max((end - self.start_time).total_seconds(), 0.0)
+
+    def _fast_now(self) -> datetime:
+        return datetime.now()
+
+    def _format_line(self) -> str:
+        formatter = _format_fast_line or _pure_format_fast_line
+        return formatter(self)
+
+    def _init_prefix(self) -> None:
+        # Label is rendered inline by the formatter; don't inject a widget.
+        pass
+
+    def _init_suffix(self) -> None:
+        # Label is rendered inline by the formatter; don't inject a widget.
+        pass
diff --git a/progressbar/multi.py b/progressbar/multi.py
index fabd1b2..5eda7eb 100644
--- a/progressbar/multi.py
+++ b/progressbar/multi.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import enum
+import importlib
 import io
 import itertools
 import operator
@@ -17,6 +18,14 @@
 from . import bar, terminal
 from .terminal import stream
 
+# MultiBar renders full (widget) progress bars from background threads. Warm
+# the widgets module now -- single-threaded, at module load, which only happens
+# when MultiBar is actually used (this module is imported lazily), so the fast
+# path and a bare ``import progressbar`` stay widgets-free. Pre-warming here
+# means a child bar's first start() doesn't import widgets inside a render
+# thread and race MultiBar._label_bar's ``assert bar.widgets``.
+importlib.import_module('progressbar.widgets')
+
 SortKeyFunc = typing.Callable[[bar.ProgressBar], typing.Any]
 
 
diff --git a/progressbar/shortcuts.py b/progressbar/shortcuts.py
index 220c8f2..c4d2c5d 100644
--- a/progressbar/shortcuts.py
+++ b/progressbar/shortcuts.py
@@ -1,12 +1,16 @@
 from __future__ import annotations
 
+import os
 import typing
 
 from . import (
     bar,
-    widgets as widgets_module,
+    fast as fast_module,
 )
 
+if typing.TYPE_CHECKING:
+    from . import widgets as widgets_module
+
 T = typing.TypeVar('T')
 
 
@@ -17,9 +21,19 @@ def progressbar(
     widgets: typing.Sequence[widgets_module.WidgetBase | str] | None = None,
     prefix: str | None = None,
     suffix: str | None = None,
+    fast: bool | None = None,
     **kwargs: typing.Any,
-) -> typing.Generator[T, None, None]:
-    progressbar_ = bar.ProgressBar(
+) -> typing.Iterator[T]:
+    # Auto-dispatch to the lean FastProgressBar for the simple, common case;
+    # anything that needs the full widget machinery uses ProgressBar.
+    use_fast = (
+        widgets is None
+        and fast is not False
+        and not kwargs.get('variables')
+        and not os.environ.get('PROGRESSBAR_DISABLE_FASTPATH')
+    )
+    cls = fast_module.FastProgressBar if use_fast else bar.ProgressBar
+    progressbar_ = cls(
         min_value=min_value,
         max_value=max_value,
         widgets=widgets,
@@ -27,4 +41,4 @@ def progressbar(
         suffix=suffix,
         **kwargs,
     )
-    yield from progressbar_(iterator)
+    return iter(progressbar_(iterator))
diff --git a/progressbar/utils.py b/progressbar/utils.py
index 4a77da7..01885dd 100644
--- a/progressbar/utils.py
+++ b/progressbar/utils.py
@@ -92,9 +92,17 @@ def no_color(value: StringT) -> StringT:
     TypeError: `value` must be a string or bytes, got 123
     """
     if isinstance(value, bytes):
+        # Fast path: with no ESC byte there is nothing to strip, so the regex
+        # would return the value unchanged anyway. Skipping it avoids a
+        # substitution on the common plain-text case, which dominates the
+        # per-redraw render cost (len_color is called for every widget).
+        if b'\x1b' not in value:
+            return value  # type: ignore
         pattern: bytes = bytes(terminal.ESC, 'ascii') + b'\\[.*?[@-~]'
         return re.sub(pattern, b'', value)  # type: ignore
     elif isinstance(value, str):
+        if '\x1b' not in value:
+            return value  # type: ignore
         return re.sub('\x1b\\[.*?[@-~]', '', value)  # type: ignore
     else:
         raise TypeError(f'`value` must be a string or bytes, got {value!r}')
diff --git a/progressbar/widgets.py b/progressbar/widgets.py
index 387b02e..4b9b6c5 100644
--- a/progressbar/widgets.py
+++ b/progressbar/widgets.py
@@ -374,10 +374,16 @@ def __call__(
         format: types.Optional[str] = None,
     ):
         for name, (key, transform) in self.mapping.items():
-            with contextlib.suppress(KeyError, ValueError, IndexError):
-                if transform is None:
-                    data[name] = data[key]
-                else:
+            # Avoid a per-entry contextlib.suppress on the redraw hot path: a
+            # missing key is the only common "miss", so test membership
+            # directly and only guard the transform (which can raise on bad
+            # values) with try/except.
+            if key not in data:
+                continue
+            if transform is None:
+                data[name] = data[key]
+            else:
+                with contextlib.suppress(ValueError, IndexError):
                     data[name] = transform(data[key])
 
         return FormatWidgetMixin.__call__(self, progress, data, format)
diff --git a/pyproject.toml b/pyproject.toml
index 1753186..230ee53 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,6 +112,12 @@ repository = 'https://github.com/wolph/python-progressbar/'
 progressbar = 'progressbar.__main__:main'
 
 [project.optional-dependencies]
+# Optional native iterator accelerator. When installed it is detected and used
+# automatically (the iterator path drops to ~5 ns/iter); otherwise progressbar2
+# falls back to the pure-Python gate. See the Performance section in README.
+fast = [
+    'speedups>=2.1.0',
+]
 docs = [
     'sphinx>=1.8.5',
     'sphinx-autodoc-typehints>=1.6.0',
@@ -164,6 +170,7 @@ exclude_lines = [
     'if 0:',
     'if __name__ == .__main__.:',
     'if types.TYPE_CHECKING:',
+    'if typing.TYPE_CHECKING:',
     '@typing.overload',
     'if os.name == .nt.:',
     'typing.Protocol',
diff --git a/pytest.ini b/pytest.ini
index 08a1130..1f00ecb 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -16,6 +16,7 @@ norecursedirs =
     .*
     _*
     build
+    benchmarks
     dist
     docs
     progressbar/terminal/os_specific
diff --git a/ruff.toml b/ruff.toml
index e27f4f8..c36fefd 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -10,6 +10,8 @@ exclude = [
     # Ignore local test files/directories/old-stuff
     'test.py',
     '*_old.py',
+    # Benchmark/tooling scripts are not held to the package lint standard
+    'benchmarks',
 ]
 
 line-length = 79
diff --git a/tests/conftest.py b/tests/conftest.py
index 59cbe7d..d8f0ea2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,7 +25,24 @@ def pytest_configure(config) -> None:
 
 
 @pytest.fixture(autouse=True)
-def small_interval(monkeypatch) -> None:
+def disable_native_accelerator(monkeypatch):
+    # The optional native accelerator (speedups.FastBarIterator) is exercised
+    # explicitly in test_native_accelerator.py. Every other test targets the
+    # pure-Python iterator (`_iter_python`), so force that path by default when
+    # the compiled `speedups` package happens to be installed in the dev/bench
+    # environment. Native tests restore it via their own monkeypatch.
+    import progressbar.bar as bar_module
+
+    monkeypatch.setattr(bar_module, '_FastBarIterator', None)
+
+
+@pytest.fixture(autouse=True)
+def small_interval(monkeypatch, request) -> None:
+    # Tests marked `no_freezegun` need real timing conditions (e.g. the perf
+    # budget test), so preserve the default _MINIMUM_UPDATE_INTERVAL so the
+    # fast-path gate can calibrate and activate correctly.
+    if request.node.get_closest_marker('no_freezegun'):
+        return
     # Remove the update limit for tests by default
     monkeypatch.setattr(
         progressbar.ProgressBar,
@@ -36,7 +53,14 @@ def small_interval(monkeypatch) -> None:
 
 
 @pytest.fixture(autouse=True)
-def sleep_faster(monkeypatch):
+def sleep_faster(monkeypatch, request):
+    # Tests marked `no_freezegun` need a real, advancing clock (e.g. the
+    # gate's perf test, which only activates after a real timing measurement).
+    # For those, skip the freezegun wrapping entirely.
+    if request.node.get_closest_marker('no_freezegun'):
+        yield None
+        return
+
     # Compute the local UTC offset so freezegun uses the same timezone as
     # the local system. Using datetime.now(timezone.utc).astimezone() avoids
     # the deprecated datetime.utcnow() (deprecated since Python 3.12).
diff --git a/tests/test_fast_default.py b/tests/test_fast_default.py
new file mode 100644
index 0000000..65781da
--- /dev/null
+++ b/tests/test_fast_default.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+import gc
+import io
+import sys
+
+import progressbar
+
+# Alias (not a `from` import) so CodeQL doesn't flag `progressbar` as imported
+# with both `import` and `import from`.
+fast_module = progressbar.fast
+
+
+class TTY(io.StringIO):
+    def isatty(self) -> bool:
+        return True
+
+    def repaints(self) -> list[str]:
+        return [p for p in self.getvalue().split('\r') if p]
+
+
+def test_fast_known_length_renders_and_completes():
+    fd = TTY()
+    bar = fast_module.FastProgressBar(max_value=1000, fd=fd)
+    out = list(bar(range(1000)))
+    assert out == list(range(1000))
+    assert bar.value == 1000
+    assert bar.percentage == 100.0
+    assert bar._finished
+    frames = fd.repaints()
+    assert frames, 'fast bar drew nothing'
+    # Close-to-default look: percentage, (n of max), a bar, Elapsed/ETA.
+    last = frames[-1]
+    assert '100%' in last
+    assert '(1000 of 1000)' in last
+    assert '|' in last  # bar delimiters
+    assert 'Elapsed Time:' in last
+
+
+def test_fast_elapsed_with_no_start_time():
+    """Test _fast_elapsed returns 0 when start_time is None."""
+    fd = TTY()
+    bar = fast_module.FastProgressBar(max_value=100, fd=fd)
+    assert bar._fast_elapsed() == 0.0
+
+
+def test_fast_format_line_with_eta_calculation():
+    """Test ETA calculation path with done > 0 and elapsed > 0."""
+    from datetime import datetime, timedelta
+
+    fd = TTY()
+    bar = fast_module.FastProgressBar(max_value=100, fd=fd)
+    # Manually set start_time to the past to ensure elapsed > 0
+    bar.start_time = datetime.now() - timedelta(seconds=2)
+    bar.value = 50  # Set done=50 to enable ETA calculation
+    line = bar._format_line()
+    # With done > 0 and elapsed > 0, ETA is computed instead of '--:--:--'.
+    assert 'ETA:' in line
+    assert 'ETA: --:--:--' not in line
+
+
+def test_fast_format_line_uses_native_hook(monkeypatch):
+    """The native `_format_fast_line` hook takes precedence when set."""
+
+    def stub(bar) -> str:
+        return 'NATIVE_HOOK_OUTPUT'
+
+    monkeypatch.setattr(fast_module, '_format_fast_line', stub)
+    fd = TTY()
+    bar = fast_module.FastProgressBar(max_value=100, fd=fd)
+    assert bar._format_line() == 'NATIVE_HOOK_OUTPUT'
+
+
+def test_fast_unknown_length_renders_count_and_elapsed():
+    fd = TTY()
+    bar = fast_module.FastProgressBar(
+        max_value=progressbar.UnknownLength, fd=fd
+    )
+    out = list(bar(iter(range(40))))
+    assert out == list(range(40))
+    assert bar.value == 39
+    last = fd.repaints()[-1]
+    assert 'Elapsed Time:' in last
+    assert '40' in last  # the count is shown
+    assert ' of ' not in last  # no "(n of max)" when length unknown
+
+
+def test_fast_prefix_suffix_in_line_not_widgets():
+    fd = TTY()
+    bar = fast_module.FastProgressBar(
+        max_value=10, fd=fd, prefix='load ', suffix=' done'
+    )
+    list(bar(range(10)))
+    assert bar.widgets == []  # prefix/suffix not injected as widgets
+    last = fd.repaints()[-1]
+    assert last.lstrip().startswith('load')
+    assert 'done' in last
+
+
+def test_fast_empty_iterable():
+    fd = TTY()
+    bar = fast_module.FastProgressBar(max_value=0, fd=fd)
+    assert list(bar([])) == []
+    assert bar._finished
+
+
+def test_fast_break_restores_streams():
+    real_out = sys.stdout
+    fd = TTY()
+    bar = fast_module.FastProgressBar(
+        max_value=1000, fd=fd, redirect_stdout=True
+    )
+    for i in bar(range(1000)):
+        if i == 5:
+            break
+    del bar
+    gc.collect()
+    assert sys.stdout is real_out
+
+
+def test_fast_with_statement():
+    fd = TTY()
+    with fast_module.FastProgressBar(max_value=10, fd=fd) as bar:
+        out = list(bar(range(10)))
+    assert out == list(range(10))
+    assert bar._finished
+
+
+def test_shortcut_dispatch(monkeypatch):
+    # Record which class the shortcut constructs for each input combination.
+    from progressbar import shortcuts
+
+    calls = {'fast': 0, 'full': 0}
+
+    class FastSpy(fast_module.FastProgressBar):
+        def __init__(self, *a, **k):
+            calls['fast'] += 1
+            super().__init__(*a, **k)
+
+    class FullSpy(progressbar.ProgressBar):
+        def __init__(self, *a, **k):
+            calls['full'] += 1
+            super().__init__(*a, **k)
+
+    monkeypatch.setattr(shortcuts.fast_module, 'FastProgressBar', FastSpy)
+    monkeypatch.setattr(shortcuts.bar, 'ProgressBar', FullSpy)
+
+    # Default (no widgets, no fast flag) -> fast.
+    assert list(shortcuts.progressbar(range(3), fd=TTY())) == [0, 1, 2]
+    assert calls == {'fast': 1, 'full': 0}
+
+    # Custom widgets -> full.
+    list(
+        shortcuts.progressbar(
+            range(3), fd=TTY(), widgets=[progressbar.Percentage()]
+        )
+    )
+    assert calls == {'fast': 1, 'full': 1}
+
+    # fast=False -> full even with no widgets.
+    list(shortcuts.progressbar(range(3), fd=TTY(), fast=False))
+    assert calls == {'fast': 1, 'full': 2}
+
+    # Env override forces full.
+    monkeypatch.setenv('PROGRESSBAR_DISABLE_FASTPATH', '1')
+    list(shortcuts.progressbar(range(3), fd=TTY()))
+    assert calls == {'fast': 1, 'full': 3}
+
+    # Dynamic variables force full (the fast formatter can't render them).
+    monkeypatch.delenv('PROGRESSBAR_DISABLE_FASTPATH', raising=False)
+    list(shortcuts.progressbar(range(3), fd=TTY(), variables={'x': 1}))
+    assert calls == {'fast': 1, 'full': 4}
+
+
+def test_full_bar_injects_prefix_suffix_widgets():
+    # The full ProgressBar (unlike the fast bar) injects prefix/suffix as
+    # FormatLabel widgets in start(); exercise that path directly.
+    fd = TTY()
+    bar_ = progressbar.ProgressBar(
+        max_value=10, fd=fd, prefix='pre ', suffix=' suf'
+    )
+    list(bar_(range(10)))
+    assert bar_.widgets  # widgets were built (not the fast empty list)
+    last = fd.repaints()[-1]
+    assert 'pre' in last
+    assert 'suf' in last
+
+
+def test_import_progressbar_is_lazy():
+    # A fresh interpreter: `import progressbar` must not eagerly pull heavy
+    # submodules; FastProgressBar still resolves lazily.
+    import subprocess
+
+    check = (
+        'import sys, progressbar\n'
+        'assert "progressbar.multi" not in sys.modules\n'
+        'assert progressbar.FastProgressBar is not None\n'
+        'print("ok")\n'
+    )
+    out = subprocess.run(
+        [sys.executable, '-c', check], capture_output=True, text=True
+    )
+    assert out.returncode == 0, out.stderr
+    assert 'ok' in out.stdout
+
+
+def test_fast_path_does_not_import_widgets_or_colors():
+    # Running the fast default end-to-end must not pull in the widgets module
+    # or the terminal colour tables (the heaviest imports).
+    import subprocess
+
+    check = (
+        'import sys, progressbar\n'
+        'list(progressbar.progressbar(range(10)))\n'
+        'assert "progressbar.widgets" not in sys.modules\n'
+        'assert "progressbar.terminal.colors" not in sys.modules\n'
+        'print("ok")\n'
+    )
+    out = subprocess.run(
+        [sys.executable, '-c', check], capture_output=True, text=True
+    )
+    assert out.returncode == 0, out.stderr
+    assert 'ok' in out.stdout
diff --git a/tests/test_fastpath.py b/tests/test_fastpath.py
new file mode 100644
index 0000000..50d1b2c
--- /dev/null
+++ b/tests/test_fastpath.py
@@ -0,0 +1,609 @@
+# tests/test_fastpath.py
+from __future__ import annotations
+
+import gc
+import io
+import itertools
+import re
+import sys
+import typing
+
+import pytest
+
+import progressbar
+
+_ANSI_ESCAPE = re.compile(r'\x1b\[[0-9;]*m')
+_PERCENT = re.compile(r'(\d+)%')
+
+
+def _drawn_percentages(repaints: list[str]) -> list[int]:
+    """Extract the integer percentage rendered in each repaint frame.
+
+    The ``Percentage`` widget renders e.g. ``  4%|###...`` (no space before the
+    bar), so the ``%`` token is glued to the bar body; a regex is more robust
+    than whitespace tokenization.
+    """
+    out: list[int] = []
+    for frame in repaints:
+        match = _PERCENT.search(_ANSI_ESCAPE.sub('', frame))
+        if match:
+            out.append(int(match.group(1)))
+    return out
+
+
+def _assert_cadence_parity(gated: list[str], ungated: list[str]) -> None:
+    """Assert the gated run kept the ungated run's rate-limited cadence.
+
+    This is the correct equivalence criterion (NOT byte-exact frames): the gate
+    may legitimately differ by a frame or two: its step is sized by time,
+    but it must not silently drop a large fraction of redraws the way the
+    original regression did (16 gated vs. 25 ungated buckets, a ~36% drop). The
+    checks below fail for such a gate while tolerating the benign +/-1 frame
+    wobble of the closed loop.
+    """
+    g_count = len(gated)
+    u_count = len(ungated)
+    # 1) Rate-limited cadence parity: counts within a frame or two of each
+    #    other. A ~36% drop (e.g. 21 vs 33) fails this by a wide margin.
+    assert abs(g_count - u_count) <= 2, (
+        f'gated redraw count {g_count} diverged from ungated {u_count} '
+        f'beyond rate-limited wobble'
+    )
+    # Sanity: the slow loop really did redraw many distinct frames, so the
+    # comparison is meaningful (not "both drew nothing").
+    assert len(set(gated)) > 10
+
+    g_pcts = _drawn_percentages(gated)
+    u_pcts = _drawn_percentages(ungated)
+    assert g_pcts, 'no percentage tokens found in gated frames'
+    # 2) Monotonic and reaches 100% at the end.
+    assert g_pcts == sorted(g_pcts), (
+        f'gated percentages not monotonic: {g_pcts}'
+    )
+    assert g_pcts[-1] == 100, f'gated did not reach 100%: {g_pcts[-1]}'
+
+    # 3) No large gap: ignoring the final jump to 100% (the loop only covers
+    #    part of the range, then finish() snaps to 100%), no consecutive
+    #    percentages is farther apart than a small multiple of the ungated
+    #    per-redraw window. A gate that drops whole stretches of the bar shows
+    #    up as an oversized inner gap here.
+    inner_gaps = [g_pcts[i + 1] - g_pcts[i] for i in range(len(g_pcts) - 2)]
+    ungated_window = max(
+        (u_pcts[i + 1] - u_pcts[i] for i in range(len(u_pcts) - 2)),
+        default=1,
+    )
+    if inner_gaps:
+        assert max(inner_gaps) <= 3 * max(ungated_window, 1), (
+            f'gated skipped a stretch of the bar: max inner gap '
+            f'{max(inner_gaps)} > 3x ungated window {ungated_window}'
+        )
+
+
+class RecordingTTY(io.StringIO):
+    """A fake terminal that records each repaint (\\r-delimited write)."""
+
+    def isatty(self) -> bool:
+        return True
+
+    def repaints(self) -> list[str]:
+        # Each redraw starts with '\r'; split and drop the empty head.
+        return [p for p in self.getvalue().split('\r') if p]
+
+
+def run_iter(n: int, **kwargs: typing.Any) -> tuple[RecordingTTY, list[int]]:
+    fd = RecordingTTY()
+    seen = list(progressbar.progressbar(range(n), fd=fd, **kwargs))
+    return fd, seen
+
+
+def test_iterates_all_items_in_order():
+    _, seen = run_iter(2000)
+    assert seen == list(range(2000))
+
+
+def test_value_is_live_during_iteration():
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=500, fd=fd)
+    last = -1
+    for i in bar(range(500)):
+        # bar.value == i: value reflects items yielded so far (pre-increment),
+        # so at the start of the body for item i, value is i (not i+1).
+        assert bar.value == i, f'bar.value mismatch at i={i}: got {bar.value}'
+        # previous_value stays byte-identical to the pre-gate behavior on
+        # EVERY iteration (not just at redraws): the value before the current
+        # one (0 for the first item, set by start()'s forced draw).
+        expected_prev = i - 1 if i else 0
+        assert bar.previous_value == expected_prev, (
+            f'previous_value mismatch at i={i}: got {bar.previous_value}'
+        )
+        last = i
+    assert last == 499
+
+
+def test_final_repaint_reaches_completion():
+    fd, _ = run_iter(1000)
+    repaints = fd.repaints()
+    assert repaints, 'expected at least one repaint'
+    assert '100%' in repaints[-1]
+
+
+def test_repaints_are_monotonic_in_percentage():
+    fd, _ = run_iter(5000)
+    pcts = []
+    for p in fd.repaints():
+        # Repaints contain ANSI color codes; strip before tokenizing.
+        plain = _ANSI_ESCAPE.sub('', p)
+        for tok in plain.split():
+            if tok.endswith('%'):
+                pcts.append(float(tok[:-1]))
+                break
+    assert pcts, 'expected at least one percentage token in repaints'
+    assert pcts == sorted(pcts), 'percentage went backwards'
+    assert pcts[0] >= 0 and pcts[-1] == 100.0
+
+
+def test_empty_iterable_finishes_cleanly():
+    fd, seen = run_iter(0)
+    assert seen == []
+    assert fd.getvalue() != ''  # start+finish still draw
+
+
+def test_single_item():
+    fd, seen = run_iter(1)
+    assert seen == [0]
+    assert '100%' in fd.repaints()[-1]
+
+
+def test_early_break_finishes_dirty():
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=1000, fd=fd)
+    for i in bar(range(1000)):
+        if i == 10:
+            break
+    del bar  # trigger GeneratorExit cleanup path (issue #212)
+    gc.collect()
+    # A dirty finish must NOT jump the bar to 100%.
+    assert '100%' not in fd.repaints()[-1]
+
+
+def test_exception_in_body_propagates_and_finishes():
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=1000, fd=fd)
+
+    class BoomError(Exception):
+        pass
+
+    with pytest.raises(BoomError):
+        for i in bar(range(1000)):
+            if i == 5:
+                raise BoomError
+    gc.collect()
+    assert fd.getvalue() != ''
+
+
+def fixed_clock(monkeypatch, dt: float):
+    """Patch the timer used by bar.py to advance by `dt` per read."""
+    bar_module = progressbar.bar
+
+    counter = itertools.count()
+
+    def fake_timer() -> float:
+        return next(counter) * dt
+
+    monkeypatch.setattr(bar_module.timeit, 'default_timer', fake_timer)
+
+
+def test_redraw_count_is_rate_limited(monkeypatch):
+    # ~1ms per timer read, 50ms min_poll_interval => far fewer redraws than N.
+    fixed_clock(monkeypatch, dt=0.001)
+    fd, _ = run_iter(20000)
+    n_repaints = len(fd.repaints())
+    assert 1 < n_repaints < 2000, n_repaints  # not one-per-iteration
+
+
+def test_gate_state_initialized():
+    bar = progressbar.ProgressBar(max_value=100)
+    assert bar._gate_enabled is True
+    assert bar._gate_step >= 1
+    assert bar._next_update == 0
+    assert bar._last_drawn_value is None
+
+
+def _controlled_clock(monkeypatch) -> list[float]:
+    """Patch bar.py's timer to read one mutable value; return that list."""
+    clock = [0.0]
+    monkeypatch.setattr(
+        progressbar.bar.timeit, 'default_timer', lambda: clock[0]
+    )
+    return clock
+
+
+def test_gate_calibrates_step_from_measured_rate(monkeypatch):
+    # The gate calibrates _gate_step from the value/time elapsed between two
+    # redraws (no separate _gate_last_* state needed). UnknownLength makes any
+    # value advance redraw (rate-limited), so the measurement is deterministic.
+    clock = _controlled_clock(monkeypatch)
+    bar = progressbar.ProgressBar(
+        max_value=progressbar.UnknownLength, fd=RecordingTTY()
+    )
+    bar.min_poll_interval = 0.05
+    bar.start()  # forced draw at t=0; no prior sample, so step stays 1
+    assert bar._gate_step == 1
+    clock[0] = 0.10  # 0.10 s later
+    bar.update(1000)  # redraw: 1000 iters over 0.10 s
+    # step = int((1000 - 0) * min_poll_interval / interval) = 1000*0.05/0.10
+    assert bar._gate_step == 500
+    assert bar._next_update == 1000 + 500
+
+
+def test_gate_backs_off_when_calibrated_and_no_redraw(monkeypatch):
+    clock = _controlled_clock(monkeypatch)
+    bar = progressbar.ProgressBar(
+        max_value=progressbar.UnknownLength, fd=RecordingTTY()
+    )
+    bar.min_poll_interval = 0.05
+    bar.start()
+    clock[0] = 0.10
+    bar.update(1000)  # calibrate: step=500, _next_update=1500
+    step = bar._gate_step
+    assert step == 500
+    # Time frozen: an update past the threshold finds delta == 0 (no redraw),
+    # so the gate backs off (doubles the step) instead of re-checking often.
+    bar.update(1500)
+    assert bar._gate_step == step * 2
+    assert bar._next_update == 1500 + step * 2
+
+
+def test_previous_value_tracks_last_redraw(monkeypatch):
+    fixed_clock(monkeypatch, dt=0.001)
+    bar = progressbar.ProgressBar(max_value=10000, fd=RecordingTTY())
+    bar.start()
+    drawn: list[int] = []
+    real_parents = bar._update_parents
+
+    def spy(value):
+        real_parents(value)
+        drawn.append(bar.value)
+
+    bar._update_parents = spy
+    for i in range(1, 10001):
+        bar.update(i)
+    bar.finish()
+    # previous_value must equal one of the actually-drawn values, not i-1.
+    assert bar.previous_value in drawn
+
+
+def test_last_drawn_value_pinned_on_skipped_update(monkeypatch):
+    """The gate's pixel reference advances only when a redraw happens.
+
+    `_last_drawn_value` (the private pixel reference used by `_needs_update`)
+    must stay pinned to the value at the last actual draw, even as later
+    `update()` calls advance `self.value` without redrawing. The public
+    `previous_value` keeps its original meaning: the value before the most
+    recent `update()` call.
+
+    After a draw at value=3 (from 0) and two rate-limited skips at 4 then 5:
+        _last_drawn_value == 3  (pinned to the drawn value, for pixel check)
+        previous_value     == 4  (value before the update(5) call)
+    """
+    bar_module = progressbar.bar
+
+    # Freeze-then-advance clock: start at 0, jump to 1.0 so update(3) draws,
+    # then keep it at 1.0 so subsequent updates are rate-limited (skipped).
+    _time: list[float] = [0.0]
+
+    def timer() -> float:
+        return _time[0]
+
+    monkeypatch.setattr(bar_module.timeit, 'default_timer', timer)
+
+    bar = progressbar.ProgressBar(max_value=100, fd=RecordingTTY())
+    bar.start()  # _last_update_timer = 0.0
+
+    # Advance time far past min_poll_interval (0.05 s) => update(3) draws.
+    _time[0] = 1.0
+    bar.update(3)
+    assert bar._last_drawn_value == 3  # a redraw happened at value 3
+
+    # Time frozen at 1.0: delta == 0 => _needs_update() returns False, so the
+    # next updates advance self.value but do not redraw.
+    bar.update(4)
+    bar.update(5)
+
+    # Pixel reference stays at the last drawn value; public previous_value
+    # tracks the value before the most recent update() call.
+    assert bar._last_drawn_value == 3, (
+        f'_last_drawn_value should stay at last-drawn (3), '
+        f'got {bar._last_drawn_value!r}'
+    )
+    assert bar.value == 5  # liveness preserved on the manual path
+    assert bar.previous_value == 4  # value before update(5)
+
+
+def test_gate_disabled_skips_calibration():
+    """When _gate_enabled is False the gate is never (re)calibrated."""
+    bar = progressbar.ProgressBar(max_value=100, fd=RecordingTTY())
+    bar.start()
+    bar._gate_enabled = False
+    initial_next = bar._next_update
+    bar.update(50)
+    # Neither the calibration nor the back-off branch runs: _next_update is
+    # left untouched while the fast path is disabled.
+    assert bar._next_update == initial_next
+
+
+@pytest.mark.no_freezegun
+def test_manual_update_skips_clock_when_gated(monkeypatch):
+    bar_module = progressbar.bar
+
+    reads: dict[str, int] = {'n': 0}
+    real = bar_module.timeit.default_timer
+
+    def counting() -> float:
+        reads['n'] += 1
+        return real()
+
+    bar = progressbar.ProgressBar(max_value=10**7, fd=RecordingTTY())
+    bar.start()
+    monkeypatch.setattr(bar_module.timeit, 'default_timer', counting)
+    before = reads['n']
+    for i in range(1, 1_000_001):
+        bar.update(i)
+    reads_during = reads['n'] - before
+    bar.finish()
+    # Far fewer clock reads than updates (gate skips the common path).
+    assert reads_during < 100_000, reads_during
+
+
+def _iter_clock(monkeypatch, dt: float) -> dict[str, int]:
+    """Patch the timer so its value depends on a shared loop ITERATION.
+
+    Unlike ``fixed_clock`` (which ties time to the *number of reads*), this
+    makes the clock return ``state['i'] * dt`` regardless of how many times
+    it is read. The gated and ungated bars read the clock a different number
+    of times, so a per-read clock would make them diverge for the wrong
+    reason. Tying time to the iteration index keeps both runs seeing the
+    exact same wall time at every iteration.
+    """
+    bar_module = progressbar.bar
+
+    state: dict[str, int] = {'i': 0}
+    monkeypatch.setattr(
+        bar_module.timeit,
+        'default_timer',
+        lambda: state['i'] * dt,
+    )
+    return state
+
+
+def _drawn_frames(
+    disable_gate: bool,
+    monkeypatch,
+    *,
+    widgets: list | None = None,
+    dt: float = 0.06,
+    n: int = 4000,
+    maxv: int = 10_000,
+) -> list[str]:
+    state = _iter_clock(monkeypatch, dt)
+    fd = RecordingTTY()
+    if widgets is None:
+        # poll_interval stays None for this widget set, which is the case
+        # that exposed the uncalibrated back-off bug.
+        widgets = [progressbar.Percentage(), progressbar.Bar()]
+    bar = progressbar.ProgressBar(max_value=maxv, fd=fd, widgets=widgets)
+    bar.start()
+    if disable_gate:
+        bar._gate_enabled = False
+    for i in range(1, n + 1):
+        state['i'] = i  # advance wall time per ITERATION
+        bar.update(i)
+    bar.finish()
+    return fd.repaints()
+
+
+def test_gated_matches_ungated_drawn_frames(monkeypatch):
+    """The gate must keep the ungated rate-limited cadence (manual path).
+
+    For a ``poll_interval is None`` bar over a slow loop (``dt`` >=
+    ``min_poll_interval`` so a redraw is due at each item), a gated bar must
+    redraw at the same rate-limited cadence as an identical bar with the gate
+    disabled. This is the reviewer's repro of the regression where the gate
+    dropped ~36% of the buckets the baseline rendered.
+
+    The criterion is rate-limited cadence parity, NOT byte-exact frames: the
+    closed-loop gate sizes its step by time, so a +/-1 frame wobble is benign
+    and expected. ``_assert_cadence_parity`` tolerates that wobble while still
+    failing for a gate that drops a large fraction of redraws.
+    """
+    with monkeypatch.context() as m:
+        gated = _drawn_frames(False, m)
+    with monkeypatch.context() as m:
+        ungated = _drawn_frames(True, m)
+
+    _assert_cadence_parity(gated, ungated)
+
+
+def _drawn_frames_iter(
+    disable_gate: bool,
+    monkeypatch,
+    *,
+    widgets: list | None = None,
+    dt: float = 0.06,
+    n: int = 4000,
+    maxv: int = 10_000,
+) -> list[str]:
+    """Drive the bar through its ITERATOR path and record drawn frames.
+
+    Mirrors ``_drawn_frames`` but uses ``ProgressBar.__iter__`` (the iterator
+    fast path) instead of manual ``update()`` calls. ``__iter__`` skips
+    ``start()`` when ``start_time`` is already set, so we call ``start()``
+    explicitly first (which resets the gate via ``init()``), then flip
+    ``_gate_enabled`` to choose gated vs. ungated. Both runs share the same
+    iteration-driven clock so they observe identical wall time per iteration.
+    """
+    state = _iter_clock(monkeypatch, dt)
+    fd = RecordingTTY()
+    if widgets is None:
+        # poll_interval stays None for this widget set, which is the case
+        # that exposed the uncalibrated back-off bug in the iterator path.
+        widgets = [progressbar.Percentage(), progressbar.Bar()]
+    bar = progressbar.ProgressBar(max_value=maxv, fd=fd, widgets=widgets)
+    bar.start()  # resets _gate_enabled via init(); primes start_time
+    if disable_gate:
+        bar._gate_enabled = False
+
+    class _IterClockRange:
+        """An iterable that advances the shared clock once per item.
+
+        Returning a fresh iterator each time keeps ``bar.value`` and the
+        iteration index aligned regardless of how many times the clock is read.
+        """
+
+        def __len__(self) -> int:
+            return n
+
+        def __iter__(self):
+            for i in range(n):
+                state['i'] = i  # advance wall time per ITERATION
+                yield i
+
+    # __iter__ does not call start() again because start_time is already set.
+    for _ in bar(_IterClockRange()):
+        pass
+    return fd.repaints()
+
+
+def test_iterator_gated_matches_ungated_drawn_frames(monkeypatch):
+    """The ITERATOR-path gate must keep the ungated rate-limited cadence.
+
+    Reviewer's repro of the regression in ``__iter__``: for a
+    ``poll_interval is None`` bar over a slow, iteration-driven clock
+    (``dt`` >= ``min_poll_interval`` so a redraw is due at each item), the
+    iterator path's inline gate skipped ``update()`` based on ``_next_update``
+    with a bogus pre-measurement step, leaping over whole buckets and dropping
+    redraws the ungated bar rendered.
+
+    With the fix (``_gate_step`` starts at 1 so ``__iter__`` calls ``update()``
+    every iteration until a real measurement grows it), the gated
+    iterator must redraw at the same rate-limited cadence as an identical bar
+    driven through the same iterator with the gate disabled. As in the manual
+    path the criterion is cadence parity, not byte-exact frames.
+    """
+    with monkeypatch.context() as m:
+        gated = _drawn_frames_iter(False, m)
+    with monkeypatch.context() as m:
+        ungated = _drawn_frames_iter(True, m)
+
+    _assert_cadence_parity(gated, ungated)
+
+
+# NOTE: A default-widget bar (poll_interval is set by the Timer/animation
+# widgets) intentionally does NOT get a byte-exact equivalence test. Its
+# redraws are time-driven, not value-driven, so matching the ungated frame
+# sequence would require the gate to read the clock on every call - which is
+# precisely the read the gate exists to skip. The correctness obligation only
+# binds the value-driven (poll_interval is None) case above, which is also the
+# case that exposed the uncalibrated back-off bug.
+
+
+def test_next_direct_exhaustion_calls_finish():
+    """Direct next(bar) still finishes the bar on StopIteration."""
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=2, fd=fd)
+    bar(range(2))
+    bar.start()
+    assert next(bar) == 0
+    assert next(bar) == 1
+    with pytest.raises(StopIteration):
+        next(bar)  # exhausts iterable, calls finish()
+    assert '100%' in fd.repaints()[-1]
+
+
+def test_shortcut_has_single_generator_layer():
+    import types
+
+    gen = progressbar.progressbar(range(3), fd=RecordingTTY())
+    assert isinstance(gen, types.GeneratorType)
+    # It is the bar's own iterator generator, not a wrapper: compare the
+    # generator's code object to ProgressBar._iter_python (the pure-Python
+    # path `__iter__` dispatches to; robust across versions). The autouse
+    # `disable_native_accelerator` fixture forces this path here.
+    assert gen.gi_code is progressbar.ProgressBar._iter_python.__code__
+
+
+def test_env_disables_fastpath(monkeypatch):
+    monkeypatch.setenv('PROGRESSBAR_DISABLE_FASTPATH', '1')
+    bar = progressbar.ProgressBar(max_value=100, fd=RecordingTTY())
+    bar.start()
+    assert bar._gate_enabled is False
+
+
+def test_zero_min_poll_interval_disables_gate():
+    # Build a bar and force min_poll_interval to zero on the *instance* (not
+    # the class) before calling start(), so the class version-tag is
+    # untouched and CPython's adaptive specialiser for __iter__ is not
+    # disturbed.
+    bar = progressbar.ProgressBar(max_value=100, fd=RecordingTTY())
+    bar.min_poll_interval = 0.0  # instance-level override, zero rate-limit
+    bar.start()
+    # With no rate limit the user wants every update considered.
+    assert bar._gate_enabled is False
+
+
+@pytest.mark.no_freezegun
+@pytest.mark.skipif(
+    sys.gettrace() is not None,
+    reason='coverage tracing inflates per-iteration cost; benchmark skipped',
+)
+def test_iterator_overhead_is_low():
+    import timeit as _t
+
+    # Use the REAL clock (no fixed_clock): under a frozen clock (dt == 0) the
+    # corrected gate never calibrates, so update() runs every iteration and the
+    # measurement no longer reflects the gated fast path. A real, advancing
+    # `perf_counter` lets the gate calibrate and skip as it does in production.
+    fd = RecordingTTY()
+    n = 200_000
+    t = min(
+        _t.timeit(
+            lambda: [None for _ in progressbar.progressbar(range(n), fd=fd)],
+            number=1,
+        )
+        for _ in range(3)
+    )
+    ns = t / n * 1e9
+    # Generous smoke gate only; the authoritative per-iteration budget is
+    # enforced in tests/test_perf_budget.py (Task 8).
+    assert ns < 200, f'{ns:.1f} ns/iter (real clock)'
+
+
+def test_no_color_fast_path_and_ansi():
+    # Render-cost optimization adds a no-ESC fast path to no_color/len_color.
+    # It must be identical to the regex path for both plain and ANSI input.
+    utils = progressbar.utils
+
+    # Fast path (no ESC byte): returned unchanged, str and bytes.
+    assert utils.no_color('plain text') == 'plain text'
+    assert utils.no_color(b'plain bytes') == b'plain bytes'
+    assert utils.len_color('plain') == 5
+    # Regex path (ANSI present): escape sequences stripped, str and bytes.
+    assert utils.no_color('\x1b[31mred\x1b[0m') == 'red'
+    assert utils.no_color(b'\x1b[31mred\x1b[0m') == b'red'
+    assert utils.len_color('\x1b[1mbold\x1b[0m') == 4
+
+
+def test_render_output_stable(monkeypatch):
+    # Guard the default-widget render path against the render-cost
+    # optimization changing appearance: the final repaint must reach 100%.
+    fixed_clock(monkeypatch, dt=10.0)  # force a redraw on every forced update
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=100, fd=fd)
+    bar.start()
+    for i in range(1, 101):
+        bar.update(i, force=True)
+    bar.finish()
+    repaints = fd.repaints()
+    assert repaints
+    last = _ANSI_ESCAPE.sub('', repaints[-1])
+    assert last.strip().startswith('100%')
diff --git a/tests/test_native_accelerator.py b/tests/test_native_accelerator.py
new file mode 100644
index 0000000..d88029c
--- /dev/null
+++ b/tests/test_native_accelerator.py
@@ -0,0 +1,286 @@
+# tests/test_native_accelerator.py
+"""Tests for the optional native (Cython) iterator accelerator.
+
+Two groups:
+
+* Integration-coverage tests that exercise ``ProgressBar.__iter__`` dispatch
+  and the ``_fast_*`` protocol hooks **without** needing the compiled
+  ``speedups`` package (using a fake iterator / direct calls), so they run —
+  and keep ``bar.py`` at 100% coverage — in CI where ``speedups`` is absent.
+* End-to-end equivalence tests marked ``@requires_speedups`` that drive the
+  real ``speedups.progressbar.FastBarIterator``; they run wherever it is
+  installed (dev/bench env) and are skipped otherwise.
+
+The conftest ``disable_native_accelerator`` autouse fixture forces the
+pure-Python path for the rest of the suite; here we restore the real iterator
+explicitly where needed.
+"""
+
+from __future__ import annotations
+
+import gc
+import io
+import re
+import sys
+
+import pytest
+
+import progressbar
+
+# Alias (not a `from` import) so CodeQL doesn't flag `progressbar` as imported
+# with both `import` and `import from`.
+bar_module = progressbar.bar
+
+# Captured at import, before the autouse fixture nulls it for each test.
+_REAL_FAST = bar_module._FastBarIterator
+HAS_SPEEDUPS = _REAL_FAST is not None
+requires_speedups = pytest.mark.skipif(
+    not HAS_SPEEDUPS,
+    reason='native accelerator (speedups package) not installed',
+)
+
+_PERCENT = re.compile(r'(\d+)%')
+_ANSI = re.compile(r'\x1b\[[0-9;]*m')
+
+
+class TTY(io.StringIO):
+    def isatty(self) -> bool:
+        return True
+
+
+class RecordingTTY(io.StringIO):
+    def isatty(self) -> bool:
+        return True
+
+    def repaints(self) -> list[str]:
+        return [p for p in self.getvalue().split('\r') if p]
+
+
+def _percentages(frames: list[str]) -> list[int]:
+    out: list[int] = []
+    for frame in frames:
+        match = _PERCENT.search(_ANSI.sub('', frame))
+        if match:
+            out.append(int(match.group(1)))
+    return out
+
+
+class _FakeFast:
+    """Stand-in for FastBarIterator: records construction, yields nothing.
+
+    Lets the native dispatch branch be covered without the compiled package.
+    """
+
+    def __init__(self, bar, iterable):
+        self.bar = bar
+        self.iterable = iterable
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise StopIteration
+
+
+# --- dispatch coverage (no compiled speedups required) --------------------
+
+
+def test_iter_uses_native_when_available(monkeypatch):
+    monkeypatch.setattr(bar_module, '_FastBarIterator', _FakeFast)
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    iterable = range(10)
+    it = iter(bar(iterable))
+    assert isinstance(it, _FakeFast)
+    assert it.bar is bar
+    assert it.iterable is bar._iterable
+
+
+def test_iter_falls_back_when_native_absent(monkeypatch):
+    monkeypatch.setattr(bar_module, '_FastBarIterator', None)
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    it = iter(bar(range(10)))
+    assert not isinstance(it, _FakeFast)
+    assert list(it) == list(range(10))
+
+
+def test_iter_falls_back_without_iterable(monkeypatch):
+    # Native needs an iterable; iterating a bar without one must not use it.
+    monkeypatch.setattr(bar_module, '_FastBarIterator', _FakeFast)
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    it = iter(bar)
+    assert not isinstance(it, _FakeFast)
+
+
+def test_iter_falls_back_when_env_disabled(monkeypatch):
+    monkeypatch.setattr(bar_module, '_FastBarIterator', _FakeFast)
+    monkeypatch.setenv('PROGRESSBAR_DISABLE_FASTPATH', '1')
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    it = iter(bar(range(10)))
+    assert not isinstance(it, _FakeFast)
+    assert list(it) == list(range(10))
+
+
+# --- protocol hook unit coverage (no compiled speedups required) ----------
+
+
+def test_fast_begin_starts_once():
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    assert bar.start_time is None
+    bar._fast_begin()
+    assert bar.start_time is not None
+    started = bar.start_time
+    bar._fast_begin()  # already started: no-op
+    assert bar.start_time is started
+
+
+def test_fast_tick_updates_value():
+    bar = progressbar.ProgressBar(max_value=100, fd=TTY())
+    bar._fast_begin()
+    bar._fast_tick(50)
+    assert bar.value == 50
+
+
+def test_fast_end_finishes_at_100():
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    bar._fast_begin()
+    bar._fast_end()
+    assert bar._finished
+    assert bar.value == bar.max_value
+
+
+def test_fast_end_dirty_keeps_partial_value():
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    bar._fast_begin()
+    bar._fast_tick(3)
+    bar._fast_end_dirty()
+    assert bar._finished
+    assert bar.value == 3  # not snapped to max_value
+
+
+# --- end-to-end with the real compiled accelerator ------------------------
+
+
+@pytest.fixture
+def native(monkeypatch):
+    """Restore the real FastBarIterator for a single test."""
+    monkeypatch.setattr(bar_module, '_FastBarIterator', _REAL_FAST)
+    return _REAL_FAST
+
+
+@requires_speedups
+def test_native_iterator_type(native):
+    bar = progressbar.ProgressBar(max_value=10, fd=TTY())
+    it = iter(bar(range(10)))
+    assert type(it) is _REAL_FAST
+
+
+@requires_speedups
+def test_native_yields_all_items_and_final_value(native):
+    bar = progressbar.ProgressBar(max_value=100, fd=RecordingTTY())
+    out = list(bar(range(100)))
+    assert out == list(range(100))
+    assert bar.value == 100
+    assert bar.percentage == 100.0
+    assert bar._finished
+
+
+@requires_speedups
+def test_native_renders_and_finishes_at_100(native):
+    fd = RecordingTTY()
+    list(progressbar.progressbar(range(500), fd=fd))
+    frames = fd.repaints()
+    assert frames, 'native path drew nothing'
+    pcts = _percentages(frames)
+    assert pcts == sorted(pcts), f'percentages not monotonic: {pcts}'
+    assert pcts[-1] == 100
+
+
+@requires_speedups
+def test_native_matches_fallback_items(native, monkeypatch):
+    # Native run.
+    native_items = list(progressbar.progressbar(range(250), fd=RecordingTTY()))
+    # Fallback run (force pure-Python).
+    monkeypatch.setattr(bar_module, '_FastBarIterator', None)
+    fallback_items = list(
+        progressbar.progressbar(range(250), fd=RecordingTTY())
+    )
+    assert native_items == fallback_items == list(range(250))
+
+
+@requires_speedups
+def test_native_generator_input(native):
+    def gen():
+        yield from range(30)
+
+    bar = progressbar.ProgressBar(max_value=30, fd=RecordingTTY())
+    assert list(bar(gen())) == list(range(30))
+    assert bar.value == 30
+
+
+@requires_speedups
+def test_native_unknown_length(native):
+    bar = progressbar.ProgressBar(
+        max_value=progressbar.UnknownLength, fd=RecordingTTY()
+    )
+    out = list(bar(iter(range(40))))
+    assert out == list(range(40))
+    assert bar.value == 39
+    assert bar._finished
+
+
+@requires_speedups
+def test_native_empty_iterable(native):
+    bar = progressbar.ProgressBar(max_value=0, fd=RecordingTTY())
+    assert list(bar([])) == []
+    assert bar._finished
+
+
+@requires_speedups
+def test_native_with_statement(native):
+    fd = RecordingTTY()
+    with progressbar.ProgressBar(max_value=10, fd=fd) as bar:
+        out = list(bar(range(10)))
+    assert out == list(range(10))
+    assert bar._finished
+
+
+@requires_speedups
+def test_native_overshoot_clamps(native):
+    # max_error=False: iterating past max_value clamps instead of raising.
+    bar = progressbar.ProgressBar(
+        max_value=5, fd=RecordingTTY(), max_error=False
+    )
+    out = list(bar(range(20)))
+    assert out == list(range(20))  # every item still yielded
+    assert bar.value == 5  # clamped to max at finish
+
+
+@requires_speedups
+def test_native_break_restores_streams(native):
+    # Issue #212: breaking out of the loop must restore redirected streams,
+    # which the cdef iterator does via __dealloc__ (no GeneratorExit hook).
+    real_out, real_err = sys.stdout, sys.stderr
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=1000, fd=fd, redirect_stdout=True)
+    for i in bar(range(1000)):
+        assert sys.stdout is not real_out  # redirected while iterating
+        if i == 5:
+            break
+    del bar
+    gc.collect()
+    assert sys.stdout is real_out
+    assert sys.stderr is real_err
+
+
+@requires_speedups
+def test_native_exception_restores_streams(native):
+    real_out = sys.stdout
+    fd = RecordingTTY()
+    bar = progressbar.ProgressBar(max_value=1000, fd=fd, redirect_stdout=True)
+    with pytest.raises(ValueError):
+        for i in bar(range(1000)):
+            if i == 5:
+                raise ValueError('boom')
+    del bar
+    gc.collect()
+    assert sys.stdout is real_out
diff --git a/tests/test_perf_budget.py b/tests/test_perf_budget.py
new file mode 100644
index 0000000..eb65cac
--- /dev/null
+++ b/tests/test_perf_budget.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import io
+import sys
+import timeit
+
+import pytest
+
+
+class _TTY(io.StringIO):
+    def isatty(self) -> bool:
+        return True
+
+
+def _overhead_ns(n: int = 200_000) -> float:
+    import progressbar
+
+    fd = _TTY()
+    base = min(
+        timeit.timeit(lambda: [None for _ in range(n)], number=1)
+        for _ in range(3)
+    )
+    wrapped = min(
+        timeit.timeit(
+            lambda: [None for _ in progressbar.progressbar(range(n), fd=fd)],
+            number=1,
+        )
+        for _ in range(3)
+    )
+    return (wrapped - base) / n * 1e9
+
+
+def _clock_read_ns(n: int = 200_000) -> float:
+    """Per-iteration cost of a single ``timeit.default_timer()`` read.
+
+    Used as a machine-independent yardstick: it scales with the interpreter
+    and runner speed exactly like the progress-bar wrapper does, so the ratio
+    between them is stable across machines (dev, CI, different Python builds).
+    """
+    timer = timeit.default_timer
+    base = min(
+        timeit.timeit(lambda: [None for _ in range(n)], number=1)
+        for _ in range(5)
+    )
+    read = min(
+        timeit.timeit(lambda: [timer() for _ in range(n)], number=1)
+        for _ in range(5)
+    )
+    return (read - base) / n * 1e9
+
+
+def _coverage_active() -> bool:
+    """Return True when a coverage tracer (sys.settrace) is installed.
+
+    pytest-cov installs a CTracer that adds per-line overhead to every
+    Python frame, distorting the measured iterator cost.  The budget
+    assertion is skipped when tracing is active; the lines still *execute*
+    (satisfying the 100 % coverage gate), only the assert is guarded.
+    """
+    return sys.gettrace() is not None
+
+
+@pytest.mark.no_freezegun
+def test_iterator_overhead_budget() -> None:
+    # Measure both before any early return so every line runs under coverage.
+    ns = _overhead_ns()
+    clock_ns = _clock_read_ns()
+    if _coverage_active():
+        # Coverage tracing inflates per-frame cost; run the measurement (so
+        # all lines are covered) but skip the assertion. The CI perf-budget
+        # step runs with --no-cov, where the assertion is enforced.
+        return
+    # Machine-independent guard. The OLD (pre-gate) path read the clock on
+    # every iteration, so its overhead was ~9x a single clock read; the gated
+    # path reads no clock on the common iteration, so its overhead is ~1x.
+    # A 4x ceiling sits comfortably between the two and tolerates slow/noisy
+    # CI runners and different Python builds (absolute ns vary wildly; the
+    # ratio does not). The point is to catch a return of the per-iteration
+    # clock-read regime, not to micro-police nanoseconds.
+    assert ns < 4 * clock_ns, (
+        f'iterator overhead {ns:.1f} ns/iter exceeded 4x a clock read '
+        f'({clock_ns:.1f} ns) - likely a regression to per-iteration '
+        f'clock reads'
+    )