diff --git a/tests/test_interactiveshell.py b/tests/test_interactiveshell.py
index a66172e46a..083c213dc0 100644
--- a/tests/test_interactiveshell.py
+++ b/tests/test_interactiveshell.py
@@ -50,16 +50,46 @@ class DerivedInterrupt(KeyboardInterrupt):
     pass
 
 
-def test_stream_performance(capsys) -> None:
-    """It should be fast to execute."""
-    src = "for i in range(250_000): print(i)"
-    start = time.perf_counter()
-    ip.run_cell(src)
-    end = time.perf_counter()
-    # We try to read as otherwise on failure, pytest will print the 250k lines to stdout.
-    capsys.readouterr()
-    duration = end - start
-    assert duration < 10
+def test_stream_scales_linearly(capsys) -> None:
+    """Output streaming must scale ~linearly, not O(n²).
+
+    Regression test for #14937: in v9.1.0, ``for i in range(N): print(i)``
+    became ~50x slower because the displayhook accumulated stream output
+    via repeated ``str += data`` (re-allocating the entire string on every
+    print). Fixed in #14941 by switching the bundle to a list and using
+    ``.append``, which is O(1) amortised.
+
+    Rather than asserting an absolute wall-clock budget (which depends on
+    the reviewer's machine and flakes on shared CI / distro build hosts),
+    this measures the timing ratio between two sample sizes that differ
+    by 10x. For O(n) behaviour the time ratio should be ~10; for an O(n²)
+    regression the same input increase produces ~100x time. The empirical
+    measurements on this code path:
+
+      - With the fix in place: ratio is ~5–12 across noisy trials
+      - With the regression reverted: ratio is ~40
+
+    A threshold of 25 catches the regression (40 > 25) with comfortable
+    margin against post-fix noise (worst observed: 12), independent of
+    machine speed.
+    """
+    timings = {}
+    for n in (10_000, 100_000):
+        src = f"for i in range({n}): print(i)"
+        start = time.perf_counter()
+        ip.run_cell(src)
+        # Drain capsys so a failure here doesn't spam pytest output.
+        capsys.readouterr()
+        timings[n] = time.perf_counter() - start
+
+    small, big = timings[10_000], timings[100_000]
+    # Guard against divide-by-zero on absurdly fast machines.
+    ratio = big / max(small, 1e-6)
+    assert ratio < 25, (
+        f"O(n²)-like scaling detected: {small=:.3f}s, {big=:.3f}s, "
+        f"ratio={ratio:.1f} (expected ~10 for linear, ~100 for quadratic — "
+        f"see #14937)"
+    )
 
 
 class InteractiveShellTestCase(unittest.TestCase):