From e2b1b385c9bc3d2b081d1e1895849bf48f5e843c Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 8 Aug 2020 13:32:20 -0700
Subject: [PATCH 01/10] frexp/ldexp variant

---
 Modules/mathmodule.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 411c6eb1935fab..fe1ce6612387c0 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2438,6 +2438,7 @@ static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
     double x, csum = 1.0, oldcsum, frac = 0.0;
+    int max_e;
     Py_ssize_t i;
 
     if (Py_IS_INFINITY(max)) {
@@ -2449,17 +2450,21 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
     if (max == 0.0 || n <= 1) {
         return max;
     }
+    frexp(max, &max_e);
     for (i=0 ; i < n ; i++) {
+        int e;
+
         x = vec[i];
         assert(Py_IS_FINITE(x) && fabs(x) <= max);
-        x /= max;
+        x = frexp(x, &e);
+        x = ldexp(x, e - max_e);
         x = x*x;
         oldcsum = csum;
         csum += x;
         assert(csum >= x);
         frac += (oldcsum - csum) + x;
     }
-    return max * sqrt(csum - 1.0 + frac);
+    return ldexp(sqrt(csum - 1.0 + frac), max_e);
 }
 
 #define NUM_STACK_ELEMS 16

From 7cb895d1bbb8a0a3fa658792fbe740cb2622ebde Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 8 Aug 2020 13:43:57 -0700
Subject: [PATCH 02/10] Multiply by power of two variant

---
 Modules/mathmodule.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index fe1ce6612387c0..ee900ea3f7c5ef 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2437,7 +2437,7 @@ fractional digits to be dropped from *csum*.
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
-    double x, csum = 1.0, oldcsum, frac = 0.0;
+    double x, csum = 1.0, oldcsum, frac = 0.0, scale;
     int max_e;
     Py_ssize_t i;
 
@@ -2451,20 +2451,18 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         return max;
     }
     frexp(max, &max_e);
+    scale = ldexp(1.0, -max_e);
     for (i=0 ; i < n ; i++) {
-        int e;
-
         x = vec[i];
         assert(Py_IS_FINITE(x) && fabs(x) <= max);
-        x = frexp(x, &e);
-        x = ldexp(x, e - max_e);
+        x *= scale;
         x = x*x;
         oldcsum = csum;
         csum += x;
         assert(csum >= x);
         frac += (oldcsum - csum) + x;
     }
-    return ldexp(sqrt(csum - 1.0 + frac), max_e);
+    return sqrt(csum - 1.0 + frac) / scale;
 }
 
 #define NUM_STACK_ELEMS 16

From 2d586b19fcaa15d7a33a9e45f0372ca78229fd9c Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 8 Aug 2020 17:23:37 -0700
Subject: [PATCH 03/10] Strengthen assertions

---
 Modules/mathmodule.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index ee900ea3f7c5ef..273e5533214125 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2452,14 +2452,17 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
     }
     frexp(max, &max_e);
     scale = ldexp(1.0, -max_e);
+    assert(max * scale >= 0.5);
+    assert(max * scale < 1.0);
     for (i=0 ; i < n ; i++) {
         x = vec[i];
         assert(Py_IS_FINITE(x) && fabs(x) <= max);
         x *= scale;
         x = x*x;
+        assert(x <= 1.0);
+        assert(csum >= x);
         oldcsum = csum;
         csum += x;
-        assert(csum >= x);
         frac += (oldcsum - csum) + x;
     }
     return sqrt(csum - 1.0 + frac) / scale;

From c15ce79ecfc94c53b4ead47aa612aaa7ec38cf02 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 10:58:06 -0700
Subject: [PATCH 04/10] Added notes and commented out assertions to reach a
 stable resting point

---
 Lib/test/test_math.py |  4 ++--
 Modules/mathmodule.c  | 25 +++++++++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index e06b1e6a5b9b7c..b00256dd2a98e1 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -904,8 +904,8 @@ class T(tuple):
         for n in range(32):
             p = (fourthmax,) * n
             q = (0.0,) * n
-            self.assertEqual(dist(p, q), fourthmax * math.sqrt(n))
-            self.assertEqual(dist(q, p), fourthmax * math.sqrt(n))
+            self.assertTrue(math.isclose(dist(p, q), fourthmax * math.sqrt(n)))
+            self.assertTrue(math.isclose(dist(q, p), fourthmax * math.sqrt(n)))
 
         # Verify scaling for extremely small values
         for exp in range(32):
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 273e5533214125..5d8638db13ddf8 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -56,6 +56,7 @@ raised for division by zero and mod by zero.
 #include "pycore_bitutils.h"      // _Py_bit_length()
 #include "pycore_dtoa.h"
 #include "_math.h"
+#include <stdio.h>
 
 #include "clinic/mathmodule.c.h"
 
@@ -2434,6 +2435,21 @@ fractional digits to be dropped from *csum*.
 
 */
 
+/*
+    def scaled(x):
+        'View scaling calculation'
+        # Limits for ldexp(1, 1023) and at 1024 goes to inf
+        #            ldexp(1.0, -1022) <- normal and at -1023 goes subnormal
+        #            ldexp(1.0, -1074) <-- subnormal and at -1075 goes to zero
+        # Failed example:
+        #            x = 2.78134e-309 --> (0.9999991647435902, -1025)
+        #            1.0 / x --> inf
+        m, e = frexp(x)
+        # e = max(e, 1023)
+        scale = ldexp(1.0, -e)
+        return x*scale, scale.hex(), m, e
+ */
+
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
@@ -2452,15 +2468,16 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
     }
     frexp(max, &max_e);
     scale = ldexp(1.0, -max_e);
-    assert(max * scale >= 0.5);
-    assert(max * scale < 1.0);
+    // fprintf(stderr, "<<max: %g  max_e: %d   scale: %g>>\n", max, max_e, scale);
+    // assert(max * scale >= 0.5);
+    // assert(max * scale < 1.0);
     for (i=0 ; i < n ; i++) {
         x = vec[i];
         assert(Py_IS_FINITE(x) && fabs(x) <= max);
         x *= scale;
         x = x*x;
-        assert(x <= 1.0);
-        assert(csum >= x);
+        // assert(x <= 1.0);
+        // assert(csum >= x);
         oldcsum = csum;
         csum += x;
         frac += (oldcsum - csum) + x;

From c853597f555db0b85fb28b4161615d9361c24ab4 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 11:24:50 -0700
Subject: [PATCH 05/10] Two code paths.  All tests pass.

---
 Modules/mathmodule.c | 56 +++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 5d8638db13ddf8..4bf383fda0f588 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2450,6 +2450,13 @@ fractional digits to be dropped from *csum*.
         return x*scale, scale.hex(), m, e
  */
 
+/*
+     A way forward:
+             Use scaling for a broad range, perhaps -1000 to 1000
+             and have a separate code path with division for the rest.
+ */
+
+
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
@@ -2467,22 +2474,39 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         return max;
     }
     frexp(max, &max_e);
-    scale = ldexp(1.0, -max_e);
-    // fprintf(stderr, "<<max: %g  max_e: %d   scale: %g>>\n", max, max_e, scale);
-    // assert(max * scale >= 0.5);
-    // assert(max * scale < 1.0);
-    for (i=0 ; i < n ; i++) {
-        x = vec[i];
-        assert(Py_IS_FINITE(x) && fabs(x) <= max);
-        x *= scale;
-        x = x*x;
-        // assert(x <= 1.0);
-        // assert(csum >= x);
-        oldcsum = csum;
-        csum += x;
-        frac += (oldcsum - csum) + x;
-    }
-    return sqrt(csum - 1.0 + frac) / scale;
+    if (-1000 <= max_e && max_e <= 1000) {
+        scale = ldexp(1.0, -max_e);
+        // fprintf(stderr, "<<max: %g  max_e: %d   scale: %g>>\n", max, max_e, scale);
+        assert(max * scale >= 0.5);
+        assert(max * scale < 1.0);
+        for (i=0 ; i < n ; i++) {
+            x = vec[i];
+            assert(Py_IS_FINITE(x) && fabs(x) <= max);
+            x *= scale;
+            x = x*x;
+            assert(x <= 1.0);
+            assert(csum >= x);
+            oldcsum = csum;
+            csum += x;
+            frac += (oldcsum - csum) + x;
+        }
+        return sqrt(csum - 1.0 + frac) / scale;
+    }
+    else
+    {
+        for (i=0 ; i < n ; i++) {
+            x = vec[i];
+            assert(Py_IS_FINITE(x) && fabs(x) <= max);
+            x /= max;
+            x = x*x;
+            assert(x <= 1.0);
+            assert(csum >= x);
+            oldcsum = csum;
+            csum += x;
+            frac += (oldcsum - csum) + x;
+        }
+        return max * sqrt(csum - 1.0 + frac);
+    }
 }
 
 #define NUM_STACK_ELEMS 16

From 03ec58b8ef2589b808bf4e5431d0fb4364e397ba Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 17:09:18 -0700
Subject: [PATCH 06/10] Clean-up comments and ranges

---
 Modules/mathmodule.c | 65 ++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 4bf383fda0f588..61373627f67f09 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2407,6 +2407,13 @@ math_fmod_impl(PyObject *module, double x, double y)
 /*
 Given an *n* length *vec* of values and a value *max*, compute:
 
+    sqrt(sum((x * scale) ** 2 for x in vec)) / scale
+
+           where scale is the first power of two
+           greater than max.
+
+or compute:
+
     max * sqrt(sum((x / max) ** 2 for x in vec))
 
 The value of the *max* variable must be non-negative and
@@ -2435,28 +2442,6 @@ fractional digits to be dropped from *csum*.
 
 */
 
-/*
-    def scaled(x):
-        'View scaling calculation'
-        # Limits for ldexp(1, 1023) and at 1024 goes to inf
-        #            ldexp(1.0, -1022) <- normal and at -1023 goes subnormal
-        #            ldexp(1.0, -1074) <-- subnormal and at -1075 goes to zero
-        # Failed example:
-        #            x = 2.78134e-309 --> (0.9999991647435902, -1025)
-        #            1.0 / x --> inf
-        m, e = frexp(x)
-        # e = max(e, 1023)
-        scale = ldexp(1.0, -e)
-        return x*scale, scale.hex(), m, e
- */
-
-/*
-     A way forward:
-             Use scaling for a broad range, perhaps -1000 to 1000
-             and have a separate code path with division for the rest.
- */
-
-
 static inline double
 vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
 {
@@ -2474,9 +2459,13 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         return max;
     }
     frexp(max, &max_e);
-    if (-1000 <= max_e && max_e <= 1000) {
+    if (-1020 <= max_e && max_e <= 1020) {
+        /* Stay away from extreme values.
+           ldexp(1.0, -1075) returns zero.
+           ldexp(1.0, -1023) is subnormal.
+           ldexp(1.0, 1024) gives Inf.
+        */
         scale = ldexp(1.0, -max_e);
-        // fprintf(stderr, "<<max: %g  max_e: %d   scale: %g>>\n", max, max_e, scale);
         assert(max * scale >= 0.5);
         assert(max * scale < 1.0);
         for (i=0 ; i < n ; i++) {
@@ -2492,21 +2481,19 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         }
         return sqrt(csum - 1.0 + frac) / scale;
     }
-    else
-    {
-        for (i=0 ; i < n ; i++) {
-            x = vec[i];
-            assert(Py_IS_FINITE(x) && fabs(x) <= max);
-            x /= max;
-            x = x*x;
-            assert(x <= 1.0);
-            assert(csum >= x);
-            oldcsum = csum;
-            csum += x;
-            frac += (oldcsum - csum) + x;
-        }
-        return max * sqrt(csum - 1.0 + frac);
-    }
+    /* Near extreme values, just divide by *max*. */
+    for (i=0 ; i < n ; i++) {
+        x = vec[i];
+        assert(Py_IS_FINITE(x) && fabs(x) <= max);
+        x /= max;
+        x = x*x;
+        assert(x <= 1.0);
+        assert(csum >= x);
+        oldcsum = csum;
+        csum += x;
+        frac += (oldcsum - csum) + x;
+    }
+    return max * sqrt(csum - 1.0 + frac);
 }
 
 #define NUM_STACK_ELEMS 16

From ebf6a31812477ccbc83d5183d28f1cda826561e5 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 17:14:21 -0700
Subject: [PATCH 07/10] Remove debugging code

---
 Modules/mathmodule.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 61373627f67f09..c6939da71028e9 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -56,7 +56,6 @@ raised for division by zero and mod by zero.
 #include "pycore_bitutils.h"      // _Py_bit_length()
 #include "pycore_dtoa.h"
 #include "_math.h"
-#include <stdio.h>
 
 #include "clinic/mathmodule.c.h"
 

From 4742ef8aaac225f52e682e04a30882716cad14a9 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 17:28:01 -0700
Subject: [PATCH 08/10] Clean-up comments and over-specified tests

---
 Lib/test/test_math.py |  3 ++-
 Modules/mathmodule.c  | 12 +++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index b00256dd2a98e1..4d62eb1b119930 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -795,7 +795,8 @@ def testHypot(self):
         # Verify scaling for extremely large values
         fourthmax = FLOAT_MAX / 4.0
         for n in range(32):
-            self.assertEqual(hypot(*([fourthmax]*n)), fourthmax * math.sqrt(n))
+            self.assertTrue(math.isclose(hypot(*([fourthmax]*n)),
+                                         fourthmax * math.sqrt(n)))
 
         # Verify scaling for extremely small values
         for exp in range(32):
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index c6939da71028e9..d0621f59dfbbf1 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2458,12 +2458,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         return max;
     }
     frexp(max, &max_e);
-    if (-1020 <= max_e && max_e <= 1020) {
-        /* Stay away from extreme values.
-           ldexp(1.0, -1075) returns zero.
-           ldexp(1.0, -1023) is subnormal.
-           ldexp(1.0, 1024) gives Inf.
-        */
+    if (max_e >= -1023) {
         scale = ldexp(1.0, -max_e);
         assert(max * scale >= 0.5);
         assert(max * scale < 1.0);
@@ -2480,7 +2475,10 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         }
         return sqrt(csum - 1.0 + frac) / scale;
     }
-    /* Near extreme values, just divide by *max*. */
+    /* When max_e < -1023, a scale computed given by ldexp(1.0, -max_e)
+       would be subnormal and break range invariants for max*scale.
+       Instead, we just divide by *max*.
+    */
     for (i=0 ; i < n ; i++) {
         x = vec[i];
         assert(Py_IS_FINITE(x) && fabs(x) <= max);

From 5e9f1a9cbf5a83188b8d866be60e9671be9989b2 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 9 Aug 2020 18:16:14 -0700
Subject: [PATCH 09/10] Add blurb

---
 .../next/Library/2020-08-09-18-16-05.bpo-41513.e6K6EK.rst       | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2020-08-09-18-16-05.bpo-41513.e6K6EK.rst

diff --git a/Misc/NEWS.d/next/Library/2020-08-09-18-16-05.bpo-41513.e6K6EK.rst b/Misc/NEWS.d/next/Library/2020-08-09-18-16-05.bpo-41513.e6K6EK.rst
new file mode 100644
index 00000000000000..cfb9f98c376a02
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-08-09-18-16-05.bpo-41513.e6K6EK.rst
@@ -0,0 +1,2 @@
+Minor algorithmic improvement to math.hypot() and math.dist() giving small
+gains in speed and accuracy.

From 5c6478f2f88f0cb6ccd65f0fe17f25ffbf059a12 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 15 Aug 2020 10:53:38 -0700
Subject: [PATCH 10/10] Improve the comments.

---
 Modules/mathmodule.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index d0621f59dfbbf1..489802cc367450 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2432,13 +2432,18 @@ The *csum* variable tracks the cumulative sum and *frac* tracks
 the cumulative fractional errors at each step.  Since this
 variant assumes that |csum| >= |x| at each step, we establish
 the precondition by starting the accumulation from 1.0 which
-represents the largest possible value of (x/max)**2.
+represents the largest possible value of (x*scale)**2 or (x/max)**2.
 
 After the loop is finished, the initial 1.0 is subtracted out
 for a net zero effect on the final sum.  Since *csum* will be
 greater than 1.0, the subtraction of 1.0 will not cause
 fractional digits to be dropped from *csum*.
 
+To get the full benefit from compensated summation, the
+largest addend should be in the range:   0.5 <= x <= 1.0.
+Accordingly, scaling or division by *max* should not be skipped
+even if not otherwise needed to prevent overflow or loss of precision.
+
 */
 
 static inline double
@@ -2475,9 +2480,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
         }
         return sqrt(csum - 1.0 + frac) / scale;
     }
-    /* When max_e < -1023, a scale computed given by ldexp(1.0, -max_e)
-       would be subnormal and break range invariants for max*scale.
-       Instead, we just divide by *max*.
+    /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
+       So instead of multiplying by a scale, we just divide by *max*.
     */
     for (i=0 ; i < n ; i++) {
         x = vec[i];