Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Lib/test/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,8 @@ def testHypot(self):
# Verify scaling for extremely large values
fourthmax = FLOAT_MAX / 4.0
for n in range(32):
self.assertEqual(hypot(*([fourthmax]*n)), fourthmax * math.sqrt(n))
self.assertTrue(math.isclose(hypot(*([fourthmax]*n)),
Comment thread
rhettinger marked this conversation as resolved.
fourthmax * math.sqrt(n)))

# Verify scaling for extremely small values
for exp in range(32):
Expand Down Expand Up @@ -904,8 +905,8 @@ class T(tuple):
for n in range(32):
p = (fourthmax,) * n
q = (0.0,) * n
self.assertEqual(dist(p, q), fourthmax * math.sqrt(n))
self.assertEqual(dist(q, p), fourthmax * math.sqrt(n))
self.assertTrue(math.isclose(dist(p, q), fourthmax * math.sqrt(n)))
self.assertTrue(math.isclose(dist(q, p), fourthmax * math.sqrt(n)))

# Verify scaling for extremely small values
for exp in range(32):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Minor algorithmic improvement to math.hypot() and math.dist() giving small
gains in speed and accuracy.
41 changes: 38 additions & 3 deletions Modules/mathmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -2406,6 +2406,13 @@ math_fmod_impl(PyObject *module, double x, double y)
/*
Given an *n* length *vec* of values and a value *max*, compute:

sqrt(sum((x * scale) ** 2 for x in vec)) / scale

where scale is the first power of two
greater than max.

or compute:

max * sqrt(sum((x / max) ** 2 for x in vec))

The value of the *max* variable must be non-negative and
Expand All @@ -2425,19 +2432,25 @@ The *csum* variable tracks the cumulative sum and *frac* tracks
the cumulative fractional errors at each step. Since this
variant assumes that |csum| >= |x| at each step, we establish
the precondition by starting the accumulation from 1.0 which
represents the largest possible value of (x/max)**2.
represents the largest possible value of (x*scale)**2 or (x/max)**2.

After the loop is finished, the initial 1.0 is subtracted out
for a net zero effect on the final sum. Since *csum* will be
greater than 1.0, the subtraction of 1.0 will not cause
fractional digits to be dropped from *csum*.

To get the full benefit from compensated summation, the
largest addend should be in the range: 0.5 <= x <= 1.0.
Accordingly, scaling or division by *max* should not be skipped
even if not otherwise needed to prevent overflow or loss of precision.

*/

static inline double
vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
{
double x, csum = 1.0, oldcsum, frac = 0.0;
double x, csum = 1.0, oldcsum, frac = 0.0, scale;
int max_e;
Py_ssize_t i;

if (Py_IS_INFINITY(max)) {
Expand All @@ -2449,14 +2462,36 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
if (max == 0.0 || n <= 1) {
return max;
}
frexp(max, &max_e);
if (max_e >= -1023) {
Comment thread
rhettinger marked this conversation as resolved.
scale = ldexp(1.0, -max_e);
assert(max * scale >= 0.5);
assert(max * scale < 1.0);
for (i=0 ; i < n ; i++) {
x = vec[i];
assert(Py_IS_FINITE(x) && fabs(x) <= max);
x *= scale;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is faster, x *= scale or x = ldexp(x, -max_e)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The test as written was over-specified. It should have been written this way from the start.

  • The x *= scale is faster than x = ldexp(x, -max_e). The former is a single, fast in-line instruction and the latter is an external library call.

Here's the generated code for the loop:

L284:
    movsd   (%r12,%rax,8), %xmm0
    addq    $1, %rax
    cmpq    %rax, %rbp
    mulsd   %xmm4, %xmm0           <-- x *= scale
    movapd  %xmm0, %xmm1
    mulsd   %xmm0, %xmm1           <-- x *= x
    movapd  %xmm2, %xmm0
    addsd   %xmm1, %xmm2            <-- csum += x
    subsd   %xmm2, %xmm0
    addsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm3
    jg  L284
    subsd   %xmm

x = x*x;
assert(x <= 1.0);
assert(csum >= x);
oldcsum = csum;
csum += x;
frac += (oldcsum - csum) + x;
}
return sqrt(csum - 1.0 + frac) / scale;
}
/* When max_e < -1023, ldexp(1.0, -max_e) overflows.
So instead of multiplying by a scale, we just divide by *max*.
*/
for (i=0 ; i < n ; i++) {
x = vec[i];
assert(Py_IS_FINITE(x) && fabs(x) <= max);
x /= max;
x = x*x;
assert(x <= 1.0);
assert(csum >= x);
oldcsum = csum;
csum += x;
assert(csum >= x);
frac += (oldcsum - csum) + x;
}
return max * sqrt(csum - 1.0 + frac);
Expand Down