gh-87790: support underscore for formatting fractional part of floats

skirpichev · skirpichev · commit 5858d8c84f33 · 2024-10-11T12:04:08.000+03:00
```pycon
&gt;&gt;&gt; f"{123_456.123_456:_._f}"  # Whole and fractional
'123_456.123_456'
&gt;&gt;&gt; f"{123_456.123_456:_f}"    # Integer component only
'123_456.123456'
&gt;&gt;&gt; f"{123_456.123_456:._f}"   # Fractional component only
'123456.123_456'
&gt;&gt;&gt; f"{123_456.123_456:.4_f}"  # with precision
'123456.1_235'
```
diff --git a/Doc/library/string.rst b/Doc/library/string.rst
@@ -312,12 +312,13 @@ non-empty format specification typically modifies the result.
 The general form of a *standard format specifier* is:
 
 .. productionlist:: format-spec
-   format_spec: [[`fill`]`align`][`sign`]["z"]["#"]["0"][`width`][`grouping_option`]["." `precision`][`type`]
+   format_spec: [[`fill`]`align`][`sign`]["z"]["#"]["0"][`width`][`grouping_option`]["." `precision` [`fraction_grouping`]][`type`]
    fill: <any character>
    align: "<" | ">" | "=" | "^"
    sign: "+" | "-" | " "
    width: `~python-grammar:digit`+
    grouping_option: "_" | ","
+   fraction_grouping: "_"
    precision: `~python-grammar:digit`+
    type: "b" | "c" | "d" | "e" | "E" | "f" | "F" | "g" | "G" | "n" | "o" | "s" | "x" | "X" | "%"
 
@@ -448,6 +449,13 @@ indicates the maximum field size - in other words, how many characters will be
 used from the field content.  The *precision* is not allowed for integer
 presentation types.
 
+The ``'_'`` option after *precision* means the use of an underscore for a
+thousands separator of the fractional part for floating-point presentation
+types.
+
+.. versionchanged:: 3.14
+   Support underscore as a thousands separator for the fractional part.
+
 Finally, the *type* determines how the data should be presented.
 
 The available string presentation types are:
diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py
@@ -754,6 +754,18 @@ def test_format(self):
         self.assertEqual(format(INF, 'f'), 'inf')
         self.assertEqual(format(INF, 'F'), 'INF')
 
+        # underscores
+        x = 123_456.123_456
+        self.assertEqual(format(x, '_f'), '123_456.123456')
+        self.assertEqual(format(x, '._f'), '123456.123_456')
+        self.assertEqual(format(x, '_._f'), '123_456.123_456')
+        self.assertEqual(format(x, '.10_f'), '123456.1_234_560_000')
+        self.assertEqual(format(x, '>21._f'), '       123456.123_456')
+        self.assertEqual(format(x, '<21._f'), '123456.123_456       ')
+        self.assertEqual(format(x, '+.11_e'), '+1.23_456_123_456e+05')
+
+        self.assertRaises(ValueError, format, x , '._6f')
+
     @support.requires_IEEE_754
     def test_format_testfile(self):
         with open(format_testfile, encoding="utf-8") as testfile:
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-10-11-10-41-05.gh-issue-87790.mlfEGl.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-11-10-41-05.gh-issue-87790.mlfEGl.rst
@@ -0,0 +1,3 @@
+Support underscore as a thousands separator in the fractional part for
+floating-point presentation types of the new-style string formatting (with
+:func:`format` or :ref:`f-strings`).  Patch by Sergey B Kirpichev.
diff --git a/Python/formatter_unicode.c b/Python/formatter_unicode.c
@@ -135,6 +135,7 @@ typedef struct {
     Py_ssize_t width;
     enum LocaleType thousands_separators;
     Py_ssize_t precision;
+    enum LocaleType frac_thousands_separator;
     Py_UCS4 type;
 } InternalFormatSpec;
 
@@ -171,6 +172,7 @@ parse_internal_render_format_spec(PyObject *obj,
     format->sign = '\0';
     format->width = -1;
     format->thousands_separators = LT_NO_LOCALE;
+    format->frac_thousands_separator = LT_NO_LOCALE;
     format->precision = -1;
     format->type = default_type;
 
@@ -260,7 +262,16 @@ parse_internal_render_format_spec(PyObject *obj,
             /* Overflow error. Exception already set. */
             return 0;
 
-        /* Not having a precision after a dot is an error. */
+        if (end-pos && READ_spec(pos) == '_') {
+            if (consumed == 0) {
+                format->precision = -1;
+            }
+            format->frac_thousands_separator = LT_UNDERSCORE_LOCALE;
+            ++pos;
+            ++consumed;
+        }
+
+        /* Not having a precision or underscore after a dot is an error. */
         if (consumed == 0) {
             PyErr_Format(PyExc_ValueError,
                          "Format specifier missing precision");
@@ -402,6 +413,7 @@ fill_padding(_PyUnicodeWriter *writer,
 typedef struct {
     PyObject *decimal_point;
     PyObject *thousands_sep;
+    PyObject *frac_thousands_sep;
     const char *grouping;
     char *grouping_buffer;
 } LocaleInfo;
@@ -423,6 +435,8 @@ typedef struct {
     Py_ssize_t n_remainder; /* Digits in decimal and/or exponent part,
                                excluding the decimal itself, if
                                present. */
+    Py_ssize_t n_frac;
+    Py_ssize_t n_grouped_frac_digits;
 
     /* These 2 are not the widths of fields, but are needed by
        STRINGLIB_GROUPING. */
@@ -445,24 +459,32 @@ typedef struct {
 */
 static void
 parse_number(PyObject *s, Py_ssize_t pos, Py_ssize_t end,
-             Py_ssize_t *n_remainder, int *has_decimal)
+             Py_ssize_t *n_remainder, Py_ssize_t *n_frac, int *has_decimal)
 {
-    Py_ssize_t remainder;
+    Py_ssize_t frac;
     int kind = PyUnicode_KIND(s);
     const void *data = PyUnicode_DATA(s);
 
-    while (pos<end && Py_ISDIGIT(PyUnicode_READ(kind, data, pos)))
+    while (pos<end && Py_ISDIGIT(PyUnicode_READ(kind, data, pos))) {
         ++pos;
-    remainder = pos;
+    }
+    frac = pos;
 
     /* Does remainder start with a decimal point? */
-    *has_decimal = pos<end && PyUnicode_READ(kind, data, remainder) == '.';
+    *has_decimal = pos<end && PyUnicode_READ(kind, data, frac) == '.';
 
     /* Skip the decimal point. */
-    if (*has_decimal)
-        remainder++;
+    if (*has_decimal) {
+        frac++;
+        pos++;
+    }
+
+    while (pos<end && Py_ISDIGIT(PyUnicode_READ(kind, data, pos))) {
+        ++pos;
+    }
 
-    *n_remainder = end - remainder;
+    *n_frac = pos - frac;
+    *n_remainder = end - pos;
 }
 
 /* not all fields of format are used.  for example, precision is
@@ -473,18 +495,19 @@ parse_number(PyObject *s, Py_ssize_t pos, Py_ssize_t end,
 static Py_ssize_t
 calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
                    Py_UCS4 sign_char, Py_ssize_t n_start,
-                   Py_ssize_t n_end, Py_ssize_t n_remainder,
+                   Py_ssize_t n_end, Py_ssize_t n_remainder, Py_ssize_t n_frac,
                    int has_decimal, const LocaleInfo *locale,
                    const InternalFormatSpec *format, Py_UCS4 *maxchar)
 {
     Py_ssize_t n_non_digit_non_padding;
     Py_ssize_t n_padding;
 
-    spec->n_digits = n_end - n_start - n_remainder - (has_decimal?1:0);
+    spec->n_digits = n_end - n_start - n_frac - n_remainder - (has_decimal?1:0);
     spec->n_lpadding = 0;
     spec->n_prefix = n_prefix;
     spec->n_decimal = has_decimal ? PyUnicode_GET_LENGTH(locale->decimal_point) : 0;
     spec->n_remainder = n_remainder;
+    spec->n_frac = n_frac;
     spec->n_spadding = 0;
     spec->n_rpadding = 0;
     spec->sign = '\0';
@@ -530,7 +553,7 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
 
     /* The number of chars used for non-digits and non-padding. */
     n_non_digit_non_padding = spec->n_sign + spec->n_prefix + spec->n_decimal +
-        spec->n_remainder;
+        + spec->n_frac + spec->n_remainder;
 
     /* min_width can go negative, that's okay. format->width == -1 means
        we don't care. */
@@ -557,12 +580,29 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
         *maxchar = Py_MAX(*maxchar, grouping_maxchar);
     }
 
+    if (spec->n_frac == 0) {
+        spec->n_grouped_frac_digits = 0;
+    }
+    else {
+        Py_UCS4 grouping_maxchar;
+        spec->n_grouped_frac_digits = _PyUnicode_InsertThousandsGrouping(
+            NULL, 0,
+            NULL, 0, spec->n_frac,
+            spec->n_frac,
+            locale->grouping, locale->frac_thousands_sep, &grouping_maxchar);
+        if (spec->n_grouped_frac_digits == -1) {
+            return -1;
+        }
+        *maxchar = Py_MAX(*maxchar, grouping_maxchar);
+    }
+
     /* Given the desired width and the total of digit and non-digit
        space we consume, see if we need any padding. format->width can
        be negative (meaning no padding), but this code still works in
        that case. */
     n_padding = format->width -
-                        (n_non_digit_non_padding + spec->n_grouped_digits);
+                        (n_non_digit_non_padding + spec->n_grouped_digits
+                         + spec->n_grouped_frac_digits - spec->n_frac);
     if (n_padding > 0) {
         /* Some padding is needed. Determine if it's left, space, or right. */
         switch (format->align) {
@@ -593,7 +633,7 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
 
     return spec->n_lpadding + spec->n_sign + spec->n_prefix +
         spec->n_spadding + spec->n_grouped_digits + spec->n_decimal +
-        spec->n_remainder + spec->n_rpadding;
+        spec->n_grouped_frac_digits + spec->n_remainder + spec->n_rpadding;
 }
 
 /* Fill in the digit parts of a number's string representation,
@@ -677,6 +717,19 @@ fill_number(_PyUnicodeWriter *writer, const NumberFieldWidths *spec,
         d_pos += 1;
     }
 
+    if (spec->n_frac) {
+        r = _PyUnicode_InsertThousandsGrouping(
+                writer, spec->n_grouped_frac_digits,
+                digits, d_pos, spec->n_frac, spec->n_frac,
+                locale->grouping, locale->frac_thousands_sep, NULL);
+        if (r == -1) {
+            return -1;
+        }
+        assert(r == spec->n_grouped_frac_digits);
+        d_pos += spec->n_frac;
+        writer->pos += spec->n_grouped_frac_digits;
+    }
+
     if (spec->n_remainder) {
         _PyUnicode_FastCopyCharacters(
             writer->buffer, writer->pos,
@@ -701,7 +754,8 @@ static const char no_grouping[1] = {CHAR_MAX};
    LT_CURRENT_LOCALE, a hard-coded locale if LT_DEFAULT_LOCALE or
    LT_UNDERSCORE_LOCALE/LT_UNDER_FOUR_LOCALE, or none if LT_NO_LOCALE. */
 static int
-get_locale_info(enum LocaleType type, LocaleInfo *locale_info)
+get_locale_info(enum LocaleType type, enum LocaleType frac_type,
+                 LocaleInfo *locale_info)
 {
     switch (type) {
     case LT_CURRENT_LOCALE: {
@@ -746,6 +800,15 @@ get_locale_info(enum LocaleType type, LocaleInfo *locale_info)
         locale_info->grouping = no_grouping;
         break;
     }
+    if (frac_type == LT_UNDERSCORE_LOCALE) {
+        locale_info->frac_thousands_sep = PyUnicode_FromOrdinal('_');
+        if (locale_info->grouping == no_grouping) {
+            locale_info->grouping = "\3";
+        }
+    }
+    else {
+        locale_info->frac_thousands_sep = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
+    }
     return 0;
 }
 
@@ -754,6 +817,7 @@ free_locale_info(LocaleInfo *locale_info)
 {
     Py_XDECREF(locale_info->decimal_point);
     Py_XDECREF(locale_info->thousands_sep);
+    Py_XDECREF(locale_info->frac_thousands_sep);
     PyMem_Free(locale_info->grouping_buffer);
 }
 
@@ -1005,13 +1069,13 @@ format_long_internal(PyObject *value, const InternalFormatSpec *format,
 
     /* Determine the grouping, separator, and decimal point, if any. */
     if (get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE :
-                        format->thousands_separators,
+                        format->thousands_separators, 0,
                         &locale) == -1)
         goto done;
 
     /* Calculate how much memory we'll need. */
     n_total = calc_number_widths(&spec, n_prefix, sign_char, inumeric_chars,
-                                 inumeric_chars + n_digits, n_remainder, 0,
+                                 inumeric_chars + n_digits, n_remainder, 0, 0,
                                  &locale, format, &maxchar);
     if (n_total == -1) {
         goto done;
@@ -1046,6 +1110,7 @@ format_float_internal(PyObject *value,
     char *buf = NULL;       /* buffer returned from PyOS_double_to_string */
     Py_ssize_t n_digits;
     Py_ssize_t n_remainder;
+    Py_ssize_t n_frac;
     Py_ssize_t n_total;
     int has_decimal;
     double val;
@@ -1125,7 +1190,8 @@ format_float_internal(PyObject *value,
     if (format->sign != '+' && format->sign != ' '
         && format->width == -1
         && format->type != 'n'
-        && !format->thousands_separators)
+        && !format->thousands_separators
+        && !format->frac_thousands_separator)
     {
         /* Fast path */
         result = _PyUnicodeWriter_WriteASCIIString(writer, buf, n_digits);
@@ -1151,18 +1217,20 @@ format_float_internal(PyObject *value,
 
     /* Determine if we have any "remainder" (after the digits, might include
        decimal or exponent or both (or neither)) */
-    parse_number(unicode_tmp, index, index + n_digits, &n_remainder, &has_decimal);
+    parse_number(unicode_tmp, index, index + n_digits,
+                 &n_remainder, &n_frac, &has_decimal);
 
     /* Determine the grouping, separator, and decimal point, if any. */
     if (get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE :
                         format->thousands_separators,
+                        format->frac_thousands_separator,
                         &locale) == -1)
         goto done;
 
     /* Calculate how much memory we'll need. */
     n_total = calc_number_widths(&spec, 0, sign_char, index,
-                                 index + n_digits, n_remainder, has_decimal,
-                                 &locale, format, &maxchar);
+                                 index + n_digits, n_remainder, n_frac,
+                                 has_decimal, &locale, format, &maxchar);
     if (n_total == -1) {
         goto done;
     }
@@ -1202,6 +1270,8 @@ format_complex_internal(PyObject *value,
     Py_ssize_t n_im_digits;
     Py_ssize_t n_re_remainder;
     Py_ssize_t n_im_remainder;
+    Py_ssize_t n_re_frac;
+    Py_ssize_t n_im_frac;
     Py_ssize_t n_re_total;
     Py_ssize_t n_im_total;
     int re_has_decimal;
@@ -1330,13 +1400,14 @@ format_complex_internal(PyObject *value,
     /* Determine if we have any "remainder" (after the digits, might include
        decimal or exponent or both (or neither)) */
     parse_number(re_unicode_tmp, i_re, i_re + n_re_digits,
-                 &n_re_remainder, &re_has_decimal);
+                 &n_re_remainder, &n_re_frac, &re_has_decimal);
     parse_number(im_unicode_tmp, i_im, i_im + n_im_digits,
-                 &n_im_remainder, &im_has_decimal);
+                 &n_im_remainder, &n_im_frac, &im_has_decimal);
 
     /* Determine the grouping, separator, and decimal point, if any. */
     if (get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE :
                         format->thousands_separators,
+                        format->frac_thousands_separator,
                         &locale) == -1)
         goto done;
 
@@ -1349,8 +1420,8 @@ format_complex_internal(PyObject *value,
     /* Calculate how much memory we'll need. */
     n_re_total = calc_number_widths(&re_spec, 0, re_sign_char,
                                     i_re, i_re + n_re_digits, n_re_remainder,
-                                    re_has_decimal, &locale, &tmp_format,
-                                    &maxchar);
+                                    n_re_frac, re_has_decimal, &locale,
+                                    &tmp_format, &maxchar);
     if (n_re_total == -1) {
         goto done;
     }
@@ -1362,8 +1433,8 @@ format_complex_internal(PyObject *value,
         tmp_format.sign = '+';
     n_im_total = calc_number_widths(&im_spec, 0, im_sign_char,
                                     i_im, i_im + n_im_digits, n_im_remainder,
-                                    im_has_decimal, &locale, &tmp_format,
-                                    &maxchar);
+                                    n_im_frac, im_has_decimal, &locale,
+                                    &tmp_format, &maxchar);
     if (n_im_total == -1) {
         goto done;
     }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Support underscore as a thousands separator in the fractional part for`
	`2`	`+floating-point presentation types of the new-style string formatting (with`
	`3`	+:func:`format` or :ref:`f-strings`). Patch by Sergey B Kirpichev.