diff --git a/doc/release/upcoming_changes/31332.improvement.rst b/doc/release/upcoming_changes/31332.improvement.rst new file mode 100644 index 000000000000..f8a0661ff0c3 --- /dev/null +++ b/doc/release/upcoming_changes/31332.improvement.rst @@ -0,0 +1,6 @@ +Structured dtypes now support larger field sizes +------------------------------------------------ +It is now possible to construct structured data types with +field sizes and offsets that exceed the size of a standard C +integer. Arrays using these structured data types are now +also possible to construct. diff --git a/numpy/_core/src/multiarray/ctors.c b/numpy/_core/src/multiarray/ctors.c index 7d5bc2d79c41..68f684bec757 100644 --- a/numpy/_core/src/multiarray/ctors.c +++ b/numpy/_core/src/multiarray/ctors.c @@ -3460,7 +3460,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea { PyArrayObject *r; npy_off_t start, numbytes; - int elsize; + npy_intp elsize; if (num < 0) { int fail = 0; diff --git a/numpy/_core/src/multiarray/descriptor.c b/numpy/_core/src/multiarray/descriptor.c index a45206e3b5ef..e37db292b6d8 100644 --- a/numpy/_core/src/multiarray/descriptor.c +++ b/numpy/_core/src/multiarray/descriptor.c @@ -316,22 +316,16 @@ _convert_from_tuple(PyObject *obj, int align) "dimension smaller then zero."); goto fail; } - if (shape.ptr[i] > NPY_MAX_INT) { - PyErr_SetString(PyExc_ValueError, - "invalid shape in fixed-type tuple: " - "dimension does not fit into a C int."); - goto fail; - } } npy_intp items = PyArray_OverflowMultiplyList(shape.ptr, shape.len); int overflowed; - int nbytes; - if (items < 0 || items > NPY_MAX_INT) { + npy_intp nbytes; + if (items < 0) { overflowed = 1; } else { - overflowed = npy_mul_with_overflow_int( - &nbytes, type->elsize, (int) items); + overflowed = npy_mul_sizes_with_overflow( + &nbytes, type->elsize, items); } if (overflowed) { PyErr_SetString(PyExc_ValueError, @@ -370,7 +364,7 @@ _convert_from_tuple(PyObject *obj, int align) } for (int i=0; i < shape.len; i++) { PyTuple_SET_ITEM(newdescr->subarray->shape, i, - PyLong_FromLong((long)shape.ptr[i])); + PyLong_FromSsize_t(shape.ptr[i])); if (PyTuple_GET_ITEM(newdescr->subarray->shape, i) == NULL) { Py_DECREF(newdescr); @@ -410,7 +404,7 @@ _convert_from_array_descr(PyObject *obj, int align) /* Types with fields need the Python C API for field access */ npy_uint64 dtypeflags = NPY_NEEDS_PYAPI; int maxalign = 1; - int totalsize = 0; + npy_intp totalsize = 0; PyObject *fields = PyDict_New(); if (!fields) { Py_DECREF(nameslist); @@ -527,7 +521,7 @@ _convert_from_array_descr(PyObject *obj, int align) goto fail; } PyTuple_SET_ITEM(tup, 0, (PyObject *)conv); - PyTuple_SET_ITEM(tup, 1, PyLong_FromLong((long) totalsize)); + PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(totalsize)); /* * Title can be "meta-data". Only insert it @@ -633,7 +627,7 @@ _convert_from_list(PyObject *obj, int align) /* Types with fields need the Python C API for field access */ npy_uint64 dtypeflags = NPY_NEEDS_PYAPI; int maxalign = 1; - int totalsize = 0; + npy_intp totalsize = 0; for (int i = 0; i < n; i++) { PyArray_Descr *conv = _convert_from_any( PyList_GET_ITEM(obj, i), align); // noqa: borrowed-ref OK @@ -648,7 +642,7 @@ _convert_from_list(PyObject *obj, int align) } maxalign = PyArray_MAX(maxalign, _align); } - PyObject *size_obj = PyLong_FromLong((long) totalsize); + PyObject *size_obj = PyLong_FromSsize_t(totalsize); if (!size_obj) { Py_DECREF(conv); goto fail; @@ -1101,7 +1095,7 @@ _convert_from_dict(PyObject *obj, int align) /* Types with fields need the Python C API for field access */ npy_uint64 dtypeflags = NPY_NEEDS_PYAPI; - int totalsize = 0; + npy_intp totalsize = 0; int maxalign = 1; int has_out_of_order_fields = 0; for (int i = 0; i < n; i++) { @@ -1146,7 +1140,7 @@ _convert_from_dict(PyObject *obj, int align) Py_DECREF(ind); goto fail; } - long offset = PyArray_PyIntAsInt(off); + npy_intp offset = PyArray_PyIntAsIntp(off); if (error_converting(offset)) { Py_DECREF(off); Py_DECREF(tup); @@ -1162,7 +1156,7 @@ _convert_from_dict(PyObject *obj, int align) goto fail; } - PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(offset)); + PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(offset)); /* Flag whether the fields are specified out of order */ if (offset < totalsize) { has_out_of_order_fields = 1; @@ -1186,7 +1180,7 @@ _convert_from_dict(PyObject *obj, int align) if (align && _align > 1) { totalsize = NPY_NEXT_ALIGNED_OFFSET(totalsize, _align); } - PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(totalsize)); + PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(totalsize)); totalsize += newdescr->elsize; } if (len == 3) { @@ -1803,7 +1797,7 @@ _convert_from_str(PyObject *obj, int align) } int check_num = NPY_NOTYPE + 10; - int elsize = 0; + npy_intp elsize = 0; /* A typecode like 'd' */ if (len == 1) { /* Python byte string characters are unsigned */ @@ -1816,7 +1810,7 @@ _convert_from_str(PyObject *obj, int align) /* Attempt to parse the integer, make sure it's the rest of the string */ errno = 0; - long result = strtol(type + 1, &typeend, 10); + npy_intp result = strtol(type + 1, &typeend, 10); npy_bool some_parsing_happened = !(type == typeend); npy_bool entire_string_consumed = *typeend == '\0'; npy_bool parsing_succeeded = @@ -1826,7 +1820,7 @@ _convert_from_str(PyObject *obj, int align) goto fail; } - elsize = (int)result; + elsize = result; if (parsing_succeeded && typeend - type == len) { @@ -2723,7 +2717,8 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args)) PyObject *ret, *mod, *obj; PyObject *state; char endian; - int elsize, alignment; + npy_intp elsize; + int alignment; ret = PyTuple_New(3); if (ret == NULL) { @@ -2825,7 +2820,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args)) elsize = -1; alignment = -1; } - PyTuple_SET_ITEM(state, 5, PyLong_FromLong(elsize)); + PyTuple_SET_ITEM(state, 5, PyLong_FromSsize_t(elsize)); PyTuple_SET_ITEM(state, 6, PyLong_FromLong(alignment)); PyTuple_SET_ITEM(state, 7, PyLong_FromUnsignedLongLong( self->flags & ~NPY_NOT_TRIVIALLY_COPYABLE)); diff --git a/numpy/_core/tests/test_dtype.py b/numpy/_core/tests/test_dtype.py index 6464ccd61f9d..a503057ca48f 100644 --- a/numpy/_core/tests/test_dtype.py +++ b/numpy/_core/tests/test_dtype.py @@ -26,7 +26,7 @@ assert_equal, assert_raises, ) -from numpy.testing._private.utils import requires_deep_recursion +from numpy.testing._private.utils import requires_deep_recursion, requires_memory def assert_dtype_equal(a, b): @@ -737,13 +737,10 @@ def test_shape_matches_ndim(self): def test_shape_invalid(self): # Check that the shape is valid. - max_int = np.iinfo(np.intc).max max_intp = np.iinfo(np.intp).max # Too large values (the datatype is part of this) - assert_raises(ValueError, np.dtype, [('a', 'f4', max_int // 4 + 1)]) - assert_raises(ValueError, np.dtype, [('a', 'f4', max_int + 1)]) - assert_raises(ValueError, np.dtype, [('a', 'f4', (max_int, 2))]) - # Takes a different code path (fails earlier: + assert_raises(ValueError, np.dtype, [('a', 'f8', max_intp // 8 + 1)]) + assert_raises(ValueError, np.dtype, [('a', 'f4', max_intp // 4 + 1)]) assert_raises(ValueError, np.dtype, [('a', 'f4', max_intp + 1)]) # Negative values assert_raises(ValueError, np.dtype, [('a', 'f4', -1)]) @@ -1252,7 +1249,7 @@ def test_structured(self, dtype, random): class TestPickling: - def check_pickling(self, dtype): + def check_pickling(self, dtype, arr_assert=True): for proto in range(pickle.HIGHEST_PROTOCOL + 1): buf = pickle.dumps(dtype, proto) # The dtype pickling itself pickles `np.dtype` if it is pickled @@ -1262,22 +1259,25 @@ def check_pickling(self, dtype): pickled = pickle.loads(buf) assert_equal(pickled, dtype) assert_equal(pickled.descr, dtype.descr) + assert_equal(pickled.itemsize, dtype.itemsize) if dtype.metadata is not None: assert_equal(pickled.metadata, dtype.metadata) - # Check the reconstructed dtype is functional - x = np.zeros(3, dtype=dtype) - y = np.zeros(3, dtype=pickled) - assert_equal(x, y) - assert_equal(x[0], y[0]) + # some large structured dtypes are too large to + # reasonably compare across all elements + if arr_assert: + # Check the reconstructed dtype is functional + x = np.zeros(3, dtype=dtype) + y = np.zeros(3, dtype=pickled) + assert_equal(x, y) + assert_equal(x[0], y[0]) @pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system") - @pytest.mark.xfail(reason="dtype conversion doesn't allow this yet.") def test_pickling_large(self): # The actual itemsize is larger than a c-integer here. dtype = np.dtype(f"({2**31},)i") - self.check_pickling(dtype) + self.check_pickling(dtype, False) dtype = np.dtype(f"({2**31},)i", metadata={"a": "b"}) - self.check_pickling(dtype) + self.check_pickling(dtype, False) @pytest.mark.parametrize('t', [int, float, complex, np.int32, str, object, bool]) @@ -2049,3 +2049,39 @@ def test_signature_dtypes_classes(self, typename: str): params_actual = set(sig.parameters) assert params_actual == params_expect + + +@pytest.mark.parametrize("kind, exp", [ + ([("x", np.float64, 2 ** 28)], (2 ** 28 * 8)), + ([("x", np.float64, 2 ** 27), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)), + ([("x", np.float32, 2 ** 28), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)), + ([("x", np.float16, 2 ** 29), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)), + ("2147483648i,2147483648i", 17179869184), + ("2147483648f,2147483648f", 17179869184), + ("2147483648d,2147483648d", 34359738368), + ("2b,2147483648b,2f,4i", 2147483674), + (dict(names=["a"], formats=["2147483648i"]), 8589934592), + (dict(names=["a"], formats=["2147483648i"], offsets=[1]), 8589934593), + (dict(names=["a"], formats=["2147483648i"], offsets=[2 ** 31 - 100]), 10737418140), + (dict(names=["a"], formats=["2147483648i"], offsets=[2 ** 31]), 10737418240), + (dict(names=["a", "b", "c"], formats=["2147483648b", "16i", "12f"], + offsets=[2 ** 31, 2 ** 32, 2 ** 32 + 69]), 4294967413), +]) +@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system") +def test_gh_31308(kind, exp): + kind_dtype = np.dtype(kind) + assert kind_dtype.itemsize == exp + for name in kind_dtype.names: + assert kind_dtype[name].shape[0] > 0 + + +@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system") +@requires_memory(free_bytes=2e9) +@pytest.mark.parametrize("val, kind, exp", [ + ((1,), [("x", np.float64, 2 ** 28)], 2 ** 28), + ((1, 1), [("x", np.float64, 2 ** 28), ("y", np.float64, 1)], 2 ** 28), +]) +def test_gh_31308_materialized(val, kind, exp): + kind_dtype = np.dtype(kind) + rec_arr = np.array(val, dtype=kind_dtype) + assert rec_arr["x"].size == exp diff --git a/numpy/_core/tests/test_records.py b/numpy/_core/tests/test_records.py index 9387e8aa9a83..9624589cb761 100644 --- a/numpy/_core/tests/test_records.py +++ b/numpy/_core/tests/test_records.py @@ -9,13 +9,16 @@ import numpy as np from numpy.testing import ( + IS_64BIT, assert_, assert_array_almost_equal, assert_array_equal, assert_equal, + assert_allclose, assert_raises, temppath, ) +from numpy.testing._private.utils import requires_memory class TestFromrecords: @@ -108,6 +111,19 @@ def test_recarray_fromfile(self): assert_equal(r1, r2) assert_equal(r2, r3) + @pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system") + @requires_memory(free_bytes=4.3e9) + def test_recarray_fromfile_massive(self, tmpdir): + kind = [("x", np.float64, 2 ** 29)] + kind_dtype = np.dtype(kind) + rec_arr = np.array((1,), dtype=kind_dtype) + with tmpdir.as_cwd(): + rec_arr.tofile("f.data") + actual = np.fromfile("f.data", dtype=kind_dtype) + assert actual.itemsize == 2 ** 29 * 8 + item = actual["x"][0][1] + assert_allclose(item, 1) + def test_recarray_from_obj(self): count = 10 a = np.zeros(count, dtype='O')