Skip to content
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/release/upcoming_changes/31332.improvement.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Structured dtypes now support larger field sizes
------------------------------------------------
It is now possible to construct structured data types with
field sizes and offsets that exceed the size of a standard C
integer. Arrays using these structured data types are now
also possible to construct.
2 changes: 1 addition & 1 deletion numpy/_core/src/multiarray/ctors.c
Original file line number Diff line number Diff line change
Expand Up @@ -3460,7 +3460,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
{
PyArrayObject *r;
npy_off_t start, numbytes;
int elsize;
npy_intp elsize;

if (num < 0) {
int fail = 0;
Expand Down
43 changes: 19 additions & 24 deletions numpy/_core/src/multiarray/descriptor.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,22 +316,16 @@ _convert_from_tuple(PyObject *obj, int align)
"dimension smaller then zero.");
goto fail;
}
if (shape.ptr[i] > NPY_MAX_INT) {
PyErr_SetString(PyExc_ValueError,
"invalid shape in fixed-type tuple: "
"dimension does not fit into a C int.");
goto fail;
}
}
npy_intp items = PyArray_OverflowMultiplyList(shape.ptr, shape.len);
int overflowed;
int nbytes;
if (items < 0 || items > NPY_MAX_INT) {
npy_intp nbytes;
if (items < 0) {
overflowed = 1;
}
else {
overflowed = npy_mul_with_overflow_int(
&nbytes, type->elsize, (int) items);
overflowed = npy_mul_sizes_with_overflow(
&nbytes, type->elsize, items);
}
if (overflowed) {
PyErr_SetString(PyExc_ValueError,
Expand Down Expand Up @@ -370,7 +364,7 @@ _convert_from_tuple(PyObject *obj, int align)
}
for (int i=0; i < shape.len; i++) {
PyTuple_SET_ITEM(newdescr->subarray->shape, i,
PyLong_FromLong((long)shape.ptr[i]));
PyLong_FromSsize_t(shape.ptr[i]));

if (PyTuple_GET_ITEM(newdescr->subarray->shape, i) == NULL) {
Py_DECREF(newdescr);
Expand Down Expand Up @@ -410,7 +404,7 @@ _convert_from_array_descr(PyObject *obj, int align)
/* Types with fields need the Python C API for field access */
npy_uint64 dtypeflags = NPY_NEEDS_PYAPI;
int maxalign = 1;
int totalsize = 0;
npy_intp totalsize = 0;
PyObject *fields = PyDict_New();
if (!fields) {
Py_DECREF(nameslist);
Expand Down Expand Up @@ -527,7 +521,7 @@ _convert_from_array_descr(PyObject *obj, int align)
goto fail;
}
PyTuple_SET_ITEM(tup, 0, (PyObject *)conv);
PyTuple_SET_ITEM(tup, 1, PyLong_FromLong((long) totalsize));
PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(totalsize));

/*
* Title can be "meta-data". Only insert it
Expand Down Expand Up @@ -633,7 +627,7 @@ _convert_from_list(PyObject *obj, int align)
/* Types with fields need the Python C API for field access */
npy_uint64 dtypeflags = NPY_NEEDS_PYAPI;
int maxalign = 1;
int totalsize = 0;
npy_intp totalsize = 0;
for (int i = 0; i < n; i++) {
PyArray_Descr *conv = _convert_from_any(
PyList_GET_ITEM(obj, i), align); // noqa: borrowed-ref OK
Expand All @@ -648,7 +642,7 @@ _convert_from_list(PyObject *obj, int align)
}
maxalign = PyArray_MAX(maxalign, _align);
}
PyObject *size_obj = PyLong_FromLong((long) totalsize);
PyObject *size_obj = PyLong_FromSsize_t(totalsize);
if (!size_obj) {
Py_DECREF(conv);
goto fail;
Expand Down Expand Up @@ -1101,7 +1095,7 @@ _convert_from_dict(PyObject *obj, int align)

/* Types with fields need the Python C API for field access */
npy_uint64 dtypeflags = NPY_NEEDS_PYAPI;
int totalsize = 0;
npy_intp totalsize = 0;
int maxalign = 1;
int has_out_of_order_fields = 0;
for (int i = 0; i < n; i++) {
Expand Down Expand Up @@ -1146,7 +1140,7 @@ _convert_from_dict(PyObject *obj, int align)
Py_DECREF(ind);
goto fail;
}
long offset = PyArray_PyIntAsInt(off);
npy_intp offset = PyArray_PyIntAsIntp(off);
if (error_converting(offset)) {
Py_DECREF(off);
Py_DECREF(tup);
Expand All @@ -1162,7 +1156,7 @@ _convert_from_dict(PyObject *obj, int align)
goto fail;
}

PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(offset));
PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(offset));
/* Flag whether the fields are specified out of order */
if (offset < totalsize) {
has_out_of_order_fields = 1;
Expand All @@ -1186,7 +1180,7 @@ _convert_from_dict(PyObject *obj, int align)
if (align && _align > 1) {
totalsize = NPY_NEXT_ALIGNED_OFFSET(totalsize, _align);
}
PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(totalsize));
PyTuple_SET_ITEM(tup, 1, PyLong_FromSsize_t(totalsize));
totalsize += newdescr->elsize;
}
if (len == 3) {
Expand Down Expand Up @@ -1803,7 +1797,7 @@ _convert_from_str(PyObject *obj, int align)
}

int check_num = NPY_NOTYPE + 10;
int elsize = 0;
npy_intp elsize = 0;
/* A typecode like 'd' */
if (len == 1) {
/* Python byte string characters are unsigned */
Expand All @@ -1816,7 +1810,7 @@ _convert_from_str(PyObject *obj, int align)

/* Attempt to parse the integer, make sure it's the rest of the string */
errno = 0;
long result = strtol(type + 1, &typeend, 10);
npy_intp result = strtol(type + 1, &typeend, 10);
npy_bool some_parsing_happened = !(type == typeend);
npy_bool entire_string_consumed = *typeend == '\0';
npy_bool parsing_succeeded =
Expand All @@ -1826,7 +1820,7 @@ _convert_from_str(PyObject *obj, int align)
goto fail;
}

elsize = (int)result;
elsize = result;


if (parsing_succeeded && typeend - type == len) {
Expand Down Expand Up @@ -2723,7 +2717,8 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
PyObject *ret, *mod, *obj;
PyObject *state;
char endian;
int elsize, alignment;
npy_intp elsize;
int alignment;

ret = PyTuple_New(3);
if (ret == NULL) {
Expand Down Expand Up @@ -2825,7 +2820,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
elsize = -1;
alignment = -1;
}
PyTuple_SET_ITEM(state, 5, PyLong_FromLong(elsize));
PyTuple_SET_ITEM(state, 5, PyLong_FromSsize_t(elsize));
PyTuple_SET_ITEM(state, 6, PyLong_FromLong(alignment));
PyTuple_SET_ITEM(state, 7, PyLong_FromUnsignedLongLong(
self->flags & ~NPY_NOT_TRIVIALLY_COPYABLE));
Expand Down
63 changes: 48 additions & 15 deletions numpy/_core/tests/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
assert_equal,
assert_raises,
)
from numpy.testing._private.utils import requires_deep_recursion
from numpy.testing._private.utils import requires_deep_recursion, requires_memory


def assert_dtype_equal(a, b):
Expand Down Expand Up @@ -737,13 +737,10 @@ def test_shape_matches_ndim(self):

def test_shape_invalid(self):
# Check that the shape is valid.
max_int = np.iinfo(np.intc).max
max_intp = np.iinfo(np.intp).max
# Too large values (the datatype is part of this)
assert_raises(ValueError, np.dtype, [('a', 'f4', max_int // 4 + 1)])
assert_raises(ValueError, np.dtype, [('a', 'f4', max_int + 1)])
assert_raises(ValueError, np.dtype, [('a', 'f4', (max_int, 2))])
# Takes a different code path (fails earlier:
assert_raises(ValueError, np.dtype, [('a', 'f8', max_intp // 8 + 1)])
assert_raises(ValueError, np.dtype, [('a', 'f4', max_intp // 4 + 1)])
assert_raises(ValueError, np.dtype, [('a', 'f4', max_intp + 1)])
Comment thread
seberg marked this conversation as resolved.
# Negative values
assert_raises(ValueError, np.dtype, [('a', 'f4', -1)])
Expand Down Expand Up @@ -1252,7 +1249,7 @@ def test_structured(self, dtype, random):

class TestPickling:

def check_pickling(self, dtype):
def check_pickling(self, dtype, arr_assert=True):
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
buf = pickle.dumps(dtype, proto)
# The dtype pickling itself pickles `np.dtype` if it is pickled
Expand All @@ -1262,22 +1259,25 @@ def check_pickling(self, dtype):
pickled = pickle.loads(buf)
assert_equal(pickled, dtype)
assert_equal(pickled.descr, dtype.descr)
assert_equal(pickled.itemsize, dtype.itemsize)
if dtype.metadata is not None:
assert_equal(pickled.metadata, dtype.metadata)
# Check the reconstructed dtype is functional
x = np.zeros(3, dtype=dtype)
y = np.zeros(3, dtype=pickled)
assert_equal(x, y)
assert_equal(x[0], y[0])
# some large structured dtypes are too large to
# reasonably compare across all elements
if arr_assert:
# Check the reconstructed dtype is functional
x = np.zeros(3, dtype=dtype)
y = np.zeros(3, dtype=pickled)
assert_equal(x, y)
assert_equal(x[0], y[0])

@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system")
@pytest.mark.xfail(reason="dtype conversion doesn't allow this yet.")
def test_pickling_large(self):
# The actual itemsize is larger than a c-integer here.
dtype = np.dtype(f"({2**31},)i")
self.check_pickling(dtype)
self.check_pickling(dtype, False)
dtype = np.dtype(f"({2**31},)i", metadata={"a": "b"})
self.check_pickling(dtype)
self.check_pickling(dtype, False)

@pytest.mark.parametrize('t', [int, float, complex, np.int32, str, object,
bool])
Expand Down Expand Up @@ -2049,3 +2049,36 @@ def test_signature_dtypes_classes(self, typename: str):

params_actual = set(sig.parameters)
assert params_actual == params_expect


@pytest.mark.parametrize("kind, exp", [
([("x", np.float64, 2 ** 28)], (2 ** 28 * 8)),
([("x", np.float64, 2 ** 27), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)),
([("x", np.float32, 2 ** 28), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)),
([("x", np.float16, 2 ** 29), ("y", np.float64, 2 ** 27)], (2 ** 28 * 8)),
("2147483648i,2147483648i", 17179869184),
("2147483648f,2147483648f", 17179869184),
("2147483648d,2147483648d", 34359738368),
("2b,2147483648b,2f,4i", 2147483674),
(dict(names=["a"], formats=["2147483648i"]), 8589934592),
(dict(names=["a"], formats=["2147483648i"], offsets=[1]), 8589934593),
(dict(names=["a"], formats=["2147483648i"], offsets=[2 ** 31 - 100]), 10737418140),
(dict(names=["a"], formats=["2147483648i"], offsets=[2 ** 31]), 10737418240),
(dict(names=["a", "b", "c"], formats=["2147483648b", "16i", "12f"],
offsets=[2 ** 31, 2 ** 32, 2 ** 32 + 69]), 4294967413),
])
@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system")
def test_gh_31308(kind, exp):
kind_dtype = np.dtype(kind)
Comment thread
tylerjereddy marked this conversation as resolved.
assert kind_dtype.itemsize == exp
for name in kind_dtype.names:
assert kind_dtype[name].shape[0] > 0


@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system")
@requires_memory(free_bytes=2e9)
def test_gh_31308_materialized():
kind = [("x", np.float64, 2 ** 28)]
kind_dtype = np.dtype(kind)
rec_arr = np.array((1,), dtype=kind_dtype)
Comment thread
tylerjereddy marked this conversation as resolved.
Outdated
assert rec_arr["x"].size == 2 ** 28
13 changes: 13 additions & 0 deletions numpy/_core/tests/test_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@

import numpy as np
from numpy.testing import (
IS_64BIT,
assert_,
assert_array_almost_equal,
assert_array_equal,
assert_equal,
assert_raises,
temppath,
)
from numpy.testing._private.utils import requires_memory


class TestFromrecords:
Expand Down Expand Up @@ -108,6 +110,17 @@ def test_recarray_fromfile(self):
assert_equal(r1, r2)
assert_equal(r2, r3)

@pytest.mark.skipif(not IS_64BIT, reason="test requires 64-bit system")
@requires_memory(free_bytes=2e9)
def test_recarray_fromfile_massive(self, tmpdir):
kind = [("x", np.float64, 2 ** 28)]
kind_dtype = np.dtype(kind)
rec_arr = np.array((1,), dtype=kind_dtype)
with tmpdir.as_cwd():
rec_arr.tofile("f.data")
actual = np.fromfile("f.data", dtype=kind_dtype)
assert actual.itemsize == 2 ** 28 * 8
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh, fun. OTOH, the error here would be that we are not reading everything (i.e. the last bit of the result not being -1). But I think there is a fun little thing happening here:

  • This is a signed int overflow 2**31 -> -2**31
  • Cast to size_t for reading, it'll actually just go back to 2**31 as that is unsigned.

So, no problem until we reach 2**32 at which point the error would be reading nothing at all.

Possible we could employ a funny trick here: Just try to read 1 element of a 2**32+1 sized dtype from a short file (not empty with the +1).
With fromfile(...., count=1) that should try to read one element, fail and return an empty array. But with the bug I think it'll actually return an array of length 1.
(That said, clear that is more of a regression test designed for the specific error in this code...)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Playing with this a bit, something really strange/bad happens somewhere--I can retrieve items from the reconstituted array quickly, but attempting to perform assertions on even those individual elements is incredibly slow.

For example,

item = actual["x"][0][1].item() # fast
assert item > 0 # prohibitively slow

It is only slow if it is destined to fail (!), with or without the npy_intp patch applied. How strange, I wonder if something is getting corrupted, or if I'm missing something obvious getting expanded too large (item prints out innocently enough as either 0.0 or 1.0 depending on application of the patch). I spent a few hours poking at that because it was so strange looking. It will hang on the mismatch of the tobytes patterns as well, which might suggest a corruption somewhere I suppose (since the same bytes pattern comparison is fine in raw interpreter).

I didn't find any advantage to using count=1. Maybe I misunderstood, but our problem is that we have a single massive "item" so it doesn't cut that down/discretize? The test works just fine with it, but runtime/result is the same. Perhaps you meant reading back in with a separate smaller dtype itemsize and looking at a slice of the data that way, not sure.

For now, I've pushed in a revision to test_recarray_fromfile_massive() such that it uses a larger itemsize (that part of your suggestion was indeed needed), and passes within 1-10 seconds locally with the previous patch applied to array_fromfile_binary. Without that small patch, the test will basically hang for reasons I don't fully understand. I suppose that's at least a bit closer to what I want re: a test that doesn't really reasonably pass without the patch, though still a bit annoying.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I misunderstood, but our problem is that we have a single massive "item" so it doesn't cut that down/discretize?

I think I just skipped over that it seemed to me that we pass count=1 it'll assume it can read 1 item, while otherwise it checks how big the file actually is.
I.e. my thought was we can just put a few bytes into the file, but NumPy will incorrectly attempt to read an actual element.


assert item > 0 # prohibitively slow

In general, or in pytest? Because if it is in pytest, I could imagine it wants to do something like print actual or so?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yeah, thanks, looks like suppressing the traceback allows the failure scenario to happen in a second or two (when type change is reverted): spin test -t numpy/_core/tests/test_records.py::TestFromrecords::test_recarray_fromfile_massive -- --tb=no

That's annoying that it doesn't auto truncate/summarize.

This also "works" though is not very nice:

--- a/numpy/_core/tests/test_records.py
+++ b/numpy/_core/tests/test_records.py
@@ -122,7 +122,9 @@ def test_recarray_fromfile_massive(self, tmpdir):
             actual = np.fromfile("f.data", dtype=kind_dtype)
             assert actual.itemsize == 2 ** 29 * 8
             item = actual["x"][0][1]
-            assert_allclose(item, 1)
+            if item != 1:
+                # avoid hang from pytest traceback dumping massive array
+                pytest.fail("fromfile elsize error", pytrace=False)

I'll leave this one alone for now--seems to be getting closer to something sensible perhaps...


def test_recarray_from_obj(self):
count = 10
a = np.zeros(count, dtype='O')
Expand Down