Skip to content

Commit 9ab9532

Browse files
raulcdjorisvandenbosschepitrou
authored
GH-25118: [Python] Make NumPy an optional runtime dependency (#41904)
### Rationale for this change Being able to run pyarrow without requiring numpy. ### What changes are included in this PR? If numpy is not present we are able to import pyarrow and run functionality. A new CI job has been created to run some basic tests without numpy. ### Are these changes tested? Yes via CI. ### Are there any user-facing changes? Yes, NumPy can be removed from the user installation and pyarrow functionality still works * GitHub Issue: #25118 Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent a8df190 commit 9ab9532

62 files changed

Lines changed: 1008 additions & 420 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/python.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ jobs:
5959
- conda-python-3.9-nopandas
6060
- conda-python-3.8-pandas-1.0
6161
- conda-python-3.10-pandas-latest
62+
- conda-python-3.10-no-numpy
6263
include:
6364
- name: conda-python-docs
6465
cache: conda-python-3.9
@@ -83,6 +84,11 @@ jobs:
8384
title: AMD64 Conda Python 3.10 Pandas latest
8485
python: "3.10"
8586
pandas: latest
87+
- name: conda-python-3.10-no-numpy
88+
cache: conda-python-3.10
89+
image: conda-python-no-numpy
90+
title: AMD64 Conda Python 3.10 without NumPy
91+
python: "3.10"
8692
env:
8793
PYTHON: ${{ matrix.python || 3.8 }}
8894
UBUNTU: ${{ matrix.ubuntu || 20.04 }}

docker-compose.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ x-hierarchy:
126126
- conda-python-hdfs
127127
- conda-python-java-integration
128128
- conda-python-jpype
129+
- conda-python-no-numpy
129130
- conda-python-spark
130131
- conda-python-substrait
131132
- conda-verify-rc
@@ -1258,6 +1259,37 @@ services:
12581259
volumes: *conda-volumes
12591260
command: *python-conda-command
12601261

1262+
conda-python-no-numpy:
1263+
# Usage:
1264+
# docker-compose build conda
1265+
# docker-compose build conda-cpp
1266+
# docker-compose build conda-python
1267+
# docker-compose build conda-python-no-numpy
1268+
# docker-compose run --rm conda-python-no-numpy
1269+
image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy
1270+
build:
1271+
context: .
1272+
dockerfile: ci/docker/conda-python.dockerfile
1273+
cache_from:
1274+
- ${REPO}:${ARCH}-conda-python-${PYTHON}
1275+
args:
1276+
repo: ${REPO}
1277+
arch: ${ARCH}
1278+
python: ${PYTHON}
1279+
shm_size: *shm-size
1280+
environment:
1281+
<<: [*common, *ccache, *sccache]
1282+
PARQUET_REQUIRE_ENCRYPTION: # inherit
1283+
HYPOTHESIS_PROFILE: # inherit
1284+
PYARROW_TEST_HYPOTHESIS: # inherit
1285+
volumes: *conda-volumes
1286+
command:
1287+
["
1288+
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
1289+
/arrow/ci/scripts/python_build.sh /arrow /build &&
1290+
mamba uninstall -y numpy &&
1291+
/arrow/ci/scripts/python_test.sh /arrow"]
1292+
12611293
conda-python-docs:
12621294
# Usage:
12631295
# archery docker run conda-python-docs

python/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS
339339
${PYARROW_CPP_SOURCE_DIR}/gdb.cc
340340
${PYARROW_CPP_SOURCE_DIR}/helpers.cc
341341
${PYARROW_CPP_SOURCE_DIR}/inference.cc
342-
${PYARROW_CPP_SOURCE_DIR}/init.cc
343342
${PYARROW_CPP_SOURCE_DIR}/io.cc
344343
${PYARROW_CPP_SOURCE_DIR}/ipc.cc
345344
${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc
345+
${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
346346
${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc
347347
${PYARROW_CPP_SOURCE_DIR}/python_test.cc
348348
${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
349349
${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
350350
${PYARROW_CPP_SOURCE_DIR}/serialize.cc
351351
${PYARROW_CPP_SOURCE_DIR}/udf.cc)
352-
set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc
352+
set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
353353
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
354354
SKIP_UNITY_BUILD_INCLUSION ON)
355355

python/pyarrow/_compute.pyx

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG
3333
from libcpp cimport bool as c_bool
3434

3535
import inspect
36-
import numpy as np
36+
try:
37+
import numpy as np
38+
except ImportError:
39+
np = None
3740
import warnings
3841

3942

@@ -43,6 +46,11 @@ _substrait_msg = (
4346
)
4447

4548

49+
SUPPORTED_INPUT_ARR_TYPES = (list, tuple)
50+
if np is not None:
51+
SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, )
52+
53+
4654
def _pas():
4755
global __pas
4856
if __pas is None:
@@ -473,7 +481,7 @@ cdef class MetaFunction(Function):
473481

474482
cdef _pack_compute_args(object values, vector[CDatum]* out):
475483
for val in values:
476-
if isinstance(val, (list, np.ndarray)):
484+
if isinstance(val, SUPPORTED_INPUT_ARR_TYPES):
477485
val = lib.asarray(val)
478486

479487
if isinstance(val, Array):
@@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions):
21892197

21902198
def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True,
21912199
min_count=0):
2192-
if not isinstance(q, (list, tuple, np.ndarray)):
2200+
if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
21932201
q = [q]
21942202
self._set_options(q, interpolation, skip_nulls, min_count)
21952203

@@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions):
22222230

22232231
def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True,
22242232
min_count=0):
2225-
if not isinstance(q, (list, tuple, np.ndarray)):
2233+
if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
22262234
q = [q]
22272235
self._set_options(q, delta, buffer_size, skip_nulls, min_count)
22282236

python/pyarrow/array.pxi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object size,
5050

5151

5252
cdef inline _is_array_like(obj):
53+
if np is None:
54+
return False
5355
if isinstance(obj, np.ndarray):
5456
return True
5557
return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
@@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible):
16081610
"""
16091611
self._assert_cpu()
16101612

1613+
if np is None:
1614+
raise ImportError(
1615+
"Cannot return a numpy.ndarray if NumPy is not present")
16111616
cdef:
16121617
PyObject* out
16131618
PandasOptions c_options

python/pyarrow/builder.pxi

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
import math
19+
1820

1921
cdef class StringBuilder(_Weakrefable):
2022
"""
@@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable):
4244
value : string/bytes or np.nan/None
4345
The value to append to the string array builder.
4446
"""
45-
if value is None or value is np.nan:
46-
self.builder.get().AppendNull()
47-
elif isinstance(value, (bytes, str)):
47+
if isinstance(value, (bytes, str)):
4848
self.builder.get().Append(tobytes(value))
49+
elif value is None or math.isnan(value):
50+
self.builder.get().AppendNull()
4951
else:
5052
raise TypeError('StringBuilder only accepts string objects')
5153

@@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable):
108110
value : string/bytes or np.nan/None
109111
The value to append to the string array builder.
110112
"""
111-
if value is None or value is np.nan:
112-
self.builder.get().AppendNull()
113-
elif isinstance(value, (bytes, str)):
113+
if isinstance(value, (bytes, str)):
114114
self.builder.get().Append(tobytes(value))
115+
elif value is None or math.isnan(value):
116+
self.builder.get().AppendNull()
115117
else:
116118
raise TypeError('StringViewBuilder only accepts string objects')
117119

python/pyarrow/conftest.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pyarrow.tests.util import windows_has_tzdata
2626
import sys
2727

28-
import numpy as np
2928

3029
groups = [
3130
'acero',
@@ -46,6 +45,8 @@
4645
'lz4',
4746
'memory_leak',
4847
'nopandas',
48+
'nonumpy',
49+
'numpy',
4950
'orc',
5051
'pandas',
5152
'parquet',
@@ -81,6 +82,8 @@
8182
'lz4': Codec.is_available('lz4'),
8283
'memory_leak': False,
8384
'nopandas': False,
85+
'nonumpy': False,
86+
'numpy': False,
8487
'orc': False,
8588
'pandas': False,
8689
'parquet': False,
@@ -158,6 +161,12 @@
158161
except ImportError:
159162
defaults['nopandas'] = True
160163

164+
try:
165+
import numpy # noqa
166+
defaults['numpy'] = True
167+
except ImportError:
168+
defaults['nonumpy'] = True
169+
161170
try:
162171
import pyarrow.parquet # noqa
163172
defaults['parquet'] = True
@@ -327,6 +336,7 @@ def unary_agg_func_fixture():
327336
Register a unary aggregate function (mean)
328337
"""
329338
from pyarrow import compute as pc
339+
import numpy as np
330340

331341
def func(ctx, x):
332342
return pa.scalar(np.nanmean(x))
@@ -352,6 +362,7 @@ def varargs_agg_func_fixture():
352362
Register a unary aggregate function
353363
"""
354364
from pyarrow import compute as pc
365+
import numpy as np
355366

356367
def func(ctx, *args):
357368
sum = 0.0

python/pyarrow/includes/libarrow_python.pxd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
248248
CResult[PyObject*] StringToTzinfo(c_string)
249249

250250

251-
cdef extern from "arrow/python/init.h":
251+
cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
252252
int arrow_init_numpy() except -1
253253

254254

python/pyarrow/lib.pyx

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121

2222
import datetime
2323
import decimal as _pydecimal
24-
import numpy as np
24+
try:
25+
import numpy as np
26+
except ImportError:
27+
np = None
2528
import os
2629
import sys
2730

@@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object
3235
cimport pyarrow.includes.libarrow_python as libarrow_python
3336
cimport cpython as cp
3437

35-
# Initialize NumPy C API
36-
arrow_init_numpy()
38+
39+
# Initialize NumPy C API only if numpy was able to be imported
40+
if np is not None:
41+
arrow_init_numpy()
42+
3743
# Initialize PyArrow C++ API
3844
# (used from some of our C++ code, see e.g. ARROW-5260)
3945
import_pyarrow()

python/pyarrow/pandas_compat.py

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,17 @@
3030
import re
3131
import warnings
3232

33-
import numpy as np
34-
33+
try:
34+
import numpy as np
35+
except ImportError:
36+
np = None
3537
import pyarrow as pa
3638
from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa
3739

3840

3941
_logical_type_map = {}
42+
_numpy_logical_type_map = {}
43+
_pandas_logical_type_map = {}
4044

4145

4246
def get_logical_type_map():
@@ -85,27 +89,32 @@ def get_logical_type(arrow_type):
8589
return 'object'
8690

8791

88-
_numpy_logical_type_map = {
89-
np.bool_: 'bool',
90-
np.int8: 'int8',
91-
np.int16: 'int16',
92-
np.int32: 'int32',
93-
np.int64: 'int64',
94-
np.uint8: 'uint8',
95-
np.uint16: 'uint16',
96-
np.uint32: 'uint32',
97-
np.uint64: 'uint64',
98-
np.float32: 'float32',
99-
np.float64: 'float64',
100-
'datetime64[D]': 'date',
101-
np.str_: 'string',
102-
np.bytes_: 'bytes',
103-
}
92+
def get_numpy_logical_type_map():
93+
global _numpy_logical_type_map
94+
if not _numpy_logical_type_map:
95+
_numpy_logical_type_map.update({
96+
np.bool_: 'bool',
97+
np.int8: 'int8',
98+
np.int16: 'int16',
99+
np.int32: 'int32',
100+
np.int64: 'int64',
101+
np.uint8: 'uint8',
102+
np.uint16: 'uint16',
103+
np.uint32: 'uint32',
104+
np.uint64: 'uint64',
105+
np.float32: 'float32',
106+
np.float64: 'float64',
107+
'datetime64[D]': 'date',
108+
np.str_: 'string',
109+
np.bytes_: 'bytes',
110+
})
111+
return _numpy_logical_type_map
104112

105113

106114
def get_logical_type_from_numpy(pandas_collection):
115+
numpy_logical_type_map = get_numpy_logical_type_map()
107116
try:
108-
return _numpy_logical_type_map[pandas_collection.dtype.type]
117+
return numpy_logical_type_map[pandas_collection.dtype.type]
109118
except KeyError:
110119
if hasattr(pandas_collection.dtype, 'tz'):
111120
return 'datetimetz'
@@ -1023,18 +1032,23 @@ def _is_generated_index_name(name):
10231032
return re.match(pattern, name) is not None
10241033

10251034

1026-
_pandas_logical_type_map = {
1027-
'date': 'datetime64[D]',
1028-
'datetime': 'datetime64[ns]',
1029-
'datetimetz': 'datetime64[ns]',
1030-
'unicode': np.str_,
1031-
'bytes': np.bytes_,
1032-
'string': np.str_,
1033-
'integer': np.int64,
1034-
'floating': np.float64,
1035-
'decimal': np.object_,
1036-
'empty': np.object_,
1037-
}
1035+
def get_pandas_logical_type_map():
1036+
global _pandas_logical_type_map
1037+
1038+
if not _pandas_logical_type_map:
1039+
_pandas_logical_type_map.update({
1040+
'date': 'datetime64[D]',
1041+
'datetime': 'datetime64[ns]',
1042+
'datetimetz': 'datetime64[ns]',
1043+
'unicode': np.str_,
1044+
'bytes': np.bytes_,
1045+
'string': np.str_,
1046+
'integer': np.int64,
1047+
'floating': np.float64,
1048+
'decimal': np.object_,
1049+
'empty': np.object_,
1050+
})
1051+
return _pandas_logical_type_map
10381052

10391053

10401054
def _pandas_type_to_numpy_type(pandas_type):
@@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type):
10501064
dtype : np.dtype
10511065
The dtype that corresponds to `pandas_type`.
10521066
"""
1067+
pandas_logical_type_map = get_pandas_logical_type_map()
10531068
try:
1054-
return _pandas_logical_type_map[pandas_type]
1069+
return pandas_logical_type_map[pandas_type]
10551070
except KeyError:
10561071
if 'mixed' in pandas_type:
10571072
# catching 'mixed', 'mixed-integer' and 'mixed-integer-float'

0 commit comments

Comments
 (0)