Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit d1ea8ff

Browse files
authored
DataFrame.values impl (#526)
* DataFrame.values impl * Add example * Remove list in loop * Refactoring * Reimplemented with code generating * Added limitation block to docstring * Add missed import
1 parent bb2a3b4 commit d1ea8ff

4 files changed

Lines changed: 203 additions & 20 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
from numba import njit
29+
30+
31+
@njit
32+
def dataframe_values():
33+
df = pd.DataFrame({'age': [3, 29], 'height': [94, 170], 'weight': [31, 115]})
34+
result = df.values
35+
36+
return result # Numpy array of dataframe values: array([[3, 94, 31], [29, 170, 115]], dtype=int64)
37+
38+
39+
print(dataframe_values())

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@
3535
import numpy
3636
import sdc
3737

38+
3839
from numba import types
3940
from numba.special import literally
4041
from sdc.hiframes.pd_dataframe_ext import DataFrameType
4142
from sdc.hiframes.pd_series_type import SeriesType
4243
from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric,
4344
check_types_comparable,
44-
gen_df_impl_generator)
45+
gen_df_impl_generator, find_common_dtype_from_numpy_dtypes)
4546
from sdc.str_arr_ext import StringArrayType
4647

4748
from sdc.hiframes.pd_dataframe_type import DataFrameType
@@ -105,6 +106,132 @@ def hpat_pandas_df_index_impl(df):
105106
return hpat_pandas_df_index_impl
106107

107108

109+
def sdc_pandas_dataframe_values_codegen(df, numba_common_dtype):
110+
"""
111+
Input:
112+
column_len = 3
113+
numba_common_dtype = float64
114+
115+
Func generated:
116+
def sdc_pandas_dataframe_values_impl(df):
117+
row_len = len(get_dataframe_data(df, 0))
118+
df_col_A = get_dataframe_data(df, 0)
119+
df_col_B = get_dataframe_data(df, 1)
120+
df_col_C = get_dataframe_data(df, 2)
121+
df_values = numpy.empty(row_len*3, numpy.dtype("float64"))
122+
for i in range(row_len):
123+
df_values[i * 3 + 0] = df_col_A[i]
124+
df_values[i * 3 + 1] = df_col_B[i]
125+
df_values[i * 3 + 2] = df_col_C[i]
126+
return df_values.reshape(row_len, 3)
127+
128+
"""
129+
130+
indent = 4 * ' '
131+
func_args = ['df']
132+
133+
func_definition = [f'def sdc_pandas_dataframe_values_impl({", ".join(func_args)}):']
134+
func_text = []
135+
column_list = []
136+
column_len = len(df.columns)
137+
func_text.append(f'row_len = len(get_dataframe_data(df, 0))')
138+
139+
for index, column_name in enumerate(df.columns):
140+
func_text.append(f'df_col_{column_name} = get_dataframe_data(df, {index})')
141+
column_list.append(f'df_col_{column_name}')
142+
143+
func_text.append(f'df_values = numpy.empty(row_len*{column_len}, numpy.dtype("{numba_common_dtype}"))')
144+
func_text.append('for i in range(row_len):')
145+
for j in range(column_len):
146+
func_text.append(indent + f'df_values[i * {column_len} + {j}] = {column_list[j]}[i]')
147+
148+
func_text.append(f"return df_values.reshape(row_len, {column_len})\n")
149+
func_definition.extend([indent + func_line for func_line in func_text])
150+
func_def = '\n'.join(func_definition)
151+
152+
global_vars = {'pandas': pandas, 'numpy': numpy,
153+
'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data}
154+
155+
return func_def, global_vars
156+
157+
158+
@sdc_overload_attribute(DataFrameType, 'values')
159+
def hpat_pandas_dataframe_values(df):
160+
"""
161+
Intel Scalable Dataframe Compiler User Guide
162+
********************************************
163+
Pandas API: pandas.DataFrame.values
164+
165+
Limitations
166+
-----------
167+
Only numeric values supported as an output
168+
169+
Examples
170+
--------
171+
.. literalinclude:: ../../../examples/dataframe/dataframe_values.py
172+
:language: python
173+
:lines: 27-
174+
:caption: The values data of the DataFrame.
175+
:name: ex_dataframe_values
176+
177+
.. command-output:: python ./dataframe/dataframe_values.py
178+
:cwd: ../../../examples
179+
180+
.. seealso::
181+
182+
:ref:`DataFrame.to_numpy <pandas.DataFrame.to_numpy>`
183+
Recommended alternative to this method.
184+
:ref:`DataFrame.index <pandas.DataFrame.index>`
185+
Retrieve the index labels.
186+
:ref:`DataFrame.columns <pandas.DataFrame.columns>`
187+
Retrieving the column names.
188+
189+
.. note::
190+
191+
The dtype will be a lower-common-denominator dtype (implicit upcasting);
192+
that is to say if the dtypes (even of numeric types) are mixed, the one that accommodates all will be chosen.
193+
Use this with care if you are not dealing with the blocks.
194+
e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8,
195+
dtype will be upcast to int32. By numpy.find_common_type() convention,
196+
mixing int64 and uint64 will result in a float64 dtype.
197+
198+
Intel Scalable Dataframe Compiler Developer Guide
199+
*************************************************
200+
Pandas DataFrame attribute :attr:`pandas.DataFrame.values` implementation.
201+
.. only:: developer
202+
Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_values*
203+
Parameters
204+
-----------
205+
df: :obj:`pandas.DataFrame`
206+
input arg
207+
Returns
208+
-------
209+
:obj: `numpy.ndarray`
210+
return a Numpy representation of the DataFrame
211+
"""
212+
213+
func_name = 'Attribute values.'
214+
ty_checker = TypeChecker(func_name)
215+
ty_checker.check(df, DataFrameType)
216+
217+
# TODO: Handle StringArrayType
218+
for i, column in enumerate(df.data):
219+
if isinstance(column, StringArrayType):
220+
ty_checker.raise_exc(column, 'Numeric type', f'df.data["{df.columns[i]}"]')
221+
222+
numba_common_dtype = find_common_dtype_from_numpy_dtypes([column.dtype for column in df.data], [])
223+
224+
def hpat_pandas_df_values_impl(df, numba_common_dtype):
225+
loc_vars = {}
226+
func_def, global_vars = sdc_pandas_dataframe_values_codegen(df, numba_common_dtype)
227+
228+
exec(func_def, global_vars, loc_vars)
229+
_values_impl = loc_vars['sdc_pandas_dataframe_values_impl']
230+
return _values_impl
231+
232+
return hpat_pandas_df_values_impl(df, numba_common_dtype)
233+
234+
108235
def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args):
109236
"""
110237
Input:

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,12 @@ def resolve_iloc(self, ary):
6161
def resolve_loc(self, ary):
6262
return DataFrameLocType(ary)
6363

64-
def resolve_values(self, ary):
65-
# using np.stack(data, 1) for both typing and implementation
66-
stack_sig = self.context.resolve_function_type(
67-
np.stack, (types.Tuple(ary.data), types.IntegerLiteral(1)), {})
68-
return stack_sig.return_type
64+
if sdc.config.config_pipeline_hpat_default:
65+
def resolve_values(self, ary):
66+
# using np.stack(data, 1) for both typing and implementation
67+
stack_sig = self.context.resolve_function_type(
68+
np.stack, (types.Tuple(ary.data), types.IntegerLiteral(1)), {})
69+
return stack_sig.return_type
6970

7071
@bound_function("df.apply")
7172
def resolve_apply(self, df, args, kws):

sdc/tests/test_dataframe.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -576,25 +576,41 @@ def test_impl(df, arr):
576576
df2 = df.copy()
577577
np.testing.assert_almost_equal(hpat_func(df, arr), test_impl(df2, arr))
578578

579-
@skip_numba_jit
580-
def test_df_values1(self):
581-
def test_impl(n):
582-
df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
579+
def _test_df_values_unboxing(self, df):
580+
def test_impl(df):
583581
return df.values
584582

585-
hpat_func = self.jit(test_impl)
586-
n = 11
587-
np.testing.assert_array_equal(hpat_func(n), test_impl(n))
583+
sdc_func = self.jit(test_impl)
584+
np.testing.assert_array_equal(sdc_func(df), test_impl(df))
588585

589-
@skip_numba_jit
590-
def test_df_values2(self):
591-
def test_impl(df):
586+
def test_df_values_unboxing(self):
587+
values_to_test = [[1, 2, 3, 4, 5],
588+
[.1, .2, .3, .4, .5],
589+
[np.nan, np.inf, .0, .1, -1.]]
590+
n = 5
591+
np.random.seed(0)
592+
A = np.ones(n)
593+
B = np.random.ranf(n)
594+
595+
for values in values_to_test:
596+
with self.subTest(values=values):
597+
df = pd.DataFrame({'A': A, 'B': B, 'C': values})
598+
self._test_df_values_unboxing(df)
599+
600+
def test_df_values(self):
601+
def test_impl(n, values):
602+
df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n), 'C': values})
592603
return df.values
593604

594-
hpat_func = self.jit(test_impl)
595-
n = 11
596-
df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
597-
np.testing.assert_array_equal(hpat_func(df), test_impl(df))
605+
sdc_func = self.jit(test_impl)
606+
n = 5
607+
values_to_test = [[1, 2, 3, 4, 5],
608+
[.1, .2, .3, .4, .5],
609+
[np.nan, np.inf, .0, .1, -1.]]
610+
611+
for values in values_to_test:
612+
with self.subTest(values=values):
613+
np.testing.assert_array_equal(sdc_func(n, values), test_impl(n, values))
598614

599615
@skip_numba_jit
600616
def test_df_values_parallel1(self):

0 commit comments

Comments
 (0)