Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 3a537d3

Browse files
authored
Overload df.rolling.sum() (#480)
* Overload df.rolling.sum() * Add perf.test for df.rolling.sum()
1 parent c60a26d commit 3a537d3

File tree

4 files changed

+202
-44
lines changed

4 files changed

+202
-44
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
from numba import njit
29+
30+
31+
@njit
32+
def df_rolling_sum():
33+
df = pd.DataFrame({'A': [4, 3, 5, 2, 6], 'B': [-4, -3, -5, -2, -6]})
34+
out_df = df.rolling(3).sum()
35+
36+
# Expect DataFrame of
37+
# {'A': [NaN, NaN, 12.0, 10.0, 13.0], 'B': [NaN, NaN, -12.0, -10.0, -13.0]}
38+
return out_df
39+
40+
41+
print(df_rolling_sum())

sdc/datatypes/hpat_pandas_dataframe_rolling_functions.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,15 @@ def sdc_pandas_dataframe_rolling_skew(self):
361361
return gen_df_rolling_method_impl('skew', self)
362362

363363

364+
@sdc_overload_method(DataFrameRollingType, 'sum')
365+
def sdc_pandas_dataframe_rolling_sum(self):
366+
367+
ty_checker = TypeChecker('Method rolling.sum().')
368+
ty_checker.check(self, DataFrameRollingType)
369+
370+
return gen_df_rolling_method_impl('sum', self)
371+
372+
364373
sdc_pandas_dataframe_rolling_apply.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
365374
'method_name': 'apply',
366375
'example_caption': 'Calculate the rolling apply.',
@@ -452,3 +461,15 @@ def sdc_pandas_dataframe_rolling_skew(self):
452461
'limitations_block': '',
453462
'extra_params': ''
454463
})
464+
465+
sdc_pandas_dataframe_rolling_sum.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
466+
'method_name': 'sum',
467+
'example_caption': 'Calculate rolling sum of given Series.',
468+
'limitations_block':
469+
"""
470+
Limitations
471+
-----------
472+
DataFrame elements cannot be max/min float/integer. Otherwise SDC and Pandas results are different.
473+
""",
474+
'extra_params': ''
475+
})

sdc/tests/test_rolling.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,21 @@ def test_impl(obj, window, min_periods):
721721
jit_result = hpat_func(obj, window, min_periods)
722722
assert_equal(jit_result, ref_result)
723723

724+
def _test_rolling_sum(self, obj):
725+
def test_impl(obj, window, min_periods):
726+
return obj.rolling(window, min_periods).sum()
727+
728+
hpat_func = self.jit(test_impl)
729+
assert_equal = self._get_assert_equal(obj)
730+
731+
for window in range(0, len(obj) + 3, 2):
732+
for min_periods in range(0, window + 1, 2):
733+
with self.subTest(obj=obj, window=window,
734+
min_periods=min_periods):
735+
jit_result = hpat_func(obj, window, min_periods)
736+
ref_result = test_impl(obj, window, min_periods)
737+
assert_equal(jit_result, ref_result)
738+
724739
@skip_sdc_jit('DataFrame.rolling.min() unsupported exceptions')
725740
def test_df_rolling_unsupported_values(self):
726741
all_data = test_global_input_data_float64
@@ -923,6 +938,19 @@ def test_df_rolling_skew(self):
923938

924939
self._test_rolling_skew(df)
925940

941+
@skip_sdc_jit('DataFrame.rolling.sum() unsupported')
942+
def test_df_rolling_sum(self):
943+
all_data = [
944+
list(range(10)), [1., -1., 0., 0.1, -0.1],
945+
[1., np.inf, np.inf, -1., 0., np.inf, np.NINF, np.NINF],
946+
[np.nan, np.inf, np.inf, np.nan, np.nan, np.nan, np.NINF, np.NZERO]
947+
]
948+
length = min(len(d) for d in all_data)
949+
data = {n: d[:length] for n, d in zip(string.ascii_uppercase, all_data)}
950+
df = pd.DataFrame(data)
951+
952+
self._test_rolling_sum(df)
953+
926954
@skip_sdc_jit('Series.rolling.min() unsupported exceptions')
927955
def test_series_rolling_unsupported_values(self):
928956
series = pd.Series(test_global_input_data_float64[0])
@@ -1224,11 +1252,6 @@ def test_series_rolling_std_exception_unsupported_ddof(self):
12241252

12251253
@skip_sdc_jit('Series.rolling.sum() unsupported Series index')
12261254
def test_series_rolling_sum(self):
1227-
def test_impl(series, window, min_periods):
1228-
return series.rolling(window, min_periods).sum()
1229-
1230-
hpat_func = self.jit(test_impl)
1231-
12321255
all_data = [
12331256
list(range(10)), [1., -1., 0., 0.1, -0.1],
12341257
[1., np.inf, np.inf, -1., 0., np.inf, np.NINF, np.NINF],
@@ -1237,13 +1260,7 @@ def test_impl(series, window, min_periods):
12371260
indices = [list(range(len(data)))[::-1] for data in all_data]
12381261
for data, index in zip(all_data, indices):
12391262
series = pd.Series(data, index, name='A')
1240-
for window in range(0, len(series) + 3, 2):
1241-
for min_periods in range(0, window + 1, 2):
1242-
with self.subTest(series=series, window=window,
1243-
min_periods=min_periods):
1244-
jit_result = hpat_func(series, window, min_periods)
1245-
ref_result = test_impl(series, window, min_periods)
1246-
pd.testing.assert_series_equal(jit_result, ref_result)
1263+
self._test_rolling_sum(series)
12471264

12481265
@skip_sdc_jit('Series.rolling.var() unsupported Series index')
12491266
def test_series_rolling_var(self):

sdc/tests/tests_perf/test_perf_df_rolling.py

Lines changed: 111 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,22 @@
2828
import time
2929

3030
import numba
31-
import numpy as np
31+
import numpy
3232
import pandas
3333

3434
from sdc.tests.test_utils import test_global_input_data_float64
3535
from sdc.tests.tests_perf.test_perf_base import TestBase
3636
from sdc.tests.tests_perf.test_perf_utils import (calc_compilation, get_times,
3737
perf_data_gen_fixed_len)
38-
from .generator import generate_test_cases
39-
from .generator import TestCase as TC
38+
39+
40+
rolling_usecase_tmpl = """
41+
def df_rolling_{method_name}_usecase(data, {extra_usecase_params}):
42+
start_time = time.time()
43+
res = data.rolling({rolling_params}).{method_name}({method_params})
44+
end_time = time.time()
45+
return end_time - start_time, res
46+
"""
4047

4148

4249
def get_rolling_params(window=100, min_periods=None):
@@ -48,6 +55,27 @@ def get_rolling_params(window=100, min_periods=None):
4855
return ', '.join(rolling_params)
4956

5057

58+
def gen_df_rolling_usecase(method_name, rolling_params=None,
59+
extra_usecase_params='', method_params=''):
60+
"""Generate df rolling method use case"""
61+
if not rolling_params:
62+
rolling_params = get_rolling_params()
63+
64+
func_text = rolling_usecase_tmpl.format(**{
65+
'method_name': method_name,
66+
'extra_usecase_params': extra_usecase_params,
67+
'rolling_params': rolling_params,
68+
'method_params': method_params
69+
})
70+
71+
global_vars = {'np': numpy, 'time': time}
72+
loc_vars = {}
73+
exec(func_text, global_vars, loc_vars)
74+
_df_rolling_usecase = loc_vars[f'df_rolling_{method_name}_usecase']
75+
76+
return _df_rolling_usecase
77+
78+
5179
# python -m sdc.runtests sdc.tests.tests_perf.test_perf_df_rolling.TestDFRollingMethods
5280
class TestDFRollingMethods(TestBase):
5381
# more than 19 columns raise SystemError: CPUDispatcher() returned a result with an error set
@@ -56,6 +84,19 @@ class TestDFRollingMethods(TestBase):
5684
@classmethod
5785
def setUpClass(cls):
5886
super().setUpClass()
87+
cls.total_data_length = {
88+
'apply': [2 * 10 ** 5],
89+
'corr': [10 ** 5],
90+
'count': [8 * 10 ** 5],
91+
'cov': [10 ** 5],
92+
'kurt': [4 * 10 ** 5],
93+
'max': [2 * 10 ** 5],
94+
'mean': [2 * 10 ** 5],
95+
'median': [2 * 10 ** 5],
96+
'min': [2 * 10 ** 5],
97+
'skew': [2 * 10 ** 5],
98+
'sum': [2 * 10 ** 5],
99+
}
59100

60101
def _test_jitted(self, pyfunc, record, *args, **kwargs):
61102
# compilation time
@@ -67,36 +108,47 @@ def _test_jitted(self, pyfunc, record, *args, **kwargs):
67108
cfunc(*args, **kwargs)
68109

69110
# execution and boxing time
70-
record['test_results'], record['boxing_results'] = get_times(cfunc, *args, **kwargs)
111+
record['test_results'], record['boxing_results'] = get_times(cfunc,
112+
*args,
113+
**kwargs)
71114

72115
def _test_python(self, pyfunc, record, *args, **kwargs):
73116
record['test_results'], _ = get_times(pyfunc, *args, **kwargs)
74117

75-
def _test_case(self, pyfunc, name, total_data_length, data_num=1,
76-
input_data=test_global_input_data_float64, columns_num=10):
77-
118+
def _gen_df(self, data, columns_num=10):
119+
"""Generate DataFrame based on input data"""
120+
return pandas.DataFrame(
121+
{col: data for col in string.ascii_uppercase[:columns_num]})
122+
123+
def _test_case(self, pyfunc, name,
124+
input_data=test_global_input_data_float64,
125+
columns_num=10, extra_data_num=0):
126+
"""
127+
Test DataFrame.rolling method
128+
:param pyfunc: Python function to test which calls tested method inside
129+
:param name: name of the tested method, e.g. min
130+
:param input_data: initial data used for generating test data
131+
:param columns_num: number of columns in generated DataFrame
132+
:param extra_data_num: number of additionally generated DataFrames
133+
"""
78134
if columns_num > self.max_columns_num:
79135
columns_num = self.max_columns_num
80136

81-
if input_data is None:
82-
input_data = test_global_input_data_float64
83-
84-
test_name = 'DataFrame.rolling.{}'.format(name)
85-
86137
full_input_data_length = sum(len(i) for i in input_data)
87-
for data_length in total_data_length:
138+
for data_length in self.total_data_length[name]:
88139
base = {
89-
'test_name': test_name,
140+
'test_name': f'DF.rolling.{name}',
90141
'data_size': data_length,
91142
}
92-
data = perf_data_gen_fixed_len(input_data, full_input_data_length, data_length)
93-
test_data = pandas.DataFrame({col: data for col in string.ascii_uppercase[:columns_num]})
143+
data = perf_data_gen_fixed_len(input_data, full_input_data_length,
144+
data_length)
145+
test_data = self._gen_df(data, columns_num=columns_num)
94146

95147
args = [test_data]
96-
for i in range(data_num - 1):
97-
np.random.seed(i)
98-
extra_data = np.random.ranf(data_length)
99-
args.append(pandas.DataFrame({col: extra_data for col in string.ascii_uppercase[:columns_num]}))
148+
for i in range(extra_data_num):
149+
numpy.random.seed(i)
150+
extra_data = numpy.random.ranf(data_length)
151+
args.append(self._gen_df(extra_data, columns_num=columns_num))
100152

101153
record = base.copy()
102154
record['test_type'] = 'SDC'
@@ -108,17 +160,44 @@ def _test_case(self, pyfunc, name, total_data_length, data_num=1,
108160
self._test_python(pyfunc, record, *args)
109161
self.test_results.add(**record)
110162

163+
def _test_df_rolling_method(self, name, rolling_params=None,
164+
extra_usecase_params='', method_params=''):
165+
usecase = gen_df_rolling_usecase(name, rolling_params=rolling_params,
166+
extra_usecase_params=extra_usecase_params,
167+
method_params=method_params)
168+
extra_data_num = 0
169+
if extra_usecase_params:
170+
extra_data_num += len(extra_usecase_params.split(', '))
171+
self._test_case(usecase, name, extra_data_num=extra_data_num)
172+
173+
def test_df_rolling_apply_mean(self):
174+
method_params = 'lambda x: np.nan if len(x) == 0 else x.mean()'
175+
self._test_df_rolling_method('apply', method_params=method_params)
176+
177+
def test_df_rolling_corr(self):
178+
self._test_df_rolling_method('corr', extra_usecase_params='other',
179+
method_params='other=other')
180+
181+
def test_df_rolling_count(self):
182+
self._test_df_rolling_method('count')
183+
184+
def test_df_rolling_kurt(self):
185+
self._test_df_rolling_method('kurt')
186+
187+
def test_df_rolling_max(self):
188+
self._test_df_rolling_method('max')
189+
190+
def test_df_rolling_mean(self):
191+
self._test_df_rolling_method('mean')
192+
193+
def test_df_rolling_median(self):
194+
self._test_df_rolling_method('median')
195+
196+
def test_df_rolling_min(self):
197+
self._test_df_rolling_method('min')
111198

112-
cases = [
113-
TC(name='apply', params='lambda x: np.nan if len(x) == 0 else x.mean()', size=[2 * 10 ** 5]),
114-
TC(name='corr', size=[10 ** 5], params='other', data_num=2),
115-
TC(name='count', size=[8 * 10 ** 5]),
116-
TC(name='kurt', size=[4 * 10 ** 5]),
117-
TC(name='max', size=[2 * 10 ** 5]),
118-
TC(name='mean', size=[2 * 10 ** 5]),
119-
TC(name='median', size=[2 * 10 ** 5]),
120-
TC(name='min', size=[2 * 10 ** 5]),
121-
TC(name='skew', size=[2 * 10 ** 5])
122-
]
199+
def test_df_rolling_skew(self):
200+
self._test_df_rolling_method('skew')
123201

124-
generate_test_cases(cases, TestDFRollingMethods, 'df', 'rolling({})'.format(get_rolling_params()))
202+
def test_df_rolling_sum(self):
203+
self._test_df_rolling_method('sum')

0 commit comments

Comments
 (0)