Overload df.rolling.sum() (#480)

densmirn · web-flow · commit 3a537d3de0cb · 2020-01-30T18:49:25.000+03:00
* Overload df.rolling.sum()

* Add perf.test for df.rolling.sum()
diff --git a/examples/dataframe/dataframe_rolling_sum.py b/examples/dataframe/dataframe_rolling_sum.py
@@ -0,0 +1,41 @@
+# *****************************************************************************
+# Copyright (c) 2020, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pandas as pd
+from numba import njit
+
+
+@njit
+def df_rolling_sum():
+    df = pd.DataFrame({'A': [4, 3, 5, 2, 6], 'B': [-4, -3, -5, -2, -6]})
+    out_df = df.rolling(3).sum()
+
+    # Expect DataFrame of
+    # {'A': [NaN, NaN, 12.0, 10.0, 13.0], 'B': [NaN, NaN, -12.0, -10.0, -13.0]}
+    return out_df
+
+
+print(df_rolling_sum())
diff --git a/sdc/datatypes/hpat_pandas_dataframe_rolling_functions.py b/sdc/datatypes/hpat_pandas_dataframe_rolling_functions.py
@@ -361,6 +361,15 @@ def sdc_pandas_dataframe_rolling_skew(self):
     return gen_df_rolling_method_impl('skew', self)
 
 
+@sdc_overload_method(DataFrameRollingType, 'sum')
+def sdc_pandas_dataframe_rolling_sum(self):
+
+    ty_checker = TypeChecker('Method rolling.sum().')
+    ty_checker.check(self, DataFrameRollingType)
+
+    return gen_df_rolling_method_impl('sum', self)
+
+
 sdc_pandas_dataframe_rolling_apply.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
     'method_name': 'apply',
     'example_caption': 'Calculate the rolling apply.',
@@ -452,3 +461,15 @@ def sdc_pandas_dataframe_rolling_skew(self):
     'limitations_block': '',
     'extra_params': ''
 })
+
+sdc_pandas_dataframe_rolling_sum.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
+    'method_name': 'sum',
+    'example_caption': 'Calculate rolling sum of given Series.',
+    'limitations_block':
+    """
+    Limitations
+    -----------
+    DataFrame elements cannot be max/min float/integer. Otherwise SDC and Pandas results are different.
+    """,
+    'extra_params': ''
+})
diff --git a/sdc/tests/test_rolling.py b/sdc/tests/test_rolling.py
@@ -721,6 +721,21 @@ def test_impl(obj, window, min_periods):
                     jit_result = hpat_func(obj, window, min_periods)
                     assert_equal(jit_result, ref_result)
 
+    def _test_rolling_sum(self, obj):
+        def test_impl(obj, window, min_periods):
+            return obj.rolling(window, min_periods).sum()
+
+        hpat_func = self.jit(test_impl)
+        assert_equal = self._get_assert_equal(obj)
+
+        for window in range(0, len(obj) + 3, 2):
+            for min_periods in range(0, window + 1, 2):
+                with self.subTest(obj=obj, window=window,
+                                  min_periods=min_periods):
+                    jit_result = hpat_func(obj, window, min_periods)
+                    ref_result = test_impl(obj, window, min_periods)
+                    assert_equal(jit_result, ref_result)
+
     @skip_sdc_jit('DataFrame.rolling.min() unsupported exceptions')
     def test_df_rolling_unsupported_values(self):
         all_data = test_global_input_data_float64
@@ -923,6 +938,19 @@ def test_df_rolling_skew(self):
 
         self._test_rolling_skew(df)
 
+    @skip_sdc_jit('DataFrame.rolling.sum() unsupported')
+    def test_df_rolling_sum(self):
+        all_data = [
+            list(range(10)), [1., -1., 0., 0.1, -0.1],
+            [1., np.inf, np.inf, -1., 0., np.inf, np.NINF, np.NINF],
+            [np.nan, np.inf, np.inf, np.nan, np.nan, np.nan, np.NINF, np.NZERO]
+        ]
+        length = min(len(d) for d in all_data)
+        data = {n: d[:length] for n, d in zip(string.ascii_uppercase, all_data)}
+        df = pd.DataFrame(data)
+
+        self._test_rolling_sum(df)
+
     @skip_sdc_jit('Series.rolling.min() unsupported exceptions')
     def test_series_rolling_unsupported_values(self):
         series = pd.Series(test_global_input_data_float64[0])
@@ -1224,11 +1252,6 @@ def test_series_rolling_std_exception_unsupported_ddof(self):
 
     @skip_sdc_jit('Series.rolling.sum() unsupported Series index')
     def test_series_rolling_sum(self):
-        def test_impl(series, window, min_periods):
-            return series.rolling(window, min_periods).sum()
-
-        hpat_func = self.jit(test_impl)
-
         all_data = [
             list(range(10)), [1., -1., 0., 0.1, -0.1],
             [1., np.inf, np.inf, -1., 0., np.inf, np.NINF, np.NINF],
@@ -1237,13 +1260,7 @@ def test_impl(series, window, min_periods):
         indices = [list(range(len(data)))[::-1] for data in all_data]
         for data, index in zip(all_data, indices):
             series = pd.Series(data, index, name='A')
-            for window in range(0, len(series) + 3, 2):
-                for min_periods in range(0, window + 1, 2):
-                    with self.subTest(series=series, window=window,
-                                      min_periods=min_periods):
-                        jit_result = hpat_func(series, window, min_periods)
-                        ref_result = test_impl(series, window, min_periods)
-                        pd.testing.assert_series_equal(jit_result, ref_result)
+            self._test_rolling_sum(series)
 
     @skip_sdc_jit('Series.rolling.var() unsupported Series index')
     def test_series_rolling_var(self):
diff --git a/sdc/tests/tests_perf/test_perf_df_rolling.py b/sdc/tests/tests_perf/test_perf_df_rolling.py
@@ -28,15 +28,22 @@
 import time
 
 import numba
-import numpy as np
+import numpy
 import pandas
 
 from sdc.tests.test_utils import test_global_input_data_float64
 from sdc.tests.tests_perf.test_perf_base import TestBase
 from sdc.tests.tests_perf.test_perf_utils import (calc_compilation, get_times,
                                                   perf_data_gen_fixed_len)
-from .generator import generate_test_cases
-from .generator import TestCase as TC
+
+
+rolling_usecase_tmpl = """
+def df_rolling_{method_name}_usecase(data, {extra_usecase_params}):
+    start_time = time.time()
+    res = data.rolling({rolling_params}).{method_name}({method_params})
+    end_time = time.time()
+    return end_time - start_time, res
+"""
 
 
 def get_rolling_params(window=100, min_periods=None):
@@ -48,6 +55,27 @@ def get_rolling_params(window=100, min_periods=None):
     return ', '.join(rolling_params)
 
 
+def gen_df_rolling_usecase(method_name, rolling_params=None,
+                           extra_usecase_params='', method_params=''):
+    """Generate df rolling method use case"""
+    if not rolling_params:
+        rolling_params = get_rolling_params()
+
+    func_text = rolling_usecase_tmpl.format(**{
+        'method_name': method_name,
+        'extra_usecase_params': extra_usecase_params,
+        'rolling_params': rolling_params,
+        'method_params': method_params
+    })
+
+    global_vars = {'np': numpy, 'time': time}
+    loc_vars = {}
+    exec(func_text, global_vars, loc_vars)
+    _df_rolling_usecase = loc_vars[f'df_rolling_{method_name}_usecase']
+
+    return _df_rolling_usecase
+
+
 # python -m sdc.runtests sdc.tests.tests_perf.test_perf_df_rolling.TestDFRollingMethods
 class TestDFRollingMethods(TestBase):
     # more than 19 columns raise SystemError: CPUDispatcher() returned a result with an error set
@@ -56,6 +84,19 @@ class TestDFRollingMethods(TestBase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
+        cls.total_data_length = {
+            'apply': [2 * 10 ** 5],
+            'corr': [10 ** 5],
+            'count': [8 * 10 ** 5],
+            'cov': [10 ** 5],
+            'kurt': [4 * 10 ** 5],
+            'max': [2 * 10 ** 5],
+            'mean': [2 * 10 ** 5],
+            'median': [2 * 10 ** 5],
+            'min': [2 * 10 ** 5],
+            'skew': [2 * 10 ** 5],
+            'sum': [2 * 10 ** 5],
+        }
 
     def _test_jitted(self, pyfunc, record, *args, **kwargs):
         # compilation time
@@ -67,36 +108,47 @@ def _test_jitted(self, pyfunc, record, *args, **kwargs):
         cfunc(*args, **kwargs)
 
         # execution and boxing time
-        record['test_results'], record['boxing_results'] = get_times(cfunc, *args, **kwargs)
+        record['test_results'], record['boxing_results'] = get_times(cfunc,
+                                                                     *args,
+                                                                     **kwargs)
 
     def _test_python(self, pyfunc, record, *args, **kwargs):
         record['test_results'], _ = get_times(pyfunc, *args, **kwargs)
 
-    def _test_case(self, pyfunc, name, total_data_length, data_num=1,
-                   input_data=test_global_input_data_float64, columns_num=10):
-
+    def _gen_df(self, data, columns_num=10):
+        """Generate DataFrame based on input data"""
+        return pandas.DataFrame(
+            {col: data for col in string.ascii_uppercase[:columns_num]})
+
+    def _test_case(self, pyfunc, name,
+                   input_data=test_global_input_data_float64,
+                   columns_num=10, extra_data_num=0):
+        """
+        Test DataFrame.rolling method
+        :param pyfunc: Python function to test which calls tested method inside
+        :param name: name of the tested method, e.g. min
+        :param input_data: initial data used for generating test data
+        :param columns_num: number of columns in generated DataFrame
+        :param extra_data_num: number of additionally generated DataFrames
+        """
         if columns_num > self.max_columns_num:
             columns_num = self.max_columns_num
 
-        if input_data is None:
-            input_data = test_global_input_data_float64
-
-        test_name = 'DataFrame.rolling.{}'.format(name)
-
         full_input_data_length = sum(len(i) for i in input_data)
-        for data_length in total_data_length:
+        for data_length in self.total_data_length[name]:
             base = {
-                'test_name': test_name,
+                'test_name': f'DF.rolling.{name}',
                 'data_size': data_length,
             }
-            data = perf_data_gen_fixed_len(input_data, full_input_data_length, data_length)
-            test_data = pandas.DataFrame({col: data for col in string.ascii_uppercase[:columns_num]})
+            data = perf_data_gen_fixed_len(input_data, full_input_data_length,
+                                           data_length)
+            test_data = self._gen_df(data, columns_num=columns_num)
 
             args = [test_data]
-            for i in range(data_num - 1):
-                np.random.seed(i)
-                extra_data = np.random.ranf(data_length)
-                args.append(pandas.DataFrame({col: extra_data for col in string.ascii_uppercase[:columns_num]}))
+            for i in range(extra_data_num):
+                numpy.random.seed(i)
+                extra_data = numpy.random.ranf(data_length)
+                args.append(self._gen_df(extra_data, columns_num=columns_num))
 
             record = base.copy()
             record['test_type'] = 'SDC'
@@ -108,17 +160,44 @@ def _test_case(self, pyfunc, name, total_data_length, data_num=1,
             self._test_python(pyfunc, record, *args)
             self.test_results.add(**record)
 
+    def _test_df_rolling_method(self, name, rolling_params=None,
+                                extra_usecase_params='', method_params=''):
+        usecase = gen_df_rolling_usecase(name, rolling_params=rolling_params,
+                                         extra_usecase_params=extra_usecase_params,
+                                         method_params=method_params)
+        extra_data_num = 0
+        if extra_usecase_params:
+            extra_data_num += len(extra_usecase_params.split(', '))
+        self._test_case(usecase, name, extra_data_num=extra_data_num)
+
+    def test_df_rolling_apply_mean(self):
+        method_params = 'lambda x: np.nan if len(x) == 0 else x.mean()'
+        self._test_df_rolling_method('apply', method_params=method_params)
+
+    def test_df_rolling_corr(self):
+        self._test_df_rolling_method('corr', extra_usecase_params='other',
+                                     method_params='other=other')
+
+    def test_df_rolling_count(self):
+        self._test_df_rolling_method('count')
+
+    def test_df_rolling_kurt(self):
+        self._test_df_rolling_method('kurt')
+
+    def test_df_rolling_max(self):
+        self._test_df_rolling_method('max')
+
+    def test_df_rolling_mean(self):
+        self._test_df_rolling_method('mean')
+
+    def test_df_rolling_median(self):
+        self._test_df_rolling_method('median')
+
+    def test_df_rolling_min(self):
+        self._test_df_rolling_method('min')
 
-cases = [
-    TC(name='apply', params='lambda x: np.nan if len(x) == 0 else x.mean()', size=[2 * 10 ** 5]),
-    TC(name='corr', size=[10 ** 5], params='other', data_num=2),
-    TC(name='count', size=[8 * 10 ** 5]),
-    TC(name='kurt', size=[4 * 10 ** 5]),
-    TC(name='max', size=[2 * 10 ** 5]),
-    TC(name='mean', size=[2 * 10 ** 5]),
-    TC(name='median', size=[2 * 10 ** 5]),
-    TC(name='min', size=[2 * 10 ** 5]),
-    TC(name='skew', size=[2 * 10 ** 5])
-]
+    def test_df_rolling_skew(self):
+        self._test_df_rolling_method('skew')
 
-generate_test_cases(cases, TestDFRollingMethods, 'df', 'rolling({})'.format(get_rolling_params()))
+    def test_df_rolling_sum(self):
+        self._test_df_rolling_method('sum')