Re-implement df structure: refactor len (#868)

akharche · densmirn · web-flow · commit 46c1212d2fcf · 2020-05-29T23:05:20.000+03:00
* Re-implement df structure: refactor len

* Undecorated all the remaining methods

Co-authored-by: Denis &lt;denis.smirnov@intel.com&gt;
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -160,7 +160,7 @@ def df_len_overload(df):
 
     if len(df.columns) == 0:  # empty df
         return lambda df: 0
-    return lambda df: len(df._data[0])
+    return lambda df: len(df._data[0][0])
 
 
 # handle getitem for Tuples because sometimes df._data[i] in
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -53,8 +53,7 @@
                                   skip_numba_jit,
                                   skip_sdc_jit,
                                   test_global_input_data_float64,
-                                  test_global_input_data_unicode_kind4,
-                                  dfRefactoringNotImplemented)
+                                  test_global_input_data_unicode_kind4)
 
 
 @sdc.jit
@@ -156,7 +155,7 @@ def test_impl(A, B, c):
         c = 2
         pd.testing.assert_frame_equal(hpat_func(A, B, c), test_impl(A, B, c))
 
-    @dfRefactoringNotImplemented
+    @unittest.skip('Implement feature to create DataFrame without column names')
     def test_create_without_column_names(self):
         def test_impl():
             df = pd.DataFrame([100, 200, 300, 400, 200, 100])
@@ -1405,7 +1404,7 @@ def test_impl():
         sdc_func = sdc.jit(test_impl)
         pd.testing.assert_frame_equal(sdc_func(), test_impl())
 
-    @dfRefactoringNotImplemented
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
     def test_df_loc_str(self):
         def test_impl(df):
             return df.loc['c']
@@ -1417,7 +1416,7 @@ def test_impl(df):
                            "C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx)
         pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
 
-    @dfRefactoringNotImplemented
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
     def test_df_loc_no_idx(self):
         def test_impl(df):
             return df.loc[2]
@@ -2658,7 +2657,6 @@ def test_impl():
         self.assertTrue(isinstance(two, np.ndarray))
         self.assertTrue(isinstance(three, np.ndarray))
 
-    @dfRefactoringNotImplemented
     def test_df_len(self):
         def test_impl(df):
             return len(df)
@@ -2709,27 +2707,27 @@ def test_impl():
         hpat_func = self.jit(test_impl)
         pd.testing.assert_series_equal(hpat_func(), test_impl())
 
-    @dfRefactoringNotImplemented
     def test_df_iterate_over_columns2(self):
         """ Verifies iteration over unboxed df columns using literal unroll. """
         from sdc.hiframes.api import get_nan_mask
 
         @self.jit
-        def jitted_func(df):
+        def jitted_func():
+            cols = ('A', 'B', 'C', 'D')
+            df = pd.DataFrame({
+                'A': ['a', 'b', None, 'a', '', None, 'b'],
+                'B': ['a', 'b', 'd', 'a', '', 'c', 'b'],
+                'C': [np.nan, 1, 2, 1, np.nan, 2, 1],
+                'D': [1, 2, 9, 5, 2, 1, 0]
+            })
             res_nan_mask = np.zeros(len(df), dtype=np.bool_)
-            for col in literal_unroll(df._data):
-                res_nan_mask += get_nan_mask(col)
+            for col in literal_unroll(cols):
+                res_nan_mask += get_nan_mask(df[col].values)
             return res_nan_mask
 
-        df = pd.DataFrame({
-                    'A': ['a', 'b', None, 'a', '', None, 'b'],
-                    'B': ['a', 'b', 'd', 'a', '', 'c', 'b'],
-                    'C': [np.nan, 1, 2, 1, np.nan, 2, 1],
-                    'D': [1, 2, 9, 5, 2, 1, 0]
-        })
         # expected is a boolean mask of df rows that have None values
         expected = np.asarray([True, False, True, False, True, True, False])
-        result = jitted_func(df)
+        result = jitted_func()
         np.testing.assert_array_equal(result, expected)
 
 
diff --git a/sdc/tests/test_groupby.py b/sdc/tests/test_groupby.py
@@ -42,8 +42,7 @@
                                   get_start_end,
                                   skip_numba_jit,
                                   skip_sdc_jit,
-                                  sdc_limitation,
-                                  dfRefactoringNotImplemented)
+                                  sdc_limitation)
 from sdc.tests.test_series import gen_frand_array
 
 
diff --git a/sdc/tests/test_rolling.py b/sdc/tests/test_rolling.py
@@ -41,8 +41,7 @@
 from sdc.tests.test_series import gen_frand_array
 from sdc.tests.test_utils import (count_array_REPs, count_parfor_REPs,
                                   skip_numba_jit, skip_sdc_jit,
-                                  test_global_input_data_float64,
-                                  dfRefactoringNotImplemented)
+                                  test_global_input_data_float64)
 
 
 LONG_TEST = (int(os.environ['SDC_LONG_ROLLING_TEST']) != 0
diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py
@@ -212,9 +212,6 @@ def skip_inline(msg_or_func):
     return wrapper(func) if func else wrapper
 
 
-dfRefactoringNotImplemented = unittest.expectedFailure
-
-
 def take_k_elements(k, data, repeat=False, seed=None):
     if seed is not None:
         np.random.seed(seed)