Re-implement df.getitem based on new structure (#845)

densmirn · web-flow · commit c06c3497009a · 2020-05-19T17:20:24.000+03:00
* Re-implement df.getitem based on new structure

* Re-implemented remaining getitem overloads, add tests
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -416,13 +416,13 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe
 #           return pandas.Series([result_A, result_B], ['A', 'B'])
 
 
-def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns, df_structure):
+def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns, column_loc):
     result_name_list = []
     joined = ', '.join(func_params)
     func_lines = [f'def _df_{func_name}_impl({joined}):']
     for i, c in enumerate(columns):
-        type_id = df_structure[c].type_id
-        col_id = df_structure[c].col_type_id
+        col_loc = column_loc[c]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         result_c = f'result_{i}'
         func_lines += [f'  series_{i} = pandas.Series({func_params[0]}._data[{type_id}][{col_id}])',
                        f'  {result_c} = series_{i}.{func_name}({series_params})']
@@ -452,7 +452,7 @@ def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params):
     df_func_name = f'_df_{func_name}_impl'
 
     func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns,
-                                                               df.df_structure)
+                                                               df.column_loc)
     loc_vars = {}
     exec(func_text, global_vars, loc_vars)
     _reduce_impl = loc_vars[df_func_name]
@@ -1453,7 +1453,7 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
 def df_length_expr(self):
     """Generate expression to get length of DF"""
     if self.columns:
-        return 'len(self._data[0])'
+        return 'len(self._data[0][0])'
 
     return '0'
 
@@ -1475,16 +1475,22 @@ def df_index_expr(self, length_expr=None, as_range=False):
 def df_getitem_slice_idx_main_codelines(self, idx):
     """Generate main code lines for df.getitem with idx of slice"""
     results = []
-    func_lines = [f'  res_index = {df_index_expr(self)}']
+    func_lines = [
+        f'  self_index = {df_index_expr(self)}',
+        f'  index = self_index[idx]',
+    ]
     for i, col in enumerate(self.columns):
+        col_loc = self.column_loc[col]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         res_data = f'res_data_{i}'
         func_lines += [
-            f'  {res_data} = pandas.Series((self._data[{i}])[idx], index=res_index[idx], name="{col}")'
+            f'  data_{i} = self._data[{type_id}][{col_id}][idx]',
+            f'  {res_data} = pandas.Series(data_{i}, index=index, name="{col}")',
         ]
         results.append((col, res_data))
 
     data = ', '.join(f'"{col}": {data}' for col, data in results)
-    func_lines += [f'  return pandas.DataFrame({{{data}}}, index=res_index[idx])']
+    func_lines += [f'  return pandas.DataFrame({{{data}}}, index=index)']
 
     return func_lines
 
@@ -1495,9 +1501,11 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
     func_lines = [f'  res_index = {df_index_expr(self)}']
     needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx}
     for col, i in needed_cols.items():
+        col_loc = self.column_loc[col]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         res_data = f'res_data_{i}'
         func_lines += [
-            f'  data_{i} = self._data [{i}]',
+            f'  data_{i} = self._data[{type_id}][{col_id}]',
             f'  {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")'
         ]
         results.append((col, res_data))
@@ -1510,23 +1518,28 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
 
 def df_getitem_bool_series_idx_main_codelines(self, idx):
     """Generate main code lines for df.getitem"""
+    length_expr = df_length_expr(self)
 
     # optimization for default indexes in df and idx when index alignment is trivial
-    if (isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType)):
-        func_lines = [f'  length = {df_length_expr(self)}',
-                      f'  self_index = {df_index_expr(self, as_range=True)}',
-                      f'  if length > len(idx):',
-                      f'    msg = "Unalignable boolean Series provided as indexer " + \\',
-                      f'          "(index of the boolean Series and of the indexed object do not match)."',
-                      f'    raise IndexingError(msg)',
-                      f'  # do not trim idx._data to length as getitem_by_mask handles such case',
-                      f'  res_index = getitem_by_mask(self_index, idx._data)',
-                      f'  # df index is default, same as positions so it can be used in take']
+    if isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType):
+        func_lines = [
+            f'  length = {length_expr}',
+            f'  self_index = {df_index_expr(self, length_expr=length_expr, as_range=True)}',
+            f'  if length > len(idx):',
+            f'    msg = "Unalignable boolean Series provided as indexer " + \\',
+            f'          "(index of the boolean Series and of the indexed object do not match)."',
+            f'    raise IndexingError(msg)',
+            f'  # do not trim idx._data to length as getitem_by_mask handles such case',
+            f'  res_index = getitem_by_mask(self_index, idx._data)',
+            f'  # df index is default, same as positions so it can be used in take'
+        ]
         results = []
         for i, col in enumerate(self.columns):
+            col_loc = self.column_loc[col]
+            type_id, col_id = col_loc.type_id, col_loc.col_id
             res_data = f'res_data_{i}'
             func_lines += [
-                f'  data_{i} = self._data[{i}]',
+                f'  data_{i} = self._data[{type_id}][{col_id}]',
                 f'  {res_data} = sdc_take(data_{i}, res_index)'
             ]
             results.append((col, res_data))
@@ -1536,17 +1549,20 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
             f'  return pandas.DataFrame({{{data}}}, index=res_index)'
         ]
     else:
-        func_lines = [f'  length = {df_length_expr(self)}',
-                      f'  self_index = self.index',
-                      f'  reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
-                      f'  res_index = getitem_by_mask(self_index, reindexed_idx._data)',
-                      f'  selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)']
-
+        func_lines = [
+            f'  length = {length_expr}',
+            f'  self_index = self.index',
+            f'  reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
+            f'  res_index = getitem_by_mask(self_index, reindexed_idx._data)',
+            f'  selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)'
+        ]
         results = []
         for i, col in enumerate(self.columns):
+            col_loc = self.column_loc[col]
+            type_id, col_id = col_loc.type_id, col_loc.col_id
             res_data = f'res_data_{i}'
             func_lines += [
-                f'  data_{i} = self._data[{i}]',
+                f'  data_{i} = self._data[{type_id}][{col_id}]',
                 f'  {res_data} = sdc_take(data_{i}, selected_pos)'
             ]
             results.append((col, res_data))
@@ -1570,9 +1586,11 @@ def df_getitem_bool_array_idx_main_codelines(self, idx):
                   f'  res_index = sdc_take(self_index, taken_pos)']
     results = []
     for i, col in enumerate(self.columns):
+        col_loc = self.column_loc[col]
+        type_id, col_id = col_loc.type_id, col_loc.col_id
         res_data = f'res_data_{i}'
         func_lines += [
-            f'  data_{i} = self._data[{i}]',
+            f'  data_{i} = self._data[{type_id}][{col_id}]',
             f'  {res_data} = sdc_take(data_{i}, taken_pos)'
         ]
         results.append((col, res_data))
@@ -1593,13 +1611,13 @@ def df_getitem_key_error_codelines():
 def df_getitem_slice_idx_codegen(self, idx):
     """
     Example of generated implementation with provided index:
-        def _df_getitem_slice_idx_impl(self, idx)
-          res_index = self._index
-          data_0 = self._data[0]
-          res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
-          data_1 = self._data [1]
-          res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
-          return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
+        def _df_getitem_slice_idx_impl(self, idx):
+          self_index = numpy.arange(len(self._data[0][0]))
+          index = self_index[idx]
+          data_0 = self._data[0][0][idx]
+          res_data_0 = pandas.Series(data_0, index=index, name="A")
+          data_1 = self._data[1][0][idx]
+          return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=index)
     """
     func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
     if self.columns:
@@ -1616,13 +1634,13 @@ def _df_getitem_slice_idx_impl(self, idx)
 def df_getitem_tuple_idx_codegen(self, idx):
     """
     Example of generated implementation with provided index:
-        def _df_getitem_tuple_idx_impl(self, idx)
-          res_index = self._index
-          data_1 = self._data[1]
-          res_data_1 = pandas.Series(data_1, index=res_index, name="B")
-          data_2 = self._data[2]
+        def _df_getitem_tuple_idx_impl(self, idx):
+          res_index = numpy.arange(len(self._data[0][0]))
+          data_0 = self._data[0][0]
+          res_data_0 = pandas.Series(data_0, index=res_index, name="A")
+          data_2 = self._data[0][1]
           res_data_2 = pandas.Series(data_2, index=res_index, name="C")
-          return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index)
+          return pandas.DataFrame({"A": res_data_0, "C": res_data_2}, index=res_index)
     """
     func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):']
     literal_idx = {col.literal_value for col in idx}
@@ -1644,18 +1662,18 @@ def df_getitem_bool_series_idx_codegen(self, idx):
     """
     Example of generated implementation with provided index:
         def _df_getitem_bool_series_idx_impl(self, idx):
-          length = len(self._data[0])
-          self_index = range(len(self._data[0]))
+          length = len(self._data[0][0])
+          self_index = range(len(self._data[0][0]))
           if length > len(idx):
             msg = "Unalignable boolean Series provided as indexer " + \
                   "(index of the boolean Series and of the indexed object do not match)."
             raise IndexingError(msg)
           # do not trim idx._data to length as getitem_by_mask handles such case
           res_index = getitem_by_mask(self_index, idx._data)
           # df index is default, same as positions so it can be used in take
-          data_0 = self._data[0]
+          data_0 = self._data[0][0]
           res_data_0 = sdc_take(data_0, res_index)
-          data_1 = self._data[1]
+          data_1 = self._data[1][0]
           res_data_1 = sdc_take(data_1, res_index)
           return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
     """
@@ -1675,15 +1693,15 @@ def df_getitem_bool_array_idx_codegen(self, idx):
     """
     Example of generated implementation with provided index:
         def _df_getitem_bool_array_idx_impl(self, idx):
-          length = len(self._data[0])
+          length = len(self._data[0][0])
           if length != len(idx):
             raise ValueError("Item wrong length.")
-          self_index = range(len(self._data[0]))
+          self_index = range(len(self._data[0][0]))
           taken_pos = getitem_by_mask(self_index, idx)
           res_index = sdc_take(self_index, taken_pos)
-          data_0 = self._data[0]
+          data_0 = self._data[0][0]
           res_data_0 = sdc_take(data_0, taken_pos)
-          data_1 = self._data[1]
+          data_1 = self._data[1][0]
           res_data_1 = sdc_take(data_1, taken_pos)
           return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
     """
@@ -1823,15 +1841,16 @@ def sdc_pandas_dataframe_getitem(self, idx):
         return None
 
     if isinstance(idx, types.StringLiteral):
-        try:
-            col_idx = self.columns.index(idx.literal_value)
-            key_error = False
-        except ValueError:
+        col_loc = self.column_loc.get(idx.literal_value)
+        if col_loc is None:
             key_error = True
+        else:
+            type_id, col_id = col_loc.type_id, col_loc.col_id
+            key_error = False
 
         def _df_getitem_str_literal_idx_impl(self, idx):
             if key_error == False:  # noqa
-                data = self._data[col_idx]
+                data = self._data[type_id][col_id]
                 return pandas.Series(data, index=self._index, name=idx)
             else:
                 raise KeyError('Column is not in the DataFrame')
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -54,9 +54,9 @@ def generic_resolve(self, df, attr):
             return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
 
 
-class ColumnId(NamedTuple):
+class ColumnLoc(NamedTuple):
     type_id: int
-    col_type_id: int
+    col_id: int
 
 
 @intrinsic
@@ -73,27 +73,27 @@ def init_dataframe(typingctx, *args):
     index_typ = args[n_cols]
     column_names = tuple(a.literal_value for a in args[n_cols + 1:])
 
-    # Define df structure, map column name to column position ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
-    df_structure = {}
+    # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
+    column_loc = {}
     # Store unique types of columns ex. {'int64': (0, [0, 2]), 'float64': (1, [1])}
     data_typs_map = {}
     types_order = []
     type_id = 0
-    for col_id, col_typ in enumerate(data_typs):
-        col_name = column_names[col_id]
+    for i, col_typ in enumerate(data_typs):
+        col_name = column_names[i]
 
         if col_typ not in data_typs_map:
-            data_typs_map[col_typ] = (type_id, [col_id])
+            data_typs_map[col_typ] = (type_id, [i])
             # The first column in each type always has 0 index
-            df_structure[col_name] = ColumnId(type_id, 0)
+            column_loc[col_name] = ColumnLoc(type_id, col_id=0)
             types_order.append(col_typ)
             type_id += 1
         else:
             # Get index of column in list of types
-            type_idx, col_indices = data_typs_map[col_typ]
-            col_idx_list = len(col_indices)
-            df_structure[col_name] = ColumnId(type_idx, col_idx_list)
-            col_indices.append(col_id)
+            type_id, col_indices = data_typs_map[col_typ]
+            col_id = len(col_indices)
+            column_loc[col_name] = ColumnLoc(type_id, col_id)
+            col_indices.append(i)
 
     def codegen(context, builder, signature, args):
         in_tup = args[0]
@@ -134,7 +134,7 @@ def codegen(context, builder, signature, args):
 
         return dataframe._getvalue()
 
-    ret_typ = DataFrameType(data_typs, index_typ, column_names, df_structure=df_structure)
+    ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc)
     sig = signature(ret_typ, types.Tuple(args))
     return sig, codegen
 
diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py
@@ -37,7 +37,7 @@ class DataFrameType(types.Type):  # TODO: IterableType over column names
     """Temporary type class for DataFrame objects.
     """
 
-    def __init__(self, data=None, index=None, columns=None, has_parent=False, df_structure=None):
+    def __init__(self, data=None, index=None, columns=None, has_parent=False, column_loc=None):
         self.data = data
         if index is None:
             index = types.none
@@ -46,7 +46,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, df_str
         # keeping whether it is unboxed from Python to enable reflection of new
         # columns
         self.has_parent = has_parent
-        self.df_structure = df_structure
+        self.column_loc = column_loc
         super(DataFrameType, self).__init__(
             name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))
 
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py