Merge branch 'master' into dfat

IntelPython · AlexanderKalistratov · Apr 23, 2020 · Mar 19, 2020 · Mar 20, 2020 · Mar 20, 2020
commit 261b5b0ad7800f244d5a6e596ccd0c217159d753
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -1906,6 +1906,58 @@ def _df_getitem_tuple_at_impl(self, idx):
     return func_text, global_vars
 
 
+def df_getitem_single_label_loc_codegen(self, idx):
+    """
+    Example of generated implementation:
+        def _df_getitem_single_label_loc_impl(self, idx):
+            idx_list = find_idx(self._dataframe._index, idx)
+            data_0 = _sdc_take(self._dataframe._data[0], idx_list)
+            res_data_0 = pandas.Series(data_0)
+            data_1 = _sdc_take(self._dataframe._data[1], idx_list)
+            res_data_1 = pandas.Series(data_1)
+            if len(idx_list) < 1:
+                raise KeyError('Index is not in the DataFrame')
+            new_index = _sdc_take(self._dataframe._index, idx_list)
+            return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=new_index)
+    """
+    if isinstance(self.index, types.NoneType):
+        fill_list = ['  idx_list =  numpy.array([idx])']
+        new_index = ['  new_index = numpy.array([idx])']
+
+    else:
+        fill_list = ['  idx_list = find_idx(self._dataframe._index, idx)']
+        new_index = ['  new_index = _sdc_take(self._dataframe._index, idx_list)']
+
+    fill_list_text = '\n'.join(fill_list)
+    new_index_text = '\n'.join(new_index)
+    func_lines = ['def _df_getitem_single_label_loc_impl(self, idx):',
+                  f'{fill_list_text}']
+    results = []
+    for i, c in enumerate(self.columns):
+        data = f'data_{i}'
+        index_in_list = f'index_in_list_{i}'
+        res_data = f'res_data_{i}'
+        func_lines += [f'  {data} = _sdc_take(self._dataframe._data[{i}], idx_list)',
+                       f'  {res_data} = pandas.Series({data})']
+        results.append((c, res_data))
+
+    func_lines += ['  if len(idx_list) < 1:',
+                   "    raise KeyError('Index is not in the DataFrame')"]
+
+    data = ', '.join(f'"{col}": {data}' for col, data in results)
+    func_lines += [f'{new_index_text}',
+                   f'  return pandas.DataFrame({{{data}}}, index=new_index)']
+
+    func_text = '\n'.join(func_lines)
+    global_vars = {'pandas': pandas, 'numpy': numpy,
+                   'numba': numba,
+                   '_sdc_take': _sdc_take,
+                   'find_idx': find_idx,
+                   'KeyError': KeyError}
+
+    return func_text, global_vars
+
+
 def df_getitem_int_iloc_codegen(self, idx):
     """
     Example of generated implementation:
@@ -2049,6 +2101,9 @@ def gen_df_getitem_tuple_at_impl(self, row, col):
     return _reduce_impl
 
 
+gen_df_getitem_loc_single_label_impl = gen_impl_generator(
+    df_getitem_single_label_loc_codegen, '_df_getitem_single_label_loc_impl')
+
 gen_df_getitem_iloc_int_impl = gen_impl_generator(
     df_getitem_int_iloc_codegen, '_df_getitem_int_iloc_impl')
 
@@ -2084,6 +2139,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
 
         raise TypingError('Attribute at(). The index must be a row and literal column. Given: {}'.format(idx))
 
+    if accessor == 'loc':
+        if isinstance(idx, (types.Integer, types.UnicodeType, types.StringLiteral)):
+            return gen_df_getitem_loc_single_label_impl(self.dataframe, idx)
+
+        ty_checker = TypeChecker('Attribute loc().')
+        ty_checker.raise_exc(idx, 'int or str', 'idx')
+
     if accessor == 'iat':
         if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
             col = idx[1].literal_value
@@ -2287,6 +2349,57 @@ def sdc_pandas_dataframe_at_impl(self):
     return sdc_pandas_dataframe_at_impl
 
 
+@sdc_overload_attribute(DataFrameType, 'loc')
+def sdc_pandas_dataframe_loc(self):
+    """
+    Intel Scalable Dataframe Compiler User Guide
+    ********************************************
+
+    Pandas API: pandas.DataFrame.loc
+
+    Limitations
+    -----------
+    - Loc always returns Dataframe.
+    - Parameter ``idx`` is supported only to be a single value, e.g. :obj:`df.loc['A']`.
+
+    Examples
+    --------
+    .. literalinclude:: ../../../examples/dataframe/dataframe_loc.py
+       :language: python
+       :lines: 36-
+       :caption: Access a group of rows and columns by label(s) or a boolean array.
+       :name: ex_dataframe_loc
+
+    .. command-output:: python ./dataframe/dataframe_loc.py
+       :cwd: ../../../examples
+
+    .. seealso::
+        :ref:`DataFrame.at <pandas.DataFrame.at>`
+            Access a single value for a row/column label pair.
+        :ref:`DataFrame.iloc <pandas.DataFrame.iloc>`
+            Access group of rows and columns by integer position(s).
+        :ref:`DataFrame.xs <pandas.DataFrame.xs>`
+            Returns a cross-section (row(s) or column(s)) from the Series/DataFrame.
+        :ref:`Series.loc <pandas.Series.loc>`
+            Access group of values using labels.
+
+    Intel Scalable Dataframe Compiler Developer Guide
+    *************************************************
+    Pandas DataFrame method :meth:`pandas.DataFrame.loc` implementation.
+
+    .. only:: developer
+        Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_loc*
+    """
+
+    ty_checker = TypeChecker('Attribute loc().')
+    ty_checker.check(self, DataFrameType)
+
+    def sdc_pandas_dataframe_loc_impl(self):
+        return sdc.datatypes.hpat_pandas_dataframe_getitem_types.dataframe_getitem_accessor_init(self, 'loc')
+
+    return sdc_pandas_dataframe_loc_impl
+
+
 @sdc_overload_method(DataFrameType, 'pct_change')
 def pct_change_overload(df, periods=1, fill_method='pad', limit=None, freq=None):
     """

diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -1237,6 +1237,40 @@ def test_impl(df):
         msg = 'Index is not in the Series'
         self.assertIn(msg, str(raises.exception))
 
+    def test_df_loc(self):
+        def test_impl(df):
+            return df.loc[4]
+
+        sdc_func = sdc.jit(test_impl)
+        idx = [3, 4, 1, 4, 0]
+        df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
+                           "B": [3, 4, 1, 0, 222],
+                           "C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
+    def test_df_loc_str(self):
+        def test_impl(df):
+            return df.loc['c']
+
+        sdc_func = sdc.jit(test_impl)
+        idx = ['a', 'b', 'c', 'с', 'e']
+        df = pd.DataFrame({"A": ['3.2', '4.4', '7.0', '3.3', '1.0'],
+                           "B": ['3', '4', '1', '0', '222'],
+                           "C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx)
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
+    def test_df_loc_no_idx(self):
+        def test_impl(df):
+            return df.loc[2]
+
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
+                           "B": [3, 4, 1, 0, 222],
+                           "C": [3.1, 8.4, 7.1, 3.2, 1]})
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
     def test_df_head(self):
         def get_func(n):
             def impl(a):