Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit c06c349

Browse files
authored
Re-implement df.getitem based on new structure (#845)
* Re-implement df.getitem based on new structure * Re-implemented remaining getitem overloads, add tests
1 parent 682f5fc commit c06c349

File tree

4 files changed

+190
-81
lines changed

4 files changed

+190
-81
lines changed

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 73 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,13 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe
416416
# return pandas.Series([result_A, result_B], ['A', 'B'])
417417

418418

419-
def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns, df_structure):
419+
def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns, column_loc):
420420
result_name_list = []
421421
joined = ', '.join(func_params)
422422
func_lines = [f'def _df_{func_name}_impl({joined}):']
423423
for i, c in enumerate(columns):
424-
type_id = df_structure[c].type_id
425-
col_id = df_structure[c].col_type_id
424+
col_loc = column_loc[c]
425+
type_id, col_id = col_loc.type_id, col_loc.col_id
426426
result_c = f'result_{i}'
427427
func_lines += [f' series_{i} = pandas.Series({func_params[0]}._data[{type_id}][{col_id}])',
428428
f' {result_c} = series_{i}.{func_name}({series_params})']
@@ -452,7 +452,7 @@ def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params):
452452
df_func_name = f'_df_{func_name}_impl'
453453

454454
func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns,
455-
df.df_structure)
455+
df.column_loc)
456456
loc_vars = {}
457457
exec(func_text, global_vars, loc_vars)
458458
_reduce_impl = loc_vars[df_func_name]
@@ -1453,7 +1453,7 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
14531453
def df_length_expr(self):
14541454
"""Generate expression to get length of DF"""
14551455
if self.columns:
1456-
return 'len(self._data[0])'
1456+
return 'len(self._data[0][0])'
14571457

14581458
return '0'
14591459

@@ -1475,16 +1475,22 @@ def df_index_expr(self, length_expr=None, as_range=False):
14751475
def df_getitem_slice_idx_main_codelines(self, idx):
14761476
"""Generate main code lines for df.getitem with idx of slice"""
14771477
results = []
1478-
func_lines = [f' res_index = {df_index_expr(self)}']
1478+
func_lines = [
1479+
f' self_index = {df_index_expr(self)}',
1480+
f' index = self_index[idx]',
1481+
]
14791482
for i, col in enumerate(self.columns):
1483+
col_loc = self.column_loc[col]
1484+
type_id, col_id = col_loc.type_id, col_loc.col_id
14801485
res_data = f'res_data_{i}'
14811486
func_lines += [
1482-
f' {res_data} = pandas.Series((self._data[{i}])[idx], index=res_index[idx], name="{col}")'
1487+
f' data_{i} = self._data[{type_id}][{col_id}][idx]',
1488+
f' {res_data} = pandas.Series(data_{i}, index=index, name="{col}")',
14831489
]
14841490
results.append((col, res_data))
14851491

14861492
data = ', '.join(f'"{col}": {data}' for col, data in results)
1487-
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])']
1493+
func_lines += [f' return pandas.DataFrame({{{data}}}, index=index)']
14881494

14891495
return func_lines
14901496

@@ -1495,9 +1501,11 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
14951501
func_lines = [f' res_index = {df_index_expr(self)}']
14961502
needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx}
14971503
for col, i in needed_cols.items():
1504+
col_loc = self.column_loc[col]
1505+
type_id, col_id = col_loc.type_id, col_loc.col_id
14981506
res_data = f'res_data_{i}'
14991507
func_lines += [
1500-
f' data_{i} = self._data [{i}]',
1508+
f' data_{i} = self._data[{type_id}][{col_id}]',
15011509
f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")'
15021510
]
15031511
results.append((col, res_data))
@@ -1510,23 +1518,28 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
15101518

15111519
def df_getitem_bool_series_idx_main_codelines(self, idx):
15121520
"""Generate main code lines for df.getitem"""
1521+
length_expr = df_length_expr(self)
15131522

15141523
# optimization for default indexes in df and idx when index alignment is trivial
1515-
if (isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType)):
1516-
func_lines = [f' length = {df_length_expr(self)}',
1517-
f' self_index = {df_index_expr(self, as_range=True)}',
1518-
f' if length > len(idx):',
1519-
f' msg = "Unalignable boolean Series provided as indexer " + \\',
1520-
f' "(index of the boolean Series and of the indexed object do not match)."',
1521-
f' raise IndexingError(msg)',
1522-
f' # do not trim idx._data to length as getitem_by_mask handles such case',
1523-
f' res_index = getitem_by_mask(self_index, idx._data)',
1524-
f' # df index is default, same as positions so it can be used in take']
1524+
if isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType):
1525+
func_lines = [
1526+
f' length = {length_expr}',
1527+
f' self_index = {df_index_expr(self, length_expr=length_expr, as_range=True)}',
1528+
f' if length > len(idx):',
1529+
f' msg = "Unalignable boolean Series provided as indexer " + \\',
1530+
f' "(index of the boolean Series and of the indexed object do not match)."',
1531+
f' raise IndexingError(msg)',
1532+
f' # do not trim idx._data to length as getitem_by_mask handles such case',
1533+
f' res_index = getitem_by_mask(self_index, idx._data)',
1534+
f' # df index is default, same as positions so it can be used in take'
1535+
]
15251536
results = []
15261537
for i, col in enumerate(self.columns):
1538+
col_loc = self.column_loc[col]
1539+
type_id, col_id = col_loc.type_id, col_loc.col_id
15271540
res_data = f'res_data_{i}'
15281541
func_lines += [
1529-
f' data_{i} = self._data[{i}]',
1542+
f' data_{i} = self._data[{type_id}][{col_id}]',
15301543
f' {res_data} = sdc_take(data_{i}, res_index)'
15311544
]
15321545
results.append((col, res_data))
@@ -1536,17 +1549,20 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15361549
f' return pandas.DataFrame({{{data}}}, index=res_index)'
15371550
]
15381551
else:
1539-
func_lines = [f' length = {df_length_expr(self)}',
1540-
f' self_index = self.index',
1541-
f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
1542-
f' res_index = getitem_by_mask(self_index, reindexed_idx._data)',
1543-
f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)']
1544-
1552+
func_lines = [
1553+
f' length = {length_expr}',
1554+
f' self_index = self.index',
1555+
f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
1556+
f' res_index = getitem_by_mask(self_index, reindexed_idx._data)',
1557+
f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)'
1558+
]
15451559
results = []
15461560
for i, col in enumerate(self.columns):
1561+
col_loc = self.column_loc[col]
1562+
type_id, col_id = col_loc.type_id, col_loc.col_id
15471563
res_data = f'res_data_{i}'
15481564
func_lines += [
1549-
f' data_{i} = self._data[{i}]',
1565+
f' data_{i} = self._data[{type_id}][{col_id}]',
15501566
f' {res_data} = sdc_take(data_{i}, selected_pos)'
15511567
]
15521568
results.append((col, res_data))
@@ -1570,9 +1586,11 @@ def df_getitem_bool_array_idx_main_codelines(self, idx):
15701586
f' res_index = sdc_take(self_index, taken_pos)']
15711587
results = []
15721588
for i, col in enumerate(self.columns):
1589+
col_loc = self.column_loc[col]
1590+
type_id, col_id = col_loc.type_id, col_loc.col_id
15731591
res_data = f'res_data_{i}'
15741592
func_lines += [
1575-
f' data_{i} = self._data[{i}]',
1593+
f' data_{i} = self._data[{type_id}][{col_id}]',
15761594
f' {res_data} = sdc_take(data_{i}, taken_pos)'
15771595
]
15781596
results.append((col, res_data))
@@ -1593,13 +1611,13 @@ def df_getitem_key_error_codelines():
15931611
def df_getitem_slice_idx_codegen(self, idx):
15941612
"""
15951613
Example of generated implementation with provided index:
1596-
def _df_getitem_slice_idx_impl(self, idx)
1597-
res_index = self._index
1598-
data_0 = self._data[0]
1599-
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
1600-
data_1 = self._data [1]
1601-
res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
1602-
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
1614+
def _df_getitem_slice_idx_impl(self, idx):
1615+
self_index = numpy.arange(len(self._data[0][0]))
1616+
index = self_index[idx]
1617+
data_0 = self._data[0][0][idx]
1618+
res_data_0 = pandas.Series(data_0, index=index, name="A")
1619+
data_1 = self._data[1][0][idx]
1620+
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=index)
16031621
"""
16041622
func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
16051623
if self.columns:
@@ -1616,13 +1634,13 @@ def _df_getitem_slice_idx_impl(self, idx)
16161634
def df_getitem_tuple_idx_codegen(self, idx):
16171635
"""
16181636
Example of generated implementation with provided index:
1619-
def _df_getitem_tuple_idx_impl(self, idx)
1620-
res_index = self._index
1621-
data_1 = self._data[1]
1622-
res_data_1 = pandas.Series(data_1, index=res_index, name="B")
1623-
data_2 = self._data[2]
1637+
def _df_getitem_tuple_idx_impl(self, idx):
1638+
res_index = numpy.arange(len(self._data[0][0]))
1639+
data_0 = self._data[0][0]
1640+
res_data_0 = pandas.Series(data_0, index=res_index, name="A")
1641+
data_2 = self._data[0][1]
16241642
res_data_2 = pandas.Series(data_2, index=res_index, name="C")
1625-
return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index)
1643+
return pandas.DataFrame({"A": res_data_0, "C": res_data_2}, index=res_index)
16261644
"""
16271645
func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):']
16281646
literal_idx = {col.literal_value for col in idx}
@@ -1644,18 +1662,18 @@ def df_getitem_bool_series_idx_codegen(self, idx):
16441662
"""
16451663
Example of generated implementation with provided index:
16461664
def _df_getitem_bool_series_idx_impl(self, idx):
1647-
length = len(self._data[0])
1648-
self_index = range(len(self._data[0]))
1665+
length = len(self._data[0][0])
1666+
self_index = range(len(self._data[0][0]))
16491667
if length > len(idx):
16501668
msg = "Unalignable boolean Series provided as indexer " + \
16511669
"(index of the boolean Series and of the indexed object do not match)."
16521670
raise IndexingError(msg)
16531671
# do not trim idx._data to length as getitem_by_mask handles such case
16541672
res_index = getitem_by_mask(self_index, idx._data)
16551673
# df index is default, same as positions so it can be used in take
1656-
data_0 = self._data[0]
1674+
data_0 = self._data[0][0]
16571675
res_data_0 = sdc_take(data_0, res_index)
1658-
data_1 = self._data[1]
1676+
data_1 = self._data[1][0]
16591677
res_data_1 = sdc_take(data_1, res_index)
16601678
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
16611679
"""
@@ -1675,15 +1693,15 @@ def df_getitem_bool_array_idx_codegen(self, idx):
16751693
"""
16761694
Example of generated implementation with provided index:
16771695
def _df_getitem_bool_array_idx_impl(self, idx):
1678-
length = len(self._data[0])
1696+
length = len(self._data[0][0])
16791697
if length != len(idx):
16801698
raise ValueError("Item wrong length.")
1681-
self_index = range(len(self._data[0]))
1699+
self_index = range(len(self._data[0][0]))
16821700
taken_pos = getitem_by_mask(self_index, idx)
16831701
res_index = sdc_take(self_index, taken_pos)
1684-
data_0 = self._data[0]
1702+
data_0 = self._data[0][0]
16851703
res_data_0 = sdc_take(data_0, taken_pos)
1686-
data_1 = self._data[1]
1704+
data_1 = self._data[1][0]
16871705
res_data_1 = sdc_take(data_1, taken_pos)
16881706
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
16891707
"""
@@ -1823,15 +1841,16 @@ def sdc_pandas_dataframe_getitem(self, idx):
18231841
return None
18241842

18251843
if isinstance(idx, types.StringLiteral):
1826-
try:
1827-
col_idx = self.columns.index(idx.literal_value)
1828-
key_error = False
1829-
except ValueError:
1844+
col_loc = self.column_loc.get(idx.literal_value)
1845+
if col_loc is None:
18301846
key_error = True
1847+
else:
1848+
type_id, col_id = col_loc.type_id, col_loc.col_id
1849+
key_error = False
18311850

18321851
def _df_getitem_str_literal_idx_impl(self, idx):
18331852
if key_error == False: # noqa
1834-
data = self._data[col_idx]
1853+
data = self._data[type_id][col_id]
18351854
return pandas.Series(data, index=self._index, name=idx)
18361855
else:
18371856
raise KeyError('Column is not in the DataFrame')

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ def generic_resolve(self, df, attr):
5454
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
5555

5656

57-
class ColumnId(NamedTuple):
57+
class ColumnLoc(NamedTuple):
5858
type_id: int
59-
col_type_id: int
59+
col_id: int
6060

6161

6262
@intrinsic
@@ -73,27 +73,27 @@ def init_dataframe(typingctx, *args):
7373
index_typ = args[n_cols]
7474
column_names = tuple(a.literal_value for a in args[n_cols + 1:])
7575

76-
# Define df structure, map column name to column position ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
77-
df_structure = {}
76+
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
77+
column_loc = {}
7878
# Store unique types of columns ex. {'int64': (0, [0, 2]), 'float64': (1, [1])}
7979
data_typs_map = {}
8080
types_order = []
8181
type_id = 0
82-
for col_id, col_typ in enumerate(data_typs):
83-
col_name = column_names[col_id]
82+
for i, col_typ in enumerate(data_typs):
83+
col_name = column_names[i]
8484

8585
if col_typ not in data_typs_map:
86-
data_typs_map[col_typ] = (type_id, [col_id])
86+
data_typs_map[col_typ] = (type_id, [i])
8787
# The first column in each type always has 0 index
88-
df_structure[col_name] = ColumnId(type_id, 0)
88+
column_loc[col_name] = ColumnLoc(type_id, col_id=0)
8989
types_order.append(col_typ)
9090
type_id += 1
9191
else:
9292
# Get index of column in list of types
93-
type_idx, col_indices = data_typs_map[col_typ]
94-
col_idx_list = len(col_indices)
95-
df_structure[col_name] = ColumnId(type_idx, col_idx_list)
96-
col_indices.append(col_id)
93+
type_id, col_indices = data_typs_map[col_typ]
94+
col_id = len(col_indices)
95+
column_loc[col_name] = ColumnLoc(type_id, col_id)
96+
col_indices.append(i)
9797

9898
def codegen(context, builder, signature, args):
9999
in_tup = args[0]
@@ -134,7 +134,7 @@ def codegen(context, builder, signature, args):
134134

135135
return dataframe._getvalue()
136136

137-
ret_typ = DataFrameType(data_typs, index_typ, column_names, df_structure=df_structure)
137+
ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc)
138138
sig = signature(ret_typ, types.Tuple(args))
139139
return sig, codegen
140140

sdc/hiframes/pd_dataframe_type.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class DataFrameType(types.Type): # TODO: IterableType over column names
3737
"""Temporary type class for DataFrame objects.
3838
"""
3939

40-
def __init__(self, data=None, index=None, columns=None, has_parent=False, df_structure=None):
40+
def __init__(self, data=None, index=None, columns=None, has_parent=False, column_loc=None):
4141
self.data = data
4242
if index is None:
4343
index = types.none
@@ -46,7 +46,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, df_str
4646
# keeping whether it is unboxed from Python to enable reflection of new
4747
# columns
4848
self.has_parent = has_parent
49-
self.df_structure = df_structure
49+
self.column_loc = column_loc
5050
super(DataFrameType, self).__init__(
5151
name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))
5252

0 commit comments

Comments
 (0)