@@ -416,13 +416,13 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe
416416# return pandas.Series([result_A, result_B], ['A', 'B'])
417417
418418
419- def _dataframe_reduce_columns_codegen (func_name , func_params , series_params , columns , df_structure ):
419+ def _dataframe_reduce_columns_codegen (func_name , func_params , series_params , columns , column_loc ):
420420 result_name_list = []
421421 joined = ', ' .join (func_params )
422422 func_lines = [f'def _df_{ func_name } _impl({ joined } ):' ]
423423 for i , c in enumerate (columns ):
424- type_id = df_structure [c ]. type_id
425- col_id = df_structure [ c ]. col_type_id
424+ col_loc = column_loc [c ]
425+ type_id , col_id = col_loc . type_id , col_loc . col_id
426426 result_c = f'result_{ i } '
427427 func_lines += [f' series_{ i } = pandas.Series({ func_params [0 ]} ._data[{ type_id } ][{ col_id } ])' ,
428428 f' { result_c } = series_{ i } .{ func_name } ({ series_params } )' ]
@@ -452,7 +452,7 @@ def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params):
452452 df_func_name = f'_df_{ func_name } _impl'
453453
454454 func_text , global_vars = _dataframe_reduce_columns_codegen (func_name , all_params , s_par , df .columns ,
455- df .df_structure )
455+ df .column_loc )
456456 loc_vars = {}
457457 exec (func_text , global_vars , loc_vars )
458458 _reduce_impl = loc_vars [df_func_name ]
@@ -1453,7 +1453,7 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
14531453def df_length_expr (self ):
14541454 """Generate expression to get length of DF"""
14551455 if self .columns :
1456- return 'len(self._data[0])'
1456+ return 'len(self._data[0][0] )'
14571457
14581458 return '0'
14591459
@@ -1475,16 +1475,22 @@ def df_index_expr(self, length_expr=None, as_range=False):
14751475def df_getitem_slice_idx_main_codelines (self , idx ):
14761476 """Generate main code lines for df.getitem with idx of slice"""
14771477 results = []
1478- func_lines = [f' res_index = { df_index_expr (self )} ' ]
1478+ func_lines = [
1479+ f' self_index = { df_index_expr (self )} ' ,
1480+ f' index = self_index[idx]' ,
1481+ ]
14791482 for i , col in enumerate (self .columns ):
1483+ col_loc = self .column_loc [col ]
1484+ type_id , col_id = col_loc .type_id , col_loc .col_id
14801485 res_data = f'res_data_{ i } '
14811486 func_lines += [
1482- f' { res_data } = pandas.Series((self._data[{ i } ])[idx], index=res_index[idx], name="{ col } ")'
1487+ f' data_{ i } = self._data[{ type_id } ][{ col_id } ][idx]' ,
1488+ f' { res_data } = pandas.Series(data_{ i } , index=index, name="{ col } ")' ,
14831489 ]
14841490 results .append ((col , res_data ))
14851491
14861492 data = ', ' .join (f'"{ col } ": { data } ' for col , data in results )
1487- func_lines += [f' return pandas.DataFrame({{{ data } }}, index=res_index[idx] )' ]
1493+ func_lines += [f' return pandas.DataFrame({{{ data } }}, index=index )' ]
14881494
14891495 return func_lines
14901496
@@ -1495,9 +1501,11 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
14951501 func_lines = [f' res_index = { df_index_expr (self )} ' ]
14961502 needed_cols = {col : i for i , col in enumerate (self .columns ) if col in literal_idx }
14971503 for col , i in needed_cols .items ():
1504+ col_loc = self .column_loc [col ]
1505+ type_id , col_id = col_loc .type_id , col_loc .col_id
14981506 res_data = f'res_data_{ i } '
14991507 func_lines += [
1500- f' data_{ i } = self._data [ { i } ]' ,
1508+ f' data_{ i } = self._data[ { type_id } ][ { col_id } ]' ,
15011509 f' { res_data } = pandas.Series(data_{ i } , index=res_index, name="{ col } ")'
15021510 ]
15031511 results .append ((col , res_data ))
@@ -1510,23 +1518,28 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
15101518
15111519def df_getitem_bool_series_idx_main_codelines (self , idx ):
15121520 """Generate main code lines for df.getitem"""
1521+ length_expr = df_length_expr (self )
15131522
15141523 # optimization for default indexes in df and idx when index alignment is trivial
1515- if (isinstance (self .index , types .NoneType ) and isinstance (idx .index , types .NoneType )):
1516- func_lines = [f' length = { df_length_expr (self )} ' ,
1517- f' self_index = { df_index_expr (self , as_range = True )} ' ,
1518- f' if length > len(idx):' ,
1519- f' msg = "Unalignable boolean Series provided as indexer " + \\ ' ,
1520- f' "(index of the boolean Series and of the indexed object do not match)."' ,
1521- f' raise IndexingError(msg)' ,
1522- f' # do not trim idx._data to length as getitem_by_mask handles such case' ,
1523- f' res_index = getitem_by_mask(self_index, idx._data)' ,
1524- f' # df index is default, same as positions so it can be used in take' ]
1524+ if isinstance (self .index , types .NoneType ) and isinstance (idx .index , types .NoneType ):
1525+ func_lines = [
1526+ f' length = { length_expr } ' ,
1527+ f' self_index = { df_index_expr (self , length_expr = length_expr , as_range = True )} ' ,
1528+ f' if length > len(idx):' ,
1529+ f' msg = "Unalignable boolean Series provided as indexer " + \\ ' ,
1530+ f' "(index of the boolean Series and of the indexed object do not match)."' ,
1531+ f' raise IndexingError(msg)' ,
1532+ f' # do not trim idx._data to length as getitem_by_mask handles such case' ,
1533+ f' res_index = getitem_by_mask(self_index, idx._data)' ,
1534+ f' # df index is default, same as positions so it can be used in take'
1535+ ]
15251536 results = []
15261537 for i , col in enumerate (self .columns ):
1538+ col_loc = self .column_loc [col ]
1539+ type_id , col_id = col_loc .type_id , col_loc .col_id
15271540 res_data = f'res_data_{ i } '
15281541 func_lines += [
1529- f' data_{ i } = self._data[{ i } ]' ,
1542+ f' data_{ i } = self._data[{ type_id } ][ { col_id } ]' ,
15301543 f' { res_data } = sdc_take(data_{ i } , res_index)'
15311544 ]
15321545 results .append ((col , res_data ))
@@ -1536,17 +1549,20 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15361549 f' return pandas.DataFrame({{{ data } }}, index=res_index)'
15371550 ]
15381551 else :
1539- func_lines = [f' length = { df_length_expr (self )} ' ,
1540- f' self_index = self.index' ,
1541- f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)' ,
1542- f' res_index = getitem_by_mask(self_index, reindexed_idx._data)' ,
1543- f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)' ]
1544-
1552+ func_lines = [
1553+ f' length = { length_expr } ' ,
1554+ f' self_index = self.index' ,
1555+ f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)' ,
1556+ f' res_index = getitem_by_mask(self_index, reindexed_idx._data)' ,
1557+ f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)'
1558+ ]
15451559 results = []
15461560 for i , col in enumerate (self .columns ):
1561+ col_loc = self .column_loc [col ]
1562+ type_id , col_id = col_loc .type_id , col_loc .col_id
15471563 res_data = f'res_data_{ i } '
15481564 func_lines += [
1549- f' data_{ i } = self._data[{ i } ]' ,
1565+ f' data_{ i } = self._data[{ type_id } ][ { col_id } ]' ,
15501566 f' { res_data } = sdc_take(data_{ i } , selected_pos)'
15511567 ]
15521568 results .append ((col , res_data ))
@@ -1570,9 +1586,11 @@ def df_getitem_bool_array_idx_main_codelines(self, idx):
15701586 f' res_index = sdc_take(self_index, taken_pos)' ]
15711587 results = []
15721588 for i , col in enumerate (self .columns ):
1589+ col_loc = self .column_loc [col ]
1590+ type_id , col_id = col_loc .type_id , col_loc .col_id
15731591 res_data = f'res_data_{ i } '
15741592 func_lines += [
1575- f' data_{ i } = self._data[{ i } ]' ,
1593+ f' data_{ i } = self._data[{ type_id } ][ { col_id } ]' ,
15761594 f' { res_data } = sdc_take(data_{ i } , taken_pos)'
15771595 ]
15781596 results .append ((col , res_data ))
@@ -1593,13 +1611,13 @@ def df_getitem_key_error_codelines():
15931611def df_getitem_slice_idx_codegen (self , idx ):
15941612 """
15951613 Example of generated implementation with provided index:
1596- def _df_getitem_slice_idx_impl(self, idx)
1597- res_index = self._index
1598- data_0 = self._data[0 ]
1599- res_data_0 = pandas.Series(data_0[idx], index=res_index[ idx], name="A")
1600- data_1 = self._data [1]
1601- res_data_1 = pandas.Series(data_1[ idx], index=res_index, name="B")
1602- return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx] )
1614+ def _df_getitem_slice_idx_impl(self, idx):
1615+ self_index = numpy.arange(len( self._data[0][0]))
1616+ index = self_index[idx ]
1617+ data_0 = self._data[0][0][ idx]
1618+ res_data_0 = pandas.Series(data_0, index=index, name="A")
1619+ data_1 = self._data[1][0][ idx]
1620+ return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=index )
16031621 """
16041622 func_lines = ['def _df_getitem_slice_idx_impl(self, idx):' ]
16051623 if self .columns :
@@ -1616,13 +1634,13 @@ def _df_getitem_slice_idx_impl(self, idx)
16161634def df_getitem_tuple_idx_codegen (self , idx ):
16171635 """
16181636 Example of generated implementation with provided index:
1619- def _df_getitem_tuple_idx_impl(self, idx)
1620- res_index = self._index
1621- data_1 = self._data[1 ]
1622- res_data_1 = pandas.Series(data_1 , index=res_index, name="B ")
1623- data_2 = self._data[2 ]
1637+ def _df_getitem_tuple_idx_impl(self, idx):
1638+ res_index = numpy.arange(len( self._data[0][0]))
1639+ data_0 = self._data[0][0 ]
1640+ res_data_0 = pandas.Series(data_0 , index=res_index, name="A ")
1641+ data_2 = self._data[0][1 ]
16241642 res_data_2 = pandas.Series(data_2, index=res_index, name="C")
1625- return pandas.DataFrame({"B ": res_data_1 , "C": res_data_2}, index=res_index)
1643+ return pandas.DataFrame({"A ": res_data_0 , "C": res_data_2}, index=res_index)
16261644 """
16271645 func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):' ]
16281646 literal_idx = {col .literal_value for col in idx }
@@ -1644,18 +1662,18 @@ def df_getitem_bool_series_idx_codegen(self, idx):
16441662 """
16451663 Example of generated implementation with provided index:
16461664 def _df_getitem_bool_series_idx_impl(self, idx):
1647- length = len(self._data[0])
1648- self_index = range(len(self._data[0]))
1665+ length = len(self._data[0][0] )
1666+ self_index = range(len(self._data[0][0] ))
16491667 if length > len(idx):
16501668 msg = "Unalignable boolean Series provided as indexer " + \
16511669 "(index of the boolean Series and of the indexed object do not match)."
16521670 raise IndexingError(msg)
16531671 # do not trim idx._data to length as getitem_by_mask handles such case
16541672 res_index = getitem_by_mask(self_index, idx._data)
16551673 # df index is default, same as positions so it can be used in take
1656- data_0 = self._data[0]
1674+ data_0 = self._data[0][0]
16571675 res_data_0 = sdc_take(data_0, res_index)
1658- data_1 = self._data[1]
1676+ data_1 = self._data[1][0]
16591677 res_data_1 = sdc_take(data_1, res_index)
16601678 return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
16611679 """
@@ -1675,15 +1693,15 @@ def df_getitem_bool_array_idx_codegen(self, idx):
16751693 """
16761694 Example of generated implementation with provided index:
16771695 def _df_getitem_bool_array_idx_impl(self, idx):
1678- length = len(self._data[0])
1696+ length = len(self._data[0][0] )
16791697 if length != len(idx):
16801698 raise ValueError("Item wrong length.")
1681- self_index = range(len(self._data[0]))
1699+ self_index = range(len(self._data[0][0] ))
16821700 taken_pos = getitem_by_mask(self_index, idx)
16831701 res_index = sdc_take(self_index, taken_pos)
1684- data_0 = self._data[0]
1702+ data_0 = self._data[0][0]
16851703 res_data_0 = sdc_take(data_0, taken_pos)
1686- data_1 = self._data[1]
1704+ data_1 = self._data[1][0]
16871705 res_data_1 = sdc_take(data_1, taken_pos)
16881706 return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
16891707 """
@@ -1823,15 +1841,16 @@ def sdc_pandas_dataframe_getitem(self, idx):
18231841 return None
18241842
18251843 if isinstance (idx , types .StringLiteral ):
1826- try :
1827- col_idx = self .columns .index (idx .literal_value )
1828- key_error = False
1829- except ValueError :
1844+ col_loc = self .column_loc .get (idx .literal_value )
1845+ if col_loc is None :
18301846 key_error = True
1847+ else :
1848+ type_id , col_id = col_loc .type_id , col_loc .col_id
1849+ key_error = False
18311850
18321851 def _df_getitem_str_literal_idx_impl (self , idx ):
18331852 if key_error == False : # noqa
1834- data = self ._data [col_idx ]
1853+ data = self ._data [type_id ][ col_id ]
18351854 return pandas .Series (data , index = self ._index , name = idx )
18361855 else :
18371856 raise KeyError ('Column is not in the DataFrame' )
0 commit comments