1919from patsy .desc import ModelDesc
2020from patsy .contrasts import code_contrast_matrix , Treatment
2121from patsy .compat import itertools_product , OrderedDict
22+ from patsy .missing import NAAction
2223
2324if have_pandas :
2425 import pandas
@@ -174,8 +175,7 @@ def __init__(self, factor, state, postprocessor, expected_levels):
174175 def eval (self , data ):
175176 # returns either a 2d ndarray or a DataFrame
176177 result = self .factor .eval (self ._state , data )
177- if self ._postprocessor is not None :
178- result = self ._postprocessor .transform (result )
178+ result = self ._postprocessor .transform (result )
179179 if not isinstance (result , Categorical ):
180180 msg = ("when evaluating categoric factor %r, I got a "
181181 "result that is not of type Categorical (but rather %s)"
@@ -189,10 +189,7 @@ def eval(self, data):
189189 % (self .factor .name (), self ._expected_levels , result .levels ))
190190 raise PatsyError (msg , self .factor )
191191 _max_allowed_dim (1 , result .int_array , self .factor )
192- # For consistency, evaluators *always* return 2d arrays (though in
193- # this case it will always have only 1 column):
194- return atleast_2d_column_default (result .int_array ,
195- preserve_pandas = True )
192+ return result
196193
197194def test__CatFactorEvaluator ():
198195 from nose .tools import assert_raises
@@ -292,8 +289,11 @@ def build(self, factor_values, out):
292289 for factor , column_idx in zip (self ._factors , column_idxs ):
293290 if factor in self ._cat_contrasts :
294291 contrast = self ._cat_contrasts [factor ]
295- out [:, i ] *= contrast .matrix [factor_values [factor ].ravel (),
296- column_idx ]
292+ int_array = factor_values [factor ].int_array
293+ if np .any (int_array < 0 ):
294+ raise PatsyError ("can't build a design matrix "
295+ "containing missing values" , factor )
296+ out [:, i ] *= contrast .matrix [int_array , column_idx ]
297297 else :
298298 assert (factor_values [factor ].shape [1 ]
299299 == self ._num_columns [factor ])
@@ -768,10 +768,14 @@ def _build(self, evaluator_to_values, dtype):
768768 for evaluator , value in evaluator_to_values .iteritems ():
769769 if evaluator in self ._evaluators :
770770 factor_to_values [evaluator .factor ] = value
771+ if isinstance (value , Categorical ):
772+ this_num_rows = value .int_array .shape [0 ]
773+ else :
774+ this_num_rows = value .shape [0 ]
771775 if num_rows is not None :
772- assert num_rows == value . shape [ 0 ]
776+ assert num_rows == this_num_rows
773777 else :
774- num_rows = value . shape [ 0 ]
778+ num_rows = this_num_rows
775779 if num_rows is None :
776780 # We have no dependence on the data -- e.g. an empty termlist, or
777781 # only an intercept term.
@@ -789,7 +793,9 @@ def _build(self, evaluator_to_values, dtype):
789793 assert start_column == self .total_columns
790794 return need_reshape , m
791795
792- def build_design_matrices (builders , data , return_type = "matrix" ,
796+ def build_design_matrices (builders , data ,
797+ NA_action = "drop" ,
798+ return_type = "matrix" ,
793799 dtype = np .dtype (float )):
794800 """Construct several design matrices from :class:`DesignMatrixBuilder`
795801 objects.
@@ -801,6 +807,10 @@ def build_design_matrices(builders, data, return_type="matrix",
801807 :arg builders: A list of :class:`DesignMatrixBuilders` specifying the
802808 design matrices to be built.
803809 :arg data: A dict-like object which will be used to look up data.
810+ :arg NA_action: What to do with rows that contain missing values. Either
811+ ``"drop"``, ``"raise"``, or an :class:`NAAction` object. See
812+ :class:`NAAction` for details on what values count as 'missing' (and how
813+ to alter this).
804814 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
805815 :arg dtype: The dtype of the returned matrix. Useful if you want to use
806816 single-precision or extended-precision.
@@ -820,12 +830,15 @@ def build_design_matrices(builders, data, return_type="matrix",
820830 incrementally processing a large data set, simply call this function for
821831 each chunk.
822832 """
833+ if isinstance (NA_action , basestring ):
834+ NA_action = NAAction (NA_action )
823835 if return_type == "dataframe" and not have_pandas :
824836 raise PatsyError ("pandas.DataFrame was requested, but pandas "
825837 "is not installed" )
826838 if return_type not in ("matrix" , "dataframe" ):
827839 raise PatsyError ("unrecognized output type %r, should be "
828840 "'matrix' or 'dataframe'" % (return_type ,))
841+ # Evaluate factors
829842 evaluator_to_values = {}
830843 num_rows = None
831844 pandas_index = None
@@ -836,30 +849,48 @@ def build_design_matrices(builders, data, return_type="matrix",
836849 for evaluator in builder ._evaluators :
837850 if evaluator not in evaluator_to_values :
838851 value = evaluator .eval (data )
839- assert value .ndim == 2
852+ if isinstance (value , Categorical ):
853+ unboxed = value .int_array
854+ else :
855+ unboxed = value
856+ # unboxed may now be a Series, DataFrame, or ndarray
840857 if num_rows is None :
841- num_rows = value .shape [0 ]
858+ num_rows = unboxed .shape [0 ]
842859 else :
843- if num_rows != value .shape [0 ]:
860+ if num_rows != unboxed .shape [0 ]:
844861 msg = ("Row mismatch: factor %s had %s rows, when "
845862 "previous factors had %s rows"
846- % (evaluator .factor .name (), value .shape [0 ],
863+ % (evaluator .factor .name (), unboxed .shape [0 ],
847864 num_rows ))
848865 raise PatsyError (msg , evaluator .factor )
849866 if (have_pandas
850- and isinstance (value , (pandas .Series , pandas .DataFrame ))):
867+ and isinstance (unboxed , (pandas .Series , pandas .DataFrame ))):
851868 if pandas_index is None :
852- pandas_index = value .index
869+ pandas_index = unboxed .index
853870 else :
854- if not pandas_index .equals (value .index ):
871+ if not pandas_index .equals (unboxed .index ):
855872 msg = ("Index mismatch: pandas objects must "
856873 "have aligned indexes" )
857874 raise PatsyError (msg , evaluator .factor )
858875 # Strategy: we work with raw ndarrays for doing the actual
859876 # combining; DesignMatrixBuilder objects never sees pandas
860877 # objects. Then at the end, if a DataFrame was requested, we
861- # convert.
862- evaluator_to_values [evaluator ] = np .asarray (value )
878+ # convert. So every entry in this dict is either a
879+ # Categorical object, or a 2-d array of values.
880+ if not isinstance (value , Categorical ):
881+ value = np .asarray (value )
882+ evaluator_to_values [evaluator ] = value
883+ # Handle NAs
884+ if pandas_index is None and num_rows is not None :
885+ pandas_index = np .arange (num_rows )
886+ factor_values = evaluator_to_values .values ()
887+ origins = [evaluator .factor .origin for evaluator in evaluator_to_values ]
888+ new_index , new_factor_values = NA_action .handle_NA (pandas_index ,
889+ factor_values ,
890+ origins )
891+ pandas_index = new_index
892+ evaluator_to_values = dict (zip (evaluator_to_values , new_factor_values ))
893+ # Build factor values into matrices
863894 results = []
864895 for builder in builders :
865896 results .append (builder ._build (evaluator_to_values , dtype ))
0 commit comments