@@ -827,10 +827,34 @@ def _build(self, evaluator_to_values, dtype):
827827 assert start_column == self .total_columns
828828 return need_reshape , m
829829
830+ class _CheckMatch (object ):
831+ def __init__ (self , name , value_type , eq_fn ):
832+ self ._name = name
833+ self ._value_type = value_type
834+ self ._eq_fn = eq_fn
835+ self .value = None
836+ self ._value_desc = None
837+ self ._value_origin = None
838+
839+ def check (self , seen_value , desc , origin ):
840+ if self .value is None :
841+ self .value = seen_value
842+ self ._value_desc = desc
843+ self ._value_origin = origin
844+ else :
845+ if not self ._eq_fn (self .value , seen_value ):
846+ # XX FIXME: this is a case where having discontiguous Origins
847+ # would be useful...
848+ raise PatsyError ("%s mismatch: %s and %s have different %s"
849+ % (self ._name , self ._value_desc , desc ,
850+ self ._value_type ),
851+ origin )
852+
830853def build_design_matrices (builders , data ,
831854 NA_action = "drop" ,
832855 return_type = "matrix" ,
833- dtype = np .dtype (float )):
856+ dtype = np .dtype (float ),
857+ data_index = None ):
834858 """Construct several design matrices from :class:`DesignMatrixBuilder`
835859 objects.
836860
@@ -848,24 +872,62 @@ def build_design_matrices(builders, data,
848872 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
849873 :arg dtype: The dtype of the returned matrix. Useful if you want to use
850874 single-precision or extended-precision.
875+ :arg data_index: A single-dimensional array-like object containing
876+ hashable objects (i.e., a valid :class:`pandas.Index`), or None. If
877+ ``None``, then defaults to the value of ``data.index``, if this
878+ attribute exists.
851879
852880 This function returns either a list of :class:`DesignMatrix` objects (for
853881 ``return_type="matrix"``) or a list of :class:`pandas.DataFrame` objects
854- (for ``return_type="dataframe"``). In the latter case, the DataFrames will
855- preserve any (row) indexes that were present in the input, which may be
856- useful for time-series models etc. In any case, all returned design
882+ (for ``return_type="dataframe"``). In both cases, all returned design
857883 matrices will have ``.design_info`` attributes containing the appropriate
858884 :class:`DesignInfo` objects.
859885
860- Unlike :func:`design_matrix_builders`, this function takes only a simple
861- data argument, not any kind of iterator. That's because this function
862- doesn't need a global view of the data -- everything that depends on the
863- whole data set is already encapsulated in the `builders`. If you are
864- incrementally processing a large data set, simply call this function for
865- each chunk.
886+ Index handling: this function checks for indexes in the following places:
887+
888+ * The ``data_index`` argument.
889+ * If ``data`` is a :class:`pandas.DataFrame`, its ``.index`` attribute.
890+ * If any factors evaluate to a :class:`pandas.Series` or
891+ :class:`pandas.DataFrame`, then their ``.index`` attributes.
892+
893+ If multiple indexes are found, they must be identical (same values in the
894+ same order). If no indexes are found, then a default index is generated
895+ using ``np.arange(num_rows)``. One way or another, we end up with a single
896+ index for all the data. If ``return_type="dataframe"``, then this index is
897+ used as the index of the returned DataFrame objects. Examining this index
898+ makes it possible to determine which rows were removed due to NAs.
899+
900+ Indexes are also critical in one further case, even if not using
901+ pandas. Most of the time, it's obvious how many rows the design matrices
902+ are supposed to have, because in a formula like ``"y ~ x"``, we can look
903+ at the values of ``y`` and ``x`` and see how many entries they have. But
904+ some formulas don't actually depend on the data, e.g. ``"~ 1"``. In this
905+ case it's impossible just from looking at the formula to know how many
906+ rows the design matrices should have. But, if ``data_index`` is specified
907+ (or ``data`` has a ``.index`` attribute), then we can (and do) use this to
908+ determine the shape of the output design matrix. If the data index is
909+ *not* available, then trying to build a formula like ``"~ 1"`` will raise
910+ an error.
911+
912+ One situation where ``data_index=`` is critical, therefore, is when
913+ implementing a model that has an implicit dependent variable, where your
914+ users will specify a one-sided formula like ``"~ 1 + x"`` and you will
915+ determine the effective left-hand side by other means. In this case, a
916+ model like ``"~ 1"`` makes perfect sense, but will be an error if
917+ ``data_index=`` is not specified.
918+
919+ Note that unlike :func:`design_matrix_builders`, this function takes only
920+ a simple data argument, not any kind of iterator. That's because this
921+ function doesn't need a global view of the data -- everything that depends
922+ on the whole data set is already encapsulated in the `builders`. If you
923+ are incrementally processing a large data set, simply call this function
924+ for each chunk.
866925
867926 .. versionadded:: 0.2.0
868927 The ``NA_action`` argument.
928+
929+ .. versionadded:: 0.3.0
930+ The ``data_index`` argument.
869931 """
870932 if isinstance (NA_action , basestring ):
871933 NA_action = NAAction (NA_action )
@@ -878,8 +940,21 @@ def build_design_matrices(builders, data,
878940 # Evaluate factors
879941 evaluator_to_values = {}
880942 evaluator_to_isNAs = {}
881- num_rows = None
882- pandas_index = None
943+ import operator
944+ rows_checker = _CheckMatch ("Shape" , "number of rows" , lambda a , b : a == b )
945+ index_checker = _CheckMatch ("Index" , "index" , lambda a , b : a .equals (b ))
946+ if data_index is not None :
947+ if have_pandas :
948+ data_index = pandas .Index (data_index )
949+ index_checker .check (data_index , "data_index argument" , None )
950+ else :
951+ data_index = np .asarray (data_index )
952+ if data_index .ndim != 1 :
953+ raise PatsyError ("data_index argument is not 1-d" )
954+ rows_checker .check (data_index .shape [0 ], "data_index" , None )
955+ if have_pandas and isinstance (data , pandas .DataFrame ):
956+ index_checker .check (data .index , "data.index" , None )
957+ rows_checker .check (data .shape [0 ], "data argument" , None )
883958 for builder in builders :
884959 # We look at evaluators rather than factors here, because it might
885960 # happen that we have the same factor twice, but with different
@@ -889,24 +964,12 @@ def build_design_matrices(builders, data,
889964 value , is_NA = evaluator .eval (data , NA_action )
890965 evaluator_to_isNAs [evaluator ] = is_NA
891966 # value may now be a Series, DataFrame, or ndarray
892- if num_rows is None :
893- num_rows = value .shape [0 ]
894- else :
895- if num_rows != value .shape [0 ]:
896- msg = ("Row mismatch: factor %s had %s rows, when "
897- "previous factors had %s rows"
898- % (evaluator .factor .name (), value .shape [0 ],
899- num_rows ))
900- raise PatsyError (msg , evaluator .factor )
967+ name = evaluator .factor .name ()
968+ origin = evaluator .factor .origin
969+ rows_checker .check (value .shape [0 ], name , origin )
901970 if (have_pandas
902971 and isinstance (value , (pandas .Series , pandas .DataFrame ))):
903- if pandas_index is None :
904- pandas_index = value .index
905- else :
906- if not pandas_index .equals (value .index ):
907- msg = ("Index mismatch: pandas objects must "
908- "have aligned indexes" )
909- raise PatsyError (msg , evaluator .factor )
972+ index_checker .check (value .index , name , origin )
910973 # Strategy: we work with raw ndarrays for doing the actual
911974 # combining; DesignMatrixBuilder objects never sees pandas
912975 # objects. Then at the end, if a DataFrame was requested, we
@@ -918,6 +981,9 @@ def build_design_matrices(builders, data,
918981 # Handle NAs
919982 values = evaluator_to_values .values ()
920983 is_NAs = evaluator_to_isNAs .values ()
984+ origins = [evaluator .factor .origin for evaluator in evaluator_to_values ]
985+ pandas_index = index_checker .value
986+ num_rows = rows_checker .value
921987 # num_rows is None iff evaluator_to_values (and associated sets like
922988 # 'values') are empty, i.e., we have no actual evaluators involved
923989 # (formulas like "~ 1").
@@ -926,10 +992,10 @@ def build_design_matrices(builders, data,
926992 pandas_index = np .arange (num_rows )
927993 values .append (pandas_index )
928994 is_NAs .append (np .zeros (len (pandas_index ), dtype = bool ))
929- origins = [ evaluator . factor . origin for evaluator in evaluator_to_values ]
995+ origins . append ( None )
930996 new_values = NA_action .handle_NA (values , is_NAs , origins )
931997 # NA_action may have changed the number of rows.
932- if num_rows is not None :
998+ if new_values :
933999 num_rows = new_values [0 ].shape [0 ]
9341000 if return_type == "dataframe" and num_rows is not None :
9351001 pandas_index = new_values .pop ()
@@ -940,19 +1006,24 @@ def build_design_matrices(builders, data,
9401006 results .append (builder ._build (evaluator_to_values , dtype ))
9411007 matrices = []
9421008 for need_reshape , matrix in results :
943- if need_reshape and num_rows is not None :
1009+ if need_reshape :
1010+ # There is no data-dependence, at all -- a formula like "1 ~ 1".
1011+ # In this case the builder just returns a single-row matrix, and
1012+ # we have to broadcast it vertically to the appropriate size. If
1013+ # we can figure out what that is...
9441014 assert matrix .shape [0 ] == 1
945- matrices .append (DesignMatrix (np .repeat (matrix , num_rows , axis = 0 ),
946- matrix .design_info ))
947- else :
948- # There is no data-dependence, at all -- a formula like "1 ~ 1". I
949- # guess we'll just return some single-row matrices. Perhaps it
950- # would be better to figure out how many rows are in the input
951- # data and broadcast to that size, but eh. Input data is optional
952- # in the first place, so even that would be no guarantee... let's
953- # wait until someone actually has a relevant use case before we
954- # worry about it.
955- matrices .append (matrix )
1015+ if num_rows is not None :
1016+ matrix = DesignMatrix (np .repeat (matrix , num_rows , axis = 0 ),
1017+ matrix .design_info )
1018+ else :
1019+ raise PatsyError (
1020+ "No design matrix has any non-trivial factors, "
1021+ "the data object is not a DataFrame, "
1022+ "and no data_index= argument was supplied. "
1023+ "I can't tell how many rows the design matrix should "
1024+ "have!"
1025+ )
1026+ matrices .append (matrix )
9561027 if return_type == "dataframe" :
9571028 assert have_pandas
9581029 for i , matrix in enumerate (matrices ):
0 commit comments