overhaul index and num_rows handling in build_design_matrices

njsmith · njsmith · commit 67b8c71580b1 · 2013-08-29T13:45:15.000+01:00
new argument data_index; better handling for

Still needs:
- tests for data_index and data.index
- data_index argument added to highlevel interface
- changes.rst (esp. that "~ 1" can now raise an error if data_index= not given)
diff --git a/patsy/build.py b/patsy/build.py
@@ -827,10 +827,34 @@ def _build(self, evaluator_to_values, dtype):
         assert start_column == self.total_columns
         return need_reshape, m
 
+class _CheckMatch(object):
+    def __init__(self, name, value_type, eq_fn):
+        self._name = name
+        self._value_type = value_type
+        self._eq_fn = eq_fn
+        self.value = None
+        self._value_desc = None
+        self._value_origin = None
+
+    def check(self, seen_value, desc, origin):
+        if self.value is None:
+            self.value = seen_value
+            self._value_desc = desc
+            self._value_origin = origin
+        else:
+            if not self._eq_fn(self.value, seen_value):
+                # XX FIXME: this is a case where having discontiguous Origins
+                # would be useful...
+                raise PatsyError("%s mismatch: %s and %s have different %s"
+                                 % (self._name, self._value_desc, desc,
+                                    self._value_type),
+                                 origin)
+
 def build_design_matrices(builders, data,
                           NA_action="drop",
                           return_type="matrix",
-                          dtype=np.dtype(float)):
+                          dtype=np.dtype(float),
+                          data_index=None):
     """Construct several design matrices from :class:`DesignMatrixBuilder`
     objects.
 
@@ -848,24 +872,62 @@ def build_design_matrices(builders, data,
     :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
     :arg dtype: The dtype of the returned matrix. Useful if you want to use
       single-precision or extended-precision.
+    :arg data_index: A single-dimensional array-like object containing
+      hashable objects (i.e., a valid :class:`pandas.Index`), or None. If
+      ``None``, then defaults to the value of ``data.index``, if this
+      attribute exists.
 
     This function returns either a list of :class:`DesignMatrix` objects (for
     ``return_type="matrix"``) or a list of :class:`pandas.DataFrame` objects
-    (for ``return_type="dataframe"``). In the latter case, the DataFrames will
-    preserve any (row) indexes that were present in the input, which may be
-    useful for time-series models etc. In any case, all returned design
+    (for ``return_type="dataframe"``). In both cases, all returned design
     matrices will have ``.design_info`` attributes containing the appropriate
     :class:`DesignInfo` objects.
 
-    Unlike :func:`design_matrix_builders`, this function takes only a simple
-    data argument, not any kind of iterator. That's because this function
-    doesn't need a global view of the data -- everything that depends on the
-    whole data set is already encapsulated in the `builders`. If you are
-    incrementally processing a large data set, simply call this function for
-    each chunk.
+    Index handling: this function checks for indexes in the following places:
+
+    * The ``data_index`` argument.
+    * If ``data`` is a :class:`pandas.DataFrame`, its ``.index`` attribute.
+    * If any factors evaluate to a :class:`pandas.Series` or
+      :class:`pandas.DataFrame`, then their ``.index`` attributes.
+
+    If multiple indexes are found, they must be identical (same values in the
+    same order). If no indexes are found, then a default index is generated
+    using ``np.arange(num_rows)``. One way or another, we end up with a single
+    index for all the data. If ``return_type="dataframe"``, then this index is
+    used as the index of the returned DataFrame objects. Examining this index
+    makes it possible to determine which rows were removed due to NAs.
+
+    Indexes are also critical in one further case, even if not using
+    pandas. Most of the time, it's obvious how many rows the design matrices
+    are supposed to have, because in a formula like ``"y ~ x"``, we can look
+    at the values of ``y`` and ``x`` and see how many entries they have. But
+    some formulas don't actually depend on the data, e.g. ``"~ 1"``. In this
+    case it's impossible just from looking at the formula to know how many
+    rows the design matrices should have. But, if ``data_index`` is specified
+    (or ``data`` has a ``.index`` attribute), then we can (and do) use this to
+    determine the shape of the output design matrix. If the data index is
+    *not* available, then trying to build a formula like ``"~ 1"`` will raise
+    an error.
+
+    One situation where ``data_index=`` is critical, therefore, is when
+    implementing a model that has an implicit dependent variable, where your
+    users will specify a one-sided formula like ``"~ 1 + x"`` and you will
+    determine the effective left-hand side by other means. In this case, a
+    model like ``"~ 1"`` makes perfect sense, but will be an error if
+    ``data_index=`` is not specified.
+
+    Note that unlike :func:`design_matrix_builders`, this function takes only
+    a simple data argument, not any kind of iterator. That's because this
+    function doesn't need a global view of the data -- everything that depends
+    on the whole data set is already encapsulated in the `builders`. If you
+    are incrementally processing a large data set, simply call this function
+    for each chunk.
 
     .. versionadded:: 0.2.0
        The ``NA_action`` argument.
+
+    .. versionadded:: 0.3.0
+       The ``data_index`` argument.
     """
     if isinstance(NA_action, basestring):
         NA_action = NAAction(NA_action)
@@ -878,8 +940,21 @@ def build_design_matrices(builders, data,
     # Evaluate factors
     evaluator_to_values = {}
     evaluator_to_isNAs = {}
-    num_rows = None
-    pandas_index = None
+    import operator
+    rows_checker = _CheckMatch("Shape", "number of rows", lambda a, b: a == b)
+    index_checker = _CheckMatch("Index", "index", lambda a, b: a.equals(b))
+    if data_index is not None:
+        if have_pandas:
+            data_index = pandas.Index(data_index)
+            index_checker.check(data_index, "data_index argument", None)
+        else:
+            data_index = np.asarray(data_index)
+        if data_index.ndim != 1:
+            raise PatsyError("data_index argument is not 1-d")
+        rows_checker.check(data_index.shape[0], "data_index", None)
+    if have_pandas and isinstance(data, pandas.DataFrame):
+        index_checker.check(data.index, "data.index", None)
+        rows_checker.check(data.shape[0], "data argument", None)
     for builder in builders:
         # We look at evaluators rather than factors here, because it might
         # happen that we have the same factor twice, but with different
@@ -889,24 +964,12 @@ def build_design_matrices(builders, data,
                 value, is_NA = evaluator.eval(data, NA_action)
                 evaluator_to_isNAs[evaluator] = is_NA
                 # value may now be a Series, DataFrame, or ndarray
-                if num_rows is None:
-                    num_rows = value.shape[0]
-                else:
-                    if num_rows != value.shape[0]:
-                        msg = ("Row mismatch: factor %s had %s rows, when "
-                               "previous factors had %s rows"
-                               % (evaluator.factor.name(), value.shape[0],
-                                  num_rows))
-                        raise PatsyError(msg, evaluator.factor)
+                name = evaluator.factor.name()
+                origin = evaluator.factor.origin
+                rows_checker.check(value.shape[0], name, origin)
                 if (have_pandas
                     and isinstance(value, (pandas.Series, pandas.DataFrame))):
-                    if pandas_index is None:
-                        pandas_index = value.index
-                    else:
-                        if not pandas_index.equals(value.index):
-                            msg = ("Index mismatch: pandas objects must "
-                                   "have aligned indexes")
-                            raise PatsyError(msg, evaluator.factor)
+                    index_checker.check(value.index, name, origin)
                 # Strategy: we work with raw ndarrays for doing the actual
                 # combining; DesignMatrixBuilder objects never sees pandas
                 # objects. Then at the end, if a DataFrame was requested, we
@@ -918,6 +981,9 @@ def build_design_matrices(builders, data,
     # Handle NAs
     values = evaluator_to_values.values()
     is_NAs = evaluator_to_isNAs.values()
+    origins = [evaluator.factor.origin for evaluator in evaluator_to_values]
+    pandas_index = index_checker.value
+    num_rows = rows_checker.value
     # num_rows is None iff evaluator_to_values (and associated sets like
     # 'values') are empty, i.e., we have no actual evaluators involved
     # (formulas like "~ 1").
@@ -926,10 +992,10 @@ def build_design_matrices(builders, data,
             pandas_index = np.arange(num_rows)
         values.append(pandas_index)
         is_NAs.append(np.zeros(len(pandas_index), dtype=bool))
-    origins = [evaluator.factor.origin for evaluator in evaluator_to_values]
+        origins.append(None)
     new_values = NA_action.handle_NA(values, is_NAs, origins)
     # NA_action may have changed the number of rows.
-    if num_rows is not None:
+    if new_values:
         num_rows = new_values[0].shape[0]
     if return_type == "dataframe" and num_rows is not None:
         pandas_index = new_values.pop()
@@ -940,19 +1006,24 @@ def build_design_matrices(builders, data,
         results.append(builder._build(evaluator_to_values, dtype))
     matrices = []
     for need_reshape, matrix in results:
-        if need_reshape and num_rows is not None:
+        if need_reshape:
+            # There is no data-dependence, at all -- a formula like "1 ~ 1".
+            # In this case the builder just returns a single-row matrix, and
+            # we have to broadcast it vertically to the appropriate size. If
+            # we can figure out what that is...
             assert matrix.shape[0] == 1
-            matrices.append(DesignMatrix(np.repeat(matrix, num_rows, axis=0),
-                                         matrix.design_info))
-        else:
-            # There is no data-dependence, at all -- a formula like "1 ~ 1". I
-            # guess we'll just return some single-row matrices. Perhaps it
-            # would be better to figure out how many rows are in the input
-            # data and broadcast to that size, but eh. Input data is optional
-            # in the first place, so even that would be no guarantee... let's
-            # wait until someone actually has a relevant use case before we
-            # worry about it.
-            matrices.append(matrix)
+            if num_rows is not None:
+                matrix = DesignMatrix(np.repeat(matrix, num_rows, axis=0),
+                                      matrix.design_info)
+            else:
+                raise PatsyError(
+                    "No design matrix has any non-trivial factors, "
+                    "the data object is not a DataFrame, "
+                    "and no data_index= argument was supplied. "
+                    "I can't tell how many rows the design matrix should "
+                    "have!"
+                    )
+        matrices.append(matrix)
     if return_type == "dataframe":
         assert have_pandas
         for i, matrix in enumerate(matrices):
diff --git a/patsy/missing.py b/patsy/missing.py
@@ -155,6 +155,7 @@ def handle_NA(self, values, is_NAs, origins):
         :returns: A list of new values (which may have a differing number of
           rows.)
         """
+        assert len(values) == len(is_NAs) == len(origins)
         if len(values) == 0:
             return values
         if self.on_NA == "raise":
diff --git a/patsy/test_build.py b/patsy/test_build.py
@@ -471,15 +471,38 @@ def test_data_independent_builder():
     def iter_maker():
         yield data
 
-    # If building a formula that doesn't depend on the data at all, we just
-    # return a single-row matrix.
-    m = make_matrix(data, 0, [], column_names=[])
-    assert m.shape == (1, 0)
+    # Trying to build a matrix that doesn't depend on the data at all is an
+    # error, if:
+    # - the data_index argument is not given
+    # - the data is not a DataFrame
+    # - there are no other matrices
+    null_builder = design_matrix_builders([make_termlist()], iter_maker)[0]
+    assert_raises(PatsyError, build_design_matrices, [null_builder], data)
+
+    intercept_builder = design_matrix_builders([make_termlist([])],
+                                               iter_maker)[0]
+    assert_raises(PatsyError, build_design_matrices, [intercept_builder], data)
 
-    m = make_matrix(data, 1, [[]], column_names=["Intercept"])
-    assert np.allclose(m, [[1]])
+    assert_raises(PatsyError,
+                  build_design_matrices,
+                  [null_builder, intercept_builder], data)
+
+    # If data_index is given, it sets the number of rows.
+    int_m, null_m = build_design_matrices([intercept_builder, null_builder],
+                                          data,
+                                          data_index=[1, 2, 3, 4])
+    assert np.allclose(int_m, [[1], [1], [1], [1]])
+    assert null_m.shape == (4, 0)
+
+    # If data is a DataFrame, it sets the number of rows.
+    if have_pandas:
+        int_m, null_m = build_design_matrices([intercept_builder,
+                                               null_builder],
+                                              pandas.DataFrame(data))
+        assert np.allclose(int_m, [[1], [1], [1]])
+        assert null_m.shape == (3, 0)
 
-    # Or, if there are other matrices that do depend on the data, we make the
+    # If there are other matrices that do depend on the data, we make the
     # data-independent matrices have the same number of rows.
     x_termlist = make_termlist(["x"])
 
diff --git a/patsy/tokens.py b/patsy/tokens.py
@@ -35,8 +35,8 @@ def python_tokenize(code):
             assert pytype not in (tokenize.NL, tokenize.NEWLINE)
             if pytype == tokenize.ERRORTOKEN:
                 raise PatsyError("error tokenizing input "
-                                    "(maybe an unclosed string?)",
-                                    origin)
+                                 "(maybe an unclosed string?)",
+                                 origin)
             if pytype == tokenize.COMMENT:
                 raise PatsyError("comments are not allowed", origin)
             yield (pytype, string, origin)