Skip to content

Commit 67b8c71

Browse files
committed
overhaul index and num_rows handling in build_design_matrices
new argument data_index; better handling for Still needs: - tests for data_index and data.index - data_index argument added to highlevel interface - changes.rst (esp. that "~ 1" can now raise an error if data_index= not given)
1 parent 892c37e commit 67b8c71

File tree

4 files changed

+146
-51
lines changed

4 files changed

+146
-51
lines changed

patsy/build.py

Lines changed: 113 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -827,10 +827,34 @@ def _build(self, evaluator_to_values, dtype):
827827
assert start_column == self.total_columns
828828
return need_reshape, m
829829

830+
class _CheckMatch(object):
831+
def __init__(self, name, value_type, eq_fn):
832+
self._name = name
833+
self._value_type = value_type
834+
self._eq_fn = eq_fn
835+
self.value = None
836+
self._value_desc = None
837+
self._value_origin = None
838+
839+
def check(self, seen_value, desc, origin):
840+
if self.value is None:
841+
self.value = seen_value
842+
self._value_desc = desc
843+
self._value_origin = origin
844+
else:
845+
if not self._eq_fn(self.value, seen_value):
846+
# XX FIXME: this is a case where having discontiguous Origins
847+
# would be useful...
848+
raise PatsyError("%s mismatch: %s and %s have different %s"
849+
% (self._name, self._value_desc, desc,
850+
self._value_type),
851+
origin)
852+
830853
def build_design_matrices(builders, data,
831854
NA_action="drop",
832855
return_type="matrix",
833-
dtype=np.dtype(float)):
856+
dtype=np.dtype(float),
857+
data_index=None):
834858
"""Construct several design matrices from :class:`DesignMatrixBuilder`
835859
objects.
836860
@@ -848,24 +872,62 @@ def build_design_matrices(builders, data,
848872
:arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
849873
:arg dtype: The dtype of the returned matrix. Useful if you want to use
850874
single-precision or extended-precision.
875+
:arg data_index: A single-dimensional array-like object containing
876+
hashable objects (i.e., a valid :class:`pandas.Index`), or None. If
877+
``None``, then defaults to the value of ``data.index``, if this
878+
attribute exists.
851879
852880
This function returns either a list of :class:`DesignMatrix` objects (for
853881
``return_type="matrix"``) or a list of :class:`pandas.DataFrame` objects
854-
(for ``return_type="dataframe"``). In the latter case, the DataFrames will
855-
preserve any (row) indexes that were present in the input, which may be
856-
useful for time-series models etc. In any case, all returned design
882+
(for ``return_type="dataframe"``). In both cases, all returned design
857883
matrices will have ``.design_info`` attributes containing the appropriate
858884
:class:`DesignInfo` objects.
859885
860-
Unlike :func:`design_matrix_builders`, this function takes only a simple
861-
data argument, not any kind of iterator. That's because this function
862-
doesn't need a global view of the data -- everything that depends on the
863-
whole data set is already encapsulated in the `builders`. If you are
864-
incrementally processing a large data set, simply call this function for
865-
each chunk.
886+
Index handling: this function checks for indexes in the following places:
887+
888+
* The ``data_index`` argument.
889+
* If ``data`` is a :class:`pandas.DataFrame`, its ``.index`` attribute.
890+
* If any factors evaluate to a :class:`pandas.Series` or
891+
:class:`pandas.DataFrame`, then their ``.index`` attributes.
892+
893+
If multiple indexes are found, they must be identical (same values in the
894+
same order). If no indexes are found, then a default index is generated
895+
using ``np.arange(num_rows)``. One way or another, we end up with a single
896+
index for all the data. If ``return_type="dataframe"``, then this index is
897+
used as the index of the returned DataFrame objects. Examining this index
898+
makes it possible to determine which rows were removed due to NAs.
899+
900+
Indexes are also critical in one further case, even if not using
901+
pandas. Most of the time, it's obvious how many rows the design matrices
902+
are supposed to have, because in a formula like ``"y ~ x"``, we can look
903+
at the values of ``y`` and ``x`` and see how many entries they have. But
904+
some formulas don't actually depend on the data, e.g. ``"~ 1"``. In this
905+
case it's impossible just from looking at the formula to know how many
906+
rows the design matrices should have. But, if ``data_index`` is specified
907+
(or ``data`` has a ``.index`` attribute), then we can (and do) use this to
908+
determine the shape of the output design matrix. If the data index is
909+
*not* available, then trying to build a formula like ``"~ 1"`` will raise
910+
an error.
911+
912+
One situation where ``data_index=`` is critical, therefore, is when
913+
implementing a model that has an implicit dependent variable, where your
914+
users will specify a one-sided formula like ``"~ 1 + x"`` and you will
915+
determine the effective left-hand side by other means. In this case, a
916+
model like ``"~ 1"`` makes perfect sense, but will be an error if
917+
``data_index=`` is not specified.
918+
919+
Note that unlike :func:`design_matrix_builders`, this function takes only
920+
a simple data argument, not any kind of iterator. That's because this
921+
function doesn't need a global view of the data -- everything that depends
922+
on the whole data set is already encapsulated in the `builders`. If you
923+
are incrementally processing a large data set, simply call this function
924+
for each chunk.
866925
867926
.. versionadded:: 0.2.0
868927
The ``NA_action`` argument.
928+
929+
.. versionadded:: 0.3.0
930+
The ``data_index`` argument.
869931
"""
870932
if isinstance(NA_action, basestring):
871933
NA_action = NAAction(NA_action)
@@ -878,8 +940,21 @@ def build_design_matrices(builders, data,
878940
# Evaluate factors
879941
evaluator_to_values = {}
880942
evaluator_to_isNAs = {}
881-
num_rows = None
882-
pandas_index = None
943+
import operator
944+
rows_checker = _CheckMatch("Shape", "number of rows", lambda a, b: a == b)
945+
index_checker = _CheckMatch("Index", "index", lambda a, b: a.equals(b))
946+
if data_index is not None:
947+
if have_pandas:
948+
data_index = pandas.Index(data_index)
949+
index_checker.check(data_index, "data_index argument", None)
950+
else:
951+
data_index = np.asarray(data_index)
952+
if data_index.ndim != 1:
953+
raise PatsyError("data_index argument is not 1-d")
954+
rows_checker.check(data_index.shape[0], "data_index", None)
955+
if have_pandas and isinstance(data, pandas.DataFrame):
956+
index_checker.check(data.index, "data.index", None)
957+
rows_checker.check(data.shape[0], "data argument", None)
883958
for builder in builders:
884959
# We look at evaluators rather than factors here, because it might
885960
# happen that we have the same factor twice, but with different
@@ -889,24 +964,12 @@ def build_design_matrices(builders, data,
889964
value, is_NA = evaluator.eval(data, NA_action)
890965
evaluator_to_isNAs[evaluator] = is_NA
891966
# value may now be a Series, DataFrame, or ndarray
892-
if num_rows is None:
893-
num_rows = value.shape[0]
894-
else:
895-
if num_rows != value.shape[0]:
896-
msg = ("Row mismatch: factor %s had %s rows, when "
897-
"previous factors had %s rows"
898-
% (evaluator.factor.name(), value.shape[0],
899-
num_rows))
900-
raise PatsyError(msg, evaluator.factor)
967+
name = evaluator.factor.name()
968+
origin = evaluator.factor.origin
969+
rows_checker.check(value.shape[0], name, origin)
901970
if (have_pandas
902971
and isinstance(value, (pandas.Series, pandas.DataFrame))):
903-
if pandas_index is None:
904-
pandas_index = value.index
905-
else:
906-
if not pandas_index.equals(value.index):
907-
msg = ("Index mismatch: pandas objects must "
908-
"have aligned indexes")
909-
raise PatsyError(msg, evaluator.factor)
972+
index_checker.check(value.index, name, origin)
910973
# Strategy: we work with raw ndarrays for doing the actual
911974
# combining; DesignMatrixBuilder objects never sees pandas
912975
# objects. Then at the end, if a DataFrame was requested, we
@@ -918,6 +981,9 @@ def build_design_matrices(builders, data,
918981
# Handle NAs
919982
values = evaluator_to_values.values()
920983
is_NAs = evaluator_to_isNAs.values()
984+
origins = [evaluator.factor.origin for evaluator in evaluator_to_values]
985+
pandas_index = index_checker.value
986+
num_rows = rows_checker.value
921987
# num_rows is None iff evaluator_to_values (and associated sets like
922988
# 'values') are empty, i.e., we have no actual evaluators involved
923989
# (formulas like "~ 1").
@@ -926,10 +992,10 @@ def build_design_matrices(builders, data,
926992
pandas_index = np.arange(num_rows)
927993
values.append(pandas_index)
928994
is_NAs.append(np.zeros(len(pandas_index), dtype=bool))
929-
origins = [evaluator.factor.origin for evaluator in evaluator_to_values]
995+
origins.append(None)
930996
new_values = NA_action.handle_NA(values, is_NAs, origins)
931997
# NA_action may have changed the number of rows.
932-
if num_rows is not None:
998+
if new_values:
933999
num_rows = new_values[0].shape[0]
9341000
if return_type == "dataframe" and num_rows is not None:
9351001
pandas_index = new_values.pop()
@@ -940,19 +1006,24 @@ def build_design_matrices(builders, data,
9401006
results.append(builder._build(evaluator_to_values, dtype))
9411007
matrices = []
9421008
for need_reshape, matrix in results:
943-
if need_reshape and num_rows is not None:
1009+
if need_reshape:
1010+
# There is no data-dependence, at all -- a formula like "1 ~ 1".
1011+
# In this case the builder just returns a single-row matrix, and
1012+
# we have to broadcast it vertically to the appropriate size. If
1013+
# we can figure out what that is...
9441014
assert matrix.shape[0] == 1
945-
matrices.append(DesignMatrix(np.repeat(matrix, num_rows, axis=0),
946-
matrix.design_info))
947-
else:
948-
# There is no data-dependence, at all -- a formula like "1 ~ 1". I
949-
# guess we'll just return some single-row matrices. Perhaps it
950-
# would be better to figure out how many rows are in the input
951-
# data and broadcast to that size, but eh. Input data is optional
952-
# in the first place, so even that would be no guarantee... let's
953-
# wait until someone actually has a relevant use case before we
954-
# worry about it.
955-
matrices.append(matrix)
1015+
if num_rows is not None:
1016+
matrix = DesignMatrix(np.repeat(matrix, num_rows, axis=0),
1017+
matrix.design_info)
1018+
else:
1019+
raise PatsyError(
1020+
"No design matrix has any non-trivial factors, "
1021+
"the data object is not a DataFrame, "
1022+
"and no data_index= argument was supplied. "
1023+
"I can't tell how many rows the design matrix should "
1024+
"have!"
1025+
)
1026+
matrices.append(matrix)
9561027
if return_type == "dataframe":
9571028
assert have_pandas
9581029
for i, matrix in enumerate(matrices):

patsy/missing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def handle_NA(self, values, is_NAs, origins):
155155
:returns: A list of new values (which may have a differing number of
156156
rows.)
157157
"""
158+
assert len(values) == len(is_NAs) == len(origins)
158159
if len(values) == 0:
159160
return values
160161
if self.on_NA == "raise":

patsy/test_build.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -471,15 +471,38 @@ def test_data_independent_builder():
471471
def iter_maker():
472472
yield data
473473

474-
# If building a formula that doesn't depend on the data at all, we just
475-
# return a single-row matrix.
476-
m = make_matrix(data, 0, [], column_names=[])
477-
assert m.shape == (1, 0)
474+
# Trying to build a matrix that doesn't depend on the data at all is an
475+
# error, if:
476+
# - the data_index argument is not given
477+
# - the data is not a DataFrame
478+
# - there are no other matrices
479+
null_builder = design_matrix_builders([make_termlist()], iter_maker)[0]
480+
assert_raises(PatsyError, build_design_matrices, [null_builder], data)
481+
482+
intercept_builder = design_matrix_builders([make_termlist([])],
483+
iter_maker)[0]
484+
assert_raises(PatsyError, build_design_matrices, [intercept_builder], data)
478485

479-
m = make_matrix(data, 1, [[]], column_names=["Intercept"])
480-
assert np.allclose(m, [[1]])
486+
assert_raises(PatsyError,
487+
build_design_matrices,
488+
[null_builder, intercept_builder], data)
489+
490+
# If data_index is given, it sets the number of rows.
491+
int_m, null_m = build_design_matrices([intercept_builder, null_builder],
492+
data,
493+
data_index=[1, 2, 3, 4])
494+
assert np.allclose(int_m, [[1], [1], [1], [1]])
495+
assert null_m.shape == (4, 0)
496+
497+
# If data is a DataFrame, it sets the number of rows.
498+
if have_pandas:
499+
int_m, null_m = build_design_matrices([intercept_builder,
500+
null_builder],
501+
pandas.DataFrame(data))
502+
assert np.allclose(int_m, [[1], [1], [1]])
503+
assert null_m.shape == (3, 0)
481504

482-
# Or, if there are other matrices that do depend on the data, we make the
505+
# If there are other matrices that do depend on the data, we make the
483506
# data-independent matrices have the same number of rows.
484507
x_termlist = make_termlist(["x"])
485508

patsy/tokens.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def python_tokenize(code):
3535
assert pytype not in (tokenize.NL, tokenize.NEWLINE)
3636
if pytype == tokenize.ERRORTOKEN:
3737
raise PatsyError("error tokenizing input "
38-
"(maybe an unclosed string?)",
39-
origin)
38+
"(maybe an unclosed string?)",
39+
origin)
4040
if pytype == tokenize.COMMENT:
4141
raise PatsyError("comments are not allowed", origin)
4242
yield (pytype, string, origin)

0 commit comments

Comments
 (0)