Skip to content

Commit d317c69

Browse files
committed
ENH: add missing value handling
Fixes pydata#1.
1 parent ab6e226 commit d317c69

File tree

10 files changed

+519
-30
lines changed

10 files changed

+519
-30
lines changed

doc/API-reference.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ Handling categorical data
166166
.. autoclass:: ContrastMatrix
167167

168168
.. autoclass:: Categorical
169+
:members:
169170

170171
Working with formulas
171172
---------------------
@@ -211,6 +212,12 @@ Building design matrices
211212

212213
.. autofunction:: build_design_matrices
213214

215+
Missing values
216+
--------------
217+
218+
.. autoclass:: NAAction
219+
:members:
220+
214221
Linear constraints
215222
------------------
216223

patsy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _reexport(modname):
7474

7575
for child in ["highlevel", "build", "categorical", "constraint", "contrasts",
7676
"desc", "design_info", "eval", "origin", "state",
77-
"user_util"]:
77+
"user_util", "missing"]:
7878
_reexport("patsy." + child)
7979
# XX FIXME: we aren't exporting any of the explicit parsing interface
8080
# yet. Need to figure out how to do that.

patsy/build.py

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from patsy.desc import ModelDesc
2020
from patsy.contrasts import code_contrast_matrix, Treatment
2121
from patsy.compat import itertools_product, OrderedDict
22+
from patsy.missing import NAAction
2223

2324
if have_pandas:
2425
import pandas
@@ -174,8 +175,7 @@ def __init__(self, factor, state, postprocessor, expected_levels):
174175
def eval(self, data):
175176
# returns either a 2d ndarray or a DataFrame
176177
result = self.factor.eval(self._state, data)
177-
if self._postprocessor is not None:
178-
result = self._postprocessor.transform(result)
178+
result = self._postprocessor.transform(result)
179179
if not isinstance(result, Categorical):
180180
msg = ("when evaluating categoric factor %r, I got a "
181181
"result that is not of type Categorical (but rather %s)"
@@ -189,10 +189,7 @@ def eval(self, data):
189189
% (self.factor.name(), self._expected_levels, result.levels))
190190
raise PatsyError(msg, self.factor)
191191
_max_allowed_dim(1, result.int_array, self.factor)
192-
# For consistency, evaluators *always* return 2d arrays (though in
193-
# this case it will always have only 1 column):
194-
return atleast_2d_column_default(result.int_array,
195-
preserve_pandas=True)
192+
return result
196193

197194
def test__CatFactorEvaluator():
198195
from nose.tools import assert_raises
@@ -292,8 +289,11 @@ def build(self, factor_values, out):
292289
for factor, column_idx in zip(self._factors, column_idxs):
293290
if factor in self._cat_contrasts:
294291
contrast = self._cat_contrasts[factor]
295-
out[:, i] *= contrast.matrix[factor_values[factor].ravel(),
296-
column_idx]
292+
int_array = factor_values[factor].int_array
293+
if np.any(int_array < 0):
294+
raise PatsyError("can't build a design matrix "
295+
"containing missing values", factor)
296+
out[:, i] *= contrast.matrix[int_array, column_idx]
297297
else:
298298
assert (factor_values[factor].shape[1]
299299
== self._num_columns[factor])
@@ -768,10 +768,14 @@ def _build(self, evaluator_to_values, dtype):
768768
for evaluator, value in evaluator_to_values.iteritems():
769769
if evaluator in self._evaluators:
770770
factor_to_values[evaluator.factor] = value
771+
if isinstance(value, Categorical):
772+
this_num_rows = value.int_array.shape[0]
773+
else:
774+
this_num_rows = value.shape[0]
771775
if num_rows is not None:
772-
assert num_rows == value.shape[0]
776+
assert num_rows == this_num_rows
773777
else:
774-
num_rows = value.shape[0]
778+
num_rows = this_num_rows
775779
if num_rows is None:
776780
# We have no dependence on the data -- e.g. an empty termlist, or
777781
# only an intercept term.
@@ -789,7 +793,9 @@ def _build(self, evaluator_to_values, dtype):
789793
assert start_column == self.total_columns
790794
return need_reshape, m
791795

792-
def build_design_matrices(builders, data, return_type="matrix",
796+
def build_design_matrices(builders, data,
797+
NA_action="drop",
798+
return_type="matrix",
793799
dtype=np.dtype(float)):
794800
"""Construct several design matrices from :class:`DesignMatrixBuilder`
795801
objects.
@@ -801,6 +807,10 @@ def build_design_matrices(builders, data, return_type="matrix",
801807
:arg builders: A list of :class:`DesignMatrixBuilders` specifying the
802808
design matrices to be built.
803809
:arg data: A dict-like object which will be used to look up data.
810+
:arg NA_action: What to do with rows that contain missing values. Either
811+
``"drop"``, ``"raise"``, or an :class:`NAAction` object. See
812+
:class:`NAAction` for details on what values count as 'missing' (and how
813+
to alter this).
804814
:arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
805815
:arg dtype: The dtype of the returned matrix. Useful if you want to use
806816
single-precision or extended-precision.
@@ -820,12 +830,15 @@ def build_design_matrices(builders, data, return_type="matrix",
820830
incrementally processing a large data set, simply call this function for
821831
each chunk.
822832
"""
833+
if isinstance(NA_action, basestring):
834+
NA_action = NAAction(NA_action)
823835
if return_type == "dataframe" and not have_pandas:
824836
raise PatsyError("pandas.DataFrame was requested, but pandas "
825837
"is not installed")
826838
if return_type not in ("matrix", "dataframe"):
827839
raise PatsyError("unrecognized output type %r, should be "
828840
"'matrix' or 'dataframe'" % (return_type,))
841+
# Evaluate factors
829842
evaluator_to_values = {}
830843
num_rows = None
831844
pandas_index = None
@@ -836,30 +849,48 @@ def build_design_matrices(builders, data, return_type="matrix",
836849
for evaluator in builder._evaluators:
837850
if evaluator not in evaluator_to_values:
838851
value = evaluator.eval(data)
839-
assert value.ndim == 2
852+
if isinstance(value, Categorical):
853+
unboxed = value.int_array
854+
else:
855+
unboxed = value
856+
# unboxed may now be a Series, DataFrame, or ndarray
840857
if num_rows is None:
841-
num_rows = value.shape[0]
858+
num_rows = unboxed.shape[0]
842859
else:
843-
if num_rows != value.shape[0]:
860+
if num_rows != unboxed.shape[0]:
844861
msg = ("Row mismatch: factor %s had %s rows, when "
845862
"previous factors had %s rows"
846-
% (evaluator.factor.name(), value.shape[0],
863+
% (evaluator.factor.name(), unboxed.shape[0],
847864
num_rows))
848865
raise PatsyError(msg, evaluator.factor)
849866
if (have_pandas
850-
and isinstance(value, (pandas.Series, pandas.DataFrame))):
867+
and isinstance(unboxed, (pandas.Series, pandas.DataFrame))):
851868
if pandas_index is None:
852-
pandas_index = value.index
869+
pandas_index = unboxed.index
853870
else:
854-
if not pandas_index.equals(value.index):
871+
if not pandas_index.equals(unboxed.index):
855872
msg = ("Index mismatch: pandas objects must "
856873
"have aligned indexes")
857874
raise PatsyError(msg, evaluator.factor)
858875
# Strategy: we work with raw ndarrays for doing the actual
859876
# combining; DesignMatrixBuilder objects never sees pandas
860877
# objects. Then at the end, if a DataFrame was requested, we
861-
# convert.
862-
evaluator_to_values[evaluator] = np.asarray(value)
878+
# convert. So every entry in this dict is either a
879+
# Categorical object, or a 2-d array of values.
880+
if not isinstance(value, Categorical):
881+
value = np.asarray(value)
882+
evaluator_to_values[evaluator] = value
883+
# Handle NAs
884+
if pandas_index is None and num_rows is not None:
885+
pandas_index = np.arange(num_rows)
886+
factor_values = evaluator_to_values.values()
887+
origins = [evaluator.factor.origin for evaluator in evaluator_to_values]
888+
new_index, new_factor_values = NA_action.handle_NA(pandas_index,
889+
factor_values,
890+
origins)
891+
pandas_index = new_index
892+
evaluator_to_values = dict(zip(evaluator_to_values, new_factor_values))
893+
# Build factor values into matrices
863894
results = []
864895
for builder in builders:
865896
results.append(builder._build(evaluator_to_values, dtype))

patsy/categorical.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from patsy.util import (SortAnythingKey,
1212
have_pandas, have_pandas_categorical,
1313
asarray_or_pandas,
14-
pandas_friendly_reshape)
14+
pandas_friendly_reshape,
15+
safe_scalar_isnan)
1516

1617
if have_pandas:
1718
import pandas
@@ -26,6 +27,10 @@ class Categorical(object):
2627
You should not normally need to use this class directly; it's mostly used
2728
as a way for :func:`C` to pass information back to the formula evaluation
2829
machinery.
30+
31+
The special integer -1 is used to indicate a missing value. (This is
32+
compatible with how :class:`pandas.Categorical` represents missing
33+
values.)
2934
"""
3035
def __init__(self, int_array, levels, contrast=None):
3136
self.int_array = asarray_or_pandas(int_array, dtype=int)
@@ -42,6 +47,8 @@ def __init__(self, int_array, levels, contrast=None):
4247

4348
@classmethod
4449
def from_pandas_categorical(cls, pandas_categorical):
50+
"""Create a Categorical object given a :class:`pandas.Categorical`
51+
object."""
4552
return Categorical(pandas_categorical.labels,
4653
pandas_categorical.levels)
4754

@@ -51,13 +58,22 @@ def from_sequence(cls, sequence, levels=None, **kwargs):
5158
5259
Create a Categorical object given a sequence of data. Levels will be
5360
auto-detected if not given.
61+
62+
As far as this function is concerned, 'None' and 'NaN' values are not
63+
possible levels; they will be treated as indicating missing values.
5464
"""
65+
def missing_level(level):
66+
return level is None or safe_scalar_isnan(level)
5567
if levels is None:
5668
try:
5769
levels = list(set(sequence))
5870
except TypeError:
5971
raise PatsyError("Error converting data to categorical: "
6072
"all items must be hashable")
73+
# Filter out any missing values. (Let's do this before sorting,
74+
# just to avoid any weirdness that might arise when trying to sort
75+
# NaNs...)
76+
levels = [level for level in levels if not missing_level(level)]
6177
levels.sort(key=SortAnythingKey)
6278
level_to_int = {}
6379
for i, level in enumerate(levels):
@@ -70,7 +86,10 @@ def from_sequence(cls, sequence, levels=None, **kwargs):
7086
int_array = np.empty(len(sequence), dtype=int)
7187
for i, entry in enumerate(sequence):
7288
try:
73-
int_array[i] = level_to_int[entry]
89+
if missing_level(entry):
90+
int_array[i] = -1
91+
else:
92+
int_array[i] = level_to_int[entry]
7493
except KeyError:
7594
sorted_levels = sorted(level_to_int)
7695
SHOW_LEVELS = 4
@@ -163,6 +182,31 @@ def test_Categorical():
163182
assert_raises(PatsyError,
164183
Categorical.from_sequence, ["a", "b"], levels=["a", "b", {}])
165184

185+
def test_Categorical_missing():
186+
seqs = [["a", "c", None, np.nan, "b"],
187+
np.asarray("a", "c", None, np.nan, "b", dtype=object),
188+
[("hi", 1), ("hi", 2), None, np.nan, ("bye", 1)],
189+
]
190+
if have_pandas:
191+
seqs.append(pandas.Series(["a", "c", None, np.nan, "b"]))
192+
for seq in seqs:
193+
c = Categorical.from_sequence(seq)
194+
assert len(c.levels) == 3
195+
assert np.array_equal(c.int_array, [0, 2, -1, -1, 1])
196+
197+
c = Categorical.from_sequence(["a", "c", None, np.nan, "b"],
198+
levels=["c", "a", "b"])
199+
assert c.levels == ("c", "a", "b")
200+
assert np.array_equal(c.int_array, [1, 0, -1, -1, 2])
201+
202+
if have_pandas_categorical:
203+
# Make sure that from_pandas_categorical works too
204+
pc = pandas.Categorical(["a", "c", None, np.nan, "b"])
205+
assert np.array_equal(safe_isnan(pc),
206+
[False, False, True, True, False])
207+
c = Categorical.from_pandas_categorical(pc)
208+
assert np.array_equal(c.int_array, [0, 2, -1, -1, 1])
209+
166210
# contrast= can be:
167211
# -- a ContrastMatrix
168212
# -- a simple np.ndarray

patsy/design_info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ class DesignMatrix(np.ndarray):
490490
DesignMatrix, but that are not actually design matrices (and such
491491
objects will behave like regular ndarrays in every way). Instead, check
492492
for the presence of a ``.design_info`` attribute -- this will be
493-
present only on"real" DesignMatrix objects.
493+
present only on "real" DesignMatrix objects.
494494
"""
495495

496496
def __new__(cls, input_array, design_info=None,

patsy/highlevel.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0):
129129
# DesignMatrixBuilder
130130
# (DesignMatrixBuilder, DesignMatrixBuilder)
131131
# any object with a special method __patsy_get_model_desc__
132-
def _do_highlevel_design(formula_like, data, eval_env, return_type):
132+
def _do_highlevel_design(formula_like, data, eval_env,
133+
NA_action, return_type):
133134
if return_type == "dataframe" and not have_pandas:
134135
raise PatsyError("pandas.DataFrame was requested, but pandas "
135136
"is not installed")
@@ -141,6 +142,7 @@ def data_iter_maker():
141142
builders = _try_incr_builders(formula_like, data_iter_maker, eval_env)
142143
if builders is not None:
143144
return build_design_matrices(builders, data,
145+
NA_action=NA_action,
144146
return_type=return_type)
145147
else:
146148
# No builders, but maybe we can still get matrices
@@ -197,7 +199,8 @@ def _regularize_matrix(m, default_column_prefix):
197199
rhs.index = lhs.index
198200
return (lhs, rhs)
199201

200-
def dmatrix(formula_like, data={}, eval_env=0, return_type="matrix"):
202+
def dmatrix(formula_like, data={}, eval_env=0,
203+
NA_action="drop", return_type="matrix"):
201204
"""Construct a single design matrix given a formula_like and data.
202205
203206
:arg formula_like: An object that can be used to construct a design
@@ -212,6 +215,10 @@ def dmatrix(formula_like, data={}, eval_env=0, return_type="matrix"):
212215
:func:`dmatrix` for lookups. If calling this function from a library,
213216
you probably want ``eval_env=1``, which means that variables should be
214217
resolved in *your* caller's namespace.
218+
:arg NA_action: What to do with rows that contain missing values. Either
219+
``"drop"``, ``"raise"``, or an :class:`NAAction` object. See
220+
:class:`NAAction` for details on what values count as 'missing' (and how
221+
to alter this).
215222
:arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
216223
217224
The `formula_like` can take a variety of forms:
@@ -252,13 +259,15 @@ def dmatrix(formula_like, data={}, eval_env=0, return_type="matrix"):
252259
preserved, which may be useful for e.g. time-series models.
253260
"""
254261
eval_env = EvalEnvironment.capture(eval_env, reference=1)
255-
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, return_type)
262+
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
263+
NA_action, return_type)
256264
if lhs.shape[1] != 0:
257265
raise PatsyError("encountered outcome variables for a model "
258266
"that does not expect them")
259267
return rhs
260268

261-
def dmatrices(formula_like, data={}, eval_env=0, return_type="matrix"):
269+
def dmatrices(formula_like, data={}, eval_env=0,
270+
NA_action="drop", return_type="matrix"):
262271
"""Construct two design matrices given a formula_like and data.
263272
264273
This function is identical to :func:`dmatrix`, except that it requires
@@ -274,7 +283,8 @@ def dmatrices(formula_like, data={}, eval_env=0, return_type="matrix"):
274283
See :func:`dmatrix` for details.
275284
"""
276285
eval_env = EvalEnvironment.capture(eval_env, reference=1)
277-
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, return_type)
286+
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
287+
NA_action, return_type)
278288
if lhs.shape[1] == 0:
279289
raise PatsyError("model is missing required outcome variables")
280290
return (lhs, rhs)

0 commit comments

Comments
 (0)