Skip to content

Commit 34425be

Browse files
committed
patsy.build and patsy.categorical tests pass
1 parent 109419b commit 34425be

File tree

4 files changed

+252
-405
lines changed

4 files changed

+252
-405
lines changed

patsy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def _reexport(modname):
7272
__all__.append(var)
7373
globals()[var] = getattr(mod, var)
7474

75-
for child in ["highlevel", "build", "categorical", "constraint", "contrasts",
75+
for child in ["highlevel", "build", "constraint", "contrasts",
7676
"desc", "design_info", "eval", "origin", "state",
7777
"user_util", "missing"]:
7878
_reexport("patsy." + child)

patsy/build.py

Lines changed: 63 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import numpy as np
1212
from patsy import PatsyError
1313
from patsy.categorical import (guess_categorical,
14-
CatLevelSniffer,
14+
CategoricalSniffer,
1515
categorical_to_int)
1616
from patsy.util import (atleast_2d_column_default,
1717
have_pandas, have_pandas_categorical,
@@ -62,8 +62,8 @@ def __init__(self, factor, state, expected_columns):
6262
self._state = state
6363
self._expected_columns = expected_columns
6464

65+
# Returns either a 2d ndarray, or a DataFrame
6566
def eval(self, data):
66-
# Returns either a 2d ndarray, or a DataFrame
6767
result = self.factor.eval(self._state, data)
6868
result = atleast_2d_column_default(result, preserve_pandas=True)
6969
_max_allowed_dim(2, result, self.factor)
@@ -135,59 +135,48 @@ def __init__(self, factor, state, levels):
135135
self._state = state
136136
self._levels = tuple(levels)
137137

138+
# returns either a 1d ndarray or a pandas.Series
138139
def eval(self, data):
139-
# returns either a 2d ndarray or a DataFrame
140140
result = self.factor.eval(self._state, data)
141141
# XX FIXME: use the real NA action
142142
result = categorical_to_int(result, self._levels, NAAction())
143-
if result.levels != self._expected_levels:
144-
msg = ("when evaluating categoric factor %r, I got Categorical "
145-
"data with unexpected levels (wanted %s, got %s)"
146-
% (self.factor.name(), self._expected_levels, result.levels))
147-
raise PatsyError(msg, self.factor)
148-
_max_allowed_dim(1, result.int_array, self.factor)
143+
assert result.ndim == 1
149144
return result
150145

151146
def test__CatFactorEvaluator():
152147
from nose.tools import assert_raises
153-
from patsy.categorical import Categorical
148+
from patsy.categorical import C
154149
f = _MockFactor()
155-
ct = CategoricalTransform()
156-
ct.memorize_chunk(Categorical([0, 1], levels=("a", "b")))
157-
ct.memorize_finish()
158-
cf1 = _CatFactorEvaluator(f, {}, ct, ["a", "b"])
150+
cf1 = _CatFactorEvaluator(f, {}, ["a", "b"])
159151
assert cf1.factor is f
160-
cat1 = cf1.eval({"mock": Categorical.from_sequence(["b", "a", "b"])})
161-
assert cat1.int_array.shape == (3,)
162-
assert np.all(cat1.int_array == [1, 0, 1])
152+
cat1 = cf1.eval({"mock": ["b", "a", "b"]})
153+
assert cat1.shape == (3,)
154+
assert np.all(cat1 == [1, 0, 1])
163155
assert_raises(PatsyError, cf1.eval, {"mock": ["c"]})
156+
assert_raises(PatsyError, cf1.eval, {"mock": C(["a", "c"])})
164157
assert_raises(PatsyError, cf1.eval,
165-
{"mock": Categorical.from_sequence(["a", "c"])})
166-
assert_raises(PatsyError, cf1.eval,
167-
{"mock": Categorical.from_sequence(["a", "b"],
168-
levels=["b", "a"])})
158+
{"mock": C(["a", "b"], levels=["b", "a"])})
169159
assert_raises(PatsyError, cf1.eval, {"mock": [1, 0, 1]})
170-
bad_cat = Categorical.from_sequence(["b", "a", "a", "b"])
171-
bad_cat.int_array.resize((2, 2))
160+
bad_cat = np.asarray(["b", "a", "a", "b"])
161+
bad_cat.resize((2, 2))
172162
assert_raises(PatsyError, cf1.eval, {"mock": bad_cat})
173163

174-
btc = _BoolToCat(_MockFactor())
175-
cf2 = _CatFactorEvaluator(_MockFactor(), {}, btc, [False, True])
164+
cf2 = _CatFactorEvaluator(_MockFactor(), {}, [False, True])
176165
cat2 = cf2.eval({"mock": [True, False, False, True]})
177-
assert cat2.int_array.shape == (4,)
178-
assert np.all(cat2.int_array == [1, 0, 0, 1])
166+
assert cat2.shape == (4,)
167+
assert np.all(cat2 == [1, 0, 0, 1])
179168

180169
if have_pandas:
181170
s = pandas.Series(["b", "a"], index=[10, 20])
182-
cat_s = cf1.eval({"mock": Categorical.from_sequence(s)})
183-
assert isinstance(cat_s.int_array, pandas.Series)
184-
assert np.array_equal(cat_s.int_array, [1, 0])
185-
assert np.array_equal(cat_s.int_array.index, [10, 20])
171+
cat_s = cf1.eval({"mock": s})
172+
assert isinstance(cat_s, pandas.Series)
173+
assert np.array_equal(cat_s, [1, 0])
174+
assert np.array_equal(cat_s.index, [10, 20])
186175
sbool = pandas.Series([True, False], index=[11, 21])
187176
cat_sbool = cf2.eval({"mock": sbool})
188-
assert isinstance(cat_sbool.int_array, pandas.Series)
189-
assert np.array_equal(cat_sbool.int_array, [1, 0])
190-
assert np.array_equal(cat_sbool.int_array.index, [11, 21])
177+
assert isinstance(cat_sbool, pandas.Series)
178+
assert np.array_equal(cat_sbool, [1, 0])
179+
assert np.array_equal(cat_sbool.index, [11, 21])
191180

192181
def _column_combinations(columns_per_factor):
193182
# For consistency with R, the left-most item iterates fastest:
@@ -249,18 +238,20 @@ def build(self, factor_values, out):
249238
for factor, column_idx in zip(self._factors, column_idxs):
250239
if factor in self._cat_contrasts:
251240
contrast = self._cat_contrasts[factor]
252-
int_array = factor_values[factor].int_array
253-
if np.any(int_array < 0):
241+
if np.any(factor_values[factor] < 0):
254242
raise PatsyError("can't build a design matrix "
255243
"containing missing values", factor)
256-
out[:, i] *= contrast.matrix[int_array, column_idx]
244+
out[:, i] *= contrast.matrix[factor_values[factor],
245+
column_idx]
257246
else:
258247
assert (factor_values[factor].shape[1]
259248
== self._num_columns[factor])
260249
out[:, i] *= factor_values[factor][:, column_idx]
261250

262251
def test__ColumnBuilder():
252+
from nose.tools import assert_raises
263253
from patsy.contrasts import ContrastMatrix
254+
from patsy.categorical import C
264255
f1 = _MockFactor("f1")
265256
f2 = _MockFactor("f2")
266257
f3 = _MockFactor("f3")
@@ -272,16 +263,23 @@ def test__ColumnBuilder():
272263
mat = np.empty((3, 2))
273264
assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]
274265
cb.build({f1: atleast_2d_column_default([1, 2, 3]),
275-
f2: Categorical([0, 0, 1], levels=("c1", "c2")),
266+
f2: np.asarray([0, 0, 1]),
276267
f3: atleast_2d_column_default([7.5, 2, -12])},
277268
mat)
278269
assert np.allclose(mat, [[0, 0.5 * 1 * 7.5],
279270
[0, 0.5 * 2 * 2],
280271
[3 * 3 * -12, 0]])
272+
# Check that missing categorical values blow up
273+
assert_raises(PatsyError, cb.build,
274+
{f1: atleast_2d_column_default([1, 2, 3]),
275+
f2: np.asarray([0, -1, 1]),
276+
f3: atleast_2d_column_default([7.5, 2, -12])},
277+
mat)
278+
281279
cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast})
282280
mat2 = np.empty((3, 4))
283281
cb2.build({f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
284-
f2: Categorical([0, 0, 1], levels=("c1", "c2")),
282+
f2: np.asarray([0, 0, 1]),
285283
f3: atleast_2d_column_default([7.5, 2, -12])},
286284
mat2)
287285
assert cb2.column_names() == ["f1[0]:f2[c1]:f3",
@@ -388,28 +386,20 @@ def __call__(self):
388386
}
389387
assert factor_states == expected
390388

391-
def _examine_factor_types(factors, factor_states, data_iter_maker,
392-
NA_action):
389+
def _examine_factor_types(factors, factor_states, data_iter_maker):
393390
num_column_counts = {}
394-
cat_levels_contrasts = {}
395-
cat_level_sniffers = {}
391+
cat_sniffers = {}
396392
examine_needed = set(factors)
397393
for data in data_iter_maker():
398-
# We might have gathered all the information we need after the first
399-
# chunk of data. If so, then we shouldn't spend time loading all the
400-
# rest of the chunks.
401-
if not examine_needed:
402-
break
403394
for factor in list(examine_needed):
404395
value = factor.eval(factor_states[factor], data)
405-
if factor in cat_level_sniffers or guess_categorical(value):
406-
if factor not in cat_level_sniffers:
407-
cat_level_sniffers[factor] = CatLevelSniffer(NA_action)
408-
done = cat_level_sniffers[factor].sniff_levels(value)
396+
if factor in cat_sniffers or guess_categorical(value):
397+
# XX FIXME: use real NAAction
398+
if factor not in cat_sniffers:
399+
cat_sniffers[factor] = CategoricalSniffer(NAAction(),
400+
factor.origin)
401+
done = cat_sniffers[factor].sniff(value)
409402
if done:
410-
levels = cat_level_sniffers.pop(factor).levels()
411-
contrast = getattr(value, "contrast", None)
412-
cat_levels_contrasts[factor] = (levels, contrast)
413403
examine_needed.remove(factor)
414404
else:
415405
# Numeric
@@ -418,9 +408,16 @@ def _examine_factor_types(factors, factor_states, data_iter_maker,
418408
column_count = value.shape[1]
419409
num_column_counts[factor] = column_count
420410
examine_needed.remove(factor)
411+
if not examine_needed:
412+
break
413+
# Pull out the levels
414+
cat_levels_contrasts = {}
415+
for factor, sniffer in cat_sniffers.iteritems():
416+
cat_levels_contrasts[factor] = sniffer.levels_contrast()
421417
return (num_column_counts, cat_levels_contrasts)
422418

423419
def test__examine_factor_types():
420+
from patsy.categorical import C
424421
class MockFactor(object):
425422
def __init__(self):
426423
# You should check this using 'is', not '=='
@@ -463,18 +460,18 @@ def next(self):
463460
num_1dim: ([1, 2, 3], [4, 5, 6]),
464461
num_1col: ([[1], [2], [3]], [[4], [5], [6]]),
465462
num_4col: (np.zeros((3, 4)), np.ones((3, 4))),
466-
categ_1col: (Categorical([0, 1, 2], levels=("a", "b", "c"),
467-
contrast="MOCK CONTRAST"),
468-
Categorical([2, 1, 0], levels=("a", "b", "c"),
469-
contrast="MOCK CONTRAST")),
463+
categ_1col: (C(["a", "b", "c"], levels=("a", "b", "c"),
464+
contrast="MOCK CONTRAST"),
465+
C(["c", "b", "a"], levels=("a", "b", "c"),
466+
contrast="MOCK CONTRAST")),
470467
bool_1col: ([True, True, False], [False, True, True]),
471468
# It has to read through all the data to see all the possible levels:
472469
string_1col: (["a", "a", "a"], ["c", "b", "a"]),
473470
object_1col: ([object_levels[0]] * 3, object_levels),
474471
}
475472

476473
it = DataIterMaker()
477-
(num_column_counts, cat_levels_contrasts, cat_postprocessors
474+
(num_column_counts, cat_levels_contrasts,
478475
) = _examine_factor_types(factor_states.keys(), factor_states, it)
479476
assert it.i == 2
480477
iterations = 0
@@ -485,21 +482,18 @@ def next(self):
485482
string_1col: (("a", "b", "c"), None),
486483
object_1col: (tuple(sorted(object_levels, key=id)), None),
487484
}
488-
assert (set(cat_postprocessors.keys())
489-
== set([categ_1col, bool_1col, string_1col, object_1col]))
490485

491486
# Check that it doesn't read through all the data if that's not necessary:
492487
it = DataIterMaker()
493488
no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col]
494-
(num_column_counts, cat_levels_contrasts, cat_postprocessors
489+
(num_column_counts, cat_levels_contrasts,
495490
) = _examine_factor_types(no_read_necessary, factor_states, it)
496-
assert it.i == 1
491+
assert it.i == 0
497492
assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4}
498493
assert cat_levels_contrasts == {
499494
categ_1col: (("a", "b", "c"), "MOCK CONTRAST"),
500495
bool_1col: ((False, True), None),
501496
}
502-
assert set(cat_postprocessors) == set([categ_1col, bool_1col])
503497

504498
# Illegal inputs:
505499
bool_3col = MockFactor()
@@ -621,10 +615,9 @@ def design_matrix_builders(termlists, data_iter_maker):
621615
# Now all the factors have working eval methods, so we can evaluate them
622616
# on some data to find out what type of data they return.
623617
(num_column_counts,
624-
cat_levels_contrasts,
625-
cat_postprocessors) = _examine_factor_types(all_factors,
626-
factor_states,
627-
data_iter_maker)
618+
cat_levels_contrasts) = _examine_factor_types(all_factors,
619+
factor_states,
620+
data_iter_maker)
628621
# Now we need the factor evaluators, which encapsulate the knowledge of
629622
# how to turn any given factor into a chunk of data:
630623
factor_evaluators = {}
@@ -635,10 +628,9 @@ def design_matrix_builders(termlists, data_iter_maker):
635628
num_column_counts[factor])
636629
else:
637630
assert factor in cat_levels_contrasts
638-
postprocessor = cat_postprocessors.get(factor)
639631
levels = cat_levels_contrasts[factor][0]
640632
evaluator = _CatFactorEvaluator(factor, factor_states[factor],
641-
postprocessor, levels)
633+
levels)
642634
factor_evaluators[factor] = evaluator
643635
# And now we can construct the DesignMatrixBuilder for each termlist:
644636
builders = []

0 commit comments

Comments
 (0)