1111import numpy as np
1212from patsy import PatsyError
1313from patsy .categorical import (guess_categorical ,
14- CatLevelSniffer ,
14+ CategoricalSniffer ,
1515 categorical_to_int )
1616from patsy .util import (atleast_2d_column_default ,
1717 have_pandas , have_pandas_categorical ,
@@ -62,8 +62,8 @@ def __init__(self, factor, state, expected_columns):
6262 self ._state = state
6363 self ._expected_columns = expected_columns
6464
65+ # Returns either a 2d ndarray, or a DataFrame
6566 def eval (self , data ):
66- # Returns either a 2d ndarray, or a DataFrame
6767 result = self .factor .eval (self ._state , data )
6868 result = atleast_2d_column_default (result , preserve_pandas = True )
6969 _max_allowed_dim (2 , result , self .factor )
@@ -135,59 +135,48 @@ def __init__(self, factor, state, levels):
135135 self ._state = state
136136 self ._levels = tuple (levels )
137137
138+ # returns either a 1d ndarray or a pandas.Series
138139 def eval (self , data ):
139- # returns either a 2d ndarray or a DataFrame
140140 result = self .factor .eval (self ._state , data )
141141 # XX FIXME: use the real NA action
142142 result = categorical_to_int (result , self ._levels , NAAction ())
143- if result .levels != self ._expected_levels :
144- msg = ("when evaluating categoric factor %r, I got Categorical "
145- "data with unexpected levels (wanted %s, got %s)"
146- % (self .factor .name (), self ._expected_levels , result .levels ))
147- raise PatsyError (msg , self .factor )
148- _max_allowed_dim (1 , result .int_array , self .factor )
143+ assert result .ndim == 1
149144 return result
150145
151146def test__CatFactorEvaluator ():
152147 from nose .tools import assert_raises
153- from patsy .categorical import Categorical
148+ from patsy .categorical import C
154149 f = _MockFactor ()
155- ct = CategoricalTransform ()
156- ct .memorize_chunk (Categorical ([0 , 1 ], levels = ("a" , "b" )))
157- ct .memorize_finish ()
158- cf1 = _CatFactorEvaluator (f , {}, ct , ["a" , "b" ])
150+ cf1 = _CatFactorEvaluator (f , {}, ["a" , "b" ])
159151 assert cf1 .factor is f
160- cat1 = cf1 .eval ({"mock" : Categorical . from_sequence ( ["b" , "a" , "b" ]) })
161- assert cat1 .int_array . shape == (3 ,)
162- assert np .all (cat1 . int_array == [1 , 0 , 1 ])
152+ cat1 = cf1 .eval ({"mock" : ["b" , "a" , "b" ]})
153+ assert cat1 .shape == (3 ,)
154+ assert np .all (cat1 == [1 , 0 , 1 ])
163155 assert_raises (PatsyError , cf1 .eval , {"mock" : ["c" ]})
156+ assert_raises (PatsyError , cf1 .eval , {"mock" : C (["a" , "c" ])})
164157 assert_raises (PatsyError , cf1 .eval ,
165- {"mock" : Categorical .from_sequence (["a" , "c" ])})
166- assert_raises (PatsyError , cf1 .eval ,
167- {"mock" : Categorical .from_sequence (["a" , "b" ],
168- levels = ["b" , "a" ])})
158+ {"mock" : C (["a" , "b" ], levels = ["b" , "a" ])})
169159 assert_raises (PatsyError , cf1 .eval , {"mock" : [1 , 0 , 1 ]})
170- bad_cat = Categorical . from_sequence (["b" , "a" , "a" , "b" ])
171- bad_cat .int_array . resize ((2 , 2 ))
160+ bad_cat = np . asarray (["b" , "a" , "a" , "b" ])
161+ bad_cat .resize ((2 , 2 ))
172162 assert_raises (PatsyError , cf1 .eval , {"mock" : bad_cat })
173163
174- btc = _BoolToCat (_MockFactor ())
175- cf2 = _CatFactorEvaluator (_MockFactor (), {}, btc , [False , True ])
164+ cf2 = _CatFactorEvaluator (_MockFactor (), {}, [False , True ])
176165 cat2 = cf2 .eval ({"mock" : [True , False , False , True ]})
177- assert cat2 .int_array . shape == (4 ,)
178- assert np .all (cat2 . int_array == [1 , 0 , 0 , 1 ])
166+ assert cat2 .shape == (4 ,)
167+ assert np .all (cat2 == [1 , 0 , 0 , 1 ])
179168
180169 if have_pandas :
181170 s = pandas .Series (["b" , "a" ], index = [10 , 20 ])
182- cat_s = cf1 .eval ({"mock" : Categorical . from_sequence ( s ) })
183- assert isinstance (cat_s . int_array , pandas .Series )
184- assert np .array_equal (cat_s . int_array , [1 , 0 ])
185- assert np .array_equal (cat_s .int_array . index , [10 , 20 ])
171+ cat_s = cf1 .eval ({"mock" : s })
172+ assert isinstance (cat_s , pandas .Series )
173+ assert np .array_equal (cat_s , [1 , 0 ])
174+ assert np .array_equal (cat_s .index , [10 , 20 ])
186175 sbool = pandas .Series ([True , False ], index = [11 , 21 ])
187176 cat_sbool = cf2 .eval ({"mock" : sbool })
188- assert isinstance (cat_sbool . int_array , pandas .Series )
189- assert np .array_equal (cat_sbool . int_array , [1 , 0 ])
190- assert np .array_equal (cat_sbool .int_array . index , [11 , 21 ])
177+ assert isinstance (cat_sbool , pandas .Series )
178+ assert np .array_equal (cat_sbool , [1 , 0 ])
179+ assert np .array_equal (cat_sbool .index , [11 , 21 ])
191180
192181def _column_combinations (columns_per_factor ):
193182 # For consistency with R, the left-most item iterates fastest:
@@ -249,18 +238,20 @@ def build(self, factor_values, out):
249238 for factor , column_idx in zip (self ._factors , column_idxs ):
250239 if factor in self ._cat_contrasts :
251240 contrast = self ._cat_contrasts [factor ]
252- int_array = factor_values [factor ].int_array
253- if np .any (int_array < 0 ):
241+ if np .any (factor_values [factor ] < 0 ):
254242 raise PatsyError ("can't build a design matrix "
255243 "containing missing values" , factor )
256- out [:, i ] *= contrast .matrix [int_array , column_idx ]
244+ out [:, i ] *= contrast .matrix [factor_values [factor ],
245+ column_idx ]
257246 else :
258247 assert (factor_values [factor ].shape [1 ]
259248 == self ._num_columns [factor ])
260249 out [:, i ] *= factor_values [factor ][:, column_idx ]
261250
262251def test__ColumnBuilder ():
252+ from nose .tools import assert_raises
263253 from patsy .contrasts import ContrastMatrix
254+ from patsy .categorical import C
264255 f1 = _MockFactor ("f1" )
265256 f2 = _MockFactor ("f2" )
266257 f3 = _MockFactor ("f3" )
@@ -272,16 +263,23 @@ def test__ColumnBuilder():
272263 mat = np .empty ((3 , 2 ))
273264 assert cb .column_names () == ["f1:f2[c1]:f3" , "f1:f2[c2]:f3" ]
274265 cb .build ({f1 : atleast_2d_column_default ([1 , 2 , 3 ]),
275- f2 : Categorical ([0 , 0 , 1 ], levels = ( "c1" , "c2" ) ),
266+ f2 : np . asarray ([0 , 0 , 1 ]),
276267 f3 : atleast_2d_column_default ([7.5 , 2 , - 12 ])},
277268 mat )
278269 assert np .allclose (mat , [[0 , 0.5 * 1 * 7.5 ],
279270 [0 , 0.5 * 2 * 2 ],
280271 [3 * 3 * - 12 , 0 ]])
272+ # Check that missing categorical values blow up
273+ assert_raises (PatsyError , cb .build ,
274+ {f1 : atleast_2d_column_default ([1 , 2 , 3 ]),
275+ f2 : np .asarray ([0 , - 1 , 1 ]),
276+ f3 : atleast_2d_column_default ([7.5 , 2 , - 12 ])},
277+ mat )
278+
281279 cb2 = _ColumnBuilder ([f1 , f2 , f3 ], {f1 : 2 , f3 : 1 }, {f2 : contrast })
282280 mat2 = np .empty ((3 , 4 ))
283281 cb2 .build ({f1 : atleast_2d_column_default ([[1 , 2 ], [3 , 4 ], [5 , 6 ]]),
284- f2 : Categorical ([0 , 0 , 1 ], levels = ( "c1" , "c2" ) ),
282+ f2 : np . asarray ([0 , 0 , 1 ]),
285283 f3 : atleast_2d_column_default ([7.5 , 2 , - 12 ])},
286284 mat2 )
287285 assert cb2 .column_names () == ["f1[0]:f2[c1]:f3" ,
@@ -388,28 +386,20 @@ def __call__(self):
388386 }
389387 assert factor_states == expected
390388
391- def _examine_factor_types (factors , factor_states , data_iter_maker ,
392- NA_action ):
389+ def _examine_factor_types (factors , factor_states , data_iter_maker ):
393390 num_column_counts = {}
394- cat_levels_contrasts = {}
395- cat_level_sniffers = {}
391+ cat_sniffers = {}
396392 examine_needed = set (factors )
397393 for data in data_iter_maker ():
398- # We might have gathered all the information we need after the first
399- # chunk of data. If so, then we shouldn't spend time loading all the
400- # rest of the chunks.
401- if not examine_needed :
402- break
403394 for factor in list (examine_needed ):
404395 value = factor .eval (factor_states [factor ], data )
405- if factor in cat_level_sniffers or guess_categorical (value ):
406- if factor not in cat_level_sniffers :
407- cat_level_sniffers [factor ] = CatLevelSniffer (NA_action )
408- done = cat_level_sniffers [factor ].sniff_levels (value )
396+ if factor in cat_sniffers or guess_categorical (value ):
397+ # XX FIXME: use real NAAction
398+ if factor not in cat_sniffers :
399+ cat_sniffers [factor ] = CategoricalSniffer (NAAction (),
400+ factor .origin )
401+ done = cat_sniffers [factor ].sniff (value )
409402 if done :
410- levels = cat_level_sniffers .pop (factor ).levels ()
411- contrast = getattr (value , "contrast" , None )
412- cat_levels_contrasts [factor ] = (levels , contrast )
413403 examine_needed .remove (factor )
414404 else :
415405 # Numeric
@@ -418,9 +408,16 @@ def _examine_factor_types(factors, factor_states, data_iter_maker,
418408 column_count = value .shape [1 ]
419409 num_column_counts [factor ] = column_count
420410 examine_needed .remove (factor )
411+ if not examine_needed :
412+ break
413+ # Pull out the levels
414+ cat_levels_contrasts = {}
415+ for factor , sniffer in cat_sniffers .iteritems ():
416+ cat_levels_contrasts [factor ] = sniffer .levels_contrast ()
421417 return (num_column_counts , cat_levels_contrasts )
422418
423419def test__examine_factor_types ():
420+ from patsy .categorical import C
424421 class MockFactor (object ):
425422 def __init__ (self ):
426423 # You should check this using 'is', not '=='
@@ -463,18 +460,18 @@ def next(self):
463460 num_1dim : ([1 , 2 , 3 ], [4 , 5 , 6 ]),
464461 num_1col : ([[1 ], [2 ], [3 ]], [[4 ], [5 ], [6 ]]),
465462 num_4col : (np .zeros ((3 , 4 )), np .ones ((3 , 4 ))),
466- categ_1col : (Categorical ([ 0 , 1 , 2 ], levels = ("a" , "b" , "c" ),
467- contrast = "MOCK CONTRAST" ),
468- Categorical ([ 2 , 1 , 0 ], levels = ("a" , "b" , "c" ),
469- contrast = "MOCK CONTRAST" )),
463+ categ_1col : (C ([ "a" , "b" , "c" ], levels = ("a" , "b" , "c" ),
464+ contrast = "MOCK CONTRAST" ),
465+ C ([ "c" , "b" , "a" ], levels = ("a" , "b" , "c" ),
466+ contrast = "MOCK CONTRAST" )),
470467 bool_1col : ([True , True , False ], [False , True , True ]),
471468 # It has to read through all the data to see all the possible levels:
472469 string_1col : (["a" , "a" , "a" ], ["c" , "b" , "a" ]),
473470 object_1col : ([object_levels [0 ]] * 3 , object_levels ),
474471 }
475472
476473 it = DataIterMaker ()
477- (num_column_counts , cat_levels_contrasts , cat_postprocessors
474+ (num_column_counts , cat_levels_contrasts ,
478475 ) = _examine_factor_types (factor_states .keys (), factor_states , it )
479476 assert it .i == 2
480477 iterations = 0
@@ -485,21 +482,18 @@ def next(self):
485482 string_1col : (("a" , "b" , "c" ), None ),
486483 object_1col : (tuple (sorted (object_levels , key = id )), None ),
487484 }
488- assert (set (cat_postprocessors .keys ())
489- == set ([categ_1col , bool_1col , string_1col , object_1col ]))
490485
491486 # Check that it doesn't read through all the data if that's not necessary:
492487 it = DataIterMaker ()
493488 no_read_necessary = [num_1dim , num_1col , num_4col , categ_1col , bool_1col ]
494- (num_column_counts , cat_levels_contrasts , cat_postprocessors
489+ (num_column_counts , cat_levels_contrasts ,
495490 ) = _examine_factor_types (no_read_necessary , factor_states , it )
496- assert it .i == 1
491+ assert it .i == 0
497492 assert num_column_counts == {num_1dim : 1 , num_1col : 1 , num_4col : 4 }
498493 assert cat_levels_contrasts == {
499494 categ_1col : (("a" , "b" , "c" ), "MOCK CONTRAST" ),
500495 bool_1col : ((False , True ), None ),
501496 }
502- assert set (cat_postprocessors ) == set ([categ_1col , bool_1col ])
503497
504498 # Illegal inputs:
505499 bool_3col = MockFactor ()
@@ -621,10 +615,9 @@ def design_matrix_builders(termlists, data_iter_maker):
621615 # Now all the factors have working eval methods, so we can evaluate them
622616 # on some data to find out what type of data they return.
623617 (num_column_counts ,
624- cat_levels_contrasts ,
625- cat_postprocessors ) = _examine_factor_types (all_factors ,
626- factor_states ,
627- data_iter_maker )
618+ cat_levels_contrasts ) = _examine_factor_types (all_factors ,
619+ factor_states ,
620+ data_iter_maker )
628621 # Now we need the factor evaluators, which encapsulate the knowledge of
629622 # how to turn any given factor into a chunk of data:
630623 factor_evaluators = {}
@@ -635,10 +628,9 @@ def design_matrix_builders(termlists, data_iter_maker):
635628 num_column_counts [factor ])
636629 else :
637630 assert factor in cat_levels_contrasts
638- postprocessor = cat_postprocessors .get (factor )
639631 levels = cat_levels_contrasts [factor ][0 ]
640632 evaluator = _CatFactorEvaluator (factor , factor_states [factor ],
641- postprocessor , levels )
633+ levels )
642634 factor_evaluators [factor ] = evaluator
643635 # And now we can construct the DesignMatrixBuilder for each termlist:
644636 builders = []
0 commit comments