3838from patsy import PatsyError
3939from patsy .state import stateful_transform
4040from patsy .util import (SortAnythingKey ,
41- have_pandas , have_pandas_categorical ,
4241 safe_scalar_isnan ,
43- iterable )
42+ iterable ,
43+ have_pandas , have_pandas_categorical ,
44+ have_pandas_categorical_dtype ,
45+ safe_is_pandas_categorical ,
46+ pandas_Categorical_from_codes ,
47+ pandas_Categorical_categories ,
48+ pandas_Categorical_codes ,
49+ safe_issubdtype )
4450
4551if have_pandas :
4652 import pandas
@@ -113,18 +119,21 @@ def test_C():
113119 assert c4 .levels == "LEVELS"
114120
115121def guess_categorical (data ):
116- if have_pandas_categorical and isinstance (data , pandas . Categorical ):
122+ if safe_is_pandas_categorical (data ):
117123 return True
118124 if isinstance (data , _CategoricalBox ):
119125 return True
120126 data = np .asarray (data )
121- if np . issubdtype (data .dtype , np .number ):
127+ if safe_issubdtype (data .dtype , np .number ):
122128 return False
123129 return True
124130
125131def test_guess_categorical ():
126132 if have_pandas_categorical :
127- assert guess_categorical (pandas .Categorical .from_array ([1 , 2 , 3 ]))
133+ c = pandas .Categorical .from_array ([1 , 2 , 3 ])
134+ assert guess_categorical (c )
135+ if have_pandas_categorical_dtype :
136+ assert guess_categorical (pandas .Series (c ))
128137 assert guess_categorical (C ([1 , 2 , 3 ]))
129138 assert guess_categorical ([True , False ])
130139 assert guess_categorical (["a" , "b" ])
@@ -168,21 +177,21 @@ def sniff(self, data):
168177 if hasattr (data , "contrast" ):
169178 self ._contrast = data .contrast
170179 # returns a bool: are we confident that we found all the levels?
171- if have_pandas_categorical and isinstance (data , pandas .Categorical ):
172- # pandas.Categorical has its own NA detection, so don't try to
173- # second-guess it.
174- self ._levels = tuple (data .levels )
175- return True
176180 if isinstance (data , _CategoricalBox ):
177181 if data .levels is not None :
178182 self ._levels = tuple (data .levels )
179183 return True
180184 else :
181185 # unbox and fall through
182186 data = data .data
187+ if safe_is_pandas_categorical (data ):
188+ # pandas.Categorical has its own NA detection, so don't try to
189+ # second-guess it.
190+ self ._levels = tuple (pandas_Categorical_categories (data ))
191+ return True
183192 # fastpath to avoid doing an item-by-item iteration over boolean
184193 # arrays, as requested by #44
185- if hasattr (data , "dtype" ) and np . issubdtype (data .dtype , np .bool_ ):
194+ if hasattr (data , "dtype" ) and safe_issubdtype (data .dtype , np .bool_ ):
186195 self ._level_set = set ([True , False ])
187196 return True
188197
@@ -218,18 +227,26 @@ def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None):
218227 assert sniffer .levels_contrast () == (exp_levels , exp_contrast )
219228
220229 if have_pandas_categorical :
221- t ([], [pandas .Categorical .from_array ([1 , 2 , None ])],
222- True , (1 , 2 ))
223- # check order preservation
224- t ([], [pandas .Categorical ([1 , 0 ], ["a" , "b" ])],
225- True , ("a" , "b" ))
226- t ([], [pandas .Categorical ([1 , 0 ], ["b" , "a" ])],
227- True , ("b" , "a" ))
228- # check that if someone sticks a .contrast field onto a Categorical
229- # object, we pick it up:
230- c = pandas .Categorical .from_array (["a" , "b" ])
231- c .contrast = "CONTRAST"
232- t ([], [c ], True , ("a" , "b" ), "CONTRAST" )
230+ # We make sure to test with both boxed and unboxed pandas objects,
231+ # because we used to have a bug where boxed pandas objects would be
232+ # treated as categorical, but their levels would be lost...
233+ preps = [lambda x : x ,
234+ C ]
235+ if have_pandas_categorical_dtype :
236+ preps += [pandas .Series ,
237+ lambda x : C (pandas .Series (x ))]
238+ for prep in preps :
239+ t ([], [prep (pandas .Categorical .from_array ([1 , 2 , None ]))],
240+ True , (1 , 2 ))
241+ # check order preservation
242+ t ([], [prep (pandas_Categorical_from_codes ([1 , 0 ], ["a" , "b" ]))],
243+ True , ("a" , "b" ))
244+ t ([], [prep (pandas_Categorical_from_codes ([1 , 0 ], ["b" , "a" ]))],
245+ True , ("b" , "a" ))
246+ # check that if someone sticks a .contrast field onto our object
247+ obj = prep (pandas .Categorical .from_array (["a" , "b" ]))
248+ obj .contrast = "CONTRAST"
249+ t ([], [obj ], True , ("a" , "b" ), "CONTRAST" )
233250
234251 t ([], [C ([1 , 2 ]), C ([3 , 2 ])], False , (1 , 2 , 3 ))
235252 # check order preservation
@@ -286,14 +303,14 @@ def categorical_to_int(data, levels, NA_action, origin=None):
286303 assert isinstance (levels , tuple )
287304 # In this function, missing values are always mapped to -1
288305
289- if have_pandas_categorical and isinstance (data , pandas . Categorical ):
290- data_levels_tuple = tuple (data . levels )
306+ if safe_is_pandas_categorical (data ):
307+ data_levels_tuple = tuple (pandas_Categorical_categories ( data ) )
291308 if not data_levels_tuple == levels :
292309 raise PatsyError ("mismatching levels: expected %r, got %r"
293310 % (levels , data_levels_tuple ), origin )
294311 # pandas.Categorical also uses -1 to indicate NA, and we don't try to
295312 # second-guess its NA detection, so we can just pass it back.
296- return data . labels
313+ return pandas_Categorical_codes ( data )
297314
298315 if isinstance (data , _CategoricalBox ):
299316 if data .levels is not None and tuple (data .levels ) != levels :
@@ -311,7 +328,7 @@ def categorical_to_int(data, levels, NA_action, origin=None):
311328
312329 # fastpath to avoid doing an item-by-item iteration over boolean arrays,
313330 # as requested by #44
314- if hasattr (data , "dtype" ) and np . issubdtype (data .dtype , np .bool_ ):
331+ if hasattr (data , "dtype" ) and safe_issubdtype (data .dtype , np .bool_ ):
315332 if level_to_int [False ] == 0 and level_to_int [True ] == 1 :
316333 return data .astype (np .int_ )
317334 out = np .empty (len (data ), dtype = int )
@@ -358,24 +375,32 @@ def test_categorical_to_int():
358375 categorical_to_int ,
359376 pandas .DataFrame ({10 : s }), ("a" , "b" , "c" ), NAAction ())
360377 if have_pandas_categorical :
361- cat = pandas .Categorical ([1 , 0 , - 1 ], ("a" , "b" ))
362- conv = categorical_to_int (cat , ("a" , "b" ), NAAction ())
363- assert np .all (conv == [1 , 0 , - 1 ])
364- # Trust pandas NA marking
365- cat2 = pandas .Categorical ([1 , 0 , - 1 ], ("a" , "None" ))
366- conv2 = categorical_to_int (cat , ("a" , "b" ), NAAction (NA_types = ["None" ]))
367- assert np .all (conv2 == [1 , 0 , - 1 ])
368- # But levels must match
369- assert_raises (PatsyError ,
370- categorical_to_int ,
371- pandas .Categorical ([1 , 0 ], ("a" , "b" )),
372- ("a" , "c" ),
373- NAAction ())
374- assert_raises (PatsyError ,
375- categorical_to_int ,
376- pandas .Categorical ([1 , 0 ], ("a" , "b" )),
377- ("b" , "a" ),
378- NAAction ())
378+ constructors = [pandas_Categorical_from_codes ]
379+ if have_pandas_categorical_dtype :
380+ def Series_from_codes (codes , categories ):
381+ c = pandas_Categorical_from_codes (codes , categories )
382+ return pandas .Series (c )
383+ constructors .append (Series_from_codes )
384+ for con in constructors :
385+ cat = con ([1 , 0 , - 1 ], ("a" , "b" ))
386+ conv = categorical_to_int (cat , ("a" , "b" ), NAAction ())
387+ assert np .all (conv == [1 , 0 , - 1 ])
388+ # Trust pandas NA marking
389+ cat2 = con ([1 , 0 , - 1 ], ("a" , "None" ))
390+ conv2 = categorical_to_int (cat , ("a" , "b" ),
391+ NAAction (NA_types = ["None" ]))
392+ assert np .all (conv2 == [1 , 0 , - 1 ])
393+ # But levels must match
394+ assert_raises (PatsyError ,
395+ categorical_to_int ,
396+ con ([1 , 0 ], ("a" , "b" )),
397+ ("a" , "c" ),
398+ NAAction ())
399+ assert_raises (PatsyError ,
400+ categorical_to_int ,
401+ con ([1 , 0 ], ("a" , "b" )),
402+ ("b" , "a" ),
403+ NAAction ())
379404
380405 def t (data , levels , expected , NA_action = NAAction ()):
381406 got = categorical_to_int (data , levels , NA_action )
0 commit comments