1717if have_pandas :
1818 import pandas
1919
20+ # Conundrum:
21+ # - We don't want to screw around with missing value handling in Categorical;
22+ # that logic is all localized inside the builder code. (And in particular,
23+ # all the configurability is there as well.)
24+ # - Therefore, we just pass through possible missing values, treating them
25+ # like ordinary levels.
26+ # - All Categorical levels must be hashable.
27+ # - But the np.ma.masked object is non-hashable on Py3.
28+ # Solution: replace np.ma.masked with an equivalent, hashable object.
29+ class HashableMaskedConstant (object ):
30+ _instance = None
31+
32+ def __new__ (cls ):
33+ if not cls ._instance :
34+ cls ._instance = object .__new__ (cls )
35+ return cls ._instance
36+
37+ def __str__ (self ):
38+ return "--"
39+
40+ hashable_masked = HashableMaskedConstant ()
41+
2042# A simple wrapper around some categorical data. Provides basically no
2143# services, but it holds data fine... eventually it'd be nice to make a custom
2244# dtype for this, but doing that right will require fixes to numpy itself.
@@ -53,29 +75,20 @@ def from_pandas_categorical(cls, pandas_categorical):
5375 pandas_categorical .levels )
5476
5577 @classmethod
56- def from_sequence (cls , sequence , levels = None , ** kwargs ):
57- """from_sequence(sequence, levels=None, contrast=None)
78+ def from_sequence (cls , sequence ,
79+ levels = None , NA_policy = "default" , ** kwargs ):
80+ """from_sequence(sequence, levels=None, NA_policy="default", contrast=None)
5881
5982 Create a Categorical object given a sequence of data. Levels will be
6083 auto-detected if not given.
6184
62- As far as this function is concerned, 'None' and 'NaN' values are not
63- possible levels; they will be treated as indicating missing
64- values. Likewise for masked elements in numpy masked arrays.
85+ NA_policy is either an :class:`NAAction` object used to identify which
6586 """
66- def missing_level (level ):
67- # Check for np.ma.masked must come before the call to
68- # safe_scalar_isnan, because safe_scalar_isnan coerces its input
69- # to float, and float(np.ma.masked) raises a spurious warning
70- # that we want to avoid (and then returns nan).
71- return (level is None
72- or level is np .ma .masked
73- or safe_scalar_isnan (level ))
7487 if levels is None :
7588 level_set = set ()
7689 for level in sequence :
77- if missing_level ( level ) :
78- continue
90+ if level is np . ma . masked :
91+ level = hashable_masked
7992 try :
8093 level_set .add (level )
8194 except TypeError :
@@ -94,22 +107,18 @@ def missing_level(level):
94107 int_array = np .empty (len (sequence ), dtype = int )
95108 for i , entry in enumerate (sequence ):
96109 try :
97- if missing_level (entry ):
98- int_array [i ] = - 1
99- else :
100- int_array [i ] = level_to_int [entry ]
110+ int_array [i ] = level_to_int [entry ]
101111 except KeyError :
102- sorted_levels = sorted (level_to_int )
103112 SHOW_LEVELS = 4
104113 level_strs = []
105- if len (sorted_levels ) <= SHOW_LEVELS :
106- level_strs += [repr (level ) for level in sorted_levels ]
114+ if len (levels ) <= SHOW_LEVELS :
115+ level_strs += [repr (level ) for level in levels ]
107116 else :
108117 level_strs += [repr (level )
109- for level in sorted_levels [:SHOW_LEVELS // 2 ]]
118+ for level in levels [:SHOW_LEVELS // 2 ]]
110119 level_strs .append ("..." )
111120 level_strs += [repr (level )
112- for level in sorted_levels [- SHOW_LEVELS // 2 :]]
121+ for level in levels [- SHOW_LEVELS // 2 :]]
113122 level_str = "[%s]" % (", " .join (level_strs ))
114123 raise PatsyError ("Error converting data to categorical: "
115124 "observation with value %r does not match "
0 commit comments