Skip to content

Commit ed653c1

Browse files
committed
checkpoint
1 parent c63f202 commit ed653c1

File tree

2 files changed

+45
-35
lines changed

2 files changed

+45
-35
lines changed

patsy/categorical.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,28 @@
1717
if have_pandas:
1818
import pandas
1919

20+
# Conundrum:
21+
# - We don't want to screw around with missing value handling in Categorical;
22+
# that logic is all localized inside the builder code. (And in particular,
23+
# all the configurability is there as well.)
24+
# - Therefore, we just pass through possible missing values, treating them
25+
# like ordinary levels.
26+
# - All Categorical levels must be hashable.
27+
# - But the np.ma.masked object is non-hashable on Py3.
28+
# Solution: replace np.ma.masked with an equivalent, hashable object.
29+
class HashableMaskedConstant(object):
30+
_instance = None
31+
32+
def __new__(cls):
33+
if not cls._instance:
34+
cls._instance = object.__new__(cls)
35+
return cls._instance
36+
37+
def __str__(self):
38+
return "--"
39+
40+
hashable_masked = HashableMaskedConstant()
41+
2042
# A simple wrapper around some categorical data. Provides basically no
2143
# services, but it holds data fine... eventually it'd be nice to make a custom
2244
# dtype for this, but doing that right will require fixes to numpy itself.
@@ -53,29 +75,20 @@ def from_pandas_categorical(cls, pandas_categorical):
5375
pandas_categorical.levels)
5476

5577
@classmethod
56-
def from_sequence(cls, sequence, levels=None, **kwargs):
57-
"""from_sequence(sequence, levels=None, contrast=None)
78+
def from_sequence(cls, sequence,
79+
levels=None, NA_policy="default", **kwargs):
80+
"""from_sequence(sequence, levels=None, NA_policy="default", contrast=None)
5881
5982
Create a Categorical object given a sequence of data. Levels will be
6083
auto-detected if not given.
6184
62-
As far as this function is concerned, 'None' and 'NaN' values are not
63-
possible levels; they will be treated as indicating missing
64-
values. Likewise for masked elements in numpy masked arrays.
85+
NA_policy is either an :class:`NAAction` object used to identify which
6586
"""
66-
def missing_level(level):
67-
# Check for np.ma.masked must come before the call to
68-
# safe_scalar_isnan, because safe_scalar_isnan coerces its input
69-
# to float, and float(np.ma.masked) raises a spurious warning
70-
# that we want to avoid (and then returns nan).
71-
return (level is None
72-
or level is np.ma.masked
73-
or safe_scalar_isnan(level))
7487
if levels is None:
7588
level_set = set()
7689
for level in sequence:
77-
if missing_level(level):
78-
continue
90+
if level is np.ma.masked:
91+
level = hashable_masked
7992
try:
8093
level_set.add(level)
8194
except TypeError:
@@ -94,22 +107,18 @@ def missing_level(level):
94107
int_array = np.empty(len(sequence), dtype=int)
95108
for i, entry in enumerate(sequence):
96109
try:
97-
if missing_level(entry):
98-
int_array[i] = -1
99-
else:
100-
int_array[i] = level_to_int[entry]
110+
int_array[i] = level_to_int[entry]
101111
except KeyError:
102-
sorted_levels = sorted(level_to_int)
103112
SHOW_LEVELS = 4
104113
level_strs = []
105-
if len(sorted_levels) <= SHOW_LEVELS:
106-
level_strs += [repr(level) for level in sorted_levels]
114+
if len(levels) <= SHOW_LEVELS:
115+
level_strs += [repr(level) for level in levels]
107116
else:
108117
level_strs += [repr(level)
109-
for level in sorted_levels[:SHOW_LEVELS//2]]
118+
for level in levels[:SHOW_LEVELS//2]]
110119
level_strs.append("...")
111120
level_strs += [repr(level)
112-
for level in sorted_levels[-SHOW_LEVELS//2:]]
121+
for level in levels[-SHOW_LEVELS//2:]]
113122
level_str = "[%s]" % (", ".join(level_strs))
114123
raise PatsyError("Error converting data to categorical: "
115124
"observation with value %r does not match "

patsy/missing.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,11 @@
4444
# These are made available in the patsy.* namespace
4545
__all__ = ["NAAction"]
4646

47-
_valid_NA_types = ["NaN"]
47+
import threading
48+
current_NA_action = threading.local()
49+
current_NA_action.value = None
50+
51+
_valid_NA_types = ["None", "NaN", "numpy.ma"]
4852
_valid_NA_responses = ["raise", "drop"]
4953
def _desc_options(options):
5054
return ", ".join([repr(opt) for opt in options])
@@ -84,18 +88,15 @@ class NAAction(object):
8488
class, or your own object that implements :meth:`handle_NA`, and pass that
8589
as the `NA_action=` argument instead.
8690
"""
87-
def __init__(self, on_NA="drop", NA_types=["NaN"]):
91+
def __init__(self, on_NA="drop", NA_types=["None", "NaN", "numpy.ma"]):
8892
"""The `NAAction` constructor takes the following arguments:
8993
9094
:arg on_NA: How to handle missing values. The default is "drop", which
9195
removes all rows from all matrices which contain any missing
9296
values. Also available is "raise", which raises an exception when
9397
any missing values are encountered.
9498
:arg NA_types: Which values count as missing, as a list of
95-
strings. Categorical missing data always count as missing. If the
96-
string "NaN" is given in this argument, then not-a-number values in
97-
numeric arrays also count as missing. It's anticipated that in the
98-
future there will be more options here as well.
99+
strings.
99100
"""
100101
self.on_NA = on_NA
101102
if self.on_NA not in _valid_NA_responses:
@@ -111,11 +112,11 @@ def __init__(self, on_NA="drop", NA_types=["NaN"]):
111112
"(should be one of %s)"
112113
% (NA_type, _desc_options(_valid_NA_types)))
113114

114-
def _where_NA(self, vector):
115-
if isinstance(vector, Categorical):
116-
return (vector.int_array == -1)
115+
def is_NA(self, arr):
116+
if isinstance(arr, Categorical):
117+
return (arr.int_array == -1)
117118
else:
118-
mask = np.zeros(vector.shape, dtype=bool)
119+
mask = np.zeros(arr.shape, dtype=bool)
119120
if "NaN" in self.NA_types:
120121
if np.issubdtype(vector.dtype, np.inexact):
121122
mask |= np.isnan(vector)
@@ -162,7 +163,7 @@ def handle_NA(self, index, factor_values, origins):
162163

163164
def _handle_NA_raise(self, index, factor_values, origins):
164165
for factor_value, origin in zip(factor_values, origins):
165-
this_mask = self._where_NA(factor_value)
166+
this_mask = self.is_NA(factor_value)
166167
if np.any(this_mask):
167168
raise PatsyError("factor contains missing values", origin)
168169
return (index, factor_values)

0 commit comments

Comments
 (0)