Skip to content

Commit 85b8ff1

Browse files
committed
Add a bunch of compatibility code for categorical handling in pandas <0.15 and >=0.15
1 parent 5532c66 commit 85b8ff1

File tree

1 file changed

+124
-0
lines changed

1 file changed

+124
-0
lines changed

patsy/util.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
# Can drop this guard whenever we drop support for such older versions of
2929
# pandas.
3030
have_pandas_categorical = (have_pandas and hasattr(pandas, "Categorical"))
31+
have_pandas_categorical_dtype = (have_pandas
32+
and hasattr(pandas.core.common,
33+
"is_categorical_dtype"))
3134

3235
# Passes through Series and DataFrames, call np.asarray() on everything else
3336
def asarray_or_pandas(a, copy=False, dtype=None, subok=False):
@@ -550,3 +553,124 @@ def test_iterable():
550553
assert iterable({"a": 1})
551554
assert not iterable(1)
552555
assert not iterable(iterable)
556+
557+
##### Handling Pandas's categorical stuff is horrible and hateful
558+
559+
# Basically they decided that they didn't like how numpy does things, so their
560+
# categorical stuff is *kinda* like how numpy would do it (e.g. they have a
561+
# special ".dtype" attribute to mark categorical data), so by default you'll
562+
# find yourself using the same code paths to handle pandas categorical data
563+
# and other non-categorical data. BUT, all the idioms for detecting
564+
# categorical data blow up with errors if you try them with real numpy dtypes,
565+
# and all numpy's idioms for detecting non-categorical types blow up with
566+
# errors if you try them with pandas categorical stuff. So basically they have
567+
# just poisoned all code that touches dtypes; the old numpy stuff is unsafe,
568+
# and you must use special code like below.
569+
#
570+
# Also there are hoops to jump through to handle both the old style
571+
# (Categorical objects) and new-style (Series with dtype="category").
572+
573+
# Needed to support pandas < 0.15
574+
def pandas_Categorical_from_codes(codes, categories):
575+
assert have_pandas_categorical
576+
577+
if hasattr(pandas.Categorical, "from_codes"):
578+
return pandas.Categorical.from_codes(codes, categories)
579+
else:
580+
return pandas.Categorical(codes, categories)
581+
582+
def test_pandas_Categorical_from_codes():
583+
c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"])
584+
assert np.all(np.asarray(c)[:-1] == ["b", "b", "a"])
585+
assert np.isnan(np.asarray(c)[-1])
586+
587+
# Needed to support pandas < 0.15
588+
def pandas_Categorical_categories(cat):
589+
# In 0.15+, a categorical Series has a .cat attribute which is a
590+
# Categorical object, and Categorical objects are what have .categories /
591+
# .codes attributes.
592+
if hasattr(cat, "cat"):
593+
cat = cat.cat
594+
if hasattr(cat, "categories"):
595+
return cat.categories
596+
else:
597+
return cat.levels
598+
599+
# Needed to support pandas < 0.15
600+
def pandas_Categorical_codes(cat):
601+
# In 0.15+, a categorical Series has a .cat attribute which is a
602+
# Categorical object, and Categorical objects are what have .categories /
603+
# .codes attributes.
604+
if hasattr(cat, "cat"):
605+
cat = cat.cat
606+
if hasattr(cat, "codes"):
607+
return cat.codes
608+
else:
609+
return cat.labels
610+
611+
def test_pandas_Categorical_accessors():
612+
c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"])
613+
assert np.all(pandas_Categorical_categories(c) == ["a", "b"])
614+
assert np.all(pandas_Categorical_codes(c) == [1, 1, 0, -1])
615+
616+
if have_pandas_categorical_dtype:
617+
s = pandas.Series(c)
618+
assert np.all(pandas_Categorical_categories(s) == ["a", "b"])
619+
assert np.all(pandas_Categorical_codes(s) == [1, 1, 0, -1])
620+
621+
# Needed to support pandas >= 0.15 (!)
622+
def safe_is_pandas_categorical_dtype(dt):
623+
if not have_pandas_categorical_dtype:
624+
return False
625+
# WTF this incredibly crucial function is not even publically exported.
626+
# Also if you read its source it uses a bare except: block which is broken
627+
# by definition, but oh well there is not much I can do about this.
628+
return pandas.core.common.is_categorical_dtype(dt)
629+
630+
# Needed to support pandas >= 0.15 (!)
631+
def safe_is_pandas_categorical(data):
632+
if not have_pandas_categorical:
633+
return False
634+
if isinstance(data, pandas.Categorical):
635+
return True
636+
if hasattr(data, "dtype"):
637+
return safe_is_pandas_categorical_dtype(data.dtype)
638+
return False
639+
640+
def test_safe_is_pandas_categorical():
641+
assert not safe_is_pandas_categorical(np.arange(10))
642+
643+
if have_pandas_categorical:
644+
c_obj = pandas.Categorical.from_array(["a", "b"])
645+
assert safe_is_pandas_categorical(c_obj)
646+
647+
if have_pandas_categorical_dtype:
648+
s_obj = pandas.Series(["a", "b"], dtype="category")
649+
assert safe_is_pandas_categorical(s_obj)
650+
651+
# Needed to support pandas >= 0.15 (!)
652+
# Calling np.issubdtype on a pandas categorical will blow up -- the officially
653+
# recommended solution is to replace every piece of code like
654+
# np.issubdtype(foo.dtype, bool)
655+
# with code like
656+
# isinstance(foo.dtype, np.dtype) and np.issubdtype(foo.dtype, bool)
657+
# or
658+
# not pandas.is_categorical_dtype(foo.dtype) and issubdtype(foo.dtype, bool)
659+
# We do the latter (with extra hoops) because the isinstance check is not
660+
# safe. See
661+
# https://github.com/pydata/pandas/issues/9581
662+
# https://github.com/pydata/pandas/issues/9581#issuecomment-77099564
663+
def safe_issubdtype(dt1, dt2):
664+
if safe_is_pandas_categorical_dtype(dt1):
665+
return False
666+
return np.issubdtype(dt1, dt2)
667+
668+
def test_safe_issubdtype():
669+
assert safe_issubdtype(int, np.integer)
670+
assert safe_issubdtype(np.dtype(float), np.floating)
671+
assert not safe_issubdtype(int, np.floating)
672+
assert not safe_issubdtype(np.dtype(float), np.integer)
673+
674+
if have_pandas_categorical_dtype:
675+
bad_dtype = pandas.Series(["a", "b"], dtype="category")
676+
assert not safe_issubdtype(bad_dtype, np.integer)

0 commit comments

Comments
 (0)