|
28 | 28 | # Can drop this guard whenever we drop support for such older versions of |
29 | 29 | # pandas. |
30 | 30 | have_pandas_categorical = (have_pandas and hasattr(pandas, "Categorical")) |
| 31 | +have_pandas_categorical_dtype = (have_pandas |
| 32 | + and hasattr(pandas.core.common, |
| 33 | + "is_categorical_dtype")) |
31 | 34 |
|
32 | 35 | # Passes through Series and DataFrames, call np.asarray() on everything else |
33 | 36 | def asarray_or_pandas(a, copy=False, dtype=None, subok=False): |
@@ -550,3 +553,124 @@ def test_iterable(): |
550 | 553 | assert iterable({"a": 1}) |
551 | 554 | assert not iterable(1) |
552 | 555 | assert not iterable(iterable) |
| 556 | + |
| 557 | +##### Handling Pandas's categorical stuff is horrible and hateful |
| 558 | + |
| 559 | +# Basically they decided that they didn't like how numpy does things, so their |
| 560 | +# categorical stuff is *kinda* like how numpy would do it (e.g. they have a |
| 561 | +# special ".dtype" attribute to mark categorical data), so by default you'll |
| 562 | +# find yourself using the same code paths to handle pandas categorical data |
| 563 | +# and other non-categorical data. BUT, all the idioms for detecting |
| 564 | +# categorical data blow up with errors if you try them with real numpy dtypes, |
| 565 | +# and all numpy's idioms for detecting non-categorical types blow up with |
| 566 | +# errors if you try them with pandas categorical stuff. So basically they have |
| 567 | +# just poisoned all code that touches dtypes; the old numpy stuff is unsafe, |
| 568 | +# and you must use special code like below. |
| 569 | +# |
| 570 | +# Also there are hoops to jump through to handle both the old style |
| 571 | +# (Categorical objects) and new-style (Series with dtype="category"). |
| 572 | + |
| 573 | +# Needed to support pandas < 0.15 |
| 574 | +def pandas_Categorical_from_codes(codes, categories): |
| 575 | + assert have_pandas_categorical |
| 576 | + |
| 577 | + if hasattr(pandas.Categorical, "from_codes"): |
| 578 | + return pandas.Categorical.from_codes(codes, categories) |
| 579 | + else: |
| 580 | + return pandas.Categorical(codes, categories) |
| 581 | + |
| 582 | +def test_pandas_Categorical_from_codes(): |
| 583 | + c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"]) |
| 584 | + assert np.all(np.asarray(c)[:-1] == ["b", "b", "a"]) |
| 585 | + assert np.isnan(np.asarray(c)[-1]) |
| 586 | + |
| 587 | +# Needed to support pandas < 0.15 |
| 588 | +def pandas_Categorical_categories(cat): |
| 589 | + # In 0.15+, a categorical Series has a .cat attribute which is a |
| 590 | + # Categorical object, and Categorical objects are what have .categories / |
| 591 | + # .codes attributes. |
| 592 | + if hasattr(cat, "cat"): |
| 593 | + cat = cat.cat |
| 594 | + if hasattr(cat, "categories"): |
| 595 | + return cat.categories |
| 596 | + else: |
| 597 | + return cat.levels |
| 598 | + |
| 599 | +# Needed to support pandas < 0.15 |
| 600 | +def pandas_Categorical_codes(cat): |
| 601 | + # In 0.15+, a categorical Series has a .cat attribute which is a |
| 602 | + # Categorical object, and Categorical objects are what have .categories / |
| 603 | + # .codes attributes. |
| 604 | + if hasattr(cat, "cat"): |
| 605 | + cat = cat.cat |
| 606 | + if hasattr(cat, "codes"): |
| 607 | + return cat.codes |
| 608 | + else: |
| 609 | + return cat.labels |
| 610 | + |
| 611 | +def test_pandas_Categorical_accessors(): |
| 612 | + c = pandas_Categorical_from_codes([1, 1, 0, -1], ["a", "b"]) |
| 613 | + assert np.all(pandas_Categorical_categories(c) == ["a", "b"]) |
| 614 | + assert np.all(pandas_Categorical_codes(c) == [1, 1, 0, -1]) |
| 615 | + |
| 616 | + if have_pandas_categorical_dtype: |
| 617 | + s = pandas.Series(c) |
| 618 | + assert np.all(pandas_Categorical_categories(s) == ["a", "b"]) |
| 619 | + assert np.all(pandas_Categorical_codes(s) == [1, 1, 0, -1]) |
| 620 | + |
| 621 | +# Needed to support pandas >= 0.15 (!) |
| 622 | +def safe_is_pandas_categorical_dtype(dt): |
| 623 | + if not have_pandas_categorical_dtype: |
| 624 | + return False |
| 625 | + # WTF this incredibly crucial function is not even publically exported. |
| 626 | + # Also if you read its source it uses a bare except: block which is broken |
| 627 | + # by definition, but oh well there is not much I can do about this. |
| 628 | + return pandas.core.common.is_categorical_dtype(dt) |
| 629 | + |
| 630 | +# Needed to support pandas >= 0.15 (!) |
| 631 | +def safe_is_pandas_categorical(data): |
| 632 | + if not have_pandas_categorical: |
| 633 | + return False |
| 634 | + if isinstance(data, pandas.Categorical): |
| 635 | + return True |
| 636 | + if hasattr(data, "dtype"): |
| 637 | + return safe_is_pandas_categorical_dtype(data.dtype) |
| 638 | + return False |
| 639 | + |
| 640 | +def test_safe_is_pandas_categorical(): |
| 641 | + assert not safe_is_pandas_categorical(np.arange(10)) |
| 642 | + |
| 643 | + if have_pandas_categorical: |
| 644 | + c_obj = pandas.Categorical.from_array(["a", "b"]) |
| 645 | + assert safe_is_pandas_categorical(c_obj) |
| 646 | + |
| 647 | + if have_pandas_categorical_dtype: |
| 648 | + s_obj = pandas.Series(["a", "b"], dtype="category") |
| 649 | + assert safe_is_pandas_categorical(s_obj) |
| 650 | + |
| 651 | +# Needed to support pandas >= 0.15 (!) |
| 652 | +# Calling np.issubdtype on a pandas categorical will blow up -- the officially |
| 653 | +# recommended solution is to replace every piece of code like |
| 654 | +# np.issubdtype(foo.dtype, bool) |
| 655 | +# with code like |
| 656 | +# isinstance(foo.dtype, np.dtype) and np.issubdtype(foo.dtype, bool) |
| 657 | +# or |
| 658 | +# not pandas.is_categorical_dtype(foo.dtype) and issubdtype(foo.dtype, bool) |
| 659 | +# We do the latter (with extra hoops) because the isinstance check is not |
| 660 | +# safe. See |
| 661 | +# https://github.com/pydata/pandas/issues/9581 |
| 662 | +# https://github.com/pydata/pandas/issues/9581#issuecomment-77099564 |
| 663 | +def safe_issubdtype(dt1, dt2): |
| 664 | + if safe_is_pandas_categorical_dtype(dt1): |
| 665 | + return False |
| 666 | + return np.issubdtype(dt1, dt2) |
| 667 | + |
| 668 | +def test_safe_issubdtype(): |
| 669 | + assert safe_issubdtype(int, np.integer) |
| 670 | + assert safe_issubdtype(np.dtype(float), np.floating) |
| 671 | + assert not safe_issubdtype(int, np.floating) |
| 672 | + assert not safe_issubdtype(np.dtype(float), np.integer) |
| 673 | + |
| 674 | + if have_pandas_categorical_dtype: |
| 675 | + bad_dtype = pandas.Series(["a", "b"], dtype="category") |
| 676 | + assert not safe_issubdtype(bad_dtype, np.integer) |
0 commit comments