Skip to content
Next Next commit
Removing support for pandas SparseDataFrame
  • Loading branch information
Neeratyoy committed Feb 17, 2020
commit 419aba3e61909512da477a5f6be66c41ed4dd9e4
2 changes: 1 addition & 1 deletion examples/30_extended/create_upload_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
print(df.info())

xor_dataset = create_dataset(
Expand Down
2 changes: 1 addition & 1 deletion examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
# Get the actual data.
#
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
# sparse matrix, or as a Pandas DataFrame. The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.
Expand Down
10 changes: 8 additions & 2 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas as pd
import scipy.sparse
from warnings import warn
from distutils.version import LooseVersion

from openml.base import OpenMLBase
from .data_feature import OpenMLDataFeature
Expand Down Expand Up @@ -509,7 +510,12 @@ def _encode_if_category(column):
)
elif array_format == "dataframe":
if scipy.sparse.issparse(data):
return pd.SparseDataFrame(data, columns=attribute_names)
# SparseDataFrame removed in pandas 1.0.0
if LooseVersion(pd.__version__) >= "1.0.0":
Comment thread
mfeurer marked this conversation as resolved.
Outdated
return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
else:
raise Exception("Current pandas version found {}. OpenML supports pandas "
"1.0.0 or higher.".format(LooseVersion(pd.__version__)))
else:
return data
else:
Expand Down Expand Up @@ -560,7 +566,7 @@ def get_data(
dataset_format : string (default='dataframe')
The format of returned dataset.
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
If ``dataframe``, the returned dataset will be a Pandas DataFrame.

Returns
-------
Expand Down
5 changes: 2 additions & 3 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def create_dataset(name, description, creator, contributor,
class:`openml.OpenMLDataset`
Dataset description."""

if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
if isinstance(data, pd.DataFrame):
# infer the row id from the index of the dataset
if row_id_attribute is None:
row_id_attribute = data.index.name
Expand All @@ -676,8 +676,7 @@ def create_dataset(name, description, creator, contributor,
if attributes == 'auto' or isinstance(attributes, dict):
if not hasattr(data, "columns"):
raise ValueError("Automatically inferring attributes requires "
"a pandas DataFrame or SparseDataFrame. "
"A {!r} was given instead.".format(data))
"a pandas DataFrame. A {!r} was given instead.".format(data))
# infer the type of data for each column of the DataFrame
attributes_ = attributes_arff_from_df(data)
if isinstance(attributes, dict):
Expand Down
10 changes: 9 additions & 1 deletion tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import pytest
from scipy import sparse

import unittest
from distutils.version import LooseVersion

import openml
from openml.testing import TestBase
from openml.exceptions import PyOpenMLError
Expand Down Expand Up @@ -284,9 +287,14 @@ def test_get_sparse_dataset(self):
self.assertEqual(len(attribute_names), 20001)
self.assertTrue(all([isinstance(att, str) for att in attribute_names]))

@unittest.skipIf(LooseVersion(pd.__version__) < "1.0.0",
Comment thread
mfeurer marked this conversation as resolved.
Outdated
reason="SparseDataFrame support removed from pandas 1.0.0 and onwards.")
def test_get_sparse_dataframe(self):
rval, *_ = self.sparse_dataset.get_data()
self.assertTrue(isinstance(rval, pd.SparseDataFrame))
dtypes = all([dtype == pd.SparseDtype(np.float32, fill_value=0.0)
for dtype in rval.dtypes])
self.assertTrue(isinstance(rval, pd.DataFrame))
Comment thread
mfeurer marked this conversation as resolved.
Outdated
self.assertTrue(dtypes)
Comment thread
mfeurer marked this conversation as resolved.
Outdated
self.assertEqual((600, 20001), rval.shape)

def test_get_sparse_dataset_with_rowid(self):
Expand Down
17 changes: 7 additions & 10 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,9 @@ def test_attributes_arff_from_df(self):
('string', 'STRING'),
('category', ['A', 'B']),
('boolean', ['True', 'False'])])
# SparseDataFrame case
df = pd.SparseDataFrame([[1, 1.0],
[2, 2.0],
[0, 0]],
columns=['integer', 'floating'],
default_fill_value=0)
# DataFrame with Sparse columns case
df = pd.DataFrame({"integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
"floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0)})
df['integer'] = df['integer'].astype(np.int64)
attributes = attributes_arff_from_df(df)
self.assertEqual(attributes, [('integer', 'INTEGER'),
Expand Down Expand Up @@ -925,15 +922,15 @@ def test_create_dataset_pandas(self):
"Uploaded ARFF does not match original one"
)

# Check that SparseDataFrame are supported properly
# Check that DataFrame with Sparse columns are supported properly
sparse_data = scipy.sparse.coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
# meta-information
description = 'Synthetic dataset created from a Pandas SparseDataFrame'
description = 'Synthetic dataset created from a Pandas DataFrame with Sparse columns'
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
Expand Down Expand Up @@ -963,7 +960,7 @@ def test_create_dataset_pandas(self):
)
self.assertEqual(
_get_online_dataset_format(dataset.id),
'sparse_arff',
'arff',
Comment thread
mfeurer marked this conversation as resolved.
Outdated
"Wrong format for dataset"
)

Expand Down