Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
24faeb7
bump to 0.11.1dev to continue developing (#971)
mfeurer Oct 25, 2020
e84cdf9
update home page example to numerical dataset (pendigits) (#976)
a-moadel Oct 26, 2020
07e87ad
Speed up tests (#977)
PGijsbers Oct 29, 2020
4923e5b
Additional fixes to PR 777 (#967)
Neeratyoy Oct 29, 2020
f2af798
Improving the performance of check_datasets_active (#980)
ArlindKadra Oct 29, 2020
756e747
Add CI through Github Actions (#975)
PGijsbers Oct 29, 2020
3132dac
add validation for ignore_attributes and default_target_attribute at …
a-moadel Oct 29, 2020
6afc880
Updated the way 'image features' are stored, updated old unit tests, …
ArlindKadra Oct 29, 2020
5b6de8a
Retry on database error to reduce number of test failures (#984)
mfeurer Oct 30, 2020
63ec0ae
Transition other Travis jobs to Github Actions (#988)
PGijsbers Nov 2, 2020
9a3a6dd
update progress file (#991)
a-moadel Nov 2, 2020
81cc423
docs: add a-moadel as a contributor (#992)
allcontributors[bot] Nov 2, 2020
51eaff6
docs: add Neeratyoy as a contributor (#998)
allcontributors[bot] Nov 2, 2020
a629562
Improve unit tests (#985)
mfeurer Nov 3, 2020
accde88
Warning if fitted sklearn model being used (#989)
Neeratyoy Nov 3, 2020
560e952
Cache dataset features and qualities as pickle (#979)
mfeurer Nov 3, 2020
5d5a48e
Update string formatting (#1001)
PGijsbers Nov 17, 2020
16799ad
Specify encoding for README file (#1004)
PGijsbers Nov 18, 2020
fba6aab
Making some unit tests work (#1000)
Neeratyoy Dec 24, 2020
e074c14
Refactor data loading/storing (#1018)
PGijsbers Jan 19, 2021
ab793a6
Adding helper functions to support ColumnTransformer (#982)
Neeratyoy Jan 28, 2021
47cda65
Rework local openml directory (#987)
mfeurer Feb 10, 2021
80ae046
Feature/give possibility to not download the dataset qualities (#1017)
a-moadel Feb 11, 2021
d2945ba
Adding sklearn 0.24 support (#1016)
Neeratyoy Feb 11, 2021
3c680c1
improve path detection (#1021)
mfeurer Feb 12, 2021
7553281
Removing flaky decorator for study unit test (#1024)
Neeratyoy Feb 16, 2021
ff7a251
Adding sklearn min. dependencies for all versions (#1022)
Neeratyoy Feb 18, 2021
4ff66ed
Parallel evaluation of tasks (#1020)
Neeratyoy Feb 18, 2021
38f9bf0
Parquet Support (#1029)
PGijsbers Mar 4, 2021
6c609b8
API for topics (#1023)
sahithyaravi Mar 9, 2021
4aec00a
Remove nan-likes from category header (#1037)
PGijsbers Mar 12, 2021
f94672e
Measuring runtimes (#1031)
Neeratyoy Mar 12, 2021
bd8ae14
Fix 1013: Store run `setup_string` (#1015)
PGijsbers Mar 25, 2021
11e6235
Fix #1033: skip two unit tests on Windows (#1040)
mfeurer Mar 26, 2021
d9037e7
bump version for new release (#1041)
mfeurer Mar 29, 2021
5511fa0
fix loky/concurrency issue (#1042)
mfeurer Mar 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updated the way 'image features' are stored, updated old unit tests, …
…added unit test, fixed typo (#983)
  • Loading branch information
ArlindKadra authored Oct 29, 2020
commit 6afc8806d97be3c2ba3bc067ce3d3c3cab9d5bc8
1 change: 1 addition & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changelog

0.11.1
~~~~~~
* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.

0.11.0
Expand Down
14 changes: 13 additions & 1 deletion openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def _parse_data_from_arff(
categories_names = {}
categorical = []
for i, (name, type_) in enumerate(data["attributes"]):
# if the feature is nominal and the a sparse matrix is
# if the feature is nominal and a sparse matrix is
# requested, the categories need to be numeric
if isinstance(type_, list) and self.format.lower() == "sparse_arff":
try:
Expand Down Expand Up @@ -456,6 +456,18 @@ def _parse_data_from_arff(
col.append(
self._unpack_categories(X[column_name], categories_names[column_name])
)
elif attribute_dtype[column_name] in ('floating',
'integer'):
X_col = X[column_name]
if X_col.min() >= 0 and X_col.max() <= 255:
try:
X_col_uint = X_col.astype('uint8')
if (X_col == X_col_uint).all():
col.append(X_col_uint)
continue
except ValueError:
pass
col.append(X[column_name])
else:
col.append(X[column_name])
X = pd.concat(col, axis=1)
Expand Down
41 changes: 23 additions & 18 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ def test_get_data_pandas(self):
self.assertEqual(data.shape[1], len(self.titanic.features))
self.assertEqual(data.shape[0], 1309)
col_dtype = {
"pclass": "float64",
"pclass": "uint8",
"survived": "category",
"name": "object",
"sex": "category",
"age": "float64",
"sibsp": "float64",
"parch": "float64",
"sibsp": "uint8",
"parch": "uint8",
"ticket": "object",
"fare": "float64",
"cabin": "object",
Expand Down Expand Up @@ -118,21 +118,29 @@ def test_get_data_no_str_data_for_nparrays(self):
with pytest.raises(PyOpenMLError, match=err_msg):
self.titanic.get_data(dataset_format="array")

def _check_expected_type(self, dtype, is_cat, col):
if is_cat:
expected_type = 'category'
elif not col.isna().any() and (col.astype('uint8') == col).all():
expected_type = 'uint8'
else:
expected_type = 'float64'

self.assertEqual(dtype.name, expected_type)

def test_get_data_with_rowid(self):
self.dataset.row_id_attribute = "condition"
rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
self.assertIsInstance(rval, pd.DataFrame)
for (dtype, is_cat) in zip(rval.dtypes, categorical):
expected_type = "category" if is_cat else "float64"
self.assertEqual(dtype.name, expected_type)
for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 39))
self.assertEqual(len(categorical), 39)

rval, _, categorical, _ = self.dataset.get_data()
self.assertIsInstance(rval, pd.DataFrame)
for (dtype, is_cat) in zip(rval.dtypes, categorical):
expected_type = "category" if is_cat else "float64"
self.assertEqual(dtype.name, expected_type)
for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 38))
self.assertEqual(len(categorical), 38)

Expand All @@ -149,9 +157,8 @@ def test_get_data_with_target_array(self):
def test_get_data_with_target_pandas(self):
X, y, categorical, attribute_names = self.dataset.get_data(target="class")
self.assertIsInstance(X, pd.DataFrame)
for (dtype, is_cat) in zip(X.dtypes, categorical):
expected_type = "category" if is_cat else "float64"
self.assertEqual(dtype.name, expected_type)
for (dtype, is_cat, col) in zip(X.dtypes, categorical, X):
self._check_expected_type(dtype, is_cat, X[col])
self.assertIsInstance(y, pd.Series)
self.assertEqual(y.dtype.name, "category")

Expand All @@ -174,16 +181,14 @@ def test_get_data_rowid_and_ignore_and_target(self):
def test_get_data_with_ignore_attributes(self):
self.dataset.ignore_attribute = ["condition"]
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
for (dtype, is_cat) in zip(rval.dtypes, categorical):
expected_type = "category" if is_cat else "float64"
self.assertEqual(dtype.name, expected_type)
for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 39))
self.assertEqual(len(categorical), 39)

rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
for (dtype, is_cat) in zip(rval.dtypes, categorical):
expected_type = "category" if is_cat else "float64"
self.assertEqual(dtype.name, expected_type)
for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 38))
self.assertEqual(len(categorical), 38)

Expand Down
7 changes: 7 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,13 @@ def test_get_dataset_by_name(self):
openml.config.server = self.production_server
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)

def test_get_dataset_uint8_dtype(self):
dataset = openml.datasets.get_dataset(1)
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, 'anneal')
df, _, _, _ = dataset.get_data()
self.assertEqual(df['carbon'].dtype, 'uint8')

def test_get_dataset(self):
# This is the only non-lazy load to ensure default behaviour works.
dataset = openml.datasets.get_dataset(1)
Expand Down