Skip to content

Commit 4e84e17

Browse files
authored
Merge branch 'develop' into joblib_evals
2 parents 17e3916 + 7553281 commit 4e84e17

File tree

17 files changed

+123
-105
lines changed

17 files changed

+123
-105
lines changed

.github/workflows/ubuntu-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
strategy:
1010
matrix:
1111
python-version: [3.6, 3.7, 3.8]
12-
scikit-learn: [0.21.2, 0.22.2, 0.23.1]
12+
scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
1313
exclude: # no scikit-learn 0.21.2 release for Python 3.8
1414
- python-version: 3.8
1515
scikit-learn: 0.21.2

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Changelog
1313
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
1414
* FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
1515
* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
16+
* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
1617
0.11.0
1718
~~~~~~
1819
* ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.

examples/30_extended/flows_and_runs_tutorial.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# License: BSD 3-Clause
99

1010
import openml
11-
import numpy as np
1211
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1312

1413
############################################################################
@@ -54,7 +53,7 @@
5453
task = openml.tasks.get_task(403)
5554

5655
# Build any classifier or pipeline
57-
clf = tree.ExtraTreeClassifier()
56+
clf = tree.DecisionTreeClassifier()
5857

5958
# Run the flow
6059
run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
8382
# ############################
8483
#
8584
# When you need to handle 'dirty' data, build pipelines to model then automatically.
86-
task = openml.tasks.get_task(1)
85+
# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
86+
# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
87+
# variables and missing values in both.
88+
task = openml.tasks.get_task(96)
8789

8890
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
8991
from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
9698
[
9799
(
98100
"categorical",
99-
pipeline.Pipeline(
100-
[
101-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
102-
(
103-
"Encoder",
104-
preprocessing.OneHotEncoder(
105-
sparse=False, handle_unknown="ignore"
106-
),
107-
),
108-
]
109-
),
101+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
110102
cat, # returns the categorical feature indices
111103
),
112-
("continuous", "passthrough", cont), # returns the numeric feature indices
104+
(
105+
"continuous",
106+
impute.SimpleImputer(strategy="median"),
107+
cont,
108+
), # returns the numeric feature indices
113109
]
114110
),
115111
),
@@ -146,20 +142,14 @@
146142
[
147143
(
148144
"categorical",
149-
pipeline.Pipeline(
150-
[
151-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
152-
(
153-
"Encoder",
154-
preprocessing.OneHotEncoder(
155-
sparse=False, handle_unknown="ignore"
156-
),
157-
),
158-
]
159-
),
145+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
160146
categorical_feature_indices,
161147
),
162-
("continuous", "passthrough", numeric_feature_indices),
148+
(
149+
"continuous",
150+
impute.SimpleImputer(strategy="median"),
151+
numeric_feature_indices,
152+
),
163153
]
164154
),
165155
),
@@ -182,7 +172,9 @@
182172
task = openml.tasks.get_task(6)
183173

184174
# The following lines can then be executed offline:
185-
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
175+
run = openml.runs.run_model_on_task(
176+
pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
177+
)
186178

187179
# The run may be stored offline, and the flow will be stored along with it:
188180
run.to_filesystem(directory="myrun")

examples/30_extended/run_setup_tutorial.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,9 @@
5959
# easy as you want it to be
6060

6161

62-
cat_imp = make_pipeline(
63-
SimpleImputer(strategy="most_frequent"),
64-
OneHotEncoder(handle_unknown="ignore", sparse=False),
65-
TruncatedSVD(),
66-
)
67-
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
62+
cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
63+
cont_imp = SimpleImputer(strategy="median")
64+
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
6865
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
6966

7067
# Let's change some hyperparameters. Of course, in any good application we

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
177177
cat_cols = list_categorical_attributes(flow_type=flow_type)
178178
num_cols = list(set(X.columns) - set(cat_cols))
179179

180-
# Missing value imputers
181-
cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
180+
# Missing value imputers for numeric columns
182181
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
183182

184-
# Creating the one-hot encoder
183+
# Creating the one-hot encoder for numerical representation of categorical columns
185184
enc = OneHotEncoder(handle_unknown="ignore")
186185

187-
# Pipeline to handle categorical column transformations
188-
cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
189-
190186
# Combining column transformers
191-
ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
187+
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
192188

193189
# Creating the full pipeline with the surrogate model
194190
clf = RandomForestRegressor(n_estimators=50)

openml/_api_calls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def _read_url_files(url, data=None, file_elements=None):
155155

156156
def __read_url(url, request_method, data=None, md5_checksum=None):
157157
data = {} if data is None else data
158-
if config.apikey is not None:
158+
if config.apikey:
159159
data["api_key"] = config.apikey
160160
return _send_request(
161161
request_method=request_method, url=url, data=data, md5_checksum=md5_checksum

openml/datasets/functions.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ def _name_to_id(
290290
error_if_multiple : bool (default=False)
291291
If `False`, if multiple datasets match, return the least recent active dataset.
292292
If `True`, if multiple datasets match, raise an error.
293+
download_qualities : bool, optional (default=True)
294+
If `True`, also download qualities.xml file. If False it skip the qualities.xml.
293295
294296
Returns
295297
-------
@@ -310,7 +312,7 @@ def _name_to_id(
310312

311313

312314
def get_datasets(
313-
dataset_ids: List[Union[str, int]], download_data: bool = True,
315+
dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
314316
) -> List[OpenMLDataset]:
315317
"""Download datasets.
316318
@@ -326,6 +328,8 @@ def get_datasets(
326328
make the operation noticeably slower. Metadata is also still retrieved.
327329
If False, create the OpenMLDataset and only populate it with the metadata.
328330
The data may later be retrieved through the `OpenMLDataset.get_data` method.
331+
download_qualities : bool, optional (default=True)
332+
If True, also download qualities.xml file. If False it skip the qualities.xml.
329333
330334
Returns
331335
-------
@@ -334,7 +338,9 @@ def get_datasets(
334338
"""
335339
datasets = []
336340
for dataset_id in dataset_ids:
337-
datasets.append(get_dataset(dataset_id, download_data))
341+
datasets.append(
342+
get_dataset(dataset_id, download_data, download_qualities=download_qualities)
343+
)
338344
return datasets
339345

340346

@@ -345,6 +351,7 @@ def get_dataset(
345351
version: int = None,
346352
error_if_multiple: bool = False,
347353
cache_format: str = "pickle",
354+
download_qualities: bool = True,
348355
) -> OpenMLDataset:
349356
""" Download the OpenML dataset representation, optionally also download actual data file.
350357
@@ -405,7 +412,10 @@ def get_dataset(
405412
features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
406413

407414
try:
408-
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
415+
if download_qualities:
416+
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
417+
else:
418+
qualities_file = ""
409419
except OpenMLServerException as e:
410420
if e.code == 362 and str(e) == "No qualities found - None":
411421
logger.warning("No qualities found for dataset {}".format(dataset_id))
@@ -996,6 +1006,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
9961006
dataset_id : int
9971007
Dataset ID
9981008
1009+
download_qualities : bool
1010+
wheather to download/use cahsed version or not.
9991011
Returns
10001012
-------
10011013
str
@@ -1009,10 +1021,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
10091021
except (OSError, IOError):
10101022
url_extension = "data/qualities/{}".format(dataset_id)
10111023
qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
1012-
10131024
with io.open(qualities_file, "w", encoding="utf8") as fh:
10141025
fh.write(qualities_xml)
1015-
10161026
return qualities_file
10171027

10181028

openml/testing.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,8 @@
88
import time
99
from typing import Dict, Union, cast
1010
import unittest
11-
import warnings
1211
import pandas as pd
1312

14-
# Currently, importing oslo raises a lot of warning that it will stop working
15-
# under python3.8; remove this once they disappear
16-
with warnings.catch_warnings():
17-
warnings.simplefilter("ignore")
18-
from oslo_concurrency import lockutils
19-
2013
import openml
2114
from openml.tasks import TaskType
2215
from openml.exceptions import OpenMLServerException
@@ -100,13 +93,6 @@ def setUp(self, n_levels: int = 1):
10093
openml.config.avoid_duplicate_runs = False
10194
openml.config.cache_directory = self.workdir
10295

103-
# If we're on travis, we save the api key in the config file to allow
104-
# the notebook tests to read them.
105-
if os.environ.get("TRAVIS") or os.environ.get("APPVEYOR"):
106-
with lockutils.external_lock("config", lock_path=self.workdir):
107-
with open(openml.config.config_file, "w") as fh:
108-
fh.write("apikey = %s" % openml.config.apikey)
109-
11096
# Increase the number of retries to avoid spurious server failures
11197
self.connection_n_retries = openml.config.connection_n_retries
11298
openml.config.connection_n_retries = 10

openml/utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
244244
limit=batch_size,
245245
offset=current_offset,
246246
output_format=output_format,
247-
**active_filters
247+
**active_filters,
248248
)
249249
except openml.exceptions.OpenMLServerNoResult:
250250
# we want to return an empty dict in this case
@@ -277,9 +277,11 @@ def _create_cache_directory(key):
277277
cache = config.get_cache_directory()
278278
cache_dir = os.path.join(cache, key)
279279
try:
280-
os.makedirs(cache_dir)
281-
except OSError:
282-
pass
280+
os.makedirs(cache_dir, exist_ok=True)
281+
except Exception as e:
282+
raise openml.exceptions.OpenMLCacheException(
283+
f"Cannot create cache directory {cache_dir}."
284+
) from e
283285
return cache_dir
284286

285287

tests/conftest.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,6 @@
3535
logger.setLevel(logging.DEBUG)
3636

3737
file_list = []
38-
directory = None
39-
40-
# finding the root directory of conftest.py and going up to OpenML main directory
41-
# exploiting the fact that conftest.py always resides in the root directory for tests
42-
static_dir = os.path.dirname(os.path.abspath(__file__))
43-
logger.info("static directory: {}".format(static_dir))
44-
while True:
45-
if "openml" in os.listdir(static_dir):
46-
break
47-
static_dir = os.path.join(static_dir, "..")
4838

4939

5040
def worker_id() -> str:
@@ -66,12 +56,11 @@ def read_file_list() -> List[str]:
6656
6757
:return: List[str]
6858
"""
69-
directory = os.path.join(static_dir, "tests/files/")
70-
if worker_id() == "master":
71-
logger.info("Collecting file lists from: {}".format(directory))
72-
files = os.walk(directory)
59+
this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
60+
directory = os.path.join(this_dir, "..")
61+
logger.info("Collecting file lists from: {}".format(directory))
7362
file_list = []
74-
for root, _, filenames in files:
63+
for root, _, filenames in os.walk(directory):
7564
for filename in filenames:
7665
file_list.append(os.path.join(root, filename))
7766
return file_list

0 commit comments

Comments
 (0)