-
-
Notifications
You must be signed in to change notification settings - Fork 270
Feature/give possibility to not download the dataset qualities #1017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
48653e2
6cc1952
4ec995f
611d2bf
d5d3a15
08a3c41
c3d9684
84b9a8f
63b791d
55c7196
1b6467d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,6 +290,8 @@ def _name_to_id( | |
| error_if_multiple : bool (default=False) | ||
| If `False`, if multiple datasets match, return the least recent active dataset. | ||
| If `True`, if multiple datasets match, raise an error. | ||
| download_qualities : bool, optional (default=True) | ||
| If `True`, also download qualities.xml file. If false use the file if it was cached. | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -310,7 +312,7 @@ def _name_to_id( | |
|
|
||
|
|
||
| def get_datasets( | ||
| dataset_ids: List[Union[str, int]], download_data: bool = True, | ||
| dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True | ||
| ) -> List[OpenMLDataset]: | ||
| """Download datasets. | ||
|
|
||
|
|
@@ -326,6 +328,8 @@ def get_datasets( | |
| make the operation noticeably slower. Metadata is also still retrieved. | ||
| If False, create the OpenMLDataset and only populate it with the metadata. | ||
| The data may later be retrieved through the `OpenMLDataset.get_data` method. | ||
| download_qualities : bool, optional (default=True) | ||
| If True, also download qualities.xml file. If false use the file if it was cached. | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -334,7 +338,9 @@ def get_datasets( | |
| """ | ||
| datasets = [] | ||
| for dataset_id in dataset_ids: | ||
| datasets.append(get_dataset(dataset_id, download_data)) | ||
| datasets.append( | ||
| get_dataset(dataset_id, download_data, download_qualities=download_qualities) | ||
| ) | ||
| return datasets | ||
|
|
||
|
|
||
|
|
@@ -345,6 +351,7 @@ def get_dataset( | |
| version: int = None, | ||
| error_if_multiple: bool = False, | ||
| cache_format: str = "pickle", | ||
| download_qualities: bool = True, | ||
| ) -> OpenMLDataset: | ||
| """ Download the OpenML dataset representation, optionally also download actual data file. | ||
|
|
||
|
|
@@ -405,7 +412,9 @@ def get_dataset( | |
| features_file = _get_dataset_features_file(did_cache_dir, dataset_id) | ||
|
|
||
| try: | ||
| qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) | ||
| qualities_file = _get_dataset_qualities_file( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes more sense to use |
||
| did_cache_dir, dataset_id, download_qualities | ||
| ) | ||
| except OpenMLServerException as e: | ||
| if e.code == 362 and str(e) == "No qualities found - None": | ||
| logger.warning("No qualities found for dataset {}".format(dataset_id)) | ||
|
|
@@ -981,7 +990,7 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str: | |
| return features_file | ||
|
|
||
|
|
||
| def _get_dataset_qualities_file(did_cache_dir, dataset_id): | ||
| def _get_dataset_qualities_file(did_cache_dir, dataset_id, download_qualities=True): | ||
| """API call to load dataset qualities. Loads from cache or downloads them. | ||
|
|
||
| Features are metafeatures (number of features, number of classes, ...) | ||
|
|
@@ -996,11 +1005,17 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id): | |
| dataset_id : int | ||
| Dataset ID | ||
|
|
||
| download_qualities : bool | ||
| wheather to download/use cahsed version or not. | ||
| Returns | ||
| ------- | ||
| str | ||
| Path of the cached qualities file | ||
| """ | ||
| # return empty path to avoied used cahched version, this will make the output consistent | ||
| # regardless the cache state. | ||
| if not download_qualities: | ||
| return "" | ||
| # Dataset qualities are subject to change and must be fetched every time | ||
| qualities_file = os.path.join(did_cache_dir, "qualities.xml") | ||
| try: | ||
|
|
@@ -1009,10 +1024,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id): | |
| except (OSError, IOError): | ||
| url_extension = "data/qualities/{}".format(dataset_id) | ||
| qualities_xml = openml._api_calls._perform_api_call(url_extension, "get") | ||
|
|
||
| with io.open(qualities_file, "w", encoding="utf8") as fh: | ||
| fh.write(qualities_xml) | ||
|
|
||
| return qualities_file | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -433,6 +433,11 @@ def test__get_dataset_qualities(self): | |
| qualities_xml_path = os.path.join(self.workdir, "qualities.xml") | ||
| self.assertTrue(os.path.exists(qualities_xml_path)) | ||
|
|
||
| def test__get_dataset_qualities_skip_download(self): | ||
| qualities = _get_dataset_qualities_file(self.workdir, 2, False) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you use keywords here (e.g. |
||
| self.assertIsInstance(qualities, str) | ||
| self.assertEqual(qualities, "") | ||
|
|
||
| def test_deletion_of_cache_dir(self): | ||
| # Simple removal | ||
| did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please update this and the other references where it says the cached version is used if
falseis passed, as this is not the behavior anymore.