Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
30dd55f
Add deprecation warning for retrieving dict
PGijsbers Jun 15, 2023
b502312
Refactor check_datasets_active to work with dataframe
PGijsbers Jun 15, 2023
357bb7d
Update unit tests to use list_datasets with output_format dataframe
PGijsbers Jun 15, 2023
29bbb57
Move list_datasets test to proper file
PGijsbers Jun 15, 2023
464e5dd
Remove list_datasets test, duplicate in test_datasets_functions
PGijsbers Jun 15, 2023
aaad25f
Update list_flows calls to use output_format='dataframe'
PGijsbers Jun 15, 2023
cf9dd7b
Update list_runs calls to require dataframe output
PGijsbers Jun 15, 2023
13f2fb5
Update list_setup calls for deprecation
PGijsbers Jun 15, 2023
d3342a1
Update list_study calls
PGijsbers Jun 15, 2023
b8a915b
Update list_tasks to specify output_format dataframe
PGijsbers Jun 15, 2023
3361b15
Add `output_format` to `list_datasets` call
PGijsbers Jun 15, 2023
be16355
Add TODO markers for removing `dict` support of `list_*` functions
PGijsbers Jun 15, 2023
5cc1287
Make status check less strict, call list_dataset with output_format
PGijsbers Jun 15, 2023
576e09c
Change index on id to did, since thats the dataset id's column name
PGijsbers Jun 15, 2023
b82febe
Update test to reflect new error message
PGijsbers Jun 15, 2023
cc944b5
Fix bug introduced by refactor
PGijsbers Jun 15, 2023
dca2590
Fix minor oversights of refactoring
PGijsbers Jun 15, 2023
5240504
Merge branch 'develop' into pandas_default
PGijsbers Jun 15, 2023
3cff453
Rename variables to reflect they are no longer lists
PGijsbers Jun 16, 2023
c130c41
Fix unsafe indexing on dataframe and remaining unit tests
PGijsbers Jun 16, 2023
22a6dd3
Perform safer check for integer dtypes
PGijsbers Jun 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update list_tasks to specify output_format dataframe
  • Loading branch information
PGijsbers committed Jun 15, 2023
commit b8a915be146af722688e3c967b4b1143ebc64a0c
2 changes: 1 addition & 1 deletion examples/30_extended/suites_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@

# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
all_tasks = list(openml.tasks.list_tasks().keys())
all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))

# The study needs a machine-readable and unique alias. To obtain this,
Expand Down
23 changes: 7 additions & 16 deletions examples/30_extended/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,19 @@
# Listing tasks
# ^^^^^^^^^^^^^
#
# We will start by simply listing only *supervised classification* tasks:

tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)

############################################################################
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert
# into a
# We will start by simply listing only *supervised classification* tasks.
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
# to have better visualization capabilities and easier access:
# instead to have better visualization capabilities and easier access:

tasks = pd.DataFrame.from_dict(tasks, orient="index")
tasks = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# As conversion to a pandas dataframe is a common task, we have added this functionality to the
# OpenML-Python library which can be used by passing ``output_format='dataframe'``:
tasks_df = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
print(tasks_df.head())

############################################################################
# We can filter the list of tasks to only contain datasets with more than
# 500 samples, but less than 1000 samples:
Expand Down
49 changes: 26 additions & 23 deletions tests/test_tasks/test_task_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# License: BSD 3-Clause

import os
from typing import cast
from unittest import mock

import pytest
Expand Down Expand Up @@ -56,7 +57,7 @@ def test__get_estimation_procedure_list(self):
def test_list_clustering_task(self):
# as shown by #383, clustering tasks can give list/dict casting problems
openml.config.server = self.production_server
openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10, output_format="dataframe")
# the expected outcome is that it doesn't crash. No assertions.

def _check_task(self, task):
Expand All @@ -71,11 +72,11 @@ def _check_task(self, task):
def test_list_tasks_by_type(self):
num_curves_tasks = 198 # number is flexible, check server if fails
ttid = TaskType.LEARNING_CURVE
tasks = openml.tasks.list_tasks(task_type=ttid)
tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
self.assertGreaterEqual(len(tasks), num_curves_tasks)
for tid in tasks:
self.assertEqual(ttid, tasks[tid]["ttid"])
self._check_task(tasks[tid])
for task in tasks.to_dict(orient="index").values():
self.assertEqual(ttid, task["ttid"])
self._check_task(task)

def test_list_tasks_output_format(self):
ttid = TaskType.LEARNING_CURVE
Expand All @@ -84,33 +85,33 @@ def test_list_tasks_output_format(self):
self.assertGreater(len(tasks), 100)

def test_list_tasks_empty(self):
tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
if len(tasks) > 0:
raise ValueError("UnitTest Outdated, got somehow results (tag is used, please adapt)")

self.assertIsInstance(tasks, dict)
tasks = cast(
pd.DataFrame,
openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag", output_format="dataframe"),
)
assert tasks.empty

def test_list_tasks_by_tag(self):
num_basic_tasks = 100 # number is flexible, check server if fails
tasks = openml.tasks.list_tasks(tag="OpenML100")
tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
self.assertGreaterEqual(len(tasks), num_basic_tasks)
for tid in tasks:
self._check_task(tasks[tid])
for task in tasks.to_dict(orient="index"):
self._check_task(task)

def test_list_tasks(self):
tasks = openml.tasks.list_tasks()
tasks = openml.tasks.list_tasks(output_format="dataframe")
self.assertGreaterEqual(len(tasks), 900)
for tid in tasks:
self._check_task(tasks[tid])
for task in tasks.to_dict(orient="index").values():
self._check_task(task)

def test_list_tasks_paginate(self):
size = 10
max = 100
for i in range(0, max, size):
tasks = openml.tasks.list_tasks(offset=i, size=size)
tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe")
self.assertGreaterEqual(size, len(tasks))
for tid in tasks:
self._check_task(tasks[tid])
for task in tasks.to_dict(orient="index").values():
self._check_task(task)

def test_list_tasks_per_type_paginate(self):
size = 40
Expand All @@ -122,11 +123,13 @@ def test_list_tasks_per_type_paginate(self):
]
for j in task_types:
for i in range(0, max, size):
tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size)
tasks = openml.tasks.list_tasks(
task_type=j, offset=i, size=size, output_format="dataframe"
)
self.assertGreaterEqual(size, len(tasks))
for tid in tasks:
self.assertEqual(j, tasks[tid]["ttid"])
self._check_task(tasks[tid])
for task in tasks.to_dict(orient="index").values():
self.assertEqual(j, task["ttid"])
self._check_task(task)

def test__get_task(self):
openml.config.set_root_cache_directory(self.static_cache_dir)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_tasks/test_task_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def tearDown(self):
def test_tagging(self):
task = openml.tasks.get_task(1) # anneal; crossvalidation
tag = "testing_tag_{}_{}".format(self.id(), time())
task_list = openml.tasks.list_tasks(tag=tag)
task_list = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
Comment thread
PGijsbers marked this conversation as resolved.
Outdated
self.assertEqual(len(task_list), 0)
task.push_tag(tag)
task_list = openml.tasks.list_tasks(tag=tag)
task_list = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
self.assertEqual(len(task_list), 1)
self.assertIn(1, task_list)
self.assertIn(1, task_list["tid"])
task.remove_tag(tag)
task_list = openml.tasks.list_tasks(tag=tag)
task_list = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
self.assertEqual(len(task_list), 0)

def test_get_train_and_test_split_indices(self):
Expand Down
5 changes: 3 additions & 2 deletions tests/test_utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def test_list_all_for_datasets(self):

def test_list_all_for_tasks(self):
required_size = 1068 # default test server reset value
tasks = openml.tasks.list_tasks(batch_size=1000, size=required_size)

tasks = openml.tasks.list_tasks(
batch_size=1000, size=required_size, output_format="dataframe"
)
self.assertEqual(len(tasks), required_size)

def test_list_all_for_flows(self):
Expand Down