Testing edited check task func

openml · PGijsbers · Feb 11, 2021 · Oct 29, 2020 · Oct 29, 2020 · Oct 29, 2020
commit ddd8b04f59669346c857002bd76e24f086333810
diff --git a/openml/testing.py b/openml/testing.py
@@ -6,9 +6,10 @@
 import shutil
 import sys
 import time
-from typing import Dict
+from typing import Dict, Union, cast
 import unittest
 import warnings
+import pandas as pd
 
 # Currently, importing oslo raises a lot of warning that it will stop working
 # under python3.8; remove this once they disappear
@@ -252,6 +253,58 @@ def _check_fold_timing_evaluations(
                         self.assertLessEqual(evaluation, max_val)
 
 
+def check_task_existence(
+    task_type: TaskType, dataset_id: int, target_name: str, **kwargs
+) -> Union[int, None]:
+    """Checks if any task with exists on test server that matches the meta data.
+
+    Parameter
+    ---------
+    task_type : openml.tasks.TaskType
+        ID of the task type as detailed `here <https://www.openml.org/search?type=task_type>`_.
+        - Supervised classification: 1
+        - Supervised regression: 2
+        - Learning curve: 3
+        - Supervised data stream classification: 4
+        - Clustering: 5
+        - Machine Learning Challenge: 6
+        - Survival Analysis: 7
+        - Subgroup Discovery: 8
+    dataset_id : int
+    target_name : str
+
+    Return
+    ------
+    int, None
+    """
+    return_val = None
+    tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
+    if len(tasks) == 0:
+        return None
+    tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
+    if len(tasks) == 0:
+        return None
+    tasks = tasks.loc[tasks["target_feature"] == target_name]
+    if len(tasks) == 0:
+        return None
+    task_match = []
+    for task_id in tasks["tid"].to_list():
+        task_match.append(task_id)
+        task = openml.tasks.get_task(task_id)
+        for k, v in kwargs.items():
+            if getattr(task, k) != v:
+                # even if one of the meta-data key mismatches, then task_id is not a match
+                task_match.pop(-1)
+                break
+        # if task_id is retained in the task_match list, it passed all meta key-value matches
+        if len(task_match) == 1:
+            return_val = task_id
+            break
+    if len(task_match) == 0:
+        return_val = None
+    return return_val
+
+
 try:
     from sklearn.impute import SimpleImputer
 except ImportError:
@@ -275,4 +328,4 @@ def cat(X):
     return X.dtypes == "category"
 
 
-__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"]
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
diff --git a/openml/utils.py b/openml/utils.py
@@ -3,7 +3,6 @@
 import os
 import xmltodict
 import shutil
-import typing
 from typing import TYPE_CHECKING, List, Tuple, Union, Type
 import warnings
 import pandas as pd
@@ -33,52 +32,6 @@
     pass
 
 
-def check_task_existence(task_meta_data: dict) -> Union[int, None]:
-    """Checks if any task with exists on test server that matches the meta data.
-
-    Parameter
-    ---------
-    task_meta_data : dict
-        A dictionary containing meta-information on the task fetched from the test server.
-
-    Return
-    ------
-    int, None
-    """
-    return_val = None
-    try:
-        tasks = openml.tasks.list_tasks(output_format="dataframe")
-        tasks = typing.cast(pd.DataFrame, tasks).loc[
-            tasks["task_type"] == task_meta_data["task_type"]
-        ]
-        if len(tasks) == 0:
-            return None
-        tasks = tasks.loc[tasks.did == task_meta_data["dataset_id"]]
-        if len(tasks) == 0:
-            return None
-        tasks = tasks.loc[tasks.target_feature == task_meta_data["target_name"]]
-        if len(tasks) == 0:
-            return None
-        task_match = []
-        for task_id in tasks.tid.values:
-            task_match.append(task_id)
-            task = openml.tasks.get_task(task_id)
-            for k, v in task_meta_data.items():
-                if getattr(task, k) != v:
-                    # even if one of the meta-data key mismatches, then task_id is not a match
-                    task_match.pop(-1)
-                    break
-            # if task_id is retained in the task_match list, it passed all meta key-value matches
-            if len(task_match) == 1:
-                return_val = task_id
-                break
-        if len(task_match) == 0:
-            return_val = None
-    except openml.exceptions.OpenMLServerException:
-        return_val = None
-    return return_val
-
-
 def extract_xml_tags(xml_tag_name, node, allow_none=True):
     """Helper to extract xml tags from xmltodict.
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1351,7 +1351,7 @@ def test_data_edit_errors(self):
             "original_data_url, default_target_attribute, row_id_attribute, "
             "ignore_attribute or paper_url to edit.",
             edit_dataset,
-            data_id=64,
+            data_id=64,  # blood-transfusion-service-center
         )
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -7,6 +7,7 @@
 import random
 import time
 import sys
+import ast
 import unittest.mock
 
 import numpy as np
@@ -24,7 +25,8 @@
 from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskType
-from openml.utils import check_task_existence
+from openml.testing import check_task_existence
+from openml.exceptions import OpenMLServerException
 
 from sklearn.naive_bayes import GaussianNB
 from sklearn.model_selection._search import BaseSearchCV
@@ -60,13 +62,13 @@ class TestRun(TestBase):
     # unit tests to pass by uploading a similar task at runtime
     TASK_META_DATA = {
         1605: {
-            "task_type": "Supervised Regression",
-            "dataset_id": 123,
+            "task_type": TaskType.SUPERVISED_REGRESSION,
+            "dataset_id": 123,  # quake
             "estimation_procedure_id": 7,
             "target_name": "richter",
         },
         1481: {
-            "task_type": "Supervised Classification",
+            "task_type": TaskType.SUPERVISED_CLASSIFICATION,
             "dataset_id": 128,  # iris
             "estimation_procedure_id": 1,
             "class_labels": ["Iris-setosa", "Iris-versicolor", "Iris-virginica"],
@@ -517,7 +519,7 @@ def _run_and_upload_classification(
     def _run_and_upload_regression(
         self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None
     ):
-        num_folds = 10  # because of holdout
+        num_folds = 10  # because of cross-validation
         num_iterations = 5  # for base search algorithms
         metric = sklearn.metrics.mean_absolute_error  # metric class
         metric_name = "mean_absolute_error"  # openml metric name
@@ -549,15 +551,23 @@ def test_run_and_upload_linear_regression(self):
         task_id = self.TEST_SERVER_TASK_REGRESSION[0]
 
         task_meta_data = self.TASK_META_DATA[task_id]
-        _task_id = check_task_existence(task_meta_data)
+        _task_id = check_task_existence(**task_meta_data)
         if _task_id is not None:
             task_id = _task_id
         else:
             task_meta_data["task_type"] = TaskType.SUPERVISED_REGRESSION
             new_task = openml.tasks.create_task(**task_meta_data)
             # publishes the new task
-            new_task = new_task.publish()
-            task_id = new_task.task_id
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
             TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
@@ -966,15 +976,23 @@ def test_initialize_model_from_run(self):
 
         task_id = 1481  # this task may be deleted during test server maintenance
         task_meta_data = self.TASK_META_DATA[task_id]
-        _task_id = check_task_existence(task_meta_data)
+        _task_id = check_task_existence(**task_meta_data)
         if _task_id is not None:
             task_id = _task_id
         else:
             task_meta_data["task_type"] = TaskType.SUPERVISED_CLASSIFICATION
             new_task = openml.tasks.create_task(**task_meta_data)
             # publishes the new task
-            new_task = new_task.publish()
-            task_id = new_task.task_id
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
             TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
@@ -1514,15 +1532,23 @@ def test_format_prediction_task_regression(self):
         task_id = self.TEST_SERVER_TASK_REGRESSION[0]
 
         task_meta_data = self.TASK_META_DATA[task_id]
-        _task_id = check_task_existence(task_meta_data)
+        _task_id = check_task_existence(**task_meta_data)
         if _task_id is not None:
             task_id = _task_id
         else:
             task_meta_data["task_type"] = TaskType.SUPERVISED_REGRESSION
             new_task = openml.tasks.create_task(**task_meta_data)
             # publishes the new task
-            new_task = new_task.publish()
-            task_id = new_task.task_id
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
             TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
@@ -1531,4 +1557,3 @@ def test_format_prediction_task_regression(self):
         ignored_input = [0] * 5
         res = format_prediction(regression, *ignored_input)
         self.assertListEqual(res, [0] * 5)
-        self.assertListEqual(res, [0] * 5)
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
@@ -1,11 +1,13 @@
 # License: BSD 3-Clause
 
+import ast
 import numpy as np
 
 import openml
 from openml.tasks import TaskType
 from openml.testing import TestBase
-from openml.utils import check_task_existence
+from openml.testing import check_task_existence
+from openml.exceptions import OpenMLServerException
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
@@ -17,20 +19,28 @@ def setUp(self, n_levels: int = 1):
         super(OpenMLRegressionTaskTest, self).setUp()
 
         task_meta_data = {
-            "task_type": "Supervised Regression",
-            "dataset_id": 105,
+            "task_type": TaskType.SUPERVISED_REGRESSION,
+            "dataset_id": 105,  # wisconsin
             "estimation_procedure_id": 7,
             "target_name": "time",
         }
-        _task_id = check_task_existence(task_meta_data)
+        _task_id = check_task_existence(**task_meta_data)
         if _task_id is not None:
             task_id = _task_id
         else:
             task_meta_data["task_type"] = TaskType.SUPERVISED_REGRESSION
             new_task = openml.tasks.create_task(**task_meta_data)
             # publishes the new task
-            new_task = new_task.publish()
-            task_id = new_task.task_id
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
             # mark to remove the uploaded task
             TestBase._mark_entity_for_removal("task", task_id)
             TestBase.logger.info("collected from test_run_functions: {}".format(task_id))