openml
diff --git a/‎openml/testing.py‎
Lines changed: 54 additions & 1 deletion b/‎openml/testing.py‎
Lines changed: 54 additions & 1 deletion
diff --git a/‎openml/utils.py‎
Lines changed: 1 addition & 0 deletions b/‎openml/utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_datasets/test_dataset_functions.py‎
Lines changed: 20 additions & 2 deletions b/‎tests/test_datasets/test_dataset_functions.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py‎
Lines changed: 14 additions & 10 deletions b/‎tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎tests/test_flows/test_flow_functions.py‎
Lines changed: 5 additions & 2 deletions b/‎tests/test_flows/test_flow_functions.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tests/test_runs/test_run.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/test_runs/test_run.py‎
Lines changed: 4 additions & 4 deletions
@@ -6,9 +6,10 @@
 import shutil
 import sys
 import time
-from typing import Dict
+from typing import Dict, Union, cast
 import unittest
 import warnings
+import pandas as pd
 
 # Currently, importing oslo raises a lot of warning that it will stop working
 # under python3.8; remove this once they disappear
@@ -252,6 +253,58 @@ def _check_fold_timing_evaluations(
                         self.assertLessEqual(evaluation, max_val)
 
 
+def check_task_existence(
+    task_type: TaskType, dataset_id: int, target_name: str, **kwargs
+) -> Union[int, None]:
+    """Checks if any task with exists on test server that matches the meta data.
+
+    Parameter
+    ---------
+    task_type : openml.tasks.TaskType
+        ID of the task type as detailed `here <https://www.openml.org/search?type=task_type>`_.
+        - Supervised classification: 1
+        - Supervised regression: 2
+        - Learning curve: 3
+        - Supervised data stream classification: 4
+        - Clustering: 5
+        - Machine Learning Challenge: 6
+        - Survival Analysis: 7
+        - Subgroup Discovery: 8
+    dataset_id : int
+    target_name : str
+
+    Return
+    ------
+    int, None
+    """
+    return_val = None
+    tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
+    if len(tasks) == 0:
+        return None
+    tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
+    if len(tasks) == 0:
+        return None
+    tasks = tasks.loc[tasks["target_feature"] == target_name]
+    if len(tasks) == 0:
+        return None
+    task_match = []
+    for task_id in tasks["tid"].to_list():
+        task_match.append(task_id)
+        task = openml.tasks.get_task(task_id)
+        for k, v in kwargs.items():
+            if getattr(task, k) != v:
+                # even if one of the meta-data key mismatches, then task_id is not a match
+                task_match.pop(-1)
+                break
+        # if task_id is retained in the task_match list, it passed all meta key-value matches
+        if len(task_match) == 1:
+            return_val = task_id
+            break
+    if len(task_match) == 0:
+        return_val = None
+    return return_val
+
+
 try:
     from sklearn.impute import SimpleImputer
 except ImportError:
 
@@ -9,6 +9,7 @@
 from functools import wraps
 import collections
 
+import openml
 import openml._api_calls
 import openml.exceptions
 from . import config
 
@@ -36,6 +36,7 @@
     DATASETS_CACHE_DIR_NAME,
 )
 from openml.datasets import fork_dataset, edit_dataset
+from openml.tasks import TaskType, create_task
 
 
 class TestOpenMLDataset(TestBase):
@@ -1350,7 +1351,7 @@ def test_data_edit_errors(self):
             "original_data_url, default_target_attribute, row_id_attribute, "
             "ignore_attribute or paper_url to edit.",
             edit_dataset,
-            data_id=564,
+            data_id=64,  # blood-transfusion-service-center
         )
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1360,15 +1361,32 @@ def test_data_edit_errors(self):
             data_id=999999,
             description="xor operation dataset",
         )
+
+        # Need to own a dataset to be able to edit meta-data
+        # Will be creating a forked version of an existing dataset to allow the unit test user
+        #  to edit meta-data of a dataset
+        did = fork_dataset(1)
+        self._wait_for_dataset_being_processed(did)
+        TestBase._mark_entity_for_removal("data", did)
+        # Need to upload a task attached to this data to test edit failure
+        task = create_task(
+            task_type=TaskType.SUPERVISED_CLASSIFICATION,
+            dataset_id=did,
+            target_name="class",
+            estimation_procedure_id=1,
+        )
+        task = task.publish()
+        TestBase._mark_entity_for_removal("task", task.task_id)
         # Check server exception when owner/admin edits critical fields of dataset with tasks
         self.assertRaisesRegex(
             OpenMLServerException,
             "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
             "can only be edited for datasets without any tasks.",
             edit_dataset,
-            data_id=223,
+            data_id=did,
             default_target_attribute="y",
         )
+
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
             OpenMLServerException,
 
@@ -1465,7 +1465,7 @@ def test_openml_param_name_to_sklearn(self):
         )
         model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)])
         flow = self.extension.model_to_flow(model)
-        task = openml.tasks.get_task(115)
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         run = openml.runs.run_flow_on_task(flow, task)
         run = run.publish()
         TestBase._mark_entity_for_removal("run", run.run_id)
@@ -1561,7 +1561,7 @@ def setUp(self):
     # Test methods for performing runs with this extension module
 
     def test_run_model_on_task(self):
-        task = openml.tasks.get_task(1)
+        task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # using most_frequent imputer since dataset has mixed types and to keep things simple
         pipe = sklearn.pipeline.Pipeline(
             [
@@ -1626,7 +1626,7 @@ def test_seed_model_raises(self):
                 self.extension.seed_model(model=clf, seed=42)
 
     def test_run_model_on_fold_classification_1_array(self):
-        task = openml.tasks.get_task(1)
+        task = openml.tasks.get_task(1)  # anneal; crossvalidation
 
         X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
@@ -1689,7 +1689,7 @@ def test_run_model_on_fold_classification_1_array(self):
     def test_run_model_on_fold_classification_1_dataframe(self):
         from sklearn.compose import ColumnTransformer
 
-        task = openml.tasks.get_task(1)
+        task = openml.tasks.get_task(1)  # anneal; crossvalidation
 
         # diff test_run_model_on_fold_classification_1_array()
         X, y = task.get_X_and_y(dataset_format="dataframe")
@@ -1753,7 +1753,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
         )
 
     def test_run_model_on_fold_classification_2(self):
-        task = openml.tasks.get_task(7)
+        task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
 
         X, y = task.get_X_and_y()
         train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
@@ -1815,7 +1815,11 @@ def predict_proba(*args, **kwargs):
                 raise AttributeError("predict_proba is not available when " "probability=False")
 
         # task 1 (test server) is important: it is a task with an unused class
-        tasks = [1, 3, 115]
+        tasks = [
+            1,  # anneal; crossvalidation
+            3,  # anneal; crossvalidation
+            115,  # diabetes; crossvalidation
+        ]
         flow = unittest.mock.Mock()
         flow.name = "dummy"
 
@@ -1969,7 +1973,7 @@ def test__extract_trace_data(self):
             "max_iter": [10, 20, 40, 80],
         }
         num_iters = 10
-        task = openml.tasks.get_task(20)
+        task = openml.tasks.get_task(20)  # balance-scale; crossvalidation
         clf = sklearn.model_selection.RandomizedSearchCV(
             sklearn.neural_network.MLPClassifier(), param_grid, num_iters,
         )
@@ -2080,8 +2084,8 @@ def test_run_on_model_with_empty_steps(self):
         from sklearn.compose import ColumnTransformer
 
         # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
-        dataset = openml.datasets.get_dataset(128)
-        task = openml.tasks.get_task(59)
+        dataset = openml.datasets.get_dataset(128)  # iris
+        task = openml.tasks.get_task(59)  # mfeat-pixel; crossvalidation
 
         X, y, categorical_ind, feature_names = dataset.get_data(
             target=dataset.default_target_attribute, dataset_format="array"
@@ -2198,7 +2202,7 @@ def test_failed_serialization_of_custom_class(self):
             steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
         )  # build a sklearn classifier
 
-        task = openml.tasks.get_task(253)  # data with mixed types from test server
+        task = openml.tasks.get_task(253)  # profb; crossvalidation
         try:
             _ = openml.runs.run_model_on_task(clf, task)
         except AttributeError as e:
 
@@ -345,11 +345,15 @@ def test_get_flow_id(self):
         with patch("openml.utils._list_all", list_all):
             clf = sklearn.tree.DecisionTreeClassifier()
             flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+            TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+            TestBase.logger.info(
+                "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
+            )
 
             self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
             flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
             self.assertIn(flow.flow_id, flow_ids)
-            self.assertGreater(len(flow_ids), 2)
+            self.assertGreater(len(flow_ids), 0)
 
             # Check that the output of get_flow_id is identical if only the name is given, no matter
             # whether exact_version is set to True or False.
@@ -361,4 +365,3 @@ def test_get_flow_id(self):
             )
             self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
             self.assertIn(flow.flow_id, flow_ids_exact_version_True)
-            self.assertGreater(len(flow_ids_exact_version_True), 2)
 
@@ -102,7 +102,7 @@ def test_to_from_filesystem_vanilla(self):
                 ("classifier", DecisionTreeClassifier(max_depth=1)),
             ]
         )
-        task = openml.tasks.get_task(119)
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(
             model=model,
             task=task,
@@ -142,7 +142,7 @@ def test_to_from_filesystem_search(self):
             },
         )
 
-        task = openml.tasks.get_task(119)
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(
             model=model, task=task, add_local_measures=False, avoid_duplicate_runs=False,
         )
@@ -163,7 +163,7 @@ def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
         )
-        task = openml.tasks.get_task(119)
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
 
         cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
@@ -184,7 +184,7 @@ def test_publish_with_local_loaded_flow(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
         )
-        task = openml.tasks.get_task(119)
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
 
         # Make sure the flow does not exist on the server yet.
         flow = extension.model_to_flow(model)
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ def test_to_from_filesystem_vanilla(self):`
`102`	`102`	`("classifier", DecisionTreeClassifier(max_depth=1)),`
`103`	`103`	`]`
`104`	`104`	`)`
`105`		`- task = openml.tasks.get_task(119)`
	`105`	`+ task = openml.tasks.get_task(119) # diabetes; crossvalidation`
`106`	`106`	`run = openml.runs.run_model_on_task(`
`107`	`107`	`model=model,`
`108`	`108`	`task=task,`
`@@ -142,7 +142,7 @@ def test_to_from_filesystem_search(self):`
`142`	`142`	`},`
`143`	`143`	`)`
`144`	`144`
`145`		`- task = openml.tasks.get_task(119)`
	`145`	`+ task = openml.tasks.get_task(119) # diabetes; crossvalidation`
`146`	`146`	`run = openml.runs.run_model_on_task(`
`147`	`147`	`model=model, task=task, add_local_measures=False, avoid_duplicate_runs=False,`
`148`	`148`	`)`
`@@ -163,7 +163,7 @@ def test_to_from_filesystem_no_model(self):`
`163`	`163`	`model = Pipeline(`
`164`	`164`	`[("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]`
`165`	`165`	`)`
`166`		`- task = openml.tasks.get_task(119)`
	`166`	`+ task = openml.tasks.get_task(119) # diabetes; crossvalidation`
`167`	`167`	`run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)`
`168`	`168`
`169`	`169`	`cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))`
`@@ -184,7 +184,7 @@ def test_publish_with_local_loaded_flow(self):`
`184`	`184`	`model = Pipeline(`
`185`	`185`	`[("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]`
`186`	`186`	`)`
`187`		`- task = openml.tasks.get_task(119)`
	`187`	`+ task = openml.tasks.get_task(119) # diabetes; crossvalidation`
`188`	`188`
`189`	`189`	`# Make sure the flow does not exist on the server yet.`
`190`	`190`	`flow = extension.model_to_flow(model)`