Reiterating with changes to example from @mfeurer suggestions

openml · mfeurer · Apr 13, 2021 · Feb 24, 2021 · Mar 23, 2021 · Mar 23, 2021
commit ba19ab74527d773113d00ebff08e7b93cf7d02ff
diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py
@@ -13,10 +13,10 @@
 
 We shall cover these 3 representative scenarios:
 
-* Retrieve runtimes for Random Forest training and prediction on each of the
-cross-validation folds
-* Test the above setting in a parallel setup and monitor the difference using
-runtimes retrieved
+* Retrieve runtimes for Random Forest training and prediction on each of the cross-validation folds
+
+* Test the above setting in a parallel setup and monitor the difference using runtimes retrieved
+
 * Compare RandomSearchCV and GridSearchCV on the above task based on runtimes
 """
 
@@ -32,8 +32,6 @@
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 
 
-openml.config.apikey = "7ff6d43e00cc810a00c01bd770996dfc"
-
 ############################################################################
 # Preparing tasks and scikit-learn models
 # ***************************************
@@ -45,16 +43,13 @@
 
 # Viewing associated data
 X, y = task.get_X_and_y(dataset_format="array")
-# print(X.head)
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
         task_id, n_repeats, n_folds, n_samples,
     )
 )
 
-clf = RandomForestClassifier(n_estimators=10)
-
 # Creating utility function
 def print_compare_runtimes(measures):
     for repeat, val1 in measures["usercpu_time_millis_training"].items():
@@ -73,6 +68,7 @@ def print_compare_runtimes(measures):
 # We'll run a Random Forest model and obtain an OpenML run object. We can
 # see the evaluations recorded per fold for the dataset and the information
 # available for this run.
+clf = RandomForestClassifier(n_estimators=10)
 
 run1 = openml.runs.run_model_on_task(
     model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False,
@@ -90,24 +86,25 @@ def print_compare_runtimes(measures):
         print("Repeat #{}-Fold #{}: {:.4f}".format(repeat, fold, val2))
     print()
 
+################################################################################
 # The remaining entries recorded in `measures` are the runtime records
 # related as:
 # usercpu_time_millis = usercpu_time_millis_training + usercpu_time_millis_testing
 # wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing
-
+#
 # The timing measures recorded as `*_millis_training` contain the per
 # repeat-fold timing incurred for the executing of the `.fit()` procedure
 # of the model. For `usercpu_time_*` the time recorded using `time.process_time()`
 # is converted to milliseconds and stored. Similarly, `time.time()` is used
-# to record the time entry for `wall_clock_time_*`. *_millis_testing`
+# to record the time entry for `wall_clock_time_*`. The `*_millis_testing` entry
 # follows the same procedure but for time taken for the `.predict()` procedure.
 
 # Comparing the CPU and wall-clock training times of the Random Forest model
 print_compare_runtimes(measures)
 
-############################################################################
-# Case 2: Running a Random Forest model on an OpenML task in parallel
-# *******************************************************************
+################################################################################
+# Case 2: Running a Random Forest model on an OpenML task in parallel (2 cores)
+# *****************************************************************************
 # Redefining the model to allow parallelism with `n_jobs=2`
 clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
 
@@ -118,6 +115,9 @@ def print_compare_runtimes(measures):
 # The wall-clock time recorded per fold should be lesser than Case 1 above
 print_compare_runtimes(measures)
 
+####################################################################################
+# Running a Random Forest model on an OpenML task in parallel (all cores available):
+
 # Redefining the model to use all available cores with `n_jobs=-1`
 clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
 
@@ -127,15 +127,27 @@ def print_compare_runtimes(measures):
 # if more than 2 CPU cores are available
 print_compare_runtimes(measures)
 
+#############################################################################
 # It should be noted that there are multiple levels at which parallelism can
 # occur here.
-
-############################################################################
+#
+# * At the outermost level, OpenML tasks contain fixed data splits, on which the
+#   defined model/flow is executed. Thus, a model can be fit on each OpenML dataset fold
+#   in parallel using the `n_jobs` parameter to `run_model_on_task` or `run_flow_on_task`.
+#
+# * The model/flow specified can also include scikit-learn models that perform their own
+#   parallelisation. For instance, by specifying `n_jobs` in the Random Forest model definition.
+#
+# * The sklearn model can further be an HPO estimator and contain it's own parallelisation.
+#   If the base estimator used supports `parallelisation`, then there's a 2-level nested definition
+#   for parallelisation possible. Such an example is explored next.
+
+#####################################################################
 # Case 3: Running and benchmarking HPO algorithms with their runtimes
 # *******************************************************************
-# We shall now optimize the same RandomForest model for the same task using
+# We shall now optimize a similar RandomForest model for the same task using
 # scikit-learn HPO support by using GridSearchCV to optimize our earlier
-# RandomForest model's parameter `n_estimators`. sklearn also provides a
+# RandomForest model's parameter `n_estimators`. scikit-learn also provides a
 # `refit_time_` for such HPO models, i.e., the time incurred by training
 # and evaluating the model on the best found parameter setting. OpenML
 # includes this in the `wall_clock_time_millis_training` measure recorded.
@@ -156,23 +168,28 @@ def print_compare_runtimes(measures):
 print_compare_runtimes(measures)
 print()
 
+##################################################################################
 # Like any optimisation problem, scikit-learn's HPO estimators also generate
-# a sequence of configurations which are evaluated as the best found
+# a sequence of configurations which are evaluated, using which the best found
 # configuration is tracked throughout the trace.
 # The OpenML run object stores these traces as OpenMLRunTrace objects accessible
-# using keys of the pattern (repeat, fold, iterations). Since `iterations` is part
-# of the scikit-learn model here, the runtime recorded per repeat-per fold is for
-# this entire `fit()` procedure.
+# using keys of the pattern (repeat, fold, iterations). Here `fold` implies the
+# outer-cross validation fold as obtained from the task data splits available in OpenML.
+# Since we are running `GridSearchCV` here which itself performs a `2-fold` cross validation,
+# the runtime recorded per repeat-per fold in the run object is for the entire `fit()`
+# procedure of GridSearchCV thus subsuming the runtimes of the 2-fold CV search performed.
 
 # We earlier extracted the number of repeats and folds for this task:
 print("# repeats: {}\n# fold: {}".format(n_repeats, n_folds))
 
 # To extract the training runtime of the first repeat, first fold:
 print(run4.fold_evaluations["wall_clock_time_millis_training"][0][0])
 
-# To extract the training runtime of the 1-st repeat, 4-th fold and also
+##################################################################################
+# To extract the training runtime of the 1-st repeat, 4-th (outer) fold and also
 # to fetch the parameters and performance of the evaluations made during
-# the 1-st repeat, 4-th fold evaluation by the Grid Search model
+# the 1-st repeat, 4-th fold evaluation by the Grid Search model.
+
 _repeat = 0
 _fold = 3
 print(
@@ -189,6 +206,7 @@ def print_compare_runtimes(measures):
         )
     )
 
+############################################################################
 # Along with the GridSearchCV already used above, we demonstrate how such
 # optimisation traces can be retrieved by showing an application of these
 # traces - comparing the speed of finding the best configuration using
@@ -206,7 +224,7 @@ def print_compare_runtimes(measures):
 )
 run5 = openml.runs.run_model_on_task(model=rs_pipe, task=task)
 
-
+################################################################################
 # Since for the call to `openml.runs.run_model_on_task` the parameter
 # `n_jobs` is set to its default None, the evaluations across the OpenML folds
 # are not parallelized. Hence, the time recorded is agnostic to the `n_jobs`