|
| 1 | +from typing import NamedTuple |
| 2 | + |
| 3 | +from kfp.components import create_component_from_func |
| 4 | + |
| 5 | +def train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML( |
| 6 | + # AutoMLTabularTrainingJob.run required parameters |
| 7 | + dataset_name: 'GoogleCloudVertexAiTabularDatasetName', |
| 8 | + target_column: str, |
| 9 | + |
| 10 | + # AutoMLTabularTrainingJob.__init__ required parameters |
| 11 | + # display_name: str, |
| 12 | + optimization_prediction_type: str, |
| 13 | + |
| 14 | + # AutoMLTabularTrainingJob.run parameters |
| 15 | + training_fraction_split: float = 0.8, |
| 16 | + validation_fraction_split: float = 0.1, |
| 17 | + test_fraction_split: float = 0.1, |
| 18 | + predefined_split_column_name: str = None, |
| 19 | + weight_column: str = None, |
| 20 | + budget_milli_node_hours: int = 1000, |
| 21 | + model_display_name: str = None, |
| 22 | + disable_early_stopping: bool = False, |
| 23 | + |
| 24 | + # AutoMLTabularTrainingJob.__init__ parameters |
| 25 | + optimization_objective: str = None, |
| 26 | + #column_transformations: Union[Dict, List[Dict], NoneType] = None, |
| 27 | + optimization_objective_recall_value: float = None, |
| 28 | + optimization_objective_precision_value: float = None, |
| 29 | + |
| 30 | + project: str = None, |
| 31 | + location: str = 'us-central1', |
| 32 | + #training_encryption_spec_key_name: str = None, |
| 33 | + #model_encryption_spec_key_name: str = None, |
| 34 | + encryption_spec_key_name: str = None, |
| 35 | +) -> NamedTuple('Outputs', [ |
| 36 | + ('model_name', 'GoogleCloudVertexAiModelName'), |
| 37 | + ('model_dict', dict), |
| 38 | +]): |
| 39 | + '''Trains model using Google Cloud Vertex AI AutoML. |
| 40 | +
|
| 41 | + Data fraction splits: |
| 42 | + Any of ``training_fraction_split``, ``validation_fraction_split`` and |
| 43 | + ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If |
| 44 | + the provided ones sum to less than 1, the remainder is assigned to sets as |
| 45 | + decided by Vertex AI. If none of the fractions are set, by default roughly 80% |
| 46 | + of data will be used for training, 10% for validation, and 10% for test. |
| 47 | +
|
| 48 | + Annotations: |
| 49 | + author: Alexey Volkov <alexey.volkov@ark-kun.com> |
| 50 | +
|
| 51 | + Args: |
| 52 | + dataset_name: |
| 53 | + Required. The full name of dataset (datasets.TabularDataset) within the same Project from which data will be used to train the Model. The |
| 54 | + Dataset must use schema compatible with Model being trained, |
| 55 | + and what is compatible should be described in the used |
| 56 | + TrainingPipeline's [training_task_definition] |
| 57 | + [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. |
| 58 | + For tabular Datasets, all their data is exported to |
| 59 | + training, to pick and choose from. |
| 60 | + target_column (str): |
| 61 | + Required. The name of the column values of which the Model is to predict. |
| 62 | + training_fraction_split (float): |
| 63 | + Required. The fraction of the input data that is to be |
| 64 | + used to train the Model. This is ignored if Dataset is not provided. |
| 65 | + validation_fraction_split (float): |
| 66 | + Required. The fraction of the input data that is to be |
| 67 | + used to validate the Model. This is ignored if Dataset is not provided. |
| 68 | + test_fraction_split (float): |
| 69 | + Required. The fraction of the input data that is to be |
| 70 | + used to evaluate the Model. This is ignored if Dataset is not provided. |
| 71 | + predefined_split_column_name (str): |
| 72 | + Optional. The key is a name of one of the Dataset's data |
| 73 | + columns. The value of the key (either the label's value or |
| 74 | + value in the column) must be one of {``training``, |
| 75 | + ``validation``, ``test``}, and it defines to which set the |
| 76 | + given piece of data is assigned. If for a piece of data the |
| 77 | + key is not present or has an invalid value, that piece is |
| 78 | + ignored by the pipeline. |
| 79 | +
|
| 80 | + Supported only for tabular and time series Datasets. |
| 81 | + weight_column (str): |
| 82 | + Optional. Name of the column that should be used as the weight column. |
| 83 | + Higher values in this column give more importance to the row |
| 84 | + during Model training. The column must have numeric values between 0 and |
| 85 | + 10000 inclusively, and 0 value means that the row is ignored. |
| 86 | + If the weight column field is not set, then all rows are assumed to have |
| 87 | + equal weight of 1. |
| 88 | + budget_milli_node_hours (int): |
| 89 | + Optional. The train budget of creating this Model, expressed in milli node |
| 90 | + hours i.e. 1,000 value in this field means 1 node hour. |
| 91 | + The training cost of the model will not exceed this budget. The final |
| 92 | + cost will be attempted to be close to the budget, though may end up |
| 93 | + being (even) noticeably smaller - at the backend's discretion. This |
| 94 | + especially may happen when further model training ceases to provide |
| 95 | + any improvements. |
| 96 | + If the budget is set to a value known to be insufficient to train a |
| 97 | + Model for the given training set, the training won't be attempted and |
| 98 | + will error. |
| 99 | + The minimum value is 1000 and the maximum is 72000. |
| 100 | + model_display_name (str): |
| 101 | + Optional. If the script produces a managed Vertex AI Model. The display name of |
| 102 | + the Model. The name can be up to 128 characters long and can be consist |
| 103 | + of any UTF-8 characters. |
| 104 | +
|
| 105 | + If not provided upon creation, the job's display_name is used. |
| 106 | + disable_early_stopping (bool): |
| 107 | + Required. If true, the entire budget is used. This disables the early stopping |
| 108 | + feature. By default, the early stopping feature is enabled, which means |
| 109 | + that training might stop before the entire training budget has been |
| 110 | + used, if further training does no longer brings significant improvement |
| 111 | + to the model. |
| 112 | +
|
| 113 | + optimization_prediction_type (str): |
| 114 | + The type of prediction the Model is to produce. |
| 115 | + "classification" - Predict one out of multiple target values is |
| 116 | + picked for each row. |
| 117 | + "regression" - Predict a value based on its relation to other values. |
| 118 | + This type is available only to columns that contain |
| 119 | + semantically numeric values, i.e. integers or floating |
| 120 | + point number, even if stored as e.g. strings. |
| 121 | + optimization_objective (str): |
| 122 | + Optional. Objective function the Model is to be optimized towards. The training |
| 123 | + task creates a Model that maximizes/minimizes the value of the objective |
| 124 | + function over the validation set. |
| 125 | +
|
| 126 | + The supported optimization objectives depend on the prediction type, and |
| 127 | + in the case of classification also the number of distinct values in the |
| 128 | + target column (two distint values -> binary, 3 or more distinct values |
| 129 | + -> multi class). |
| 130 | + If the field is not set, the default objective function is used. |
| 131 | +
|
| 132 | + Classification (binary): |
| 133 | + "maximize-au-roc" (default) - Maximize the area under the receiver |
| 134 | + operating characteristic (ROC) curve. |
| 135 | + "minimize-log-loss" - Minimize log loss. |
| 136 | + "maximize-au-prc" - Maximize the area under the precision-recall curve. |
| 137 | + "maximize-precision-at-recall" - Maximize precision for a specified |
| 138 | + recall value. |
| 139 | + "maximize-recall-at-precision" - Maximize recall for a specified |
| 140 | + precision value. |
| 141 | +
|
| 142 | + Classification (multi class): |
| 143 | + "minimize-log-loss" (default) - Minimize log loss. |
| 144 | +
|
| 145 | + Regression: |
| 146 | + "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). |
| 147 | + "minimize-mae" - Minimize mean-absolute error (MAE). |
| 148 | + "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). |
| 149 | + column_transformations (Optional[Union[Dict, List[Dict]]]): |
| 150 | + Optional. Transformations to apply to the input columns (i.e. columns other |
| 151 | + than the targetColumn). Each transformation may produce multiple |
| 152 | + result values from the column's value, and all are used for training. |
| 153 | + When creating transformation for BigQuery Struct column, the column |
| 154 | + should be flattened using "." as the delimiter. |
| 155 | + If an input column has no transformations on it, such a column is |
| 156 | + ignored by the training, except for the targetColumn, which should have |
| 157 | + no transformations defined on. |
| 158 | + optimization_objective_recall_value (float): |
| 159 | + Optional. Required when maximize-precision-at-recall optimizationObjective was |
| 160 | + picked, represents the recall value at which the optimization is done. |
| 161 | +
|
| 162 | + The minimum value is 0 and the maximum is 1.0. |
| 163 | + optimization_objective_precision_value (float): |
| 164 | + Optional. Required when maximize-recall-at-precision optimizationObjective was |
| 165 | + picked, represents the precision value at which the optimization is |
| 166 | + done. |
| 167 | +
|
| 168 | + The minimum value is 0 and the maximum is 1.0. |
| 169 | +
|
| 170 | + Returns: |
| 171 | + model_name: Model name (fully-qualified) |
| 172 | + model_dict: Model metadata in JSON format |
| 173 | + ''' |
| 174 | + |
| 175 | + import datetime |
| 176 | + import logging |
| 177 | + |
| 178 | + from google.cloud import aiplatform |
| 179 | + from google.protobuf import json_format |
| 180 | + |
| 181 | + logging.getLogger().setLevel(logging.INFO) |
| 182 | + |
| 183 | + if not model_display_name: |
| 184 | + model_display_name = 'TablesModel_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") |
| 185 | + |
| 186 | + aiplatform.init( |
| 187 | + project=project, |
| 188 | + location=location, |
| 189 | + encryption_spec_key_name=encryption_spec_key_name, |
| 190 | + ) |
| 191 | + |
| 192 | + model = aiplatform.AutoMLTabularTrainingJob( |
| 193 | + display_name='AutoMLTabularTrainingJob_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"), |
| 194 | + optimization_prediction_type=optimization_prediction_type, |
| 195 | + optimization_objective=optimization_objective, |
| 196 | + #column_transformations=column_transformations, |
| 197 | + optimization_objective_recall_value=optimization_objective_recall_value, |
| 198 | + optimization_objective_precision_value=optimization_objective_precision_value, |
| 199 | + ).run( |
| 200 | + dataset=aiplatform.TabularDataset(dataset_name=dataset_name), |
| 201 | + target_column=target_column, |
| 202 | + training_fraction_split=training_fraction_split, |
| 203 | + validation_fraction_split=validation_fraction_split, |
| 204 | + test_fraction_split=test_fraction_split, |
| 205 | + predefined_split_column_name=predefined_split_column_name, |
| 206 | + weight_column=weight_column, |
| 207 | + budget_milli_node_hours=budget_milli_node_hours, |
| 208 | + model_display_name=model_display_name, |
| 209 | + disable_early_stopping=disable_early_stopping, |
| 210 | + ) |
| 211 | + |
| 212 | + (_, model_project, _, model_location, _, model_id) = model.resource_name.split('/') |
| 213 | + model_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{model_location}/models/{model_id}/evaluate?project={model_project}' |
| 214 | + logging.info(f'Created model {model.name}.') |
| 215 | + logging.info(f'Link: {model_web_url}') |
| 216 | + model_json = json_format.MessageToJson(model._gca_resource._pb) |
| 217 | + print(model_json) |
| 218 | + return (model.resource_name, model_json, model_web_url) |
| 219 | + |
| 220 | + |
| 221 | +if __name__ == '__main__': |
| 222 | + train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML_op = create_component_from_func( |
| 223 | + train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML, |
| 224 | + base_image='python:3.9', |
| 225 | + # TODO: Update after my fix for TabularDataset.column_names bug is released https://github.com/googleapis/python-aiplatform/pull/590 |
| 226 | + packages_to_install=['git+https://github.com/googleapis/python-aiplatform.git@refs/pull/590/head'], |
| 227 | + output_component_file='component.yaml', |
| 228 | + annotations={ |
| 229 | + "author": "Alexey Volkov <alexey.volkov@ark-kun.com>", |
| 230 | + "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Train_model/component.yaml", |
| 231 | + }, |
| 232 | + ) |
0 commit comments