Skip to content

Commit bc4e012

Browse files
committed
Google Cloud - Vertex_AI - AutoML - Tables - Added the "Train_model" component
There is a hack to work around the issue googleapis/python-aiplatform#589 that I fixed in googleapis/python-aiplatform#590
1 parent e68d53b commit bc4e012

2 files changed

Lines changed: 756 additions & 0 deletions

File tree

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
from typing import NamedTuple
2+
3+
from kfp.components import create_component_from_func
4+
5+
def train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML(
6+
# AutoMLTabularTrainingJob.run required parameters
7+
dataset_name: 'GoogleCloudVertexAiTabularDatasetName',
8+
target_column: str,
9+
10+
# AutoMLTabularTrainingJob.__init__ required parameters
11+
# display_name: str,
12+
optimization_prediction_type: str,
13+
14+
# AutoMLTabularTrainingJob.run parameters
15+
training_fraction_split: float = 0.8,
16+
validation_fraction_split: float = 0.1,
17+
test_fraction_split: float = 0.1,
18+
predefined_split_column_name: str = None,
19+
weight_column: str = None,
20+
budget_milli_node_hours: int = 1000,
21+
model_display_name: str = None,
22+
disable_early_stopping: bool = False,
23+
24+
# AutoMLTabularTrainingJob.__init__ parameters
25+
optimization_objective: str = None,
26+
#column_transformations: Union[Dict, List[Dict], NoneType] = None,
27+
optimization_objective_recall_value: float = None,
28+
optimization_objective_precision_value: float = None,
29+
30+
project: str = None,
31+
location: str = 'us-central1',
32+
#training_encryption_spec_key_name: str = None,
33+
#model_encryption_spec_key_name: str = None,
34+
encryption_spec_key_name: str = None,
35+
) -> NamedTuple('Outputs', [
36+
('model_name', 'GoogleCloudVertexAiModelName'),
37+
('model_dict', dict),
38+
]):
39+
'''Trains model using Google Cloud Vertex AI AutoML.
40+
41+
Data fraction splits:
42+
Any of ``training_fraction_split``, ``validation_fraction_split`` and
43+
``test_fraction_split`` may optionally be provided, they must sum to up to 1. If
44+
the provided ones sum to less than 1, the remainder is assigned to sets as
45+
decided by Vertex AI. If none of the fractions are set, by default roughly 80%
46+
of data will be used for training, 10% for validation, and 10% for test.
47+
48+
Annotations:
49+
author: Alexey Volkov <alexey.volkov@ark-kun.com>
50+
51+
Args:
52+
dataset_name:
53+
Required. The full name of dataset (datasets.TabularDataset) within the same Project from which data will be used to train the Model. The
54+
Dataset must use schema compatible with Model being trained,
55+
and what is compatible should be described in the used
56+
TrainingPipeline's [training_task_definition]
57+
[google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition].
58+
For tabular Datasets, all their data is exported to
59+
training, to pick and choose from.
60+
target_column (str):
61+
Required. The name of the column values of which the Model is to predict.
62+
training_fraction_split (float):
63+
Required. The fraction of the input data that is to be
64+
used to train the Model. This is ignored if Dataset is not provided.
65+
validation_fraction_split (float):
66+
Required. The fraction of the input data that is to be
67+
used to validate the Model. This is ignored if Dataset is not provided.
68+
test_fraction_split (float):
69+
Required. The fraction of the input data that is to be
70+
used to evaluate the Model. This is ignored if Dataset is not provided.
71+
predefined_split_column_name (str):
72+
Optional. The key is a name of one of the Dataset's data
73+
columns. The value of the key (either the label's value or
74+
value in the column) must be one of {``training``,
75+
``validation``, ``test``}, and it defines to which set the
76+
given piece of data is assigned. If for a piece of data the
77+
key is not present or has an invalid value, that piece is
78+
ignored by the pipeline.
79+
80+
Supported only for tabular and time series Datasets.
81+
weight_column (str):
82+
Optional. Name of the column that should be used as the weight column.
83+
Higher values in this column give more importance to the row
84+
during Model training. The column must have numeric values between 0 and
85+
10000 inclusively, and 0 value means that the row is ignored.
86+
If the weight column field is not set, then all rows are assumed to have
87+
equal weight of 1.
88+
budget_milli_node_hours (int):
89+
Optional. The train budget of creating this Model, expressed in milli node
90+
hours i.e. 1,000 value in this field means 1 node hour.
91+
The training cost of the model will not exceed this budget. The final
92+
cost will be attempted to be close to the budget, though may end up
93+
being (even) noticeably smaller - at the backend's discretion. This
94+
especially may happen when further model training ceases to provide
95+
any improvements.
96+
If the budget is set to a value known to be insufficient to train a
97+
Model for the given training set, the training won't be attempted and
98+
will error.
99+
The minimum value is 1000 and the maximum is 72000.
100+
model_display_name (str):
101+
Optional. If the script produces a managed Vertex AI Model. The display name of
102+
the Model. The name can be up to 128 characters long and can be consist
103+
of any UTF-8 characters.
104+
105+
If not provided upon creation, the job's display_name is used.
106+
disable_early_stopping (bool):
107+
Required. If true, the entire budget is used. This disables the early stopping
108+
feature. By default, the early stopping feature is enabled, which means
109+
that training might stop before the entire training budget has been
110+
used, if further training does no longer brings significant improvement
111+
to the model.
112+
113+
optimization_prediction_type (str):
114+
The type of prediction the Model is to produce.
115+
"classification" - Predict one out of multiple target values is
116+
picked for each row.
117+
"regression" - Predict a value based on its relation to other values.
118+
This type is available only to columns that contain
119+
semantically numeric values, i.e. integers or floating
120+
point number, even if stored as e.g. strings.
121+
optimization_objective (str):
122+
Optional. Objective function the Model is to be optimized towards. The training
123+
task creates a Model that maximizes/minimizes the value of the objective
124+
function over the validation set.
125+
126+
The supported optimization objectives depend on the prediction type, and
127+
in the case of classification also the number of distinct values in the
128+
target column (two distint values -> binary, 3 or more distinct values
129+
-> multi class).
130+
If the field is not set, the default objective function is used.
131+
132+
Classification (binary):
133+
"maximize-au-roc" (default) - Maximize the area under the receiver
134+
operating characteristic (ROC) curve.
135+
"minimize-log-loss" - Minimize log loss.
136+
"maximize-au-prc" - Maximize the area under the precision-recall curve.
137+
"maximize-precision-at-recall" - Maximize precision for a specified
138+
recall value.
139+
"maximize-recall-at-precision" - Maximize recall for a specified
140+
precision value.
141+
142+
Classification (multi class):
143+
"minimize-log-loss" (default) - Minimize log loss.
144+
145+
Regression:
146+
"minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
147+
"minimize-mae" - Minimize mean-absolute error (MAE).
148+
"minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
149+
column_transformations (Optional[Union[Dict, List[Dict]]]):
150+
Optional. Transformations to apply to the input columns (i.e. columns other
151+
than the targetColumn). Each transformation may produce multiple
152+
result values from the column's value, and all are used for training.
153+
When creating transformation for BigQuery Struct column, the column
154+
should be flattened using "." as the delimiter.
155+
If an input column has no transformations on it, such a column is
156+
ignored by the training, except for the targetColumn, which should have
157+
no transformations defined on.
158+
optimization_objective_recall_value (float):
159+
Optional. Required when maximize-precision-at-recall optimizationObjective was
160+
picked, represents the recall value at which the optimization is done.
161+
162+
The minimum value is 0 and the maximum is 1.0.
163+
optimization_objective_precision_value (float):
164+
Optional. Required when maximize-recall-at-precision optimizationObjective was
165+
picked, represents the precision value at which the optimization is
166+
done.
167+
168+
The minimum value is 0 and the maximum is 1.0.
169+
170+
Returns:
171+
model_name: Model name (fully-qualified)
172+
model_dict: Model metadata in JSON format
173+
'''
174+
175+
import datetime
176+
import logging
177+
178+
from google.cloud import aiplatform
179+
from google.protobuf import json_format
180+
181+
logging.getLogger().setLevel(logging.INFO)
182+
183+
if not model_display_name:
184+
model_display_name = 'TablesModel_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
185+
186+
aiplatform.init(
187+
project=project,
188+
location=location,
189+
encryption_spec_key_name=encryption_spec_key_name,
190+
)
191+
192+
model = aiplatform.AutoMLTabularTrainingJob(
193+
display_name='AutoMLTabularTrainingJob_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),
194+
optimization_prediction_type=optimization_prediction_type,
195+
optimization_objective=optimization_objective,
196+
#column_transformations=column_transformations,
197+
optimization_objective_recall_value=optimization_objective_recall_value,
198+
optimization_objective_precision_value=optimization_objective_precision_value,
199+
).run(
200+
dataset=aiplatform.TabularDataset(dataset_name=dataset_name),
201+
target_column=target_column,
202+
training_fraction_split=training_fraction_split,
203+
validation_fraction_split=validation_fraction_split,
204+
test_fraction_split=test_fraction_split,
205+
predefined_split_column_name=predefined_split_column_name,
206+
weight_column=weight_column,
207+
budget_milli_node_hours=budget_milli_node_hours,
208+
model_display_name=model_display_name,
209+
disable_early_stopping=disable_early_stopping,
210+
)
211+
212+
(_, model_project, _, model_location, _, model_id) = model.resource_name.split('/')
213+
model_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{model_location}/models/{model_id}/evaluate?project={model_project}'
214+
logging.info(f'Created model {model.name}.')
215+
logging.info(f'Link: {model_web_url}')
216+
model_json = json_format.MessageToJson(model._gca_resource._pb)
217+
print(model_json)
218+
return (model.resource_name, model_json, model_web_url)
219+
220+
221+
if __name__ == '__main__':
222+
train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML_op = create_component_from_func(
223+
train_tabular_model_using_Google_Cloud_Vertex_AI_AutoML,
224+
base_image='python:3.9',
225+
# TODO: Update after my fix for TabularDataset.column_names bug is released https://github.com/googleapis/python-aiplatform/pull/590
226+
packages_to_install=['git+https://github.com/googleapis/python-aiplatform.git@refs/pull/590/head'],
227+
output_component_file='component.yaml',
228+
annotations={
229+
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
230+
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Train_model/component.yaml",
231+
},
232+
)

0 commit comments

Comments
 (0)