11# ##################################################################################
22# ## Goal: demonstrate usage of H2O's Random Forest and GBM algorithms
3- # ## Task: predict
3+ # ## Task: Predicting forest cover type from cartographic variables only
4+ # ## The actual forest cover type for a given observation
5+ # ## (30 x 30 meter cell) was determined from the US Forest Service (USFS).
46
57# # H2O is an R package
68library(h2o )
@@ -10,9 +12,9 @@ h2o.init(
1012 max_mem_size = " 2G" ) # # specify the memory size for the H2O cloud
1113
1214# # Load a file from disk
13- df <- h2o.importFile(" covtype.full.csv" )
15+ df <- h2o.importFile(" ../data/ covtype.full.csv" )
1416
15- # # To run machine learning , we will create three splits for train/test/valid
17+ # # First , we will create three splits for train/test/valid independent data sets.
1618# # We will train a data set on one set and use the others to test the validity
1719# # of model by ensuring that it can predict accurately on data the model has not
1820# # been shown.
@@ -189,18 +191,18 @@ h2o.hit_ratio_table(gbm3,valid = T)[1,2] ## review the newest model's accurac
189191# # is being run. The default for classification is one-third of the columns.
190192# # The default for regression is the square root of the number of columns.
191193
192- rf2 <- h2o.randomForest(
193- training_frame = train ,
194- validation_frame = valid ,
195- x = 1 : 12 ,
196- y = 13 ,
197- model_id = " rf_covType2" ,
198- ntrees = 200 ,
199- max_depth = 30 ,
200- stopping_rounds = 2 ,
201- stopping_tolerance = 1e-2 ,
202- score_each_iteration = T ,
203- seed = 3000000 )
194+ rf2 <- h2o.randomForest( # #
195+ training_frame = train , # #
196+ validation_frame = valid , # #
197+ x = 1 : 12 , # #
198+ y = 13 , # #
199+ model_id = " rf_covType2" , # #
200+ ntrees = 200 , # #
201+ max_depth = 30 , # # Increase depth, from 20
202+ stopping_rounds = 2 , # #
203+ stopping_tolerance = 1e-2 , # #
204+ score_each_iteration = T , # #
205+ seed = 3000000 ) # #
204206# ##############################################################################
205207summary(rf2 )
206208h2o.hit_ratio_table(gbm3 ,valid = T )[1 ,2 ] # # review the newest GBM accuracy
@@ -209,6 +211,29 @@ h2o.hit_ratio_table(rf2,valid = T)[1,2] ## newest random forest accuracy
209211# ##############################################################################
210212
211213# # So we now have our accuracy up beyond 95%.
214+ # # We have witheld an extra test set to ensure that after all the parameter
215+ # # tuning we have done, repeatedly applied to the validation data, that our
216+ # # model produces similar results against the third data set.
217+
218+ # # Create predictions using our latest RF model against the test set.
219+ finalRf_predictions <- h2o.predict(
220+ object = rf2
221+ ,newdata = test )
222+
223+ # # Glance at what that prediction set looks like
224+ # # We see a final prediction in the "predict" column,
225+ # # and then the predicted probabilities per class.
226+ finalRf_predictions
227+
228+ # # Compare these predictions to the accuracy we got from our experimentation
229+ h2o.hit_ratio_table(rf2 ,valid = T )[1 ,2 ] # # validation set accuracy
230+ mean(finalRf_predictions $ predict == test $ Cover_Type ) # # test set accuracy
231+
232+ # # We have very similar error rates on both sets, so it would not seem
233+ # # that we have overfit the validation set through our experimentation.
234+ # #
235+ # # This concludes the demo, but what might we try next, if we were to continue?
236+ # #
212237# # We could further experiment with deeper trees or a higher percentage of
213238# # columns used (mtries).
214239# # Also we could experiment with the nbins and nbins_cats settings to control
0 commit comments