Skip to content

Commit fcd0f98

Browse files
author
avniwadhwa
committed
airlines demo R template
1 parent c79dfb8 commit fcd0f98

1 file changed

Lines changed: 87 additions & 0 deletions

File tree

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
## Code for scatterplots
2+
scatter_plot <- function(data, x, y, max_points = 1000, fit = F) {
3+
if (fit) {
4+
lr <- h2o.glm(x = x, y = y, training_frame = data, family = "gaussian")
5+
coeff <- lr@model$coefficients_table$standardized_coefficients
6+
}
7+
8+
df <- data[,c(x, y)]
9+
10+
11+
runif <- h2o.runif(df)
12+
df.subset <- df[runif < max_points/nrow(data),]
13+
df.R <- as.data.frame(df.subset)
14+
15+
if (fit) h2o.rm(lr@model_id)
16+
17+
plot(x = df.R[,x], y = df.R[,y], col = "blue", xlab = x,
18+
ylab = y, ylim = c(0, 550))
19+
if (fit) abline(coef = coeff, col = "black")
20+
}
21+
22+
## Load library and initialize h2o
23+
library(h2o)
24+
h2o.init(nthreads = -1)
25+
26+
27+
## Set file path and import data. Drop constant column (23).
28+
pathToAirlines <- "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
29+
30+
airlines.hex <- h2o.importFile(path = pathToAirlines, destination_frame = "airlines.hex")
31+
32+
airlines.hex <- airlines.hex[-23]
33+
dim(airlines.hex)
34+
35+
36+
## Get a summary of the data. Build a histogram examining the "Year" column using h2o.hist()
37+
summary(airlines.hex)
38+
39+
h2o.hist(airlines.hex$Year)
40+
41+
42+
## Scatter plot of airlines dataset examining the relationship between the "Distance" and "AirTime" columns
43+
scatter_plot(data = airlines.hex, x = "Distance", y = "AirTime", max_points = 10000)
44+
45+
46+
## Use h2o.group_by to calcualte the flights in a given month
47+
48+
49+
## Use as.factor to change the "Year," "Month," "DayOfWeek," and "Cancelled" columns to factors
50+
airlines.hex$Year <- as.factor(airlines.hex$Year)
51+
airlines.hex$Month <- as.factor(airlines.hex$Month)
52+
airlines.hex$DayOfWeek <- as.factor(airlines.hex$DayOfWeek)
53+
airlines.hex$Cancelled <- as.factor(airlines.hex$Cancelled)
54+
55+
## Calculate and plot travel timef
56+
hour1 <- airlines.hex$CRSArrTime %/% 100
57+
mins1 <- airlines.hex$CRSArrTime %% 100
58+
arrTime <- hour1*60+mins1
59+
60+
hour2 <- airlines.hex$CRSDepTime %/% 100
61+
mins2 <- airlines.hex$CRSDepTime %% 100
62+
depTime <- hour2*60+mins2
63+
64+
65+
66+
67+
## Impute missing travel times by the "Origin" and "Dest" columns and re-plot.
68+
69+
70+
## Create test/train split
71+
72+
73+
## Set predictor and response variables
74+
myY <- "IsDepDelayed"
75+
76+
77+
78+
## Simple GLM and GBM models - Predict Delays
79+
80+
81+
82+
## Get summary of models
83+
84+
85+
## Get variable importances for both models
86+
87+

0 commit comments

Comments
 (0)