Skip to content

Commit 371f7bd

Browse files
author
Amy Wang
committed
training material for hands on session
1 parent 8f7685e commit 371f7bd

1 file changed

Lines changed: 131 additions & 0 deletions

File tree

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
library(h2o)
2+
h2o.init(nthreads = -1)
3+
4+
## If possible download from the s3 link and change the path to the dataset.
5+
small_test <- "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"
6+
7+
## Task 1: Import Data
8+
loanStats <- h2o.importFile(path = small_test, parse = F)
9+
col_types <- c('numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'enum', 'string', 'numeric',
10+
'enum', 'enum', 'enum', 'string', 'enum', 'numeric', 'enum', 'enum', 'enum', 'enum',
11+
'string', 'enum', 'enum', 'enum', 'enum', 'enum', 'numeric', 'numeric', 'enum',
12+
'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'string', 'numeric',
13+
'enum', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric',
14+
'numeric', 'numeric', 'enum', 'numeric', 'enum', 'enum', 'numeric', 'enum', 'numeric')
15+
loanStats <- h2o.parseRaw(data = loanStats, destination_frame = "loanStats", col.types = col_types)
16+
17+
## Task 2: Look at the levels in the response column loan_status
18+
## Hint: Use h2o.table function on the response column, use as.data.frame to return the table to R
19+
as.data.frame(h2o.table(loanStats$loan_status))
20+
21+
## Task 3: Filter out all loans that are completed, aka subset data
22+
## Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans
23+
loanStats <- loanStats[!(loanStats$loan_status %in% c("Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)")), ]
24+
25+
## Task 4: Bin the response variable to good/bad loans only, use your best judgment for what is a good/bad loan
26+
## Create new column called bad_loan which should be a binary variable
27+
## Hint: You can turn the bad_loan column into factor using as.factor
28+
loanStats$bad_loan <- loanStats$loan_status %in% c("Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off")
29+
loanStats$bad_loan <- as.factor(loanStats$bad_loan)
30+
31+
## Task 5: String munging to clean string columns before converting to numeric
32+
## Hint: Columns that need munging includes "int_rate", "revol_util", "emp_length"
33+
34+
## Example for int_rate using h2o.strsplit, trim, as.numeric
35+
loanStats$int_rate <- h2o.strsplit(loanStats$int_rate, split = "%")
36+
loanStats$int_rate <- h2o.trim(loanStats$int_rate)
37+
loanStats$int_rate <- as.numeric(loanStats$int_rate)
38+
39+
## Now try for revol_util yourself
40+
loanStats$revol_util <- h2o.strsplit(loanStats$revol_util, split = "%")
41+
loanStats$revol_util <- h2o.trim(loanStats$revol_util)
42+
loanStats$revol_util <- as.numeric(loanStats$revol_util)
43+
44+
## Now we're going to clean up emp_length.
45+
## Use h2o.sub to remove " year" and " years", also translate n/a to ""
46+
loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "([ ]*+[a-zA-Z].*)|(n/a)", replacement = "")
47+
## Use h2o.trim to remove any trailing spaces
48+
loanStats$emp_length <- h2o.trim(loanStats$emp_length)
49+
## Use h2o.sub to convert < 1 to 0 years and do the same for 10 + to 10
50+
## Hint: Be mindful of spaces between characters
51+
loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "< 1", replacement = "0")
52+
loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "10\\+", replacement = "10")
53+
loanStats$emp_length <- as.numeric(loanStats$emp_length)
54+
55+
## Task 6: Convert string columns to dates
56+
## Also create new feature called "credit_length_in_years"
57+
## Hint: Use the columns "earliest_cr_line" and "issue_d"
58+
time1 <- as.Date(h2o.strsplit(x = loanStats$earliest_cr_line, split = "-")[,2], format = "%Y")
59+
time2 <- as.Date(h2o.strsplit(x = loanStats$issue_d, split = "-")[,2], format = "%Y")
60+
loanStats$credit_length_in_years <- year(time2) - year(time1)
61+
62+
## Task 7: Use h2o.sub to create two levels for column "verification_status" ie "verified" and "not verified"
63+
## Hint: Use h2o.table to examine levels within "verification_status", warning messages can be ignored
64+
loanStats$verification_status <- h2o.sub(x = loanStats$verification_status, pattern = "VERIFIED - income source", replacement = "verified")
65+
loanStats$verification_status <- h2o.sub(x = loanStats$verification_status, pattern = "VERIFIED - income", replacement = "verified")
66+
loanStats$verification_status <- as.h2o(as.matrix(loanStats$verification_status))
67+
68+
## Task 8: Define your response and predictor variables
69+
myY <- "bad_loan"
70+
myX <- c("loan_amnt", "term", "home_ownership", "annual_inc", "verification_status", "purpose",
71+
"addr_state", "dti", "delinq_2yrs", "open_acc", "pub_rec", "revol_bal", "total_acc",
72+
"emp_length", "credit_length_in_years", "inq_last_6mths", "revol_util")
73+
74+
## Task 9: Do a test-train split (80-20)
75+
## Hint: Use h2o.splitFrame ONLY once
76+
split <- h2o.splitFrame(loanStats, ratios = 0.8)
77+
train <- split[[1]]
78+
valid <- split[[2]]
79+
80+
## Task 10: Build model predicting good/bad loan
81+
## Note: Use any of the classification methods available including GLM, GBM, Random Forest, and Deep Learning
82+
gbm_model <- h2o.gbm(x = myX, y = myY, training_frame = train, validation_frame = valid,
83+
learn_rate = 0.05, score_each_iteration = T, ntrees = 100)
84+
## Task 11: Plot the scoring history to make sure you're not overfitting
85+
## Hint: Use plot function on the model object
86+
plot(gbm_model)
87+
88+
## Task 12: Plot the ROC curve for the binomial models and get auc using h2o.auc
89+
## Hint: Use h2o.performance and plot to grab the modelmetrics and then plotting the modelmetrics
90+
perf <- h2o.performance(model = gbm_model)
91+
plot(perf, train = T)
92+
plot(perf, valid = T)
93+
h2o.auc(gbm_model, train = T)
94+
h2o.auc(gbm_model, valid = T)
95+
96+
## Task 13: Check the variable importance and generate confusion matrix for max F1 threshold
97+
## Hint: Use h2o.varimp for non-GLM model and use h2o.confusionMatrix
98+
h2o.varimp(gbm_model)
99+
h2o.confusionMatrix(gbm_model)
100+
101+
## Task 14: Score the entire data set using the model
102+
## Hint: Use h2o.predict.
103+
pred <- h2o.predict(gbm_model, loanStats)
104+
105+
## Extra: Calculate the money gain/loss if model is implemented
106+
## Calculate the total amount of money earned or lost per loan
107+
loanStats$earned <- loanStats$total_pymnt - loanStats$loan_amnt
108+
109+
## Calculate how much money will be lost to false negative, vs how much will be saved due to true positives
110+
loanStats$pred <- pred[,1]
111+
net <- as.data.frame(h2o.group_by(data = loanStats, by = c("bad_loan", "pred"), gb.control = list(na.methods = "ignore"), sum("earned")))
112+
n1 <- net[ net$bad_loan == 0 & net$pred == 0, 3]
113+
n2 <- net[ net$bad_loan == 0 & net$pred == 1, 3]
114+
n3 <- net[ net$bad_loan == 1 & net$pred == 1, 3]
115+
n4 <- net[ net$bad_loan == 1 & net$pred == 0, 3]
116+
117+
118+
## Function defined to pretty print numerics as dollars
119+
printMoney <- function(x){
120+
x <- round(abs(x),2)
121+
format(x, digits=10, nsmall=2, decimal.mark=".", big.mark=",")
122+
}
123+
124+
## Calculate the amount of earned
125+
print(paste0("Total amount of profit still earned using the model : $", printMoney(n1) , ""))
126+
print(paste0("Total amount of profit forfeitted using the model : $", printMoney(n2) , ""))
127+
print(paste0("Total amount of loss that could have been prevented : $", printMoney(n3) , ""))
128+
print(paste0("Total amount of loss that still would've accrued : $", printMoney(n4) , ""))
129+
## Calculate Net
130+
print(paste0("Total profit by implementing model : $", printMoney( n1 - n2 + n3 - n4)))
131+

0 commit comments

Comments
 (0)