|
| 1 | +library(h2o) |
| 2 | +h2o.init(nthreads = -1) |
| 3 | + |
| 4 | +## If possible download from the s3 link and change the path to the dataset. |
| 5 | +small_test <- "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv" |
| 6 | + |
| 7 | +## Task 1: Import Data |
| 8 | +loanStats <- h2o.importFile(path = small_test, parse = F) |
| 9 | +col_types <- c('numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'enum', 'string', 'numeric', |
| 10 | + 'enum', 'enum', 'enum', 'string', 'enum', 'numeric', 'enum', 'enum', 'enum', 'enum', |
| 11 | + 'string', 'enum', 'enum', 'enum', 'enum', 'enum', 'numeric', 'numeric', 'enum', |
| 12 | + 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'string', 'numeric', |
| 13 | + 'enum', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', |
| 14 | + 'numeric', 'numeric', 'enum', 'numeric', 'enum', 'enum', 'numeric', 'enum', 'numeric') |
| 15 | +loanStats <- h2o.parseRaw(data = loanStats, destination_frame = "loanStats", col.types = col_types) |
| 16 | + |
| 17 | +## Task 2: Look at the levels in the response column loan_status |
| 18 | +## Hint: Use h2o.table function on the response column, use as.data.frame to return the table to R |
| 19 | +as.data.frame(h2o.table(loanStats$loan_status)) |
| 20 | + |
| 21 | +## Task 3: Filter out all loans that are completed, aka subset data |
| 22 | +## Hint: "Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)" are ongoing loans |
| 23 | +loanStats <- loanStats[!(loanStats$loan_status %in% c("Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)")), ] |
| 24 | + |
| 25 | +## Task 4: Bin the response variable to good/bad loans only, use your best judgment for what is a good/bad loan |
| 26 | +## Create new column called bad_loan which should be a binary variable |
| 27 | +## Hint: You can turn the bad_loan column into factor using as.factor |
| 28 | +loanStats$bad_loan <- loanStats$loan_status %in% c("Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off") |
| 29 | +loanStats$bad_loan <- as.factor(loanStats$bad_loan) |
| 30 | + |
| 31 | +## Task 5: String munging to clean string columns before converting to numeric |
| 32 | +## Hint: Columns that need munging includes "int_rate", "revol_util", "emp_length" |
| 33 | + |
| 34 | +## Example for int_rate using h2o.strsplit, trim, as.numeric |
| 35 | +loanStats$int_rate <- h2o.strsplit(loanStats$int_rate, split = "%") |
| 36 | +loanStats$int_rate <- h2o.trim(loanStats$int_rate) |
| 37 | +loanStats$int_rate <- as.numeric(loanStats$int_rate) |
| 38 | + |
| 39 | +## Now try for revol_util yourself |
| 40 | +loanStats$revol_util <- h2o.strsplit(loanStats$revol_util, split = "%") |
| 41 | +loanStats$revol_util <- h2o.trim(loanStats$revol_util) |
| 42 | +loanStats$revol_util <- as.numeric(loanStats$revol_util) |
| 43 | + |
| 44 | +## Now we're going to clean up emp_length. |
| 45 | +## Use h2o.sub to remove " year" and " years", also translate n/a to "" |
| 46 | +loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "([ ]*+[a-zA-Z].*)|(n/a)", replacement = "") |
| 47 | +## Use h2o.trim to remove any trailing spaces |
| 48 | +loanStats$emp_length <- h2o.trim(loanStats$emp_length) |
| 49 | +## Use h2o.sub to convert < 1 to 0 years and do the same for 10 + to 10 |
| 50 | +## Hint: Be mindful of spaces between characters |
| 51 | +loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "< 1", replacement = "0") |
| 52 | +loanStats$emp_length <- h2o.sub(x = loanStats$emp_length, pattern = "10\\+", replacement = "10") |
| 53 | +loanStats$emp_length <- as.numeric(loanStats$emp_length) |
| 54 | + |
| 55 | +## Task 6: Convert string columns to dates |
| 56 | +## Also create new feature called "credit_length_in_years" |
| 57 | +## Hint: Use the columns "earliest_cr_line" and "issue_d" |
| 58 | +time1 <- as.Date(h2o.strsplit(x = loanStats$earliest_cr_line, split = "-")[,2], format = "%Y") |
| 59 | +time2 <- as.Date(h2o.strsplit(x = loanStats$issue_d, split = "-")[,2], format = "%Y") |
| 60 | +loanStats$credit_length_in_years <- year(time2) - year(time1) |
| 61 | + |
| 62 | +## Task 7: Use h2o.sub to create two levels for column "verification_status" ie "verified" and "not verified" |
| 63 | +## Hint: Use h2o.table to examine levels within "verification_status", warning messages can be ignored |
| 64 | +loanStats$verification_status <- h2o.sub(x = loanStats$verification_status, pattern = "VERIFIED - income source", replacement = "verified") |
| 65 | +loanStats$verification_status <- h2o.sub(x = loanStats$verification_status, pattern = "VERIFIED - income", replacement = "verified") |
| 66 | +loanStats$verification_status <- as.h2o(as.matrix(loanStats$verification_status)) |
| 67 | + |
| 68 | +## Task 8: Define your response and predictor variables |
| 69 | +myY <- "bad_loan" |
| 70 | +myX <- c("loan_amnt", "term", "home_ownership", "annual_inc", "verification_status", "purpose", |
| 71 | + "addr_state", "dti", "delinq_2yrs", "open_acc", "pub_rec", "revol_bal", "total_acc", |
| 72 | + "emp_length", "credit_length_in_years", "inq_last_6mths", "revol_util") |
| 73 | + |
| 74 | +## Task 9: Do a test-train split (80-20) |
| 75 | +## Hint: Use h2o.splitFrame ONLY once |
| 76 | +split <- h2o.splitFrame(loanStats, ratios = 0.8) |
| 77 | +train <- split[[1]] |
| 78 | +valid <- split[[2]] |
| 79 | + |
| 80 | +## Task 10: Build model predicting good/bad loan |
| 81 | +## Note: Use any of the classification methods available including GLM, GBM, Random Forest, and Deep Learning |
| 82 | +gbm_model <- h2o.gbm(x = myX, y = myY, training_frame = train, validation_frame = valid, |
| 83 | + learn_rate = 0.05, score_each_iteration = T, ntrees = 100) |
| 84 | +## Task 11: Plot the scoring history to make sure you're not overfitting |
| 85 | +## Hint: Use plot function on the model object |
| 86 | +plot(gbm_model) |
| 87 | + |
| 88 | +## Task 12: Plot the ROC curve for the binomial models and get auc using h2o.auc |
| 89 | +## Hint: Use h2o.performance and plot to grab the modelmetrics and then plotting the modelmetrics |
| 90 | +perf <- h2o.performance(model = gbm_model) |
| 91 | +plot(perf, train = T) |
| 92 | +plot(perf, valid = T) |
| 93 | +h2o.auc(gbm_model, train = T) |
| 94 | +h2o.auc(gbm_model, valid = T) |
| 95 | + |
| 96 | +## Task 13: Check the variable importance and generate confusion matrix for max F1 threshold |
| 97 | +## Hint: Use h2o.varimp for non-GLM model and use h2o.confusionMatrix |
| 98 | +h2o.varimp(gbm_model) |
| 99 | +h2o.confusionMatrix(gbm_model) |
| 100 | + |
| 101 | +## Task 14: Score the entire data set using the model |
| 102 | +## Hint: Use h2o.predict. |
| 103 | +pred <- h2o.predict(gbm_model, loanStats) |
| 104 | + |
| 105 | +## Extra: Calculate the money gain/loss if model is implemented |
| 106 | +## Calculate the total amount of money earned or lost per loan |
| 107 | +loanStats$earned <- loanStats$total_pymnt - loanStats$loan_amnt |
| 108 | + |
| 109 | +## Calculate how much money will be lost to false negative, vs how much will be saved due to true positives |
| 110 | +loanStats$pred <- pred[,1] |
| 111 | +net <- as.data.frame(h2o.group_by(data = loanStats, by = c("bad_loan", "pred"), gb.control = list(na.methods = "ignore"), sum("earned"))) |
| 112 | +n1 <- net[ net$bad_loan == 0 & net$pred == 0, 3] |
| 113 | +n2 <- net[ net$bad_loan == 0 & net$pred == 1, 3] |
| 114 | +n3 <- net[ net$bad_loan == 1 & net$pred == 1, 3] |
| 115 | +n4 <- net[ net$bad_loan == 1 & net$pred == 0, 3] |
| 116 | + |
| 117 | + |
| 118 | +## Function defined to pretty print numerics as dollars |
| 119 | +printMoney <- function(x){ |
| 120 | + x <- round(abs(x),2) |
| 121 | + format(x, digits=10, nsmall=2, decimal.mark=".", big.mark=",") |
| 122 | +} |
| 123 | + |
| 124 | +## Calculate the amount of earned |
| 125 | +print(paste0("Total amount of profit still earned using the model : $", printMoney(n1) , "")) |
| 126 | +print(paste0("Total amount of profit forfeitted using the model : $", printMoney(n2) , "")) |
| 127 | +print(paste0("Total amount of loss that could have been prevented : $", printMoney(n3) , "")) |
| 128 | +print(paste0("Total amount of loss that still would've accrued : $", printMoney(n4) , "")) |
| 129 | +## Calculate Net |
| 130 | +print(paste0("Total profit by implementing model : $", printMoney( n1 - n2 + n3 - n4))) |
| 131 | + |
0 commit comments