|
| 1 | +### functions for categorical data processing |
| 2 | +# 1. One-hot encoding categorical features |
| 3 | + |
| 4 | + |
| 5 | +## loading libraries |
| 6 | +library(dummies) |
| 7 | +library(plyr) |
| 8 | + |
| 9 | + |
| 10 | +## function for one-hot encoding categorical features |
| 11 | +onehot_encode_categories <- function(X_train,X_test=data.frame()) |
| 12 | +{ |
| 13 | + # creating panel |
| 14 | + cat("Creating panel\n") |
| 15 | + |
| 16 | + if (nrow(X_test) > 0) |
| 17 | + { |
| 18 | + panel <- rbind(X_train,X_test) |
| 19 | + }else |
| 20 | + { |
| 21 | + panel <- X_train |
| 22 | + } |
| 23 | + |
| 24 | + # extracting categorical columns |
| 25 | + categorical_columns <- NULL |
| 26 | + |
| 27 | + for (i in 1:ncol(panel)) |
| 28 | + { |
| 29 | + if (class(panel[,i]) %in% c("character", "factor")) |
| 30 | + { |
| 31 | + categorical_columns <- c(categorical_columns, colnames(panel)[i]) |
| 32 | + } |
| 33 | + } |
| 34 | + |
| 35 | + # creating dummy variables |
| 36 | + cat("One-hot encoding the categorical variables") |
| 37 | + |
| 38 | + if (length(categorical_columns) > 0) |
| 39 | + { |
| 40 | + panel <- dummy.data.frame(panel, names=categorical_columns, sep="_") |
| 41 | + colnames(panel) <- gsub("[[:punct:]]", "", colnames(panel)) |
| 42 | + colnames(panel) <- gsub("[[:space:]]+", " ", colnames(panel)) |
| 43 | + colnames(panel) <- gsub(" ", "_", colnames(panel)) |
| 44 | + } |
| 45 | + |
| 46 | + X_train <- panel[1:nrow(X_train),] |
| 47 | + X_test <- panel[(nrow(X_train)+1):nrow(panel),] |
| 48 | + |
| 49 | + return(list(X_train,X_test)) |
| 50 | +} |
| 51 | + |
0 commit comments