We use the data from the handwritten ZIP codes on envelopes from U.S. postal mail. The images are 16×16 eight-bit grayscale maps for digtis 0,1,…,9, with each pixel ranging in intensity from 0 to 255. Some sample images are shown in Figure 4.1.
library(xgboost)
#library(lightgbm) #an alternative of xgboost, faster
library(keras)
library(magrittr)
library(tidyverse)
library(DT)
library(ElemStatLearn)
The data set has 7291 observations. We randomly sampled 4000 of them for testing and the remaining for training.
set.seed(777555333)
LXyTr <- ElemStatLearn::zip.train
colnames(LXyTr) <- c("Y",paste0("x",1:256))
te_idx <- sample(1:nrow(LXyTr),size = 4000,replace = FALSE)
LXyTe <- LXyTr[te_idx,]
LXyTr <- LXyTr[-te_idx,]
## num [1:3291, 1:257] 6 4 3 1 0 1 1 7 4 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:257] "Y" "x1" "x2" "x3" ...
# Sample 6 examples for each number
findRows <- function(LX, n) {
# Find n (random) rows with LX representing 0,1,2,...,9
res <- vector(length=10, mode="list")
names(res) <- 0:9
ind <- LX[,1]
for (j in 0:9) {
res[[j+1]] <- sample( which(ind==j), n ) }
return(res)
}
# Making a plot like that for the ZIP data
rows <- findRows(LXyTr, 6)
# Another way
im <- rows %>%
map_dfr(function(rows_j){
rows_j %>%
map_dfc(function(x){
im <- LXyTr[x, ]
# print(paste("digit ", im[1], " taken"))
im <- im[-1]
im <- t(matrix(im, 16, 16, byrow = TRUE))
im <- im[, 16:1]
return(data.frame(im))
})
}) %>%
as.matrix()
#image(im, col=gray(256:0/256), zlim=c(0,1), xlab="", ylab="" )
im <- data.frame(row=1:nrow(im),im)
colnames(im)[-1] <- c(1:(ncol(im)-1))
im <- gather(im,col,value,-row)
im$col <- as.integer(im$col)
ggplot(im, aes(x = row, y = col, fill = value)) +
geom_raster() +
scale_fill_gradient(low = "white", high = "black")
library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
D <- 16
# prepare model
fit_dnn <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "relu", input_shape = c(D * D)) %>%
layer_dense(units = 64, activation = "relu", input_shape = c(D * D)) %>%
layer_dense(units = 10, activation = "softmax")
# print the structure of the NN
fit_dnn
## Model
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense_1 (Dense) (None, 128) 32896
## ___________________________________________________________________________
## dense_2 (Dense) (None, 64) 8256
## ___________________________________________________________________________
## dense_3 (Dense) (None, 10) 650
## ===========================================================================
## Total params: 41,802
## Trainable params: 41,802
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_dnn %>% compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)
train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D * D))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D * D))
tc_dnn <- proc.time()
# train
fit_dnn %>% fit(train_images,
train_labels,
epochs = 25,
batch_size = 100)
tc_dnn <- proc.time()-tc_dnn
# evaluate
err_dnn <- 1-evaluate(fit_dnn,test_images, test_labels)$acc
cat("Test error: ",err_dnn,"\n",sep="")
## Test error: 0.05225
cat("Computation time: ",tc_dnn[3],"s",sep="")
## Computation time: 3.39s
# prediction
#pred_dnn <- predict_classes(fit_dnn,test_images)
The Convolutional neural networks (convnets) is a more advanced network struture that are widely used in computer vision application. It was heavily used in the ImageNet competition.
library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
# prepare model
fit_cnn <- keras_model_sequential() %>%
layer_conv_2d(filters = 32, kernel_size = c(5, 5), activation = "relu",
input_shape = c(D, D, 1)) %>%
layer_max_pooling_2d(pool_size = c(3, 3)) %>%
layer_flatten() %>%
layer_dense(units = 10, activation = "softmax")
# print the structure of the NN
fit_cnn
## Model
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## conv2d_1 (Conv2D) (None, 12, 12, 32) 832
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D) (None, 4, 4, 32) 0
## ___________________________________________________________________________
## flatten_1 (Flatten) (None, 512) 0
## ___________________________________________________________________________
## dense_1 (Dense) (None, 10) 5130
## ===========================================================================
## Total params: 5,962
## Trainable params: 5,962
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_cnn %>% compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)
train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D,D,1))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D,D,1))
# train
tc_cnn <- proc.time()
fit_cnn %>% fit(train_images,
train_labels,
epochs = 30,
batch_size = 100)
tc_cnn <- proc.time()-tc_cnn
# evaluate
err_cnn <- 1-evaluate(fit_cnn,test_images, test_labels)$acc
cat("Test error: ",err_cnn,"\n",sep="")
## Test error: 0.01775
cat("Computation time: ",tc_cnn[3],"s",sep="")
## Computation time: 17.21s
# prediction
#pred_cnn <- predict_classes(fit_cnn,test_images)
library(e1071)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))
tc_svm <- proc.time()
fit_svm <- svm(Y~.,data = LXyTr_data)
tc_svm <- proc.time()-tc_svm
pred_svm <- predict(fit_svm,newdata = LXyTe_data)
err_svm <- mean(pred_svm!=LXyTe_data[,1])
cat("Test error: ",err_svm,"\n",sep="")
## Test error: 0.04125
cat("Computation time: ",tc_svm[3],"s",sep="")
## Computation time: 8.59s
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))
tc_rf <- proc.time()
fit_rf <- randomForest(Y~.,data = LXyTr_data)
tc_rf <- proc.time()-tc_rf
pred_rf <- predict(fit_rf,newdata = LXyTe_data)
err_rf <- mean(pred_rf!=LXyTe_data[,1])
cat("Test error: ",err_rf,"\n",sep="")
## Test error: 0.03975
cat("Computation time: ",tc_rf[3],"s",sep="")
## Computation time: 36.8s
Notice that here I did not use a validation set to tune the parameters. The training accuracy shows that XGBoost model overfits heavily.
library(xgboost)
params <- list(objective = "multi:softmax",
num_class = 10,
booster = "gbtree",
eval_metric = "merror",
nthread = 4,
eta = 0.1,
max_depth = 6,
min_child_weight = 1,
gamma = 0,
subsample = 1,
colsample_bytree = 0.8,
colsample_bylevel = 0.85,
alpha = 0.1,
lambda = 0.1)
xtrain <- xgb.DMatrix(data = LXyTr[,-1], label = LXyTr[,1])
tc_xgb <- proc.time()
fit_xgb <- xgb.train(params, data = xtrain,
nrounds = 25, watchlist = list(train=xtrain),
verbose=1, print_every_n = 5,
early_stopping_rounds = 5)
## [1] train-merror:0.044363
## Will train until train_merror hasn't improved in 5 rounds.
##
## [6] train-merror:0.008508
## [11] train-merror:0.004862
## [16] train-merror:0.002735
## [21] train-merror:0.000304
## [25] train-merror:0.000304
tc_xgb <- proc.time()-tc_xgb
pred_xgb <- predict(fit_xgb, LXyTe[,-1])
err_xgb <- mean(pred_xgb!=LXyTe[,1])
cat("Test error: ",err_xgb,"\n",sep="")
## Test error: 0.06625
cat("Computation time: ",tc_xgb[3],"s",sep="")
## Computation time: 5.53s
# Print variable importance
#xgb.importance(feature_names = paste0("x",1:D^2),model=fit_xgb)
library(knitr)
err_table <- data.frame(Method=c("Multi-layer perceptron","Convnets","SVM","Random Forest","XGBoost"),
TestErr=c(err_dnn, err_cnn, err_svm, err_rf, err_xgb),
Time=c(tc_dnn[3], tc_cnn[3], tc_svm[3], tc_rf[3], tc_xgb[3]))
knitr::kable(err_table,
caption = "Summary of the test error for each method")
Method | TestErr | Time |
---|---|---|
Multi-layer perceptron | 0.05225 | 3.39 |
Convnets | 0.01775 | 17.21 |
SVM | 0.04125 | 8.59 |
Random Forest | 0.03975 | 36.80 |
XGBoost | 0.06625 | 5.53 |