1 Introduction

We use the data from the handwritten ZIP codes on envelopes from U.S. postal mail. The images are 16×16 eight-bit grayscale maps for digtis 0,1,…,9, with each pixel ranging in intensity from 0 to 255. Some sample images are shown in Figure 4.1.

2 Preparations

library(xgboost)
#library(lightgbm) #an alternative of xgboost, faster
library(keras)
library(magrittr)
library(tidyverse)
library(DT)
library(ElemStatLearn)

The data set has 7291 observations. We randomly sampled 4000 of them for testing and the remaining for training.

set.seed(777555333)
LXyTr <- ElemStatLearn::zip.train 
colnames(LXyTr) <- c("Y",paste0("x",1:256))
te_idx <- sample(1:nrow(LXyTr),size = 4000,replace = FALSE)
LXyTe <- LXyTr[te_idx,]
LXyTr <- LXyTr[-te_idx,]

3 Glance at Data

##  num [1:3291, 1:257] 6 4 3 1 0 1 1 7 4 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:257] "Y" "x1" "x2" "x3" ...
 

4 Plot Images

# Sample 6 examples for each number
findRows <- function(LX, n) {
 # Find  n (random) rows with LX representing 0,1,2,...,9
 res <- vector(length=10, mode="list")
 names(res) <- 0:9
 ind <- LX[,1]
 for (j in 0:9) {
    res[[j+1]] <- sample( which(ind==j), n ) }
 return(res) 
}

# Making a plot like that for the ZIP data
rows <- findRows(LXyTr, 6)

# Another way
im  <- rows %>%
  map_dfr(function(rows_j){
    rows_j %>%
      map_dfc(function(x){
        im <- LXyTr[x, ]
        # print(paste("digit ", im[1], " taken"))
        im <- im[-1]
        im <- t(matrix(im, 16, 16, byrow = TRUE))
        im <- im[, 16:1]
        return(data.frame(im))
      })
  }) %>%
  as.matrix()


#image(im, col=gray(256:0/256), zlim=c(0,1), xlab="", ylab="" )    
im <- data.frame(row=1:nrow(im),im)
colnames(im)[-1] <- c(1:(ncol(im)-1))
im <- gather(im,col,value,-row)
im$col <- as.integer(im$col)

ggplot(im, aes(x = row, y = col, fill = value)) +
    geom_raster() +
    scale_fill_gradient(low = "white", high = "black")
Examples of training cases from the zip code data

Figure 4.1: Examples of training cases from the zip code data

5 Train Models

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
D <- 16

# prepare model
fit_dnn <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 64, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 10, activation = "softmax")

# print the structure of the NN
fit_dnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 128)                   32896       
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 64)                    8256        
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 10)                    650         
## ===========================================================================
## Total params: 41,802
## Trainable params: 41,802
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_dnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D * D))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D * D))

tc_dnn <- proc.time()
# train
fit_dnn %>% fit(train_images, 
                train_labels, 
                epochs = 25, 
                batch_size = 100)
tc_dnn <- proc.time()-tc_dnn

# evaluate
err_dnn <- 1-evaluate(fit_dnn,test_images, test_labels)$acc

cat("Test error: ",err_dnn,"\n",sep="")
## Test error: 0.05225
cat("Computation time: ",tc_dnn[3],"s",sep="")
## Computation time: 3.39s
# prediction
#pred_dnn <- predict_classes(fit_dnn,test_images)

The Convolutional neural networks (convnets) is a more advanced network struture that are widely used in computer vision application. It was heavily used in the ImageNet competition.

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
# prepare model
fit_cnn <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(5, 5), activation = "relu",
                input_shape = c(D, D, 1)) %>%
  layer_max_pooling_2d(pool_size = c(3, 3)) %>%
  layer_flatten() %>%
  layer_dense(units = 10, activation = "softmax")


# print the structure of the NN
fit_cnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## conv2d_1 (Conv2D)                (None, 12, 12, 32)            832         
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)   (None, 4, 4, 32)              0           
## ___________________________________________________________________________
## flatten_1 (Flatten)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 10)                    5130        
## ===========================================================================
## Total params: 5,962
## Trainable params: 5,962
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_cnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D,D,1))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D,D,1))

# train
tc_cnn <- proc.time()
fit_cnn %>% fit(train_images, 
                train_labels, 
                epochs = 30, 
                batch_size = 100)
tc_cnn <- proc.time()-tc_cnn

# evaluate
err_cnn <- 1-evaluate(fit_cnn,test_images, test_labels)$acc

cat("Test error: ",err_cnn,"\n",sep="")
## Test error: 0.01775
cat("Computation time: ",tc_cnn[3],"s",sep="")
## Computation time: 17.21s
# prediction
#pred_cnn <- predict_classes(fit_cnn,test_images)
library(e1071)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_svm <- proc.time()
fit_svm <- svm(Y~.,data = LXyTr_data)
tc_svm <- proc.time()-tc_svm

pred_svm <- predict(fit_svm,newdata = LXyTe_data)
err_svm <- mean(pred_svm!=LXyTe_data[,1])

cat("Test error: ",err_svm,"\n",sep="")
## Test error: 0.04125
cat("Computation time: ",tc_svm[3],"s",sep="")
## Computation time: 8.59s
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_rf <- proc.time()
fit_rf <- randomForest(Y~.,data = LXyTr_data)
tc_rf <- proc.time()-tc_rf

pred_rf <- predict(fit_rf,newdata = LXyTe_data)
err_rf <- mean(pred_rf!=LXyTe_data[,1])

cat("Test error: ",err_rf,"\n",sep="")
## Test error: 0.03975
cat("Computation time: ",tc_rf[3],"s",sep="")
## Computation time: 36.8s

Notice that here I did not use a validation set to tune the parameters. The training accuracy shows that XGBoost model overfits heavily.

library(xgboost)

params <- list(objective = "multi:softmax",
               num_class = 10,
               booster = "gbtree",
               eval_metric = "merror",
               nthread = 4,
               eta = 0.1,
               max_depth = 6,
               min_child_weight = 1,
               gamma = 0,
               subsample = 1,
               colsample_bytree = 0.8,
               colsample_bylevel = 0.85,
               alpha = 0.1,
               lambda = 0.1)


xtrain <- xgb.DMatrix(data = LXyTr[,-1], label = LXyTr[,1])

tc_xgb <- proc.time()
fit_xgb <- xgb.train(params, data = xtrain, 
                     nrounds = 25, watchlist = list(train=xtrain), 
                     verbose=1, print_every_n = 5, 
                     early_stopping_rounds = 5)
## [1]  train-merror:0.044363 
## Will train until train_merror hasn't improved in 5 rounds.
## 
## [6]  train-merror:0.008508 
## [11] train-merror:0.004862 
## [16] train-merror:0.002735 
## [21] train-merror:0.000304 
## [25] train-merror:0.000304
tc_xgb <- proc.time()-tc_xgb

pred_xgb <- predict(fit_xgb, LXyTe[,-1])

err_xgb <- mean(pred_xgb!=LXyTe[,1])

cat("Test error: ",err_xgb,"\n",sep="")
## Test error: 0.06625
cat("Computation time: ",tc_xgb[3],"s",sep="")
## Computation time: 5.53s
# Print variable importance
#xgb.importance(feature_names = paste0("x",1:D^2),model=fit_xgb)

6 Summary

library(knitr)
err_table <- data.frame(Method=c("Multi-layer perceptron","Convnets","SVM","Random Forest","XGBoost"),
                        TestErr=c(err_dnn, err_cnn, err_svm, err_rf, err_xgb),
                        Time=c(tc_dnn[3], tc_cnn[3], tc_svm[3], tc_rf[3], tc_xgb[3]))

knitr::kable(err_table,
             caption = "Summary of the test error for each method")
Table 6.1: Summary of the test error for each method
Method TestErr Time
Multi-layer perceptron 0.05225 3.39
Convnets 0.01775 17.21
SVM 0.04125 8.59
Random Forest 0.03975 36.80
XGBoost 0.06625 5.53