1 Introduction

We use the data from the handwritten ZIP codes on envelopes from U.S. postal mail. The images are 16×16 eight-bit grayscale maps for digtis 0,1,…,9, with each pixel ranging in intensity from 0 to 255. Some sample images are shown in Figure 4.1.

2 Preparations

2.1 Load libraries

library(xgboost)
#library(lightgbm) #an alternative of xgboost, faster
library(keras)
library(magrittr)
library(tidyverse)
library(DT)
library(ElemStatLearn)

2.2 Read and split data

The data set has 7291 observations. We randomly sampled 4000 of them for testing and the remaining for training.

set.seed(777555333)
LXyTr <- ElemStatLearn::zip.train 
colnames(LXyTr) <- c("Y",paste0("x",1:256))
te_idx <- sample(1:nrow(LXyTr),size = 4000,replace = FALSE)
LXyTe <- LXyTr[te_idx,]
LXyTr <- LXyTr[-te_idx,]

3 Glance at Data

3.1 Data structure

##  num [1:3291, 1:257] 6 4 3 1 0 1 1 7 4 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:257] "Y" "x1" "x2" "x3" ...

3.2 Detail

4 Plot Images

# Sample 6 examples for each number
findRows <- function(LX, n) {
 # Find  n (random) rows with LX representing 0,1,2,...,9
 res <- vector(length=10, mode="list")
 names(res) <- 0:9
 ind <- LX[,1]
 for (j in 0:9) {
    res[[j+1]] <- sample( which(ind==j), n ) }
 return(res) 
}

# Making a plot like that for the ZIP data
rows <- findRows(LXyTr, 6)

# Another way
im  <- rows %>%
  map_dfr(function(rows_j){
    rows_j %>%
      map_dfc(function(x){
        im <- LXyTr[x, ]
        # print(paste("digit ", im[1], " taken"))
        im <- im[-1]
        im <- t(matrix(im, 16, 16, byrow = TRUE))
        im <- im[, 16:1]
        return(data.frame(im))
      })
  }) %>%
  as.matrix()


#image(im, col=gray(256:0/256), zlim=c(0,1), xlab="", ylab="" )    
im <- data.frame(row=1:nrow(im),im)
colnames(im)[-1] <- c(1:(ncol(im)-1))
im <- gather(im,col,value,-row)
im$col <- as.integer(im$col)

ggplot(im, aes(x = row, y = col, fill = value)) +
    geom_raster() +
    scale_fill_gradient(low = "white", high = "black")
Examples of training cases from the zip code data

Figure 4.1: Examples of training cases from the zip code data

5 Train Models

5.1 Multi-layer perceptron with keras

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
D <- 16

# prepare model
fit_dnn <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 64, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 10, activation = "softmax")

# print the structure of the NN
fit_dnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 128)                   32896       
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 64)                    8256        
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 10)                    650         
## ===========================================================================
## Total params: 41,802
## Trainable params: 41,802
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_dnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D * D))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D * D))

tc_dnn <- proc.time()
# train
fit_dnn %>% fit(train_images, 
                train_labels, 
                epochs = 25, 
                batch_size = 100)
tc_dnn <- proc.time()-tc_dnn

# evaluate
err_dnn <- 1-evaluate(fit_dnn,test_images, test_labels)$acc

cat("Test error: ",err_dnn,"\n",sep="")
## Test error: 0.05225
cat("Computation time: ",tc_dnn[3],"s",sep="")
## Computation time: 3.39s
# prediction
#pred_dnn <- predict_classes(fit_dnn,test_images)

5.2 Convolutional neural networks with keras

The Convolutional neural networks (convnets) is a more advanced network struture that are widely used in computer vision application. It was heavily used in the ImageNet competition.

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
# prepare model
fit_cnn <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(5, 5), activation = "relu",
                input_shape = c(D, D, 1)) %>%
  layer_max_pooling_2d(pool_size = c(3, 3)) %>%
  layer_flatten() %>%
  layer_dense(units = 10, activation = "softmax")


# print the structure of the NN
fit_cnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## conv2d_1 (Conv2D)                (None, 12, 12, 32)            832         
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)   (None, 4, 4, 32)              0           
## ___________________________________________________________________________
## flatten_1 (Flatten)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 10)                    5130        
## ===========================================================================
## Total params: 5,962
## Trainable params: 5,962
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_cnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D,D,1))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D,D,1))

# train
tc_cnn <- proc.time()
fit_cnn %>% fit(train_images, 
                train_labels, 
                epochs = 30, 
                batch_size = 100)
tc_cnn <- proc.time()-tc_cnn

# evaluate
err_cnn <- 1-evaluate(fit_cnn,test_images, test_labels)$acc

cat("Test error: ",err_cnn,"\n",sep="")
## Test error: 0.01775
cat("Computation time: ",tc_cnn[3],"s",sep="")
## Computation time: 17.21s
# prediction
#pred_cnn <- predict_classes(fit_cnn,test_images)

5.3 SVM

library(e1071)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_svm <- proc.time()
fit_svm <- svm(Y~.,data = LXyTr_data)
tc_svm <- proc.time()-tc_svm

pred_svm <- predict(fit_svm,newdata = LXyTe_data)
err_svm <- mean(pred_svm!=LXyTe_data[,1])

cat("Test error: ",err_svm,"\n",sep="")
## Test error: 0.04125
cat("Computation time: ",tc_svm[3],"s",sep="")
## Computation time: 8.59s

5.4 Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_rf <- proc.time()
fit_rf <- randomForest(Y~.,data = LXyTr_data)
tc_rf <- proc.time()-tc_rf

pred_rf <- predict(fit_rf,newdata = LXyTe_data)
err_rf <- mean(pred_rf!=LXyTe_data[,1])

cat("Test error: ",err_rf,"\n",sep="")
## Test error: 0.03975
cat("Computation time: ",tc_rf[3],"s",sep="")
## Computation time: 36.8s

5.5 XGBoost

Notice that here I did not use a validation set to tune the parameters. The training accuracy shows that XGBoost model overfits heavily.

library(xgboost)

params <- list(objective = "multi:softmax",
               num_class = 10,
               booster = "gbtree",
               eval_metric = "merror",
               nthread = 4,
               eta = 0.1,
               max_depth = 6,
               min_child_weight = 1,
               gamma = 0,
               subsample = 1,
               colsample_bytree = 0.8,
               colsample_bylevel = 0.85,
               alpha = 0.1,
               lambda = 0.1)


xtrain <- xgb.DMatrix(data = LXyTr[,-1], label = LXyTr[,1])

tc_xgb <- proc.time()
fit_xgb <- xgb.train(params, data = xtrain, 
                     nrounds = 25, watchlist = list(train=xtrain), 
                     verbose=1, print_every_n = 5, 
                     early_stopping_rounds = 5)
## [1]  train-merror:0.044363 
## Will train until train_merror hasn't improved in 5 rounds.
## 
## [6]  train-merror:0.008508 
## [11] train-merror:0.004862 
## [16] train-merror:0.002735 
## [21] train-merror:0.000304 
## [25] train-merror:0.000304
tc_xgb <- proc.time()-tc_xgb

pred_xgb <- predict(fit_xgb, LXyTe[,-1])

err_xgb <- mean(pred_xgb!=LXyTe[,1])

cat("Test error: ",err_xgb,"\n",sep="")
## Test error: 0.06625
cat("Computation time: ",tc_xgb[3],"s",sep="")
## Computation time: 5.53s
# Print variable importance
#xgb.importance(feature_names = paste0("x",1:D^2),model=fit_xgb)

6 Summary

library(knitr)
err_table <- data.frame(Method=c("Multi-layer perceptron","Convnets","SVM","Random Forest","XGBoost"),
                        TestErr=c(err_dnn, err_cnn, err_svm, err_rf, err_xgb),
                        Time=c(tc_dnn[3], tc_cnn[3], tc_svm[3], tc_rf[3], tc_xgb[3]))

knitr::kable(err_table,
             caption = "Summary of the test error for each method")
Table 6.1: Summary of the test error for each method
Method TestErr Time
Multi-layer perceptron 0.05225 3.39
Convnets 0.01775 17.21
SVM 0.04125 8.59
Random Forest 0.03975 36.80
XGBoost 0.06625 5.53