1 Introduction
2 Preparations
- 2.1 Load libraries
- 2.2 Read and split data
3 Glance at Data
- 3.1 Data structure
- 3.2 Detail
4 Plot Images
5 Train Models
6 Summary

1 Introduction

We use the data from the handwritten ZIP codes on envelopes from U.S. postal mail. The images are 16×16 eight-bit grayscale maps for digtis 0,1,…,9, with each pixel ranging in intensity from 0 to 255. Some sample images are shown in Figure 4.1.

2 Preparations

2.1 Load libraries

library(xgboost)
#library(lightgbm) #an alternative of xgboost, faster
library(keras)
library(magrittr)
library(tidyverse)
library(DT)
library(ElemStatLearn)

2.2 Read and split data

The data set has 7291 observations. We randomly sampled 4000 of them for testing and the remaining for training.

set.seed(777555333)
LXyTr <- ElemStatLearn::zip.train 
colnames(LXyTr) <- c("Y",paste0("x",1:256))
te_idx <- sample(1:nrow(LXyTr),size = 4000,replace = FALSE)
LXyTe <- LXyTr[te_idx,]
LXyTr <- LXyTr[-te_idx,]

3 Glance at Data

3.1 Data structure

##  num [1:3291, 1:257] 6 4 3 1 0 1 1 7 4 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:257] "Y" "x1" "x2" "x3" ...

3.2 Detail

4 Plot Images

# Sample 6 examples for each number
findRows <- function(LX, n) {
 # Find  n (random) rows with LX representing 0,1,2,...,9
 res <- vector(length=10, mode="list")
 names(res) <- 0:9
 ind <- LX[,1]
 for (j in 0:9) {
    res[[j+1]] <- sample( which(ind==j), n ) }
 return(res) 
}

# Making a plot like that for the ZIP data
rows <- findRows(LXyTr, 6)

# Another way
im  <- rows %>%
  map_dfr(function(rows_j){
    rows_j %>%
      map_dfc(function(x){
        im <- LXyTr[x, ]
        # print(paste("digit ", im[1], " taken"))
        im <- im[-1]
        im <- t(matrix(im, 16, 16, byrow = TRUE))
        im <- im[, 16:1]
        return(data.frame(im))
      })
  }) %>%
  as.matrix()


#image(im, col=gray(256:0/256), zlim=c(0,1), xlab="", ylab="" )    
im <- data.frame(row=1:nrow(im),im)
colnames(im)[-1] <- c(1:(ncol(im)-1))
im <- gather(im,col,value,-row)
im$col <- as.integer(im$col)

ggplot(im, aes(x = row, y = col, fill = value)) +
    geom_raster() +
    scale_fill_gradient(low = "white", high = "black")

Figure 4.1: Examples of training cases from the zip code data

5 Train Models

5.1 Multi-layer perceptron with keras

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)

## Set session seed to 112233 (disabled GPU)

D <- 16

# prepare model
fit_dnn <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 64, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 10, activation = "softmax")

# print the structure of the NN
fit_dnn

## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 128)                   32896       
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 64)                    8256        
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 10)                    650         
## ===========================================================================
## Total params: 41,802
## Trainable params: 41,802
## Non-trainable params: 0
## ___________________________________________________________________________

# Set the estimation method
fit_dnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D * D))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D * D))

tc_dnn <- proc.time()
# train
fit_dnn %>% fit(train_images, 
                train_labels, 
                epochs = 25, 
                batch_size = 100)
tc_dnn <- proc.time()-tc_dnn

# evaluate
err_dnn <- 1-evaluate(fit_dnn,test_images, test_labels)$acc

cat("Test error: ",err_dnn,"\n",sep="")

## Test error: 0.05225

cat("Computation time: ",tc_dnn[3],"s",sep="")

## Computation time: 3.39s

# prediction
#pred_dnn <- predict_classes(fit_dnn,test_images)

5.2 Convolutional neural networks with keras

The Convolutional neural networks (convnets) is a more advanced network struture that are widely used in computer vision application. It was heavily used in the ImageNet competition.

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)

## Set session seed to 112233 (disabled GPU)

# prepare model
fit_cnn <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(5, 5), activation = "relu",
                input_shape = c(D, D, 1)) %>%
  layer_max_pooling_2d(pool_size = c(3, 3)) %>%
  layer_flatten() %>%
  layer_dense(units = 10, activation = "softmax")


# print the structure of the NN
fit_cnn

## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## conv2d_1 (Conv2D)                (None, 12, 12, 32)            832         
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)   (None, 4, 4, 32)              0           
## ___________________________________________________________________________
## flatten_1 (Flatten)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 10)                    5130        
## ===========================================================================
## Total params: 5,962
## Trainable params: 5,962
## Non-trainable params: 0
## ___________________________________________________________________________

# Set the estimation method
fit_cnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D,D,1))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D,D,1))

# train
tc_cnn <- proc.time()
fit_cnn %>% fit(train_images, 
                train_labels, 
                epochs = 30, 
                batch_size = 100)
tc_cnn <- proc.time()-tc_cnn

# evaluate
err_cnn <- 1-evaluate(fit_cnn,test_images, test_labels)$acc

cat("Test error: ",err_cnn,"\n",sep="")

## Test error: 0.01775

cat("Computation time: ",tc_cnn[3],"s",sep="")

## Computation time: 17.21s

# prediction
#pred_cnn <- predict_classes(fit_cnn,test_images)

5.3 SVM

library(e1071)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_svm <- proc.time()
fit_svm <- svm(Y~.,data = LXyTr_data)
tc_svm <- proc.time()-tc_svm

pred_svm <- predict(fit_svm,newdata = LXyTe_data)
err_svm <- mean(pred_svm!=LXyTe_data[,1])

cat("Test error: ",err_svm,"\n",sep="")

## Test error: 0.04125

cat("Computation time: ",tc_svm[3],"s",sep="")

## Computation time: 8.59s

5.4 Random Forest

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_rf <- proc.time()
fit_rf <- randomForest(Y~.,data = LXyTr_data)
tc_rf <- proc.time()-tc_rf

pred_rf <- predict(fit_rf,newdata = LXyTe_data)
err_rf <- mean(pred_rf!=LXyTe_data[,1])

cat("Test error: ",err_rf,"\n",sep="")

## Test error: 0.03975

cat("Computation time: ",tc_rf[3],"s",sep="")

## Computation time: 36.8s

5.5 XGBoost

Notice that here I did not use a validation set to tune the parameters. The training accuracy shows that XGBoost model overfits heavily.

library(xgboost)

params <- list(objective = "multi:softmax",
               num_class = 10,
               booster = "gbtree",
               eval_metric = "merror",
               nthread = 4,
               eta = 0.1,
               max_depth = 6,
               min_child_weight = 1,
               gamma = 0,
               subsample = 1,
               colsample_bytree = 0.8,
               colsample_bylevel = 0.85,
               alpha = 0.1,
               lambda = 0.1)


xtrain <- xgb.DMatrix(data = LXyTr[,-1], label = LXyTr[,1])

tc_xgb <- proc.time()
fit_xgb <- xgb.train(params, data = xtrain, 
                     nrounds = 25, watchlist = list(train=xtrain), 
                     verbose=1, print_every_n = 5, 
                     early_stopping_rounds = 5)

## [1]  train-merror:0.044363 
## Will train until train_merror hasn't improved in 5 rounds.
## 
## [6]  train-merror:0.008508 
## [11] train-merror:0.004862 
## [16] train-merror:0.002735 
## [21] train-merror:0.000304 
## [25] train-merror:0.000304

tc_xgb <- proc.time()-tc_xgb

pred_xgb <- predict(fit_xgb, LXyTe[,-1])

err_xgb <- mean(pred_xgb!=LXyTe[,1])

cat("Test error: ",err_xgb,"\n",sep="")

## Test error: 0.06625

cat("Computation time: ",tc_xgb[3],"s",sep="")

## Computation time: 5.53s

# Print variable importance
#xgb.importance(feature_names = paste0("x",1:D^2),model=fit_xgb)

6 Summary

library(knitr)
err_table <- data.frame(Method=c("Multi-layer perceptron","Convnets","SVM","Random Forest","XGBoost"),
                        TestErr=c(err_dnn, err_cnn, err_svm, err_rf, err_xgb),
                        Time=c(tc_dnn[3], tc_cnn[3], tc_svm[3], tc_rf[3], tc_xgb[3]))

knitr::kable(err_table,
             caption = "Summary of the test error for each method")

Table 6.1: Summary of the test error for each method
Method	TestErr	Time
Multi-layer perceptron	0.05225	3.39
Convnets	0.01775	17.21
SVM	0.04125	8.59
Random Forest	0.03975	36.80
XGBoost	0.06625	5.53

EDA for Zip Codes

Yuanhao Lai, Ian McLeod

February 17, 2019