1 Introduction

BFOS (Breiman et al. 1984, §2.6.1, p.43) discussed a data generating model (DGM) that captures the some aspects of classification that are not especially amenable linear classifiers. A nonlinear classifier is needed and also there are \(K=10\) classes. The idea is electronic digits represented by line segments as shown below.

Each digit can be represented by from one to seven line segments. In the Figure below, the digit 8 is represented by 7 segments and these are encoded using the variables \(x_j,\,j=1,\dotsc,7\) with \(x_j=1\), indicating the line segment is on and \(x_j=0\) indicates it is off. For fun, the line segments are color coded.

One can use \(x_i\) as covariates/features to classify the digits of BFOS. But in this document, we discuss a different way to classify the digits of BFOS. We will first convert the representation of line segments to grayscale images. Then some machine learning methods including deep learning, SVM, RF, and XGBOOST will be applied to learn the digtis from the converted images.

Let’s prepare and have a look at the dataset.

2 Preparations

2.1 Load libraries

The package gencve provides us a function rdigitsBFOS() to simulate the line-segment representation of digtis.

library(xgboost)
#library(lightgbm) #an alternative of xgboost, faster
library(keras)#install.packages("keras")
library(magrittr)
library(tidyverse)
library(DT) #install.packages("DT")
library(gencve)#install.packages("gencve")

2.2 Simulate data

We simulated two data sets in the same way. One with \(2500\) observation for training and one with \(100000\) observations for testing. We set \(\alpha=0.1\), so the Bayesian rate will be \(26\%\).

set.seed(777555333)
XyTr <- rdigitsBFOS(n=round(0.25*10^3), alpha=0.1, silent=TRUE) #Training 
XyTe <- rdigitsBFOS(n=10^4, alpha=0.1, silent=TRUE) #Testing

3 Glance at Data

3.1 Data structure

## 'data.frame':    2500 obs. of  8 variables:
##  $ x1   : num  1 0 0 1 0 1 1 0 1 1 ...
##  $ x2   : num  1 0 0 1 1 1 1 0 1 1 ...
##  $ x3   : num  1 1 1 1 1 0 0 1 0 1 ...
##  $ x4   : num  0 0 1 1 1 1 1 0 1 1 ...
##  $ x5   : num  1 0 1 0 0 0 1 0 1 0 ...
##  $ x6   : num  1 1 0 1 1 1 1 0 1 1 ...
##  $ x7   : num  1 0 1 1 0 1 1 0 1 1 ...
##  $ digit: Factor w/ 10 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, "title")= chr "Electronic Digit Recognition Problem, alpha = 0.1 BayesRate = 0.25994"

3.2 Detail

4 Convert to Grayscale Image

Here we converted each observation of variables \(x_j,\,i=1,\dotsc,7\) to a vector of length \(49\). This long vector can be formatted as a \(7 \times 7\) grayscale image. The Figure 4.1 shows a random sample of six cases for each digit.

gen_segment_location <- function(D){
  if(D!=round(D)){stop("D should be an integer!")}
  if(D<7) stop("D should >= 7!")
  D <- as.integer(D)
  
  # location of 6 vertices, fix upper margin to be 1
  segment_length <- floor((D-1)/2)
  lmar <- floor((D-segment_length)/2)
  v1 <- lmar*D+2
  v2 <- v1+segment_length-1
  v3 <- v2+segment_length-1
  v4 <- v3+ifelse(D%%2==0,2,1)+D*(segment_length-2)+2 
  v5 <- v4+segment_length-1
  v6 <- v5+segment_length-1
  
  # use a 7-row matrix to store the location of each segment
  segment_location <- matrix(1,7,segment_length)
  segment_location[1,] <- seq(v1,v4,D)
  segment_location[2,] <- v1:v2
  segment_location[3,] <- v4:v5
  segment_location[4,] <- seq(v2,v5,D)
  segment_location[5,] <- v2:v3
  segment_location[6,] <- v5:v6
  segment_location[7,] <- seq(v3,v6,D)
  
  return(list(D=D,segment_location=segment_location))
}

digit2lv <- function(X,sl){
  D <- sl$D
  segment_location <- sl$segment_location
  
  # LX2 <- matrix(0,nrow(X),D*D)
  # for(i in 1:nrow(X)){
  #   LX2[i,segment_location[which(as.vector(X[i,1:7])==1),]] <- 1
  # }
  
  # An more efficient way
  LX_idx <- which(X[,1:7]==1,arr.ind = TRUE)
  LX_idx <- cbind(LX_idx[,1],segment_location[LX_idx[,2],])
  LX_idx <- LX_idx[,-1]+(LX_idx[,1]-1)*D^2
  LX <- matrix(0,D*D,nrow(X))
  LX[LX_idx] <- 1
  LX <- t(LX)
  
  # Add label
  LX <- cbind(as.numeric(X[,8])-1,LX)
  
  return(LX)
}


#Example
D <- 7
sl <- gen_segment_location(D)
LXyTr <- digit2lv(XyTr,sl)
LXyTe <- digit2lv(XyTe,sl)

# Sample 6 examples for each number
findRows <- function(LX, n) {
 # Find  n (random) rows with LX representing 0,1,2,...,9
 res <- vector(length=10, mode="list")
 names(res) <- 0:9
 ind <- LX[,1]
 for (j in 0:9) {
    res[[j+1]] <- sample( which(ind==j), n ) }
 return(res) 
}

# Making a plot like that for the ZIP data
rows <- findRows(LXyTr, 6)

# digits <- vector(length=10, mode="list")
# names(digits) <- 0:9
# for (j in 0:9) {
#   digits[[j+1]] <- do.call("cbind", lapply(as.list(rows[[j+1]]),
#                                            function(x){
#                                              im <- LXyTr[x, ]
#                                             # print(paste("digit ", im[1], " taken"))
#                                              im <- im[-1]
#                                              im <- (matrix(im, D, D, byrow = TRUE))
#                                              im <- im[, D:1]
#                                              return(im)
#                                            } ) )
# }
# im <- do.call("rbind", digits)


# Another way
im  <- rows %>%
  map_dfr(function(rows_j){
    rows_j %>%
      map_dfc(function(x){
        im <- LXyTr[x, ]
        # print(paste("digit ", im[1], " taken"))
        im <- im[-1]
        im <- (matrix(im, D, D, byrow = TRUE))
        im <- im[, D:1]
        return(data.frame(im))
      })
  }) %>%
  as.matrix()


#image(im, col=gray(1:0/1), zlim=c(0,1), xlab="", ylab="" )    
im <- data.frame(row=1:nrow(im),im)
colnames(im)[-1] <- c(1:(ncol(im)-1))
im <- gather(im,col,value,-row)
im$col <- as.integer(im$col)

ggplot(im, aes(x = row, y = col, fill = value)) +
    geom_raster() +
    scale_fill_gradient(low = "white", high = "black")
Examples of training cases from BFOS code data. Each image is a 8 × 8 1-bit grayscale representation of a digit

Figure 4.1: Examples of training cases from BFOS code data. Each image is a 8 × 8 1-bit grayscale representation of a digit

5 Can Bayesian Error Rate be Achieved?

The original seven-variable representation can be considered as a sparse representation for the long vector of length 49.

We are interested in whether some classification methods can still work in this case in approaching the Bayes error rates 26% (with a small training sample size).

In this section, we compare the performance of several popular classifiers including single/multi-layer perceptron by neuralnet or Kera, convolutional neural networks by `Kera’, SVM, Random Forest, and XGBoost. However, we did not do any parameter tuning for the following methods and the default setting were used.

Note: It may be time-consuming to train a model for a multi-class problem.

5.1 Single-layer perceptron with neuralnet

Too slow. It does not converge.

library(neuralnet)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

fit_neuralnet1 <- neuralnet(Y~.,data = LXyTr_data,hidden = 5,linear.output=FALSE)
pred_neuralnet1 <- predict(fit_neuralnet1,LXyTe_data)
err_neuralnet1 <- mean(pred_neuralnet1!=LXyTe_data[,1])
err_neuralnet1

5.2 Multi-layer perceptron with neuralnet

Too slow. It does not converge.

library(neuralnet) #install.packages("neuralnet")
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

fit_neuralnet2 <- neuralnet(Y~.,data = LXyTr_data,hidden = c(2,2),linear.output=FALSE)
pred_neuralnet2 <- predict(fit_neuralnet2,LXyTe_data)
err_neuralnet2 <- mean(pred_neuralnet2!=LXyTe_data[,1])
err_neuralnet2

5.3 Multi-layer perceptron with keras

The network structure is simple with two densely-connected layers where the second one is used to output the probability for each class. But the result is kind of amazing. The test error rate is closed to (even better than due to randomness) the Bayesian error rate \(26\%\).

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
#aim
use_condaenv("r-tensorflow")

# prepare model
fit_dnn <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = "relu", input_shape = c(D * D)) %>%
  layer_dense(units = 10, activation = "softmax")

# print the structure of the NN
fit_dnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 128)                   6400        
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 10)                    1290        
## ===========================================================================
## Total params: 7,690
## Trainable params: 7,690
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_dnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D * D))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D * D))


tc_dnn <- proc.time()
# train
fit_dnn %>% fit(train_images, 
                train_labels, 
                epochs = 5, 
                batch_size = 100)
tc_dnn <- proc.time()-tc_dnn

# evaluate
err_dnn <- 1-evaluate(fit_dnn,test_images, test_labels)$acc

cat("Test error: ",err_dnn,"\n",sep="")
## Test error: 0.2653
cat("Computation time: ",tc_dnn[3],"s",sep="")
## Computation time: 0.57s
# prediction
#pred_dnn <- predict_classes(fit_dnn,test_images)

5.4 Convolutional neural networks with keras

The Convolutional neural networks (convnets) is a more advanced network struture that are widely used in computer vision application. It was heavily used in the ImageNet competition.

library(keras)
use_backend('tensorflow')
use_session_with_seed(112233,disable_gpu = TRUE,disable_parallel_cpu = FALSE)
## Set session seed to 112233 (disabled GPU)
# prepare model
fit_cnn <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = c(3, 3), activation = "relu",
                input_shape = c(D, D, 1)) %>%
  layer_max_pooling_2d(pool_size = c(1, 1)) %>%
  layer_flatten() %>%
  layer_dense(units = 10, activation = "softmax")


# print the structure of the NN
fit_cnn
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## conv2d_1 (Conv2D)                (None, 5, 5, 32)              320         
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D)   (None, 5, 5, 32)              0           
## ___________________________________________________________________________
## flatten_1 (Flatten)              (None, 800)                   0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 10)                    8010        
## ===========================================================================
## Total params: 8,330
## Trainable params: 8,330
## Non-trainable params: 0
## ___________________________________________________________________________
# Set the estimation method
fit_cnn %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

# prepare data
train_labels <- to_categorical(LXyTr[,1],num_classes = 10)
test_labels <- to_categorical(LXyTe[,1],num_classes = 10)

train_images <- array_reshape(LXyTr[,-1], c(nrow(LXyTr), D,D,1))
test_images <- array_reshape(LXyTe[,-1],c(nrow(LXyTe), D,D,1))


# train
tc_cnn <- proc.time()
fit_cnn %>% fit(train_images, 
                train_labels, 
                epochs = 10, 
                batch_size = 100)
tc_cnn <- proc.time()-tc_cnn

# evaluate
err_cnn <- 1-evaluate(fit_cnn,test_images, test_labels)$acc

cat("Test error: ",err_cnn,"\n",sep="")
## Test error: 0.25948
cat("Computation time: ",tc_cnn[3],"s",sep="")
## Computation time: 1.22s
# prediction
#pred_cnn <- predict_classes(fit_cnn,test_images)

5.5 SVM

library(e1071)
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_svm <- proc.time()
fit_svm <- svm(Y~.,data = LXyTr_data)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'x1' and 'x2' and 'x3' and 'x4' and 'x5' and 'x6' and 'x7'
## and 'x8' and 'x9' and 'x10' and 'x11' and 'x12' and 'x13' and 'x14' and
## 'x15' and 'x21' and 'x22' and 'x24' and 'x26' and 'x28' and 'x29' and 'x35'
## and 'x36' and 'x37' and 'x38' and 'x39' and 'x40' and 'x41' and 'x42' and
## 'x43' and 'x44' and 'x45' and 'x46' and 'x47' and 'x48' and 'x49' constant.
## Cannot scale data.
tc_svm <- proc.time()-tc_svm

pred_svm <- predict(fit_svm,newdata = LXyTe_data)
err_svm <- mean(pred_svm!=LXyTe_data[,1])

cat("Test error: ",err_svm,"\n",sep="")
## Test error: 0.29434
cat("Computation time: ",tc_svm[3],"s",sep="")
## Computation time: 0.76s

5.6 Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
LXyTr_data <- data.frame(Y=factor(LXyTr[,1]),x=LXyTr[,-1])
LXyTe_data <- data.frame(Y=factor(LXyTe[,1]),x=LXyTe[,-1])
colnames(LXyTr_data) <- c("Y",paste0("x",1:D^2))
colnames(LXyTe_data) <- c("Y",paste0("x",1:D^2))

tc_rf <- proc.time()
fit_rf <- randomForest(Y~.,data = LXyTr_data)
tc_rf <- proc.time()-tc_rf

pred_rf <- predict(fit_rf,newdata = LXyTe_data)
err_rf <- mean(pred_rf!=LXyTe_data[,1])

cat("Test error: ",err_rf,"\n",sep="")
## Test error: 0.27016
cat("Computation time: ",tc_rf[3],"s",sep="")
## Computation time: 1.53s

5.7 XGBoost

Notice that here I did not use a validation set to tune the parameters.

library(xgboost)

params <- list(objective = "multi:softmax",
               num_class = 10,
               booster = "gbtree",
               eval_metric = "mlogloss",
               nthread = 4,
               eta = 0.1,
               max_depth = 6,
               min_child_weight = 1,
               gamma = 0,
               subsample = 1,
               colsample_bytree = 0.8,
               colsample_bylevel = 0.85,
               alpha = 0,
               lambda = 0.1)


xtrain <- xgb.DMatrix(data = LXyTr[,-1], label = LXyTr[,1])

tc_xgb <- proc.time()
fit_xgb <- xgb.train(params, data = xtrain, 
                     nrounds = 25, watchlist = list(train=xtrain), 
                     verbose=1, print_every_n = 5, 
                     early_stopping_rounds = 5)
## [1]  train-mlogloss:2.078792 
## Will train until train_mlogloss hasn't improved in 5 rounds.
## 
## [6]  train-mlogloss:1.442318 
## [11] train-mlogloss:1.157092 
## [16] train-mlogloss:0.991969 
## [21] train-mlogloss:0.894906 
## [25] train-mlogloss:0.840094
tc_xgb <- proc.time()-tc_xgb

pred_xgb <- predict(fit_xgb, LXyTe[,-1])

err_xgb <- mean(pred_xgb!=LXyTe[,1])

cat("Test error: ",err_xgb,"\n",sep="")
## Test error: 0.26152
cat("Computation time: ",tc_xgb[3],"s",sep="")
## Computation time: 0.67s
# Print variable importance
xgb.importance(feature_names = paste0("x",1:D^2),model=fit_xgb)
##     Feature         Gain       Cover   Frequency
##  1:     x19 0.1595346838 0.124100460 0.087486929
##  2:     x31 0.1373735257 0.119186924 0.112757058
##  3:     x16 0.1117691048 0.048299168 0.071976298
##  4:     x25 0.1096392778 0.125241670 0.122342280
##  5:     x17 0.1078339193 0.130417441 0.080167306
##  6:     x33 0.0925907102 0.095349595 0.127396305
##  7:     x23 0.0861323777 0.088199936 0.101951900
##  8:     x18 0.0824365969 0.049069532 0.041652144
##  9:     x27 0.0578763065 0.099597791 0.101603346
## 10:     x20 0.0458368155 0.075089264 0.075113280
## 11:     x30 0.0056231983 0.021240463 0.042349251
## 12:     x34 0.0027301204 0.019100623 0.025967236
## 13:     x32 0.0006233632 0.005107132 0.009236668

6 Summary

library(knitr)
err_table <- data.frame(Method=c("Multi-layer perceptron","Convnets","SVM","Random Forest","XGBoost"),
                        TestErr=c(err_dnn, err_cnn, err_svm, err_rf, err_xgb),
                        Time=c(tc_dnn[3], tc_cnn[3], tc_svm[3], tc_rf[3], tc_xgb[3]))

knitr::kable(err_table,
             caption = "Summary of the test error for each method")
Table 6.1: Summary of the test error for each method
Method TestErr Time
Multi-layer perceptron 0.26530 0.57
Convnets 0.25948 1.22
SVM 0.29434 0.76
Random Forest 0.27016 1.53
XGBoost 0.26152 0.67