Chapter 3: Neural Networks

1. Anatomy of a neural network

1.2 Layers

A Layer is a data processing module that takes as input one or more tensors and that outputs one or more tensors.

2D tensor (samples, features): dense layers or fully connected layers, layer_dense()
3D tensors (samples, timesteps, features): recurrent layers layer_lstm()
4D tensors, images (samples, channles, height, width): 2D covlution layers layer_conv_2d()

E.g.

layer <- layer_dense(units = 32, input_shape = c(784))

1.3 Models: network of layers

1.4 Loss functions and optimizers:

Loss function, aka objective function: qunatity that will be minimied during training
- Two-class classification: binary crossentropy
- Multiclass classification: categorical crossentropy
- Regression: mse
- Sequence learning: connectionist temporal classification
Optimizer: determines who the netowrk will be updated based on the loss function. It implements a specific variant of SGD.

2. Keras

Keras is a model-level library providing high-level building blocks for developing deep learning models. It relies on three backend implementations (for low-level tensor library): TensorFlow, Theano, and MS CNTK.

2.1 Keras work flow

1. Define training data: input tensor and targets
1. Define model (layers) that maps input to targets

either use keras_model_sequential() for linear stacks, or use functional API to buld arbitrary architectures.

model = keras_model_sequential() %>% 
  layer_dense(units = 32, input_shape = c(784)) %>%
  layer_dense(units = 10, activation = "softmax")

input_tensor = layer_input(shape = c(784))
output_tensor = input_tensor %>% 
  layer_dense(units = 32, activation = "relu") %>% 
  layer_dense(units = 10, activation = "softmax")
model = keras_model(inputs = input_tensor, outputs = output_tensor)

1. Configure the learning by setting loss, optimizer, and some metrics to monitor

model %>% compile(
  optimizer = optimizer_rmsprop(lr = 0.0001), l
  oss = "mse",
  metrics = c("accuracy")
)

1. Iterate on training data by calling fit()

model %>% fit(input_tensor, target_tensor, batch_size = 128, epochs = 10)

3. Binary classification

3.1 Load in data

3.2 Prepare the data

Each sample is a list of integers representing word index, which needs to be converted to a 2D tensor.

Padding the list so that they have exactly same length
One-hot encode, to convert list into vectors of 0s and 1s. E.g. [3,5] will be converted to a 10,000 dim vector that only have 1 at 3 and 5.

vectorize_sequences = function(sequences, dimension = 10000) {
  results = matrix(0, nrow = length(sequences), ncol = dimension) 
  for (i in 1:length(sequences)){
    results[i, sequences[[i]]] = 1 
  }
  results
}
x_train = vectorize_sequences(train_data) 
x_test = vectorize_sequences(test_data)
y_train = as.numeric(train_labels)
y_test = as.numeric(test_labels)

3.3 Build the network

Two things to consider:

How many layers
How many hidden units for each layer

Activation function: to get away from merely linear transformation.

For a binary classification problem, the best loss to use is binary_crossentropy (measures the distance between ground truth distribution and prediction distribution)/

3.4 Validating and training

Epochs: number of iterations over all samples in the training data Batch: samples to run each iteration

Training accuracy increases and training loss decreases as a results of SGD. It is not the case for validation. It peaks in the middle, which is a sign of overfitting.

library(keras)

## 1.1 Load in data
imdb = dataset_imdb(num_words = 10000)
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% imdb

#%<-% multi-assignment operator to unpack a list into a set of dinstance variables#

# ## Convert index back to english words ##
# word_index <- dataset_imdb_word_index()
# reverse_word_index <- names(word_index)
# names(reverse_word_index) <- word_index
# decoded_review <- sapply(train_data[[1]], function(index) {
#   word <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
#   if (!is.null(word)) word else "?" })

## 1.2 Encoding the integer sequences into a binary matrix
vectorize_sequences = function(sequences, dimension = 10000) {
  results = matrix(0, nrow = length(sequences), ncol = dimension) 
  for (i in 1:length(sequences)){
    results[i, sequences[[i]]] = 1 
  }
  results
}
x_train = vectorize_sequences(train_data) 
x_test = vectorize_sequences(test_data)
y_train = as.numeric(train_labels)
y_test = as.numeric(test_labels)

## 1.3 Define models
model = keras_model_sequential() %>%
  layer_dense(units = 16, activation = "relu", input_shape = c(10000)) %>% 
  layer_dense(units = 16, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

## 1.4 Complie the model
model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("accuracy"))

# ## or passing them as function objects
# model %>% compile(
#   optimizer = optimizer_rmsprop(lr = 0.001), loss = loss_binary_crossentropy,
#   metrics = metric_binary_accuracy
# )

## 1.5 Validation
val_indices =1:10000
x_val = x_train[val_indices,] 
partial_x_train = x_train[-val_indices,]
y_val = y_train[val_indices]
partial_y_train = y_train[-val_indices]

history = model %>% fit(
  partial_x_train, partial_y_train,
  epochs = 20,
  batch_size = 512,
  validation_data = list(x_val, y_val)
)

plot(history)


## 1.6 Battile the overfitting
## Due to overfitting, start to build from strach. 
model <- keras_model_sequential() %>%
  layer_dense(units = 16, activation = "relu", input_shape = c(10000)) %>%
  layer_dense(units = 16, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")
model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("accuracy")
)
model %>% fit(x_train, y_train, epochs = 4, batch_size = 512)
results = model %>% evaluate(x_test, y_test)


## 1.7 Prediction
model %>% predict(x_test[1:10,])

4. Multiclass classification problem

Note: There are two ways to process the multiclass classification problem

Encoding the labels via categorical encoding (also known as one-hot encoding) and using categorical_crossentropy as a loss function
Encoding the labels as integers and using the sparse_categorical_ crossentropy loss function

If you need to classify data into a large number of categories, you should avoid creating information bottlenecks in your network due to intermediate layers that are too small.

## 2.1 load the data
library(keras)
reuters = dataset_reuters(num_words = 10000)
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% reuters #labels 0 to 45

## 2.2 Data preparation
vectorize_sequences = function(sequences, dimension = 10000) {
  results = matrix(0, nrow = length(sequences), ncol = dimension) 
  for (i in 1:length(sequences))
    results[i, sequences[[i]]] <- 1 
  results
}
x_train = vectorize_sequences(train_data) 
x_test = vectorize_sequences(test_data)

one_hot_train_labels <- to_categorical(train_labels)
one_hot_test_labels <- to_categorical(test_labels)

### 2.3 Build model 

model = keras_model_sequential() %>%
  layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 46, activation = "softmax")
model %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)

### 2.4 Validation
val_indices = 1:1000
x_val = x_train[val_indices,] 
partial_x_train = x_train[-val_indices,]
y_val = one_hot_train_labels[val_indices,] 
partial_y_train = one_hot_train_labels[-val_indices,]

history <- model %>% fit(
  partial_x_train,
  partial_y_train,
  epochs = 20,
  batch_size = 512,
  validation_data = list(x_val, y_val)
)

plot(history)

### 2.5 retrain
model = keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
  layer_dense(units = 64, activation = "relu") %>%
  layer_dense(units = 46, activation = "softmax")
model %>% compile(
  optimizer = "rmsprop",
  loss = "categorical_crossentropy",
  metrics = c("accuracy")
)
history = model %>% fit( partial_x_train, partial_y_train,
                          epochs = 9,
                          batch_size = 512,
                          validation_data = list(x_val, y_val)
)
results = model %>% evaluate(x_test, one_hot_test_labels)

### 2.6 Prediction
predictions = model %>% predict(x_test)

4. Regression

The network for regression will end with a single unit and no activation (linear layer).

  model %>% compile(
    optimizer = "rmsprop",
    loss = "mse",
    metrics = c("mae")
)

Here we use mean absolute error for metrics, and mean squared error as loss functions.

Scale the data before training the network.

4.1 K-fold cross validation (CV)

Concept: validation score may have a high variance with regard to the validation split.

### 3.1 Load the data
library(keras)

dataset = dataset_boston_housing()
c(c(train_data, train_targets), c(test_data, test_targets)) %<-% dataset
train_data = scale(train_data)
test_data = scale(test_data)

### 3.2 build the model
build_model = function(){
  model = keras_model_sequential() %>% 
    layer_dense(units = 64, activation = "relu",input_shape = dim(train_data)[[2]]) %>%
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 1)
  model %>% compile(
    optimizer = "rmsprop",
    loss = "mse",
    metrics = c("mae")
  )
}

### 3.3 K fold cross validation

k = 4
indices = sample(1:nrow(train_data))
folds = cut(indices, breaks = k, labels = F)

num_epochs = 100
all_scores = c()

for(i in 1:k){
  cat("processing fold #", i, "\n")
  
  val_indices = which(folds == i, arr.ind = T)
  val_data = train_data[val_indices,]
  val_targets = train_targets[val_indices]
  
  partial_train_data = train_data[-val_indices,] 
  partial_train_targets = train_targets[-val_indices]
  
  model = build_model()
  
  model %>% fit(partial_train_data, partial_train_targets,
                epochs = num_epochs, batch_size = 1, verbose = 0)
  results = model %>% evaluate(val_data, val_targets, verbose = 0)
  all_scores = c(all_scores, results[2])
}

all_scores

### Another training with more epochs. 

num_epochs = 500
all_mae_histories = NULL

for(i in 1:k){
  cat("processing fold #", i, "\n")
  
  val_indices = which(folds == i, arr.ind = T)
  val_data = train_data[val_indices,]
  val_targets = train_targets[val_indices]
  
  partial_train_data = train_data[-val_indices,] 
  partial_train_targets = train_targets[-val_indices]
  
  model = build_model()
  
  history = model %>% fit(
    partial_train_data, partial_train_targets,
    validaton_data = list(val_data, val_targets),
    epochs = num_epochs, batch_size = 1, verbose = 0)
  
  mae_history = history$metrics$val_mean_absolute_error 
  all_mae_histories = rbind(all_mae_histories, mae_history)
}

average_mae_history <- data.frame(
  epoch = seq(1:ncol(all_mae_histories)),
  validation_mae = apply(all_mae_histories, 2, mean)
)
ggplot2::ggplot(average_mae_history, aes(x = epoch, y = validation_mae)) + geom_line()

### Final model
model = build_model()
model %>% fit(train_data, train_targets,
              epochs = 80, batch_size = 16, verbose = 0)
result = model %>% evaluate(test_data, test_targets)