A Layer is a data processing module that takes as input one or more tensors and that outputs one or more tensors.
2D tensor (samples, features): dense layers or fully connected layers, layer_dense()
3D tensors (samples, timesteps, features): recurrent layers layer_lstm()
4D tensors, images (samples, channles, height, width): 2D covlution layers layer_conv_2d()
E.g.
layer <- layer_dense(units = 32, input_shape = c(784))
Loss function, aka objective function: qunatity that will be minimied during training
Two-class classification: binary crossentropy
Multiclass classification: categorical crossentropy
Regression: mse
Sequence learning: connectionist temporal classification
Optimizer: determines who the netowrk will be updated based on the loss function. It implements a specific variant of SGD.
Keras is a model-level library providing high-level building blocks for developing deep learning models. It relies on three backend implementations (for low-level tensor library): TensorFlow, Theano, and MS CNTK.
either use keras_model_sequential()
for linear stacks, or use functional API to buld arbitrary architectures.
model = keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(784)) %>%
layer_dense(units = 10, activation = "softmax")
input_tensor = layer_input(shape = c(784))
output_tensor = input_tensor %>%
layer_dense(units = 32, activation = "relu") %>%
layer_dense(units = 10, activation = "softmax")
model = keras_model(inputs = input_tensor, outputs = output_tensor)
model %>% compile(
optimizer = optimizer_rmsprop(lr = 0.0001), l
oss = "mse",
metrics = c("accuracy")
)
fit()
model %>% fit(input_tensor, target_tensor, batch_size = 128, epochs = 10)
Each sample is a list of integers representing word index, which needs to be converted to a 2D tensor.
Padding the list so that they have exactly same length
One-hot encode, to convert list into vectors of 0s and 1s. E.g. [3,5] will be converted to a 10,000 dim vector that only have 1 at 3 and 5.
vectorize_sequences = function(sequences, dimension = 10000) {
results = matrix(0, nrow = length(sequences), ncol = dimension)
for (i in 1:length(sequences)){
results[i, sequences[[i]]] = 1
}
results
}
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = as.numeric(train_labels)
y_test = as.numeric(test_labels)
Two things to consider:
How many layers
How many hidden units for each layer
Activation function: to get away from merely linear transformation.
For a binary classification problem, the best loss to use is binary_crossentropy
(measures the distance between ground truth distribution and prediction distribution)/
Epochs: number of iterations over all samples in the training data Batch: samples to run each iteration
Training accuracy increases and training loss decreases as a results of SGD. It is not the case for validation. It peaks in the middle, which is a sign of overfitting.
library(keras)
## 1.1 Load in data
imdb = dataset_imdb(num_words = 10000)
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% imdb
#%<-% multi-assignment operator to unpack a list into a set of dinstance variables#
# ## Convert index back to english words ##
# word_index <- dataset_imdb_word_index()
# reverse_word_index <- names(word_index)
# names(reverse_word_index) <- word_index
# decoded_review <- sapply(train_data[[1]], function(index) {
# word <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
# if (!is.null(word)) word else "?" })
## 1.2 Encoding the integer sequences into a binary matrix
vectorize_sequences = function(sequences, dimension = 10000) {
results = matrix(0, nrow = length(sequences), ncol = dimension)
for (i in 1:length(sequences)){
results[i, sequences[[i]]] = 1
}
results
}
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = as.numeric(train_labels)
y_test = as.numeric(test_labels)
## 1.3 Define models
model = keras_model_sequential() %>%
layer_dense(units = 16, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
## 1.4 Complie the model
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy"))
# ## or passing them as function objects
# model %>% compile(
# optimizer = optimizer_rmsprop(lr = 0.001), loss = loss_binary_crossentropy,
# metrics = metric_binary_accuracy
# )
## 1.5 Validation
val_indices =1:10000
x_val = x_train[val_indices,]
partial_x_train = x_train[-val_indices,]
y_val = y_train[val_indices]
partial_y_train = y_train[-val_indices]
history = model %>% fit(
partial_x_train, partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
## 1.6 Battile the overfitting
## Due to overfitting, start to build from strach.
model <- keras_model_sequential() %>%
layer_dense(units = 16, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
model %>% fit(x_train, y_train, epochs = 4, batch_size = 512)
results = model %>% evaluate(x_test, y_test)
## 1.7 Prediction
model %>% predict(x_test[1:10,])
Note: There are two ways to process the multiclass classification problem
Encoding the labels via categorical encoding (also known as one-hot encoding) and using categorical_crossentropy as a loss function
Encoding the labels as integers and using the sparse_categorical_ crossentropy loss function
If you need to classify data into a large number of categories, you should avoid creating information bottlenecks in your network due to intermediate layers that are too small.
## 2.1 load the data
library(keras)
reuters = dataset_reuters(num_words = 10000)
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% reuters #labels 0 to 45
## 2.2 Data preparation
vectorize_sequences = function(sequences, dimension = 10000) {
results = matrix(0, nrow = length(sequences), ncol = dimension)
for (i in 1:length(sequences))
results[i, sequences[[i]]] <- 1
results
}
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
one_hot_train_labels <- to_categorical(train_labels)
one_hot_test_labels <- to_categorical(test_labels)
### 2.3 Build model
model = keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 46, activation = "softmax")
model %>% compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
### 2.4 Validation
val_indices = 1:1000
x_val = x_train[val_indices,]
partial_x_train = x_train[-val_indices,]
y_val = one_hot_train_labels[val_indices,]
partial_y_train = one_hot_train_labels[-val_indices,]
history <- model %>% fit(
partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
### 2.5 retrain
model = keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 46, activation = "softmax")
model %>% compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
history = model %>% fit( partial_x_train, partial_y_train,
epochs = 9,
batch_size = 512,
validation_data = list(x_val, y_val)
)
results = model %>% evaluate(x_test, one_hot_test_labels)
### 2.6 Prediction
predictions = model %>% predict(x_test)
The network for regression will end with a single unit and no activation (linear layer).
model %>% compile(
optimizer = "rmsprop",
loss = "mse",
metrics = c("mae")
)
Here we use mean absolute error for metrics, and mean squared error as loss functions.
Scale the data before training the network.
Concept: validation score may have a high variance with regard to the validation split.
### 3.1 Load the data
library(keras)
dataset = dataset_boston_housing()
c(c(train_data, train_targets), c(test_data, test_targets)) %<-% dataset
train_data = scale(train_data)
test_data = scale(test_data)
### 3.2 build the model
build_model = function(){
model = keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu",input_shape = dim(train_data)[[2]]) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1)
model %>% compile(
optimizer = "rmsprop",
loss = "mse",
metrics = c("mae")
)
}
### 3.3 K fold cross validation
k = 4
indices = sample(1:nrow(train_data))
folds = cut(indices, breaks = k, labels = F)
num_epochs = 100
all_scores = c()
for(i in 1:k){
cat("processing fold #", i, "\n")
val_indices = which(folds == i, arr.ind = T)
val_data = train_data[val_indices,]
val_targets = train_targets[val_indices]
partial_train_data = train_data[-val_indices,]
partial_train_targets = train_targets[-val_indices]
model = build_model()
model %>% fit(partial_train_data, partial_train_targets,
epochs = num_epochs, batch_size = 1, verbose = 0)
results = model %>% evaluate(val_data, val_targets, verbose = 0)
all_scores = c(all_scores, results[2])
}
all_scores
### Another training with more epochs.
num_epochs = 500
all_mae_histories = NULL
for(i in 1:k){
cat("processing fold #", i, "\n")
val_indices = which(folds == i, arr.ind = T)
val_data = train_data[val_indices,]
val_targets = train_targets[val_indices]
partial_train_data = train_data[-val_indices,]
partial_train_targets = train_targets[-val_indices]
model = build_model()
history = model %>% fit(
partial_train_data, partial_train_targets,
validaton_data = list(val_data, val_targets),
epochs = num_epochs, batch_size = 1, verbose = 0)
mae_history = history$metrics$val_mean_absolute_error
all_mae_histories = rbind(all_mae_histories, mae_history)
}
average_mae_history <- data.frame(
epoch = seq(1:ncol(all_mae_histories)),
validation_mae = apply(all_mae_histories, 2, mean)
)
ggplot2::ggplot(average_mae_history, aes(x = epoch, y = validation_mae)) + geom_line()
### Final model
model = build_model()
model %>% fit(train_data, train_targets,
epochs = 80, batch_size = 16, verbose = 0)
result = model %>% evaluate(test_data, test_targets)