Streamlined Machine Learning Modeling in R
1. Basic modeling in tidyverse
Before even going to tidymodel
, the
tidyverse
packages provide ways to streamline fitting
common statistical models.
1.1 tidyverse
modeling workflow
Make a list column
nest()
Work with list columns
map()
map(.x, .f = ~mean(.x))
Can work with
broom
, Metrics
, rsample
etc.
Simplify the list_columns
unnest()
model presentations
broom::tidy()
tidy()
for model coefficientsglance()
one row summary of the model, e.g. R^2augment()
adds prediction column to the original data
library(tidyverse)
library(tidymodels)
= readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/gapminder.rds")
gapminder
## Step one is to make a list column based on the grouping variable
= gapminder %>% group_by(country) %>% nest()
nested
## 1. Example 1) calculating the mean of a column
%>%
nested mutate(pop_mean = map(data, ~mean(.x$population))) %>%
unnest(pop_mean)
## # A tibble: 77 × 3
## # Groups: country [77]
## country data pop_mean
## <fct> <list> <dbl>
## 1 Algeria <tibble [52 × 6]> 23129438.
## 2 Argentina <tibble [52 × 6]> 30783053.
## 3 Australia <tibble [52 × 6]> 16074837.
## 4 Austria <tibble [52 × 6]> 7746272.
## 5 Bangladesh <tibble [52 × 6]> 97649407.
## 6 Belgium <tibble [52 × 6]> 9983596.
## 7 Benin <tibble [52 × 6]> 4995497.
## 8 Bolivia <tibble [52 × 6]> 6482842.
## 9 Botswana <tibble [52 × 6]> 1230400.
## 10 Brazil <tibble [52 × 6]> 137294617.
## # … with 67 more rows
## 2. Example 2) building a model with map()
### 2.1 Coefficients
= nested %>%
lm_reg_coef mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(coef = map(model, ~tidy(.x))) %>%
unnest(coef)
### 2.2 Summary statistics
= nested %>%
lm_reg_inference mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(specs = map(model, ~glance(.x))) %>%
unnest(specs)
### 2.3 augumented dataframes with predicted values, can directly be used for checking prediction
= nested %>%
lm_reg_pred mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(augmented = map(model, ~augment(.x))) %>%
unnest(augmented)
%>% filter(country == "Italy") %>%
lm_reg_pred ggplot(aes(x = year, y = life_expectancy)) +
geom_point() +
geom_line(aes(y = .fitted), color = "red")
1.2 The sampling workflow
Test and train split
= initial_split(gapminder, prop = 0.75)
gap_split = training(gap_split)
training_data = testing(gap_split) testing_data
Cross Validation Creation and performance
library(Metrics)
## Create the CV splits as columns of a tibble
= vfold_cv(training_data, v = 3)
cv_split = cv_split %>%
cv_data mutate(train = map(splits, ~training(.x)),
validate = map(splits, ~testing(.x)))
## Train a model in the training sets
= cv_data %>%
cv_models_lm mutate(model = map(train, ~lm(formula = life_expectancy~., data = .x)))
## Prediction for the testing set
= cv_models_lm %>%
cv_prep_lm mutate(validate_actual = map(validate, ~.x$life_expectancy),
validate_predicted = map2(model, validate, ~predict(.x, .y)))
= cv_prep_lm %>%
cv_eval_lm mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,
~mae(actual = .x, predicted = .y)))
cv_eval_lm
## # 3-fold cross-validation
## # A tibble: 3 × 8
## splits id train validate model validate_…¹ valid…² valid…³
## <list> <chr> <list> <list> <list> <list> <list> <dbl>
## 1 <split [2002/1001]> Fold1 <tibble> <tibble> <lm> <dbl> <dbl> 1.51
## 2 <split [2002/1001]> Fold2 <tibble> <tibble> <lm> <dbl> <dbl> 1.55
## 3 <split [2002/1001]> Fold3 <tibble> <tibble> <lm> <dbl> <dbl> 1.45
## # … with abbreviated variable names ¹validate_actual, ²validate_predicted,
## # ³validate_mae
1.3 A Random Forest model using tidyverse
for
regression
Fit an RF in ranger
with no tuning
library(ranger)
= cv_data %>%
cv_models_rf mutate(model = map(train, ~ranger(formula = life_expectancy~.,
data = .x, seed = 42)))
= cv_models_rf %>%
cv_prep_rf mutate(validate_predicted = map2(model, validate,
~predict(.x, .y)$predictions))
Fit an RF in ranger
with hyper parameter
tuning
= cv_data %>%
cv_tune crossing(mtry = 1:5)
head(cv_tune)
## # A tibble: 6 × 5
## splits id train validate mtry
## <list> <chr> <list> <list> <int>
## 1 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 1
## 2 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 2
## 3 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 3
## 4 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 4
## 5 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 5
## 6 <split [2002/1001]> Fold2 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 1
= cv_tune %>%
cv_model_tunerf mutate(model = map2(train, mtry, ~ranger(formula = life_expectancy~.,data = .x, mtry = .y))) %>%
mutate(validate_predicted = map2(model, validate, ~predict(.x, .y)$predictions),
validate_actual = map(validate, ~.x$life_expectancy)) %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,~mae(actual = .x, predicted = .y)))
::kable(cv_model_tunerf %>%
knitrgroup_by(mtry) %>%
summarise(mean_mae = mean(validate_mae)))
mtry | mean_mae |
---|---|
1 | 0.9682711 |
2 | 0.8764128 |
3 | 0.8768114 |
4 | 0.8763115 |
5 | 0.8821423 |
1.4 A logistic regression model using tidyverse
for
classification
= readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/attrition.rds")
attrtion
= initial_split(attrtion, prop = 0.75)
attrtion_split = training(attrtion_split)
training_data = testing(attrtion_split)
testing_data
= vfold_cv(training_data, v = 5)
cv_split = cv_split %>%
cv_data mutate(train = map(splits, ~training(.x)),
validate = map(splits, ~testing(.x)))
= cv_data %>%
cv_models_lr mutate(model = map(train, ~glm(formula = Attrition~.,
data = .x, family = "binomial")))
= cv_models_lr %>%
cv_models_lr_pred mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response")> 0.5 )
%>%
) mutate(
# Calculate accuracy and recall
Accuracy = map2(.x =validate_actual, .y = validate_predicted, ~accuracy(.x,.y)),
Recall = map2(.x =validate_actual, .y = validate_predicted, ~recall(.x,.y))
%>%
) unnest(Accuracy) %>%
unnest(Recall)
::kable(cv_models_lr_pred %>% select(id, Accuracy, Recall)) knitr
id | Accuracy | Recall |
---|---|---|
Fold1 | 0.8552036 | 0.3636364 |
Fold2 | 0.8778281 | 0.4473684 |
Fold3 | 0.8772727 | 0.3888889 |
Fold4 | 0.8681818 | 0.4285714 |
Fold5 | 0.8272727 | 0.3428571 |
1.5 A Random Forest model using tidyverse
for
classification
=cv_data %>%
cv_tune crossing(mtry = c(2, 4, 8, 16))
= cv_tune %>%
cv_models_rf mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
= cv_models_rf %>%
cv_models_rf_pred mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions == "Yes" )
%>%
) mutate(
# Calculate accurarcy
Accuracy = map2(.x =validate_actual, .y = validate_predicted, ~accuracy(.x,.y)),
Recall = map2(.x =validate_actual, .y = validate_predicted, ~recall(.x,.y))
%>%
) unnest(Accuracy) %>%
unnest(Recall)
::kable(cv_models_rf_pred %>% select(mtry, Accuracy, Recall) %>%
knitrgroup_by(mtry) %>% summarise_all(mean))
mtry | Accuracy | Recall |
---|---|---|
2 | 0.8493624 | 0.1127804 |
4 | 0.8520814 | 0.1519070 |
8 | 0.8529905 | 0.1865930 |
16 | 0.8538996 | 0.2032597 |
2. Supervised mdoeling framework in tidymodels
tidymodels
roadmap * Taken from DataCamp
resampling
splits training and testingrecipes
for feature engineeringparsnip
fits the model with 3 components- model type (linear regression, e.g.)
- engine (r packages)
- mode (regression or classification)
yarstick
evaluate model performance, i.e metrics functions
2.1 Linear regression
# split training and testing
= initial_split(mpg,prop = 0.75,strata = hwy)
mpg_split = mpg_split %>% training()
mpg_training = mpg_split %>% testing() mpg_test
library(tidymodels)
# Set up the model
= linear_reg() %>%
lm_model set_engine('lm') %>%
set_mode('regression')
# Fit the model
= lm_model %>%
lm_fit fit(hwy ~ cty, data = mpg_training)
# Model coefficients
tidy(lm_fit)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.695 0.563 1.24 2.18e- 1
## 2 cty 1.35 0.0326 41.4 4.39e-91
# Prediction on test data
= mpg_test %>%
mpg_test_results select(hwy, cty) %>%
bind_cols(
%>% predict(new_data = mpg_test)
lm_fit
)
# Performance
%>% rsq(truth = hwy, estimate = .pred) mpg_test_results
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.923
# Visualization
ggplot(mpg_test_results, aes(x = hwy, y = .pred)) +
geom_point() +
geom_abline(color = 'blue', linetype = 2) +
coord_obs_pred() +
labs(title = 'R-Squared Plot',
y = 'Predicted Highway MPG',
x = 'Actual Highway MPG')
The above procedures can be streamlined with: model specs, formula, and splitting object. THIS SHOULD BE USED if there is no need to get coefficients. Streamlined Approach:
= lm_model %>%
lm_last_fit last_fit(hwy ~ cty,
split = mpg_split)
%>% collect_metrics() lm_last_fit
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.86 Preprocessor1_Model1
## 2 rsq standard 0.923 Preprocessor1_Model1
%>% collect_predictions() lm_last_fit
## # A tibble: 61 × 5
## id .pred .row hwy .config
## <chr> <dbl> <int> <int> <chr>
## 1 train/test split 29.1 4 30 Preprocessor1_Model1
## 2 train/test split 25.0 7 27 Preprocessor1_Model1
## 3 train/test split 22.3 9 25 Preprocessor1_Model1
## 4 train/test split 23.7 13 25 Preprocessor1_Model1
## 5 train/test split 23.7 14 25 Preprocessor1_Model1
## 6 train/test split 23.7 17 25 Preprocessor1_Model1
## 7 train/test split 22.3 18 23 Preprocessor1_Model1
## 8 train/test split 15.6 20 15 Preprocessor1_Model1
## 9 train/test split 18.3 22 17 Preprocessor1_Model1
## 10 train/test split 16.9 23 17 Preprocessor1_Model1
## # … with 51 more rows
2.2 Classification
= readRDS("~/Dropbox/Coursework/Machine Learning in R/Modeling with tidymodels in R/datasets/leads_df.rds")
leads_df = initial_split(leads_df,prop = 0.75,strata = purchased)
leads_split = leads_split %>% training()
leads_training = leads_split %>%testing() leads_test
# Set up the model
= logistic_reg() %>%
logistic_model set_engine('glm') %>%
set_mode('classification')
# Fit the model
= logistic_model %>%
logistic_fit fit(purchased ~ total_visits + total_time,
data = leads_training)
# Prediction
= logistic_fit %>%
class_preds predict(new_data = leads_test,
type = 'class')
= logistic_fit %>%
prob_preds predict(new_data = leads_test,
type = 'prob')
= leads_test %>%
leads_results select(purchased) %>%
bind_cols(class_preds, prob_preds)
In tidymodes
classification model, outcome has to be a
factor
with the positive class being the first level.
# Get selected metrics
= metric_set( sens, spec)
custom_metrics
custom_metrics(leads_results,
truth = purchased,
estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.7
## 2 spec binary 0.858
# Get all
conf_mat(leads_results, truth = purchased,
estimate = .pred_class) %>% summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.801
## 2 kap binary 0.565
## 3 sens binary 0.7
## 4 spec binary 0.858
## 5 ppv binary 0.737
## 6 npv binary 0.835
## 7 mcc binary 0.565
## 8 j_index binary 0.558
## 9 bal_accuracy binary 0.779
## 10 detection_prevalence binary 0.343
## 11 precision binary 0.737
## 12 recall binary 0.7
## 13 f_meas binary 0.718
# Visualize the conft matrix
conf_mat(leads_results,
truth = purchased,
estimate = .pred_class) %>%
autoplot(type = 'heatmap')
conf_mat(leads_results,
truth = purchased,
estimate = .pred_class) %>%
autoplot(type = 'mosaic')
# ROC
%>%
leads_results roc_curve(truth = purchased, .pred_yes) %>%
autoplot()
# AUC
roc_auc(leads_results,truth = purchased,.pred_yes)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.852
Streamlined Approach:
= logistic_model %>%
logistic_last_fit last_fit(purchased ~ total_visits + total_time,
split = leads_split)
%>%
logistic_last_fit collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.801 Preprocessor1_Model1
## 2 roc_auc binary 0.852 Preprocessor1_Model1
= logistic_last_fit %>%
last_fit_results collect_predictions()
conf_mat(last_fit_results, truth = purchased,
estimate = .pred_class) %>% summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.801
## 2 kap binary 0.565
## 3 sens binary 0.7
## 4 spec binary 0.858
## 5 ppv binary 0.737
## 6 npv binary 0.835
## 7 mcc binary 0.565
## 8 j_index binary 0.558
## 9 bal_accuracy binary 0.779
## 10 detection_prevalence binary 0.343
## 11 precision binary 0.737
## 12 recall binary 0.7
## 13 f_meas binary 0.718
2.3 Feature engineering
recipe
allows:
Define column roles, predictors or outcome
Variable data types, numeric or categorical
Preprocessing steps
step_*()
2.3.1 Overall procedure
# Build a recipe
= recipe(purchased ~ .,data = leads_training) %>%
leads_log_rec step_log(total_time, base = 10)
%>%
leads_log_rec summary()
## # A tibble: 7 × 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 total_visits numeric predictor original
## 2 total_time numeric predictor original
## 3 pages_per_visit numeric predictor original
## 4 total_clicks numeric predictor original
## 5 lead_source nominal predictor original
## 6 us_location nominal predictor original
## 7 purchased nominal outcome original
# Train based on recipe
= leads_log_rec %>% prep(training = leads_training)
leads_log_rec_prep
# Transform the training data
%>%
leads_log_rec_prep bake(new_data = NULL)
## # A tibble: 996 × 7
## total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
## <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 7 2.68 2.33 21 organic… west no
## 2 3 2.85 3 28 organic… west no
## 3 3 2.22 3 32 direct_… southe… no
## 4 3 0.845 3 23 organic… west no
## 5 6 2.75 6 48 organic… southe… no
## 6 4 2.37 4 31 organic… southe… no
## 7 2 2.37 1 13 organic… southe… no
## 8 3 3.01 1.5 17 organic… southe… no
## 9 2 2.95 2 25 organic… southe… no
## 10 2 2.75 2 22 organic… southw… no
## # … with 986 more rows, and abbreviated variable names ¹lead_source,
## # ²us_location, ³purchased
# Transform the test data
%>%
leads_log_rec_prep bake(new_data = leads_test)
## # A tibble: 332 × 7
## total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
## <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 8 2 2.67 24 direct_… west no
## 2 5 2.36 2.5 25 email southe… no
## 3 4 2.25 4 37 direct_… west no
## 4 2 3.10 2 26 email midwest no
## 5 2 2.29 2 20 organic… west no
## 6 2 2.87 2 26 affilia… southw… yes
## 7 4 3.11 4 37 direct_… northe… yes
## 8 2 1.20 1 12 email midwest no
## 9 9 3.01 1.5 12 direct_… west yes
## 10 2 2.24 2 20 affilia… southw… no
## # … with 322 more rows, and abbreviated variable names ¹lead_source,
## # ²us_location, ³purchased
2.3.2 Treatment of numeric variables
- Correlated variables
%>%
leads_training select_if(is.numeric) %>%
cor()
## total_visits total_time pages_per_visit total_clicks
## total_visits 1.00000000 0.026421560 0.48605494 0.463329369
## total_time 0.02642156 1.000000000 0.01400162 0.003530897
## pages_per_visit 0.48605494 0.014001621 1.00000000 0.963031706
## total_clicks 0.46332937 0.003530897 0.96303171 1.000000000
## Highly correlated variables will have one to be removed
= recipe(purchased ~ .,
leads_cor_rec data = leads_training) %>%
step_corr(all_numeric(), -all_outcomes(), threshold = 0.9)
= leads_cor_rec %>%
processed_train prep(training = leads_training) %>%
bake(new_data = NULL)
= leads_cor_rec %>%
processed_test prep(training = leads_training) %>%
bake(new_data = leads_test)
- Normalization
= recipe(purchased ~ .,
leads_norm_rec data = leads_training) %>%
step_corr(all_numeric(), -all_outcomes(),threshold = 0.9) %>%
step_normalize(all_numeric())
%>%
leads_norm_rec prep(training = leads_training) %>%
bake(new_data = leads_test)
## # A tibble: 332 × 6
## total_visits total_time total_clicks lead_source us_location purchased
## <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 0.974 -0.993 -0.334 direct_traffic west no
## 2 0.147 -0.754 -0.260 email southeast no
## 3 -0.129 -0.849 0.622 direct_traffic west no
## 4 -0.680 1.20 -0.187 email midwest no
## 5 -0.680 -0.819 -0.628 organic_search west no
## 6 -0.680 0.202 -0.187 affiliate southwest yes
## 7 -0.129 1.22 0.622 direct_traffic northeast yes
## 8 -0.680 -1.15 -1.22 email midwest no
## 9 1.25 0.731 -1.22 direct_traffic west yes
## 10 -0.680 -0.859 -0.628 affiliate southwest no
## # … with 322 more rows
2.3.3 Treatment of nominal variables
- Create dummy variable
recipe(purchased ~ ., data = leads_training) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
prep(training = leads_training) %>%
bake(new_data = leads_test)
## # A tibble: 332 × 12
## total_visits total_…¹ pages…² total…³ purch…⁴ lead_…⁵ lead_…⁶ lead_…⁷ us_lo…⁸
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 8 100 2.67 24 no 0 0 1 0
## 2 5 228 2.5 25 no 1 0 0 1
## 3 4 177 4 37 no 0 0 1 0
## 4 2 1273 2 26 no 1 0 0 0
## 5 2 193 2 20 no 0 1 0 0
## 6 2 739 2 26 yes 0 0 0 0
## 7 4 1286 4 37 yes 0 0 1 0
## 8 2 16 1 12 no 1 0 0 0
## 9 9 1022 1.5 12 yes 0 0 1 0
## 10 2 172 2 20 no 0 0 0 0
## # … with 322 more rows, 3 more variables: us_location_midwest <dbl>,
## # us_location_southwest <dbl>, us_location_west <dbl>, and abbreviated
## # variable names ¹total_time, ²pages_per_visit, ³total_clicks, ⁴purchased,
## # ⁵lead_source_email, ⁶lead_source_organic_search,
## # ⁷lead_source_direct_traffic, ⁸us_location_southeast
2.4 Machine learning workflow
2.4.1 A full picture
Data resampling
= initial_split(leads_df,strata = purchased)
leads_split = leads_split %>%
leads_training training()
= leads_split %>%
leads_test testing()
Model specification
= logistic_reg() %>%
logistic_model set_engine('glm') %>%
set_mode('classification')
Feature engineering
# Create recipe
= recipe(purchased ~ .,
leads_recipe data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# Train recipe
= leads_recipe %>%
leads_recipe_prep prep(training = leads_training)
= leads_recipe_prep %>%
leads_training_prep bake(new_data = NULL)
= leads_recipe_prep %>%
leads_test_prep bake(new_data = leads_test)
Model fitting and prediction
= logistic_model %>%
logistic_fit fit(purchased ~ .,
data = leads_training_prep)
= predict(logistic_fit,
class_preds new_data = leads_test_prep,
type = 'class')
= predict(logistic_fit,
prob_preds new_data = leads_test_prep,
type = 'prob')
= leads_test %>%
leads_results select(purchased) %>%
bind_cols(class_preds, prob_preds)
Model evaluation
%>%
leads_results conf_mat(truth = purchased,
estimate = .pred_class) %>% summary()
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.822
## 2 kap binary 0.601
## 3 sens binary 0.675
## 4 spec binary 0.906
## 5 ppv binary 0.802
## 6 npv binary 0.831
## 7 mcc binary 0.606
## 8 j_index binary 0.581
## 9 bal_accuracy binary 0.790
## 10 detection_prevalence binary 0.304
## 11 precision binary 0.802
## 12 recall binary 0.675
## 13 f_meas binary 0.733
roc_auc(leads_results,truth = purchased,.pred_yes)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.857
2.4.2 A decision tree example to combine model and recipe
= decision_tree() %>%
dt_model set_engine('rpart') %>%
set_mode('classification')
= recipe(purchased ~ .,
leads_recipe data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
Combine model and recipes
# Create the workflow
= workflow() %>%
leads_wkfl add_model(dt_model) %>%
add_recipe(leads_recipe)
leads_wkfl
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Computational engine: rpart
# Fit the model using the streamline way
= leads_wkfl %>%
leads_wkfl_fit last_fit(split = leads_split)
%>%
leads_wkfl_fit collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.816 Preprocessor1_Model1
## 2 roc_auc binary 0.846 Preprocessor1_Model1
= leads_wkfl_fit %>%
leads_wkfl_preds collect_predictions()
leads_wkfl_preds
## # A tibble: 332 × 7
## id .pred_yes .pred_no .row .pred_class purchased .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0.132 0.868 3 no no Preprocessor…
## 2 train/test split 0.132 0.868 4 no no Preprocessor…
## 3 train/test split 0.132 0.868 8 no no Preprocessor…
## 4 train/test split 0.132 0.868 9 no no Preprocessor…
## 5 train/test split 0.132 0.868 10 no no Preprocessor…
## 6 train/test split 0.132 0.868 22 no no Preprocessor…
## 7 train/test split 0.757 0.243 24 yes yes Preprocessor…
## 8 train/test split 0.132 0.868 29 no no Preprocessor…
## 9 train/test split 0.757 0.243 37 yes yes Preprocessor…
## 10 train/test split 0.757 0.243 38 yes yes Preprocessor…
## # … with 322 more rows
# Evaluation
= metric_set(roc_auc, sens, spec)
leads_metrics %>%
leads_wkfl_preds leads_metrics(truth = purchased,
estimate = .pred_class,
.pred_yes)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.683
## 2 spec binary 0.892
## 3 roc_auc binary 0.846
2.4.3 Cross validation
set.seed(214)
= vfold_cv(leads_training,
leads_folds v = 10,
strata = purchased)
= leads_wkfl %>%
leads_rs_fit fit_resamples(resamples = leads_folds,
metrics = leads_metrics)
## Or:
# leads_rs_fit = fit_resamples(
# dt_model,
# purchased ~ .,
# resamples = leads_folds,
# metrics = leads_metrics)
# Get the mean across CVs
%>%
leads_rs_fit collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 roc_auc binary 0.794 10 0.0123 Preprocessor1_Model1
## 2 sens binary 0.672 10 0.0175 Preprocessor1_Model1
## 3 spec binary 0.846 10 0.0127 Preprocessor1_Model1
# Get the metrics for each CV
= leads_rs_fit %>%
rs_metrics collect_metrics(summarize = FALSE)
%>%
rs_metrics group_by(.metric) %>%
summarize(min = min(.estimate),
median = median(.estimate),
max = max(.estimate),
mean = mean(.estimate),
sd = sd(.estimate))
## # A tibble: 3 × 6
## .metric min median max mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 roc_auc 0.738 0.803 0.859 0.794 0.0390
## 2 sens 0.556 0.681 0.75 0.672 0.0552
## 3 spec 0.794 0.841 0.906 0.846 0.0402
2.4.4 Hyper parameter tuning in tidymodels
*
Set up model to label hyper parameters to be tuned
= decision_tree(cost_complexity = tune(),
dt_tune_model tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
dt_tune_model
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
= leads_wkfl %>%
leads_tune_wkfl update_model(dt_tune_model)
leads_tune_wkfl
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
Grid search - random
set.seed(214)
= grid_random(parameters(dt_tune_model),size = 5)
dt_grid
= leads_tune_wkfl %>%
dt_tuning tune_grid(resamples = leads_folds,
grid = dt_grid,
metrics = leads_metrics)
## Overall exploration of results
# dt_tuning %>%
# collect_metrics()
%>%
dt_tuning collect_metrics(summarize = FALSE) %>%
filter(.metric == 'roc_auc') %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))
## # A tibble: 10 × 4
## id min_roc_auc median_roc_auc max_roc_auc
## <chr> <dbl> <dbl> <dbl>
## 1 Fold01 0.768 0.806 0.807
## 2 Fold02 0.736 0.806 0.838
## 3 Fold03 0.815 0.844 0.854
## 4 Fold04 0.799 0.807 0.837
## 5 Fold05 0.633 0.745 0.768
## 6 Fold06 0.817 0.861 0.874
## 7 Fold07 0.695 0.799 0.811
## 8 Fold08 0.759 0.799 0.860
## 9 Fold09 0.739 0.818 0.845
## 10 Fold10 0.727 0.738 0.775
Grid search - regular
= grid_regular(parameters(dt_tune_model),levels = 3) dt_grid
Show and select best model
# Display the best model from tuning
%>% show_best(metric = 'roc_auc', n = 5) dt_tuning
## # A tibble: 5 × 9
## cost_complexity tree_depth min_n .metric .estima…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000758 14 39 roc_auc binary 0.820 10 0.0128 Prepro…
## 2 0.00380 5 36 roc_auc binary 0.805 10 0.0135 Prepro…
## 3 0.000000600 3 5 roc_auc binary 0.798 10 0.00935 Prepro…
## 4 0.0243 5 34 roc_auc binary 0.791 10 0.0124 Prepro…
## 5 0.00000443 11 8 roc_auc binary 0.767 10 0.0228 Prepro…
## # … with abbreviated variable name ¹.estimator
# select the best model
= dt_tuning %>%
best_dt_model select_best(metric = 'roc_auc')
best_dt_model
## # A tibble: 1 × 4
## cost_complexity tree_depth min_n .config
## <dbl> <int> <int> <chr>
## 1 0.0000000758 14 39 Preprocessor1_Model1
# Finalize the workflow
= leads_tune_wkfl %>%
final_leads_wkfl finalize_workflow(best_dt_model)
final_leads_wkfl
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = 7.58290839567418e-08
## tree_depth = 14
## min_n = 39
##
## Computational engine: rpart
= final_leads_wkfl %>%
leads_final_fit last_fit(split = leads_split)
%>%
leads_final_fit collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.810 Preprocessor1_Model1
## 2 roc_auc binary 0.855 Preprocessor1_Model1
2.5 Final Overall Procedure
# A full picture
rm(list = ls())
# 1. Split training and testing
= initial_split(leads_df,strata = purchased)
leads_split = leads_split %>%
leads_training training()
= leads_split %>%
leads_test testing()
# 2. Initial model set up and indicate parameters to be tuned
= decision_tree(cost_complexity = tune(),
dt_tune_model tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
= metric_set(roc_auc, sens, spec)
leads_metrics
# 3. Set up the recipe
= recipe(purchased ~ .,
leads_recipe data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# 4. Create the cross validation
= vfold_cv(leads_training,
leads_folds v = 10,
strata = purchased)
# 5. Create the ML workflow
= workflow() %>%
leads_tune_wkfl add_model(dt_tune_model) %>%
add_recipe(leads_recipe)
# 6. Grid search tuning
= grid_random(extract_parameter_set_dials(dt_tune_model),size = 5)
dt_grid
= leads_tune_wkfl %>%
dt_tuning tune_grid(resamples = leads_folds,
grid = dt_grid,
metrics = leads_metrics)
# 7. Evaluate the grid search
%>%
dt_tuning collect_metrics(summarize = FALSE) %>%
filter(.metric == 'roc_auc') %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))
# 8. Select the best model
= dt_tuning %>%
best_dt_model select_best(metric = 'roc_auc')
# 9. Finalize the workflow
= leads_tune_wkfl %>%
final_leads_wkfl finalize_workflow(best_dt_model)
final_leads_wkfl
# 10. Apply the model to the test data and evaluate the test set performance
= final_leads_wkfl %>%
leads_final_fit last_fit(split = leads_split)
conf_mat(leads_final_fit$.predictions[[1]],truth = purchased,
estimate = .pred_class) %>% summary()
%>% collect_metrics() leads_final_fit
# A full picture
# 1. Split training and testing
= initial_split(leads_df,strata = purchased)
leads_split = leads_split %>%
leads_training training()
= leads_split %>%
leads_test testing()
# 2. Set up the tuning specs
= boost_tree(
boost_spec trees = 500,
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
# 3. Create the tuning grid
= grid_regular(extract_parameter_set_dials(boost_spec),
tunegrid_boost levels = 2)
# 4. Tuning
= tune_grid(
tune_results
boost_spec,~ .,
purchased resamples = vfold_cv(leads_training, v = 6),
grid = tunegrid_boost,
metrics = metric_set(roc_auc))
# 5. Visualize the results
autoplot(tune_results)
# 6. Finalize the model
= select_best(tune_results)
best_params = finalize_model(boost_spec,
final_spec
best_params)
# # 7. Train the final model
# final_model = final_spec %>% fit(formula = purchased ~ .,
# data = leads_training)
#
# # 8. Testing prediction
# predictions = predict(final_model,leads_test, type = "class") %>%
# bind_cols(leads_test) %>%
#
#
# predictions = leads_test %>% select(purchased) %>%
# bind_cols(predict(final_model,leads_test, type = "class"),
# predict(final_model,leads_test, type = "prob"))
# 7. The streamelined approahc
= select_best(tune_results)
best_params = finalize_model(boost_spec,
final_spec
best_params)
= final_spec %>%
final_fit last_fit(purchased ~ .,
split = leads_split)
%>% collect_metrics()
final_fit %>% collect_predictions() final_fit
3. Machine learning with caret
3.1 ML set up in caret
library(caret)
set.seed(42)
= readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/breast_cancer_data_orig.csv")
breast_cancer_data
# Create partition index
= createDataPartition(breast_cancer_data$diagnosis, p = .70,
index list = FALSE)
# Subset `breast_cancer_data` with index
= breast_cancer_data[index, ]
bc_train_data = breast_cancer_data[-index, ]
bc_test_data
# Set up CV
= trainControl(method = "repeatedcv", number = 3, repeats = 5)
fitControl = expand.grid(degree = 4, scale = 1, C = 1)
hyperparams set.seed(42)
= train(diagnosis ~ .,
svm_model data = bc_train_data,
method = "svmPoly",
trControl = fitControl,
tuneGrid =hyperparams,
verbose = FALSE)
3.2 Data preprocessing
= train(X, Y, method = "glm",
model preProcess = c("zv", "center", "scale", "medianImpute", "pca")
)
3.3 Hyper parameter tuning in caret
Hyper parameters can be viewed: https://topepo.github.io/caret/available-models.html
= readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/voters_train_data.csv")
voters_train_data
# set up cv
= trainControl(method = "repeatedcv", number = 3, repeats = 5) fitControl
Custom tuning grid
= expand.grid(n.trees = c(100, 200, 250),
man_grid interaction.depth = c(1, 4, 6),
shrinkage = 0.1, n.minobsinnode = 10)
= train(turnout16_2016 ~ .,data = voters_train_data,
gbm_model_voters_grid method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = man_grid)
Grid search with range
= expand.grid(n.trees = seq(from = 10, to = 300, by = 50),
big_grid interaction.depth = seq(from = 1, to = 10,
length.out = 6),
shrinkage = 0.1,
n.minobsinnode = 10)
= trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "grid")
fitControl set.seed(42)
= train(turnout16_2016 ~ .,
gbm_model_voters_big_grid data = voters_train_data,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = big_grid)
ggplot(gbm_model_voters_big_grid)
Random Search
= trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "random")
fitControl = train(turnout16_2016 ~ .,
gbm_model_voters_random data = voters_train_data,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneLength = 5)
3.4 Comapre multiple models
set.seed(42)
= createFolds(bc_train_data$diagnosis, k = 5)
myFolds = trainControl(
myControl summaryFunction = twoClassSummary,
classProbs = TRUE,
verboseIter = TRUE,
savePredictions = TRUE,
index = myFolds
)
= train(diagnosis ~ .,
model_glmnet
bc_train_data,metric = "ROC",
method = "glmnet",
trControl = myControl)
## + Fold1: alpha=0.10, lambda=0.07724
## - Fold1: alpha=0.10, lambda=0.07724
## + Fold1: alpha=0.55, lambda=0.07724
## - Fold1: alpha=0.55, lambda=0.07724
## + Fold1: alpha=1.00, lambda=0.07724
## - Fold1: alpha=1.00, lambda=0.07724
## + Fold2: alpha=0.10, lambda=0.07724
## - Fold2: alpha=0.10, lambda=0.07724
## + Fold2: alpha=0.55, lambda=0.07724
## - Fold2: alpha=0.55, lambda=0.07724
## + Fold2: alpha=1.00, lambda=0.07724
## - Fold2: alpha=1.00, lambda=0.07724
## + Fold3: alpha=0.10, lambda=0.07724
## - Fold3: alpha=0.10, lambda=0.07724
## + Fold3: alpha=0.55, lambda=0.07724
## - Fold3: alpha=0.55, lambda=0.07724
## + Fold3: alpha=1.00, lambda=0.07724
## - Fold3: alpha=1.00, lambda=0.07724
## + Fold4: alpha=0.10, lambda=0.07724
## - Fold4: alpha=0.10, lambda=0.07724
## + Fold4: alpha=0.55, lambda=0.07724
## - Fold4: alpha=0.55, lambda=0.07724
## + Fold4: alpha=1.00, lambda=0.07724
## - Fold4: alpha=1.00, lambda=0.07724
## + Fold5: alpha=0.10, lambda=0.07724
## - Fold5: alpha=0.10, lambda=0.07724
## + Fold5: alpha=0.55, lambda=0.07724
## - Fold5: alpha=0.55, lambda=0.07724
## + Fold5: alpha=1.00, lambda=0.07724
## - Fold5: alpha=1.00, lambda=0.07724
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0772 on full training set
= train(diagnosis ~ .,
model_rf
bc_train_data,metric = "ROC",
method = "ranger",
trControl = myControl
)
## + Fold1: mtry= 2, min.node.size=1, splitrule=gini
## - Fold1: mtry= 2, min.node.size=1, splitrule=gini
## + Fold1: mtry=16, min.node.size=1, splitrule=gini
## - Fold1: mtry=16, min.node.size=1, splitrule=gini
## + Fold1: mtry=30, min.node.size=1, splitrule=gini
## - Fold1: mtry=30, min.node.size=1, splitrule=gini
## + Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold1: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold1: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold1: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold1: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold2: mtry= 2, min.node.size=1, splitrule=gini
## - Fold2: mtry= 2, min.node.size=1, splitrule=gini
## + Fold2: mtry=16, min.node.size=1, splitrule=gini
## - Fold2: mtry=16, min.node.size=1, splitrule=gini
## + Fold2: mtry=30, min.node.size=1, splitrule=gini
## - Fold2: mtry=30, min.node.size=1, splitrule=gini
## + Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold2: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold2: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold2: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold2: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold3: mtry= 2, min.node.size=1, splitrule=gini
## - Fold3: mtry= 2, min.node.size=1, splitrule=gini
## + Fold3: mtry=16, min.node.size=1, splitrule=gini
## - Fold3: mtry=16, min.node.size=1, splitrule=gini
## + Fold3: mtry=30, min.node.size=1, splitrule=gini
## - Fold3: mtry=30, min.node.size=1, splitrule=gini
## + Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold3: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold3: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold3: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold3: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold4: mtry= 2, min.node.size=1, splitrule=gini
## - Fold4: mtry= 2, min.node.size=1, splitrule=gini
## + Fold4: mtry=16, min.node.size=1, splitrule=gini
## - Fold4: mtry=16, min.node.size=1, splitrule=gini
## + Fold4: mtry=30, min.node.size=1, splitrule=gini
## - Fold4: mtry=30, min.node.size=1, splitrule=gini
## + Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold4: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold4: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold4: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold4: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold5: mtry= 2, min.node.size=1, splitrule=gini
## - Fold5: mtry= 2, min.node.size=1, splitrule=gini
## + Fold5: mtry=16, min.node.size=1, splitrule=gini
## - Fold5: mtry=16, min.node.size=1, splitrule=gini
## + Fold5: mtry=30, min.node.size=1, splitrule=gini
## - Fold5: mtry=30, min.node.size=1, splitrule=gini
## + Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold5: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold5: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold5: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold5: mtry=30, min.node.size=1, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = extratrees, min.node.size = 1 on full training set
# Make a list
= list(glmnet = model_glmnet,rf = model_rf)
model_list = resamples(model_list)
resamps summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: glmnet, rf
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.9875630 0.9890336 0.9917647 0.9913304 0.9934167 0.9948739 0
## rf 0.9873529 0.9873739 0.9891597 0.9894744 0.9907773 0.9927083 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.975 0.98 0.995 0.989 0.995 1 0
## rf 0.950 0.96 0.980 0.977 0.995 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.8067227 0.8750000 0.9075630 0.8876050 0.9075630 0.9411765 0
## rf 0.8151261 0.8833333 0.8991597 0.8892717 0.8991597 0.9495798 0
bwplot(resamps, metric = "ROC")
densityplot(resamps, metric = "ROC")