Streamlined Machine Learning Modeling in R
1. Basic modeling in tidyverse
Before even going to tidymodel, the
tidyverse packages provide ways to streamline fitting
common statistical models.
1.1 tidyverse modeling workflow
Make a list column
nest()Work with list columns
map()
map(.x, .f = ~mean(.x)) Can work with
broom, Metrics, rsample etc.
Simplify the list_columns
unnest()model presentations
broom::tidy()
tidy()for model coefficientsglance()one row summary of the model, e.g. R^2augment()adds prediction column to the original data
library(tidyverse)
library(tidymodels)
gapminder = readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/gapminder.rds")
## Step one is to make a list column based on the grouping variable
nested = gapminder %>% group_by(country) %>% nest()
## 1. Example 1) calculating the mean of a column
nested %>%
mutate(pop_mean = map(data, ~mean(.x$population))) %>%
unnest(pop_mean)## # A tibble: 77 × 3
## # Groups: country [77]
## country data pop_mean
## <fct> <list> <dbl>
## 1 Algeria <tibble [52 × 6]> 23129438.
## 2 Argentina <tibble [52 × 6]> 30783053.
## 3 Australia <tibble [52 × 6]> 16074837.
## 4 Austria <tibble [52 × 6]> 7746272.
## 5 Bangladesh <tibble [52 × 6]> 97649407.
## 6 Belgium <tibble [52 × 6]> 9983596.
## 7 Benin <tibble [52 × 6]> 4995497.
## 8 Bolivia <tibble [52 × 6]> 6482842.
## 9 Botswana <tibble [52 × 6]> 1230400.
## 10 Brazil <tibble [52 × 6]> 137294617.
## # … with 67 more rows
## 2. Example 2) building a model with map()
### 2.1 Coefficients
lm_reg_coef = nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(coef = map(model, ~tidy(.x))) %>%
unnest(coef)
### 2.2 Summary statistics
lm_reg_inference = nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(specs = map(model, ~glance(.x))) %>%
unnest(specs)
### 2.3 augumented dataframes with predicted values, can directly be used for checking prediction
lm_reg_pred = nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
mutate(augmented = map(model, ~augment(.x))) %>%
unnest(augmented)
lm_reg_pred %>% filter(country == "Italy") %>%
ggplot(aes(x = year, y = life_expectancy)) +
geom_point() +
geom_line(aes(y = .fitted), color = "red")1.2 The sampling workflow
Test and train split
gap_split = initial_split(gapminder, prop = 0.75)
training_data = training(gap_split)
testing_data = testing(gap_split)Cross Validation Creation and performance
library(Metrics)
## Create the CV splits as columns of a tibble
cv_split = vfold_cv(training_data, v = 3)
cv_data = cv_split %>%
mutate(train = map(splits, ~training(.x)),
validate = map(splits, ~testing(.x)))
## Train a model in the training sets
cv_models_lm = cv_data %>%
mutate(model = map(train, ~lm(formula = life_expectancy~., data = .x)))
## Prediction for the testing set
cv_prep_lm = cv_models_lm %>%
mutate(validate_actual = map(validate, ~.x$life_expectancy),
validate_predicted = map2(model, validate, ~predict(.x, .y)))
cv_eval_lm = cv_prep_lm %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,
~mae(actual = .x, predicted = .y)))
cv_eval_lm## # 3-fold cross-validation
## # A tibble: 3 × 8
## splits id train validate model validate_…¹ valid…² valid…³
## <list> <chr> <list> <list> <list> <list> <list> <dbl>
## 1 <split [2002/1001]> Fold1 <tibble> <tibble> <lm> <dbl> <dbl> 1.51
## 2 <split [2002/1001]> Fold2 <tibble> <tibble> <lm> <dbl> <dbl> 1.55
## 3 <split [2002/1001]> Fold3 <tibble> <tibble> <lm> <dbl> <dbl> 1.45
## # … with abbreviated variable names ¹validate_actual, ²validate_predicted,
## # ³validate_mae
1.3 A Random Forest model using tidyverse for
regression
Fit an RF in ranger with no tuning
library(ranger)
cv_models_rf = cv_data %>%
mutate(model = map(train, ~ranger(formula = life_expectancy~.,
data = .x, seed = 42)))
cv_prep_rf = cv_models_rf %>%
mutate(validate_predicted = map2(model, validate,
~predict(.x, .y)$predictions))Fit an RF in ranger with hyper parameter
tuning
cv_tune = cv_data %>%
crossing(mtry = 1:5)
head(cv_tune)## # A tibble: 6 × 5
## splits id train validate mtry
## <list> <chr> <list> <list> <int>
## 1 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 1
## 2 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 2
## 3 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 3
## 4 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 4
## 5 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 5
## 6 <split [2002/1001]> Fold2 <tibble [2,002 × 7]> <tibble [1,001 × 7]> 1
cv_model_tunerf = cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = life_expectancy~.,data = .x, mtry = .y))) %>%
mutate(validate_predicted = map2(model, validate, ~predict(.x, .y)$predictions),
validate_actual = map(validate, ~.x$life_expectancy)) %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,~mae(actual = .x, predicted = .y)))
knitr::kable(cv_model_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae)))| mtry | mean_mae |
|---|---|
| 1 | 0.9682711 |
| 2 | 0.8764128 |
| 3 | 0.8768114 |
| 4 | 0.8763115 |
| 5 | 0.8821423 |
1.4 A logistic regression model using tidyverse for
classification
attrtion = readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/attrition.rds")
attrtion_split = initial_split(attrtion, prop = 0.75)
training_data = training(attrtion_split)
testing_data = testing(attrtion_split)
cv_split = vfold_cv(training_data, v = 5)
cv_data = cv_split %>%
mutate(train = map(splits, ~training(.x)),
validate = map(splits, ~testing(.x)))
cv_models_lr = cv_data %>%
mutate(model = map(train, ~glm(formula = Attrition~.,
data = .x, family = "binomial")))
cv_models_lr_pred = cv_models_lr %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response")> 0.5 )
) %>%
mutate(
# Calculate accuracy and recall
Accuracy = map2(.x =validate_actual, .y = validate_predicted, ~accuracy(.x,.y)),
Recall = map2(.x =validate_actual, .y = validate_predicted, ~recall(.x,.y))
) %>%
unnest(Accuracy) %>%
unnest(Recall)
knitr::kable(cv_models_lr_pred %>% select(id, Accuracy, Recall))| id | Accuracy | Recall |
|---|---|---|
| Fold1 | 0.8552036 | 0.3636364 |
| Fold2 | 0.8778281 | 0.4473684 |
| Fold3 | 0.8772727 | 0.3888889 |
| Fold4 | 0.8681818 | 0.4285714 |
| Fold5 | 0.8272727 | 0.3428571 |
1.5 A Random Forest model using tidyverse for
classification
cv_tune =cv_data %>%
crossing(mtry = c(2, 4, 8, 16))
cv_models_rf = cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
cv_models_rf_pred = cv_models_rf %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions == "Yes" )
) %>%
mutate(
# Calculate accurarcy
Accuracy = map2(.x =validate_actual, .y = validate_predicted, ~accuracy(.x,.y)),
Recall = map2(.x =validate_actual, .y = validate_predicted, ~recall(.x,.y))
) %>%
unnest(Accuracy) %>%
unnest(Recall)
knitr::kable(cv_models_rf_pred %>% select(mtry, Accuracy, Recall) %>%
group_by(mtry) %>% summarise_all(mean))| mtry | Accuracy | Recall |
|---|---|---|
| 2 | 0.8493624 | 0.1127804 |
| 4 | 0.8520814 | 0.1519070 |
| 8 | 0.8529905 | 0.1865930 |
| 16 | 0.8538996 | 0.2032597 |
2. Supervised mdoeling framework in tidymodels
tidymodels roadmap * Taken from DataCamp
resamplingsplits training and testingrecipesfor feature engineeringparsnipfits the model with 3 components- model type (linear regression, e.g.)
- engine (r packages)
- mode (regression or classification)
yarstickevaluate model performance, i.e metrics functions
2.1 Linear regression
# split training and testing
mpg_split = initial_split(mpg,prop = 0.75,strata = hwy)
mpg_training = mpg_split %>% training()
mpg_test = mpg_split %>% testing()library(tidymodels)
# Set up the model
lm_model = linear_reg() %>%
set_engine('lm') %>%
set_mode('regression')
# Fit the model
lm_fit = lm_model %>%
fit(hwy ~ cty, data = mpg_training)
# Model coefficients
tidy(lm_fit)## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.695 0.563 1.24 2.18e- 1
## 2 cty 1.35 0.0326 41.4 4.39e-91
# Prediction on test data
mpg_test_results = mpg_test %>%
select(hwy, cty) %>%
bind_cols(
lm_fit %>% predict(new_data = mpg_test)
)
# Performance
mpg_test_results %>% rsq(truth = hwy, estimate = .pred)## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.923
# Visualization
ggplot(mpg_test_results, aes(x = hwy, y = .pred)) +
geom_point() +
geom_abline(color = 'blue', linetype = 2) +
coord_obs_pred() +
labs(title = 'R-Squared Plot',
y = 'Predicted Highway MPG',
x = 'Actual Highway MPG')The above procedures can be streamlined with: model specs, formula, and splitting object. THIS SHOULD BE USED if there is no need to get coefficients. Streamlined Approach:
lm_last_fit = lm_model %>%
last_fit(hwy ~ cty,
split = mpg_split)
lm_last_fit %>% collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.86 Preprocessor1_Model1
## 2 rsq standard 0.923 Preprocessor1_Model1
lm_last_fit %>% collect_predictions()## # A tibble: 61 × 5
## id .pred .row hwy .config
## <chr> <dbl> <int> <int> <chr>
## 1 train/test split 29.1 4 30 Preprocessor1_Model1
## 2 train/test split 25.0 7 27 Preprocessor1_Model1
## 3 train/test split 22.3 9 25 Preprocessor1_Model1
## 4 train/test split 23.7 13 25 Preprocessor1_Model1
## 5 train/test split 23.7 14 25 Preprocessor1_Model1
## 6 train/test split 23.7 17 25 Preprocessor1_Model1
## 7 train/test split 22.3 18 23 Preprocessor1_Model1
## 8 train/test split 15.6 20 15 Preprocessor1_Model1
## 9 train/test split 18.3 22 17 Preprocessor1_Model1
## 10 train/test split 16.9 23 17 Preprocessor1_Model1
## # … with 51 more rows
2.2 Classification
leads_df = readRDS("~/Dropbox/Coursework/Machine Learning in R/Modeling with tidymodels in R/datasets/leads_df.rds")
leads_split = initial_split(leads_df,prop = 0.75,strata = purchased)
leads_training = leads_split %>% training()
leads_test = leads_split %>%testing()# Set up the model
logistic_model = logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
# Fit the model
logistic_fit = logistic_model %>%
fit(purchased ~ total_visits + total_time,
data = leads_training)
# Prediction
class_preds = logistic_fit %>%
predict(new_data = leads_test,
type = 'class')
prob_preds = logistic_fit %>%
predict(new_data = leads_test,
type = 'prob')
leads_results = leads_test %>%
select(purchased) %>%
bind_cols(class_preds, prob_preds)In tidymodes classification model, outcome has to be a
factor with the positive class being the first level.
# Get selected metrics
custom_metrics = metric_set( sens, spec)
custom_metrics(leads_results,
truth = purchased,
estimate = .pred_class)## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.7
## 2 spec binary 0.858
# Get all
conf_mat(leads_results, truth = purchased,
estimate = .pred_class) %>% summary()## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.801
## 2 kap binary 0.565
## 3 sens binary 0.7
## 4 spec binary 0.858
## 5 ppv binary 0.737
## 6 npv binary 0.835
## 7 mcc binary 0.565
## 8 j_index binary 0.558
## 9 bal_accuracy binary 0.779
## 10 detection_prevalence binary 0.343
## 11 precision binary 0.737
## 12 recall binary 0.7
## 13 f_meas binary 0.718
# Visualize the conft matrix
conf_mat(leads_results,
truth = purchased,
estimate = .pred_class) %>%
autoplot(type = 'heatmap')conf_mat(leads_results,
truth = purchased,
estimate = .pred_class) %>%
autoplot(type = 'mosaic')# ROC
leads_results %>%
roc_curve(truth = purchased, .pred_yes) %>%
autoplot()# AUC
roc_auc(leads_results,truth = purchased,.pred_yes)## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.852
Streamlined Approach:
logistic_last_fit = logistic_model %>%
last_fit(purchased ~ total_visits + total_time,
split = leads_split)
logistic_last_fit %>%
collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.801 Preprocessor1_Model1
## 2 roc_auc binary 0.852 Preprocessor1_Model1
last_fit_results = logistic_last_fit %>%
collect_predictions()
conf_mat(last_fit_results, truth = purchased,
estimate = .pred_class) %>% summary()## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.801
## 2 kap binary 0.565
## 3 sens binary 0.7
## 4 spec binary 0.858
## 5 ppv binary 0.737
## 6 npv binary 0.835
## 7 mcc binary 0.565
## 8 j_index binary 0.558
## 9 bal_accuracy binary 0.779
## 10 detection_prevalence binary 0.343
## 11 precision binary 0.737
## 12 recall binary 0.7
## 13 f_meas binary 0.718
2.3 Feature engineering
recipe allows:
Define column roles, predictors or outcome
Variable data types, numeric or categorical
Preprocessing steps
step_*()
2.3.1 Overall procedure
# Build a recipe
leads_log_rec = recipe(purchased ~ .,data = leads_training) %>%
step_log(total_time, base = 10)
leads_log_rec %>%
summary()## # A tibble: 7 × 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 total_visits numeric predictor original
## 2 total_time numeric predictor original
## 3 pages_per_visit numeric predictor original
## 4 total_clicks numeric predictor original
## 5 lead_source nominal predictor original
## 6 us_location nominal predictor original
## 7 purchased nominal outcome original
# Train based on recipe
leads_log_rec_prep = leads_log_rec %>% prep(training = leads_training)
# Transform the training data
leads_log_rec_prep %>%
bake(new_data = NULL)## # A tibble: 996 × 7
## total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
## <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 7 2.68 2.33 21 organic… west no
## 2 3 2.85 3 28 organic… west no
## 3 3 2.22 3 32 direct_… southe… no
## 4 3 0.845 3 23 organic… west no
## 5 6 2.75 6 48 organic… southe… no
## 6 4 2.37 4 31 organic… southe… no
## 7 2 2.37 1 13 organic… southe… no
## 8 3 3.01 1.5 17 organic… southe… no
## 9 2 2.95 2 25 organic… southe… no
## 10 2 2.75 2 22 organic… southw… no
## # … with 986 more rows, and abbreviated variable names ¹lead_source,
## # ²us_location, ³purchased
# Transform the test data
leads_log_rec_prep %>%
bake(new_data = leads_test)## # A tibble: 332 × 7
## total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
## <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 8 2 2.67 24 direct_… west no
## 2 5 2.36 2.5 25 email southe… no
## 3 4 2.25 4 37 direct_… west no
## 4 2 3.10 2 26 email midwest no
## 5 2 2.29 2 20 organic… west no
## 6 2 2.87 2 26 affilia… southw… yes
## 7 4 3.11 4 37 direct_… northe… yes
## 8 2 1.20 1 12 email midwest no
## 9 9 3.01 1.5 12 direct_… west yes
## 10 2 2.24 2 20 affilia… southw… no
## # … with 322 more rows, and abbreviated variable names ¹lead_source,
## # ²us_location, ³purchased
2.3.2 Treatment of numeric variables
- Correlated variables
leads_training %>%
select_if(is.numeric) %>%
cor()## total_visits total_time pages_per_visit total_clicks
## total_visits 1.00000000 0.026421560 0.48605494 0.463329369
## total_time 0.02642156 1.000000000 0.01400162 0.003530897
## pages_per_visit 0.48605494 0.014001621 1.00000000 0.963031706
## total_clicks 0.46332937 0.003530897 0.96303171 1.000000000
## Highly correlated variables will have one to be removed
leads_cor_rec = recipe(purchased ~ .,
data = leads_training) %>%
step_corr(all_numeric(), -all_outcomes(), threshold = 0.9)
processed_train = leads_cor_rec %>%
prep(training = leads_training) %>%
bake(new_data = NULL)
processed_test = leads_cor_rec %>%
prep(training = leads_training) %>%
bake(new_data = leads_test)- Normalization
leads_norm_rec = recipe(purchased ~ .,
data = leads_training) %>%
step_corr(all_numeric(), -all_outcomes(),threshold = 0.9) %>%
step_normalize(all_numeric())
leads_norm_rec %>%
prep(training = leads_training) %>%
bake(new_data = leads_test)## # A tibble: 332 × 6
## total_visits total_time total_clicks lead_source us_location purchased
## <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 0.974 -0.993 -0.334 direct_traffic west no
## 2 0.147 -0.754 -0.260 email southeast no
## 3 -0.129 -0.849 0.622 direct_traffic west no
## 4 -0.680 1.20 -0.187 email midwest no
## 5 -0.680 -0.819 -0.628 organic_search west no
## 6 -0.680 0.202 -0.187 affiliate southwest yes
## 7 -0.129 1.22 0.622 direct_traffic northeast yes
## 8 -0.680 -1.15 -1.22 email midwest no
## 9 1.25 0.731 -1.22 direct_traffic west yes
## 10 -0.680 -0.859 -0.628 affiliate southwest no
## # … with 322 more rows
2.3.3 Treatment of nominal variables
- Create dummy variable
recipe(purchased ~ ., data = leads_training) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
prep(training = leads_training) %>%
bake(new_data = leads_test)## # A tibble: 332 × 12
## total_visits total_…¹ pages…² total…³ purch…⁴ lead_…⁵ lead_…⁶ lead_…⁷ us_lo…⁸
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 8 100 2.67 24 no 0 0 1 0
## 2 5 228 2.5 25 no 1 0 0 1
## 3 4 177 4 37 no 0 0 1 0
## 4 2 1273 2 26 no 1 0 0 0
## 5 2 193 2 20 no 0 1 0 0
## 6 2 739 2 26 yes 0 0 0 0
## 7 4 1286 4 37 yes 0 0 1 0
## 8 2 16 1 12 no 1 0 0 0
## 9 9 1022 1.5 12 yes 0 0 1 0
## 10 2 172 2 20 no 0 0 0 0
## # … with 322 more rows, 3 more variables: us_location_midwest <dbl>,
## # us_location_southwest <dbl>, us_location_west <dbl>, and abbreviated
## # variable names ¹total_time, ²pages_per_visit, ³total_clicks, ⁴purchased,
## # ⁵lead_source_email, ⁶lead_source_organic_search,
## # ⁷lead_source_direct_traffic, ⁸us_location_southeast
2.4 Machine learning workflow
2.4.1 A full picture
Data resampling
leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
training()
leads_test = leads_split %>%
testing()Model specification
logistic_model = logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')Feature engineering
# Create recipe
leads_recipe = recipe(purchased ~ .,
data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# Train recipe
leads_recipe_prep = leads_recipe %>%
prep(training = leads_training)
leads_training_prep = leads_recipe_prep %>%
bake(new_data = NULL)
leads_test_prep = leads_recipe_prep %>%
bake(new_data = leads_test)Model fitting and prediction
logistic_fit = logistic_model %>%
fit(purchased ~ .,
data = leads_training_prep)
class_preds = predict(logistic_fit,
new_data = leads_test_prep,
type = 'class')
prob_preds = predict(logistic_fit,
new_data = leads_test_prep,
type = 'prob')
leads_results = leads_test %>%
select(purchased) %>%
bind_cols(class_preds, prob_preds)Model evaluation
leads_results %>%
conf_mat(truth = purchased,
estimate = .pred_class) %>% summary()## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.822
## 2 kap binary 0.601
## 3 sens binary 0.675
## 4 spec binary 0.906
## 5 ppv binary 0.802
## 6 npv binary 0.831
## 7 mcc binary 0.606
## 8 j_index binary 0.581
## 9 bal_accuracy binary 0.790
## 10 detection_prevalence binary 0.304
## 11 precision binary 0.802
## 12 recall binary 0.675
## 13 f_meas binary 0.733
roc_auc(leads_results,truth = purchased,.pred_yes)## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.857
2.4.2 A decision tree example to combine model and recipe
dt_model = decision_tree() %>%
set_engine('rpart') %>%
set_mode('classification')
leads_recipe = recipe(purchased ~ .,
data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())Combine model and recipes
# Create the workflow
leads_wkfl = workflow() %>%
add_model(dt_model) %>%
add_recipe(leads_recipe)
leads_wkfl## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Computational engine: rpart
# Fit the model using the streamline way
leads_wkfl_fit = leads_wkfl %>%
last_fit(split = leads_split)
leads_wkfl_fit %>%
collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.816 Preprocessor1_Model1
## 2 roc_auc binary 0.846 Preprocessor1_Model1
leads_wkfl_preds = leads_wkfl_fit %>%
collect_predictions()
leads_wkfl_preds## # A tibble: 332 × 7
## id .pred_yes .pred_no .row .pred_class purchased .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0.132 0.868 3 no no Preprocessor…
## 2 train/test split 0.132 0.868 4 no no Preprocessor…
## 3 train/test split 0.132 0.868 8 no no Preprocessor…
## 4 train/test split 0.132 0.868 9 no no Preprocessor…
## 5 train/test split 0.132 0.868 10 no no Preprocessor…
## 6 train/test split 0.132 0.868 22 no no Preprocessor…
## 7 train/test split 0.757 0.243 24 yes yes Preprocessor…
## 8 train/test split 0.132 0.868 29 no no Preprocessor…
## 9 train/test split 0.757 0.243 37 yes yes Preprocessor…
## 10 train/test split 0.757 0.243 38 yes yes Preprocessor…
## # … with 322 more rows
# Evaluation
leads_metrics = metric_set(roc_auc, sens, spec)
leads_wkfl_preds %>%
leads_metrics(truth = purchased,
estimate = .pred_class,
.pred_yes)## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.683
## 2 spec binary 0.892
## 3 roc_auc binary 0.846
2.4.3 Cross validation
set.seed(214)
leads_folds = vfold_cv(leads_training,
v = 10,
strata = purchased)
leads_rs_fit = leads_wkfl %>%
fit_resamples(resamples = leads_folds,
metrics = leads_metrics)
## Or:
# leads_rs_fit = fit_resamples(
# dt_model,
# purchased ~ .,
# resamples = leads_folds,
# metrics = leads_metrics)
# Get the mean across CVs
leads_rs_fit %>%
collect_metrics()## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 roc_auc binary 0.794 10 0.0123 Preprocessor1_Model1
## 2 sens binary 0.672 10 0.0175 Preprocessor1_Model1
## 3 spec binary 0.846 10 0.0127 Preprocessor1_Model1
# Get the metrics for each CV
rs_metrics = leads_rs_fit %>%
collect_metrics(summarize = FALSE)
rs_metrics %>%
group_by(.metric) %>%
summarize(min = min(.estimate),
median = median(.estimate),
max = max(.estimate),
mean = mean(.estimate),
sd = sd(.estimate))## # A tibble: 3 × 6
## .metric min median max mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 roc_auc 0.738 0.803 0.859 0.794 0.0390
## 2 sens 0.556 0.681 0.75 0.672 0.0552
## 3 spec 0.794 0.841 0.906 0.846 0.0402
2.4.4 Hyper parameter tuning in tidymodels*
Set up model to label hyper parameters to be tuned
dt_tune_model = decision_tree(cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
dt_tune_model## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
leads_tune_wkfl = leads_wkfl %>%
update_model(dt_tune_model)
leads_tune_wkfl## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
## min_n = tune()
##
## Computational engine: rpart
Grid search - random
set.seed(214)
dt_grid = grid_random(parameters(dt_tune_model),size = 5)
dt_tuning = leads_tune_wkfl %>%
tune_grid(resamples = leads_folds,
grid = dt_grid,
metrics = leads_metrics)
## Overall exploration of results
# dt_tuning %>%
# collect_metrics()
dt_tuning %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == 'roc_auc') %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))## # A tibble: 10 × 4
## id min_roc_auc median_roc_auc max_roc_auc
## <chr> <dbl> <dbl> <dbl>
## 1 Fold01 0.768 0.806 0.807
## 2 Fold02 0.736 0.806 0.838
## 3 Fold03 0.815 0.844 0.854
## 4 Fold04 0.799 0.807 0.837
## 5 Fold05 0.633 0.745 0.768
## 6 Fold06 0.817 0.861 0.874
## 7 Fold07 0.695 0.799 0.811
## 8 Fold08 0.759 0.799 0.860
## 9 Fold09 0.739 0.818 0.845
## 10 Fold10 0.727 0.738 0.775
Grid search - regular
dt_grid = grid_regular(parameters(dt_tune_model),levels = 3)Show and select best model
# Display the best model from tuning
dt_tuning %>% show_best(metric = 'roc_auc', n = 5)## # A tibble: 5 × 9
## cost_complexity tree_depth min_n .metric .estima…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000758 14 39 roc_auc binary 0.820 10 0.0128 Prepro…
## 2 0.00380 5 36 roc_auc binary 0.805 10 0.0135 Prepro…
## 3 0.000000600 3 5 roc_auc binary 0.798 10 0.00935 Prepro…
## 4 0.0243 5 34 roc_auc binary 0.791 10 0.0124 Prepro…
## 5 0.00000443 11 8 roc_auc binary 0.767 10 0.0228 Prepro…
## # … with abbreviated variable name ¹.estimator
# select the best model
best_dt_model = dt_tuning %>%
select_best(metric = 'roc_auc')
best_dt_model## # A tibble: 1 × 4
## cost_complexity tree_depth min_n .config
## <dbl> <int> <int> <chr>
## 1 0.0000000758 14 39 Preprocessor1_Model1
# Finalize the workflow
final_leads_wkfl = leads_tune_wkfl %>%
finalize_workflow(best_dt_model)
final_leads_wkfl## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = 7.58290839567418e-08
## tree_depth = 14
## min_n = 39
##
## Computational engine: rpart
leads_final_fit = final_leads_wkfl %>%
last_fit(split = leads_split)
leads_final_fit %>%
collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.810 Preprocessor1_Model1
## 2 roc_auc binary 0.855 Preprocessor1_Model1
2.5 Final Overall Procedure
# A full picture
rm(list = ls())
# 1. Split training and testing
leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
training()
leads_test = leads_split %>%
testing()
# 2. Initial model set up and indicate parameters to be tuned
dt_tune_model = decision_tree(cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
leads_metrics = metric_set(roc_auc, sens, spec)
# 3. Set up the recipe
leads_recipe = recipe(purchased ~ .,
data = leads_training) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# 4. Create the cross validation
leads_folds = vfold_cv(leads_training,
v = 10,
strata = purchased)
# 5. Create the ML workflow
leads_tune_wkfl = workflow() %>%
add_model(dt_tune_model) %>%
add_recipe(leads_recipe)
# 6. Grid search tuning
dt_grid = grid_random(extract_parameter_set_dials(dt_tune_model),size = 5)
dt_tuning = leads_tune_wkfl %>%
tune_grid(resamples = leads_folds,
grid = dt_grid,
metrics = leads_metrics)
# 7. Evaluate the grid search
dt_tuning %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == 'roc_auc') %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))
# 8. Select the best model
best_dt_model = dt_tuning %>%
select_best(metric = 'roc_auc')
# 9. Finalize the workflow
final_leads_wkfl = leads_tune_wkfl %>%
finalize_workflow(best_dt_model)
final_leads_wkfl
# 10. Apply the model to the test data and evaluate the test set performance
leads_final_fit = final_leads_wkfl %>%
last_fit(split = leads_split)
conf_mat(leads_final_fit$.predictions[[1]],truth = purchased,
estimate = .pred_class) %>% summary()
leads_final_fit %>% collect_metrics()# A full picture
# 1. Split training and testing
leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
training()
leads_test = leads_split %>%
testing()
# 2. Set up the tuning specs
boost_spec = boost_tree(
trees = 500,
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
# 3. Create the tuning grid
tunegrid_boost = grid_regular(extract_parameter_set_dials(boost_spec),
levels = 2)
# 4. Tuning
tune_results = tune_grid(
boost_spec,
purchased ~ .,
resamples = vfold_cv(leads_training, v = 6),
grid = tunegrid_boost,
metrics = metric_set(roc_auc))
# 5. Visualize the results
autoplot(tune_results)
# 6. Finalize the model
best_params = select_best(tune_results)
final_spec = finalize_model(boost_spec,
best_params)
# # 7. Train the final model
# final_model = final_spec %>% fit(formula = purchased ~ .,
# data = leads_training)
#
# # 8. Testing prediction
# predictions = predict(final_model,leads_test, type = "class") %>%
# bind_cols(leads_test) %>%
#
#
# predictions = leads_test %>% select(purchased) %>%
# bind_cols(predict(final_model,leads_test, type = "class"),
# predict(final_model,leads_test, type = "prob"))
# 7. The streamelined approahc
best_params = select_best(tune_results)
final_spec = finalize_model(boost_spec,
best_params)
final_fit = final_spec %>%
last_fit(purchased ~ .,
split = leads_split)
final_fit %>% collect_metrics()
final_fit %>% collect_predictions() 3. Machine learning with caret
3.1 ML set up in caret
library(caret)
set.seed(42)
breast_cancer_data = readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/breast_cancer_data_orig.csv")
# Create partition index
index = createDataPartition(breast_cancer_data$diagnosis, p = .70,
list = FALSE)
# Subset `breast_cancer_data` with index
bc_train_data = breast_cancer_data[index, ]
bc_test_data = breast_cancer_data[-index, ]
# Set up CV
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5)
hyperparams = expand.grid(degree = 4, scale = 1, C = 1)
set.seed(42)
svm_model = train(diagnosis ~ .,
data = bc_train_data,
method = "svmPoly",
trControl = fitControl,
tuneGrid =hyperparams,
verbose = FALSE)3.2 Data preprocessing
model = train(X, Y, method = "glm",
preProcess = c("zv", "center", "scale", "medianImpute", "pca")
)3.3 Hyper parameter tuning in caret
Hyper parameters can be viewed: https://topepo.github.io/caret/available-models.html
voters_train_data = readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/voters_train_data.csv")
# set up cv
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5)Custom tuning grid
man_grid = expand.grid(n.trees = c(100, 200, 250),
interaction.depth = c(1, 4, 6),
shrinkage = 0.1, n.minobsinnode = 10)
gbm_model_voters_grid = train(turnout16_2016 ~ .,data = voters_train_data,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = man_grid)Grid search with range
big_grid = expand.grid(n.trees = seq(from = 10, to = 300, by = 50),
interaction.depth = seq(from = 1, to = 10,
length.out = 6),
shrinkage = 0.1,
n.minobsinnode = 10)
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "grid")
set.seed(42)
gbm_model_voters_big_grid = train(turnout16_2016 ~ .,
data = voters_train_data,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = big_grid)
ggplot(gbm_model_voters_big_grid)Random Search
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "random")
gbm_model_voters_random = train(turnout16_2016 ~ .,
data = voters_train_data,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneLength = 5)3.4 Comapre multiple models
set.seed(42)
myFolds = createFolds(bc_train_data$diagnosis, k = 5)
myControl = trainControl(
summaryFunction = twoClassSummary,
classProbs = TRUE,
verboseIter = TRUE,
savePredictions = TRUE,
index = myFolds
)
model_glmnet = train(diagnosis ~ .,
bc_train_data,
metric = "ROC",
method = "glmnet",
trControl = myControl)## + Fold1: alpha=0.10, lambda=0.07724
## - Fold1: alpha=0.10, lambda=0.07724
## + Fold1: alpha=0.55, lambda=0.07724
## - Fold1: alpha=0.55, lambda=0.07724
## + Fold1: alpha=1.00, lambda=0.07724
## - Fold1: alpha=1.00, lambda=0.07724
## + Fold2: alpha=0.10, lambda=0.07724
## - Fold2: alpha=0.10, lambda=0.07724
## + Fold2: alpha=0.55, lambda=0.07724
## - Fold2: alpha=0.55, lambda=0.07724
## + Fold2: alpha=1.00, lambda=0.07724
## - Fold2: alpha=1.00, lambda=0.07724
## + Fold3: alpha=0.10, lambda=0.07724
## - Fold3: alpha=0.10, lambda=0.07724
## + Fold3: alpha=0.55, lambda=0.07724
## - Fold3: alpha=0.55, lambda=0.07724
## + Fold3: alpha=1.00, lambda=0.07724
## - Fold3: alpha=1.00, lambda=0.07724
## + Fold4: alpha=0.10, lambda=0.07724
## - Fold4: alpha=0.10, lambda=0.07724
## + Fold4: alpha=0.55, lambda=0.07724
## - Fold4: alpha=0.55, lambda=0.07724
## + Fold4: alpha=1.00, lambda=0.07724
## - Fold4: alpha=1.00, lambda=0.07724
## + Fold5: alpha=0.10, lambda=0.07724
## - Fold5: alpha=0.10, lambda=0.07724
## + Fold5: alpha=0.55, lambda=0.07724
## - Fold5: alpha=0.55, lambda=0.07724
## + Fold5: alpha=1.00, lambda=0.07724
## - Fold5: alpha=1.00, lambda=0.07724
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0772 on full training set
model_rf = train(diagnosis ~ .,
bc_train_data,
metric = "ROC",
method = "ranger",
trControl = myControl
)## + Fold1: mtry= 2, min.node.size=1, splitrule=gini
## - Fold1: mtry= 2, min.node.size=1, splitrule=gini
## + Fold1: mtry=16, min.node.size=1, splitrule=gini
## - Fold1: mtry=16, min.node.size=1, splitrule=gini
## + Fold1: mtry=30, min.node.size=1, splitrule=gini
## - Fold1: mtry=30, min.node.size=1, splitrule=gini
## + Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold1: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold1: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold1: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold1: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold2: mtry= 2, min.node.size=1, splitrule=gini
## - Fold2: mtry= 2, min.node.size=1, splitrule=gini
## + Fold2: mtry=16, min.node.size=1, splitrule=gini
## - Fold2: mtry=16, min.node.size=1, splitrule=gini
## + Fold2: mtry=30, min.node.size=1, splitrule=gini
## - Fold2: mtry=30, min.node.size=1, splitrule=gini
## + Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold2: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold2: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold2: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold2: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold3: mtry= 2, min.node.size=1, splitrule=gini
## - Fold3: mtry= 2, min.node.size=1, splitrule=gini
## + Fold3: mtry=16, min.node.size=1, splitrule=gini
## - Fold3: mtry=16, min.node.size=1, splitrule=gini
## + Fold3: mtry=30, min.node.size=1, splitrule=gini
## - Fold3: mtry=30, min.node.size=1, splitrule=gini
## + Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold3: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold3: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold3: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold3: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold4: mtry= 2, min.node.size=1, splitrule=gini
## - Fold4: mtry= 2, min.node.size=1, splitrule=gini
## + Fold4: mtry=16, min.node.size=1, splitrule=gini
## - Fold4: mtry=16, min.node.size=1, splitrule=gini
## + Fold4: mtry=30, min.node.size=1, splitrule=gini
## - Fold4: mtry=30, min.node.size=1, splitrule=gini
## + Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold4: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold4: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold4: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold4: mtry=30, min.node.size=1, splitrule=extratrees
## + Fold5: mtry= 2, min.node.size=1, splitrule=gini
## - Fold5: mtry= 2, min.node.size=1, splitrule=gini
## + Fold5: mtry=16, min.node.size=1, splitrule=gini
## - Fold5: mtry=16, min.node.size=1, splitrule=gini
## + Fold5: mtry=30, min.node.size=1, splitrule=gini
## - Fold5: mtry=30, min.node.size=1, splitrule=gini
## + Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
## - Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
## + Fold5: mtry=16, min.node.size=1, splitrule=extratrees
## - Fold5: mtry=16, min.node.size=1, splitrule=extratrees
## + Fold5: mtry=30, min.node.size=1, splitrule=extratrees
## - Fold5: mtry=30, min.node.size=1, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = extratrees, min.node.size = 1 on full training set
# Make a list
model_list = list(glmnet = model_glmnet,rf = model_rf)
resamps = resamples(model_list)
summary(resamps)##
## Call:
## summary.resamples(object = resamps)
##
## Models: glmnet, rf
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.9875630 0.9890336 0.9917647 0.9913304 0.9934167 0.9948739 0
## rf 0.9873529 0.9873739 0.9891597 0.9894744 0.9907773 0.9927083 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.975 0.98 0.995 0.989 0.995 1 0
## rf 0.950 0.96 0.980 0.977 0.995 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glmnet 0.8067227 0.8750000 0.9075630 0.8876050 0.9075630 0.9411765 0
## rf 0.8151261 0.8833333 0.8991597 0.8892717 0.8991597 0.9495798 0
bwplot(resamps, metric = "ROC")densityplot(resamps, metric = "ROC")