Streamlined Machine Learning Modeling in R

1. Basic modeling in `tidyverse`

Before even going to tidymodel, the tidyverse packages provide ways to streamline fitting common statistical models.

1.1 `tidyverse` modeling workflow

Make a list column nest()
Work with list columns map()

map(.x, .f = ~mean(.x)) Can work with broom, Metrics, rsample etc.

Simplify the list_columns unnest()
model presentations broom::tidy()

tidy() for model coefficients
glance() one row summary of the model, e.g. R^2
augment() adds prediction column to the original data

library(tidyverse)
library(tidymodels)

gapminder = readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/gapminder.rds")

## Step one is to make a list column based on the grouping variable
nested = gapminder %>% group_by(country) %>% nest()

## 1. Example 1) calculating the mean of a column 
nested %>% 
  mutate(pop_mean = map(data, ~mean(.x$population))) %>% 
  unnest(pop_mean)

## # A tibble: 77 × 3
## # Groups:   country [77]
##    country    data                pop_mean
##    <fct>      <list>                 <dbl>
##  1 Algeria    <tibble [52 × 6]>  23129438.
##  2 Argentina  <tibble [52 × 6]>  30783053.
##  3 Australia  <tibble [52 × 6]>  16074837.
##  4 Austria    <tibble [52 × 6]>   7746272.
##  5 Bangladesh <tibble [52 × 6]>  97649407.
##  6 Belgium    <tibble [52 × 6]>   9983596.
##  7 Benin      <tibble [52 × 6]>   4995497.
##  8 Bolivia    <tibble [52 × 6]>   6482842.
##  9 Botswana   <tibble [52 × 6]>   1230400.
## 10 Brazil     <tibble [52 × 6]> 137294617.
## # … with 67 more rows

## 2. Example 2) building a model with map()

### 2.1 Coefficients

lm_reg_coef = nested %>%
  mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>% 
  mutate(coef = map(model, ~tidy(.x))) %>%
  unnest(coef)

### 2.2 Summary statistics

lm_reg_inference = nested %>%
  mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>% 
  mutate(specs = map(model, ~glance(.x))) %>%
  unnest(specs)

### 2.3 augumented dataframes with predicted values, can directly be used for checking prediction

lm_reg_pred =  nested %>%
  mutate(model = map(data, ~lm(formula = life_expectancy~year,data = .x))) %>%
  mutate(augmented = map(model, ~augment(.x))) %>%
  unnest(augmented)

lm_reg_pred %>% filter(country == "Italy") %>%
  ggplot(aes(x = year, y = life_expectancy)) +
  geom_point() +
  geom_line(aes(y = .fitted), color = "red")

1.2 The sampling workflow

Test and train split

gap_split = initial_split(gapminder, prop = 0.75)
training_data = training(gap_split)
testing_data = testing(gap_split)

Cross Validation Creation and performance

library(Metrics)

## Create the CV splits as columns of a tibble
cv_split = vfold_cv(training_data, v = 3)
cv_data = cv_split %>% 
  mutate(train = map(splits, ~training(.x)),
         validate = map(splits, ~testing(.x)))

## Train a model in the training sets
cv_models_lm = cv_data %>%
  mutate(model = map(train, ~lm(formula = life_expectancy~., data = .x)))

## Prediction for the testing set
cv_prep_lm = cv_models_lm %>%
  mutate(validate_actual = map(validate, ~.x$life_expectancy),
         validate_predicted = map2(model, validate, ~predict(.x, .y)))


cv_eval_lm = cv_prep_lm %>%
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,
                                ~mae(actual = .x, predicted = .y)))
cv_eval_lm

## #  3-fold cross-validation 
## # A tibble: 3 × 8
##   splits              id    train    validate model  validate_…¹ valid…² valid…³
##   <list>              <chr> <list>   <list>   <list> <list>      <list>    <dbl>
## 1 <split [2002/1001]> Fold1 <tibble> <tibble> <lm>   <dbl>       <dbl>      1.51
## 2 <split [2002/1001]> Fold2 <tibble> <tibble> <lm>   <dbl>       <dbl>      1.55
## 3 <split [2002/1001]> Fold3 <tibble> <tibble> <lm>   <dbl>       <dbl>      1.45
## # … with abbreviated variable names ¹validate_actual, ²validate_predicted,
## #   ³validate_mae

1.3 A Random Forest model using `tidyverse` for regression

Fit an RF in ranger with no tuning

library(ranger)
cv_models_rf = cv_data %>%
 mutate(model = map(train, ~ranger(formula = life_expectancy~.,
                                    data = .x, seed = 42)))

cv_prep_rf = cv_models_rf %>%
 mutate(validate_predicted = map2(model, validate,
                                  ~predict(.x, .y)$predictions))

Fit an RF in ranger with hyper parameter tuning

cv_tune = cv_data %>%
  crossing(mtry = 1:5)
head(cv_tune)

## # A tibble: 6 × 5
##   splits              id    train                validate              mtry
##   <list>              <chr> <list>               <list>               <int>
## 1 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     1
## 2 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     2
## 3 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     3
## 4 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     4
## 5 <split [2002/1001]> Fold1 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     5
## 6 <split [2002/1001]> Fold2 <tibble [2,002 × 7]> <tibble [1,001 × 7]>     1

cv_model_tunerf = cv_tune %>%
  mutate(model = map2(train, mtry, ~ranger(formula = life_expectancy~.,data = .x, mtry = .y))) %>%
  mutate(validate_predicted = map2(model, validate, ~predict(.x, .y)$predictions),
          validate_actual = map(validate, ~.x$life_expectancy)) %>%
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted,~mae(actual = .x, predicted = .y)))

knitr::kable(cv_model_tunerf %>% 
  group_by(mtry) %>% 
  summarise(mean_mae = mean(validate_mae)))

mtry	mean_mae
1	0.9682711
2	0.8764128
3	0.8768114
4	0.8763115
5	0.8821423

1.4 A logistic regression model using `tidyverse` for classification

attrtion = readRDS("~/Dropbox/Coursework/Machine Learning in R/Machine Learning in the Tidyverse/datasets/attrition.rds")

attrtion_split = initial_split(attrtion, prop = 0.75)
training_data = training(attrtion_split)
testing_data = testing(attrtion_split)

cv_split = vfold_cv(training_data, v = 5)
cv_data = cv_split %>% 
  mutate(train = map(splits, ~training(.x)),
         validate = map(splits, ~testing(.x)))


cv_models_lr = cv_data %>%
  mutate(model = map(train, ~glm(formula = Attrition~.,
                                 data = .x, family = "binomial")))

cv_models_lr_pred = cv_models_lr %>% 
  mutate(
    # Prepare binary vector of actual Attrition values in validate
    validate_actual = map(validate, ~.x$Attrition == "Yes"),
    # Prepare binary vector of predicted Attrition values for validate
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response")> 0.5 )
  ) %>%
  mutate(
    # Calculate accuracy and recall
    Accuracy = map2(.x =validate_actual, .y = validate_predicted,  ~accuracy(.x,.y)),
    Recall = map2(.x =validate_actual, .y = validate_predicted,  ~recall(.x,.y))
  ) %>%
  unnest(Accuracy) %>%
  unnest(Recall)
  

knitr::kable(cv_models_lr_pred %>% select(id, Accuracy, Recall))

id	Accuracy	Recall
Fold1	0.8552036	0.3636364
Fold2	0.8778281	0.4473684
Fold3	0.8772727	0.3888889
Fold4	0.8681818	0.4285714
Fold5	0.8272727	0.3428571

1.5 A Random Forest model using `tidyverse` for classification

cv_tune =cv_data %>%
  crossing(mtry = c(2, 4, 8, 16))


cv_models_rf = cv_tune %>%
  mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
                                           data = .x, mtry = .y,
                                           num.trees = 100, seed = 42)))
cv_models_rf_pred = cv_models_rf %>% 
  mutate(
    # Prepare binary vector of actual Attrition values in validate
    validate_actual = map(validate, ~.x$Attrition == "Yes"),
    # Prepare binary vector of predicted Attrition values for validate
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)$predictions == "Yes" )
  ) %>%
  mutate(
    # Calculate accurarcy
    Accuracy = map2(.x =validate_actual, .y = validate_predicted,  ~accuracy(.x,.y)),
    Recall = map2(.x =validate_actual, .y = validate_predicted,  ~recall(.x,.y))
  ) %>%
  unnest(Accuracy) %>%
  unnest(Recall)
  

knitr::kable(cv_models_rf_pred %>% select(mtry, Accuracy, Recall) %>% 
               group_by(mtry) %>% summarise_all(mean))

mtry	Accuracy	Recall
2	0.8493624	0.1127804
4	0.8520814	0.1519070
8	0.8529905	0.1865930
16	0.8538996	0.2032597

2. Supervised mdoeling framework in `tidymodels`

tidymodels roadmap * Taken from DataCamp

resampling splits training and testing
recipes for feature engineering
parsnip fits the model with 3 components
- model type (linear regression, e.g.)
- engine (r packages)
- mode (regression or classification)
yarstick evaluate model performance, i.e metrics functions

2.1 Linear regression

# split training and testing
mpg_split = initial_split(mpg,prop = 0.75,strata = hwy)
mpg_training = mpg_split %>% training()
mpg_test = mpg_split %>% testing()

library(tidymodels)
# Set up the model
lm_model =  linear_reg() %>%
  set_engine('lm') %>%
  set_mode('regression')

# Fit the model
lm_fit = lm_model %>%
  fit(hwy ~ cty, data = mpg_training)

# Model coefficients
tidy(lm_fit)

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    0.695    0.563       1.24 2.18e- 1
## 2 cty            1.35     0.0326     41.4  4.39e-91

# Prediction on test data
mpg_test_results = mpg_test %>%
  select(hwy, cty) %>%
  bind_cols(
    lm_fit %>% predict(new_data = mpg_test)
    )

# Performance
mpg_test_results %>% rsq(truth = hwy, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.923

# Visualization
 ggplot(mpg_test_results, aes(x = hwy, y = .pred)) +
  geom_point() +
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() + 
  labs(title = 'R-Squared Plot',
       y = 'Predicted Highway MPG',
       x = 'Actual Highway MPG')

The above procedures can be streamlined with: model specs, formula, and splitting object. THIS SHOULD BE USED if there is no need to get coefficients. Streamlined Approach:

lm_last_fit =  lm_model %>%
  last_fit(hwy ~ cty,
           split = mpg_split)

lm_last_fit %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       1.86  Preprocessor1_Model1
## 2 rsq     standard       0.923 Preprocessor1_Model1

lm_last_fit %>% collect_predictions()

## # A tibble: 61 × 5
##    id               .pred  .row   hwy .config             
##    <chr>            <dbl> <int> <int> <chr>               
##  1 train/test split  29.1     4    30 Preprocessor1_Model1
##  2 train/test split  25.0     7    27 Preprocessor1_Model1
##  3 train/test split  22.3     9    25 Preprocessor1_Model1
##  4 train/test split  23.7    13    25 Preprocessor1_Model1
##  5 train/test split  23.7    14    25 Preprocessor1_Model1
##  6 train/test split  23.7    17    25 Preprocessor1_Model1
##  7 train/test split  22.3    18    23 Preprocessor1_Model1
##  8 train/test split  15.6    20    15 Preprocessor1_Model1
##  9 train/test split  18.3    22    17 Preprocessor1_Model1
## 10 train/test split  16.9    23    17 Preprocessor1_Model1
## # … with 51 more rows

2.2 Classification

leads_df = readRDS("~/Dropbox/Coursework/Machine Learning in R/Modeling with tidymodels in R/datasets/leads_df.rds")
leads_split = initial_split(leads_df,prop = 0.75,strata = purchased)
leads_training = leads_split %>% training()
leads_test =  leads_split %>%testing()

# Set up the model
logistic_model =  logistic_reg() %>%
  set_engine('glm') %>%
  set_mode('classification')

# Fit the model
logistic_fit = logistic_model %>%
  fit(purchased ~ total_visits + total_time,
      data = leads_training)

# Prediction
class_preds = logistic_fit %>%
  predict(new_data = leads_test,
          type = 'class')

prob_preds = logistic_fit %>%
  predict(new_data = leads_test,
          type = 'prob')

leads_results = leads_test %>%
  select(purchased) %>%
  bind_cols(class_preds, prob_preds)

In tidymodes classification model, outcome has to be a factor with the positive class being the first level.

# Get selected metrics
custom_metrics = metric_set( sens, spec)

custom_metrics(leads_results,
               truth = purchased,
               estimate = .pred_class)

## # A tibble: 2 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 sens    binary         0.7  
## 2 spec    binary         0.858

# Get all
conf_mat(leads_results, truth = purchased,
         estimate = .pred_class) %>% summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.801
##  2 kap                  binary         0.565
##  3 sens                 binary         0.7  
##  4 spec                 binary         0.858
##  5 ppv                  binary         0.737
##  6 npv                  binary         0.835
##  7 mcc                  binary         0.565
##  8 j_index              binary         0.558
##  9 bal_accuracy         binary         0.779
## 10 detection_prevalence binary         0.343
## 11 precision            binary         0.737
## 12 recall               binary         0.7  
## 13 f_meas               binary         0.718

# Visualize the conft matrix

conf_mat(leads_results,
         truth = purchased,
         estimate = .pred_class) %>%
  autoplot(type = 'heatmap')

conf_mat(leads_results,
         truth = purchased,
         estimate = .pred_class) %>%
  autoplot(type = 'mosaic')

# ROC
leads_results %>%
  roc_curve(truth = purchased, .pred_yes) %>%
  autoplot()

# AUC
roc_auc(leads_results,truth = purchased,.pred_yes)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.852

Streamlined Approach:

logistic_last_fit = logistic_model %>%
  last_fit(purchased ~ total_visits + total_time,
           split = leads_split)
logistic_last_fit %>%
  collect_metrics()

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.801 Preprocessor1_Model1
## 2 roc_auc  binary         0.852 Preprocessor1_Model1

last_fit_results = logistic_last_fit %>%
  collect_predictions()

conf_mat(last_fit_results, truth = purchased,
         estimate = .pred_class) %>% summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.801
##  2 kap                  binary         0.565
##  3 sens                 binary         0.7  
##  4 spec                 binary         0.858
##  5 ppv                  binary         0.737
##  6 npv                  binary         0.835
##  7 mcc                  binary         0.565
##  8 j_index              binary         0.558
##  9 bal_accuracy         binary         0.779
## 10 detection_prevalence binary         0.343
## 11 precision            binary         0.737
## 12 recall               binary         0.7  
## 13 f_meas               binary         0.718

2.3 Feature engineering

recipe allows:

Define column roles, predictors or outcome
Variable data types, numeric or categorical
Preprocessing steps step_*()

2.3.1 Overall procedure

# Build a recipe
leads_log_rec = recipe(purchased ~ .,data = leads_training) %>%
  step_log(total_time, base = 10)

leads_log_rec %>%
  summary()

## # A tibble: 7 × 4
##   variable        type    role      source  
##   <chr>           <chr>   <chr>     <chr>   
## 1 total_visits    numeric predictor original
## 2 total_time      numeric predictor original
## 3 pages_per_visit numeric predictor original
## 4 total_clicks    numeric predictor original
## 5 lead_source     nominal predictor original
## 6 us_location     nominal predictor original
## 7 purchased       nominal outcome   original

# Train based on recipe
leads_log_rec_prep = leads_log_rec %>% prep(training = leads_training)

# Transform the training data
leads_log_rec_prep %>%
  bake(new_data = NULL)

## # A tibble: 996 × 7
##    total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
##           <dbl>      <dbl>           <dbl>        <dbl> <fct>    <fct>   <fct>  
##  1            7      2.68             2.33           21 organic… west    no     
##  2            3      2.85             3              28 organic… west    no     
##  3            3      2.22             3              32 direct_… southe… no     
##  4            3      0.845            3              23 organic… west    no     
##  5            6      2.75             6              48 organic… southe… no     
##  6            4      2.37             4              31 organic… southe… no     
##  7            2      2.37             1              13 organic… southe… no     
##  8            3      3.01             1.5            17 organic… southe… no     
##  9            2      2.95             2              25 organic… southe… no     
## 10            2      2.75             2              22 organic… southw… no     
## # … with 986 more rows, and abbreviated variable names ¹lead_source,
## #   ²us_location, ³purchased

# Transform the test data
leads_log_rec_prep %>%
  bake(new_data = leads_test)

## # A tibble: 332 × 7
##    total_visits total_time pages_per_visit total_clicks lead_s…¹ us_lo…² purch…³
##           <dbl>      <dbl>           <dbl>        <dbl> <fct>    <fct>   <fct>  
##  1            8       2               2.67           24 direct_… west    no     
##  2            5       2.36            2.5            25 email    southe… no     
##  3            4       2.25            4              37 direct_… west    no     
##  4            2       3.10            2              26 email    midwest no     
##  5            2       2.29            2              20 organic… west    no     
##  6            2       2.87            2              26 affilia… southw… yes    
##  7            4       3.11            4              37 direct_… northe… yes    
##  8            2       1.20            1              12 email    midwest no     
##  9            9       3.01            1.5            12 direct_… west    yes    
## 10            2       2.24            2              20 affilia… southw… no     
## # … with 322 more rows, and abbreviated variable names ¹lead_source,
## #   ²us_location, ³purchased

2.3.2 Treatment of numeric variables

Correlated variables

leads_training %>%
  select_if(is.numeric) %>%
  cor()

##                 total_visits  total_time pages_per_visit total_clicks
## total_visits      1.00000000 0.026421560      0.48605494  0.463329369
## total_time        0.02642156 1.000000000      0.01400162  0.003530897
## pages_per_visit   0.48605494 0.014001621      1.00000000  0.963031706
## total_clicks      0.46332937 0.003530897      0.96303171  1.000000000

## Highly correlated variables will have one to be removed
leads_cor_rec = recipe(purchased ~ .,
                        data = leads_training) %>%
  step_corr(all_numeric(), -all_outcomes(), threshold = 0.9)


processed_train = leads_cor_rec %>%
  prep(training = leads_training) %>%
  bake(new_data = NULL)

processed_test = leads_cor_rec %>%
  prep(training = leads_training) %>%
  bake(new_data = leads_test)

Normalization

leads_norm_rec =  recipe(purchased ~ .,
                         data = leads_training) %>%
  step_corr(all_numeric(), -all_outcomes(),threshold = 0.9) %>%
  step_normalize(all_numeric())

leads_norm_rec %>%
  prep(training = leads_training) %>%
  bake(new_data = leads_test)

## # A tibble: 332 × 6
##    total_visits total_time total_clicks lead_source    us_location purchased
##           <dbl>      <dbl>        <dbl> <fct>          <fct>       <fct>    
##  1        0.974     -0.993       -0.334 direct_traffic west        no       
##  2        0.147     -0.754       -0.260 email          southeast   no       
##  3       -0.129     -0.849        0.622 direct_traffic west        no       
##  4       -0.680      1.20        -0.187 email          midwest     no       
##  5       -0.680     -0.819       -0.628 organic_search west        no       
##  6       -0.680      0.202       -0.187 affiliate      southwest   yes      
##  7       -0.129      1.22         0.622 direct_traffic northeast   yes      
##  8       -0.680     -1.15        -1.22  email          midwest     no       
##  9        1.25       0.731       -1.22  direct_traffic west        yes      
## 10       -0.680     -0.859       -0.628 affiliate      southwest   no       
## # … with 322 more rows

2.3.3 Treatment of nominal variables

Create dummy variable

recipe(purchased ~ ., data = leads_training) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  prep(training = leads_training) %>%
  bake(new_data = leads_test)

## # A tibble: 332 × 12
##    total_visits total_…¹ pages…² total…³ purch…⁴ lead_…⁵ lead_…⁶ lead_…⁷ us_lo…⁸
##           <dbl>    <dbl>   <dbl>   <dbl> <fct>     <dbl>   <dbl>   <dbl>   <dbl>
##  1            8      100    2.67      24 no            0       0       1       0
##  2            5      228    2.5       25 no            1       0       0       1
##  3            4      177    4         37 no            0       0       1       0
##  4            2     1273    2         26 no            1       0       0       0
##  5            2      193    2         20 no            0       1       0       0
##  6            2      739    2         26 yes           0       0       0       0
##  7            4     1286    4         37 yes           0       0       1       0
##  8            2       16    1         12 no            1       0       0       0
##  9            9     1022    1.5       12 yes           0       0       1       0
## 10            2      172    2         20 no            0       0       0       0
## # … with 322 more rows, 3 more variables: us_location_midwest <dbl>,
## #   us_location_southwest <dbl>, us_location_west <dbl>, and abbreviated
## #   variable names ¹total_time, ²pages_per_visit, ³total_clicks, ⁴purchased,
## #   ⁵lead_source_email, ⁶lead_source_organic_search,
## #   ⁷lead_source_direct_traffic, ⁸us_location_southeast

2.4 Machine learning workflow

2.4.1 A full picture

Data resampling

leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
  training()
leads_test = leads_split %>%
  testing()

Model specification

logistic_model = logistic_reg() %>%
  set_engine('glm') %>%
  set_mode('classification')

Feature engineering

# Create recipe
leads_recipe = recipe(purchased ~ .,
                       data = leads_training) %>%
  step_corr(all_numeric(), threshold = 0.9) %>%
  step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
leads_recipe_prep =  leads_recipe %>%
  prep(training = leads_training)

leads_training_prep = leads_recipe_prep %>%
  bake(new_data = NULL)

leads_test_prep = leads_recipe_prep %>%
  bake(new_data = leads_test)

Model fitting and prediction

logistic_fit = logistic_model %>%
  fit(purchased ~ .,
      data = leads_training_prep)
class_preds = predict(logistic_fit,
                       new_data = leads_test_prep,
                       type = 'class')
prob_preds = predict(logistic_fit,
                      new_data = leads_test_prep,
                      type = 'prob')
leads_results = leads_test %>%
  select(purchased) %>%
  bind_cols(class_preds, prob_preds)

Model evaluation

leads_results %>%
  conf_mat(truth = purchased,
           estimate = .pred_class) %>% summary()

## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.822
##  2 kap                  binary         0.601
##  3 sens                 binary         0.675
##  4 spec                 binary         0.906
##  5 ppv                  binary         0.802
##  6 npv                  binary         0.831
##  7 mcc                  binary         0.606
##  8 j_index              binary         0.581
##  9 bal_accuracy         binary         0.790
## 10 detection_prevalence binary         0.304
## 11 precision            binary         0.802
## 12 recall               binary         0.675
## 13 f_meas               binary         0.733

roc_auc(leads_results,truth = purchased,.pred_yes)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.857

2.4.2 A decision tree example to combine model and recipe

dt_model = decision_tree() %>%
  set_engine('rpart') %>%
  set_mode('classification')

leads_recipe = recipe(purchased ~ .,
                       data = leads_training) %>%
  step_corr(all_numeric(), threshold = 0.9) %>%
  step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())

Combine model and recipes

# Create the workflow
leads_wkfl = workflow() %>%
  add_model(dt_model) %>%
  add_recipe(leads_recipe)
leads_wkfl

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_corr()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
## 
## Computational engine: rpart

# Fit the model using the streamline way
leads_wkfl_fit = leads_wkfl %>%
  last_fit(split = leads_split)

leads_wkfl_fit %>%
  collect_metrics()

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.816 Preprocessor1_Model1
## 2 roc_auc  binary         0.846 Preprocessor1_Model1

leads_wkfl_preds = leads_wkfl_fit %>%
  collect_predictions()
leads_wkfl_preds

## # A tibble: 332 × 7
##    id               .pred_yes .pred_no  .row .pred_class purchased .config      
##    <chr>                <dbl>    <dbl> <int> <fct>       <fct>     <chr>        
##  1 train/test split     0.132    0.868     3 no          no        Preprocessor…
##  2 train/test split     0.132    0.868     4 no          no        Preprocessor…
##  3 train/test split     0.132    0.868     8 no          no        Preprocessor…
##  4 train/test split     0.132    0.868     9 no          no        Preprocessor…
##  5 train/test split     0.132    0.868    10 no          no        Preprocessor…
##  6 train/test split     0.132    0.868    22 no          no        Preprocessor…
##  7 train/test split     0.757    0.243    24 yes         yes       Preprocessor…
##  8 train/test split     0.132    0.868    29 no          no        Preprocessor…
##  9 train/test split     0.757    0.243    37 yes         yes       Preprocessor…
## 10 train/test split     0.757    0.243    38 yes         yes       Preprocessor…
## # … with 322 more rows

# Evaluation
leads_metrics = metric_set(roc_auc, sens, spec)
leads_wkfl_preds %>%
  leads_metrics(truth = purchased,
                estimate = .pred_class,
                .pred_yes)

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 sens    binary         0.683
## 2 spec    binary         0.892
## 3 roc_auc binary         0.846

2.4.3 Cross validation

set.seed(214)
leads_folds = vfold_cv(leads_training,
                        v = 10,
                        strata = purchased)

leads_rs_fit = leads_wkfl %>%
  fit_resamples(resamples = leads_folds,
                metrics = leads_metrics)

## Or:
# leads_rs_fit =  fit_resamples(
#   dt_model,
#   purchased ~ .,
#   resamples = leads_folds,
#   metrics = leads_metrics)


# Get the mean across CVs
leads_rs_fit %>%
  collect_metrics()

## # A tibble: 3 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 roc_auc binary     0.794    10  0.0123 Preprocessor1_Model1
## 2 sens    binary     0.672    10  0.0175 Preprocessor1_Model1
## 3 spec    binary     0.846    10  0.0127 Preprocessor1_Model1

# Get the metrics for each CV
rs_metrics = leads_rs_fit %>%
  collect_metrics(summarize = FALSE)

rs_metrics %>%
  group_by(.metric) %>%
  summarize(min = min(.estimate),
            median = median(.estimate),
            max = max(.estimate),
            mean = mean(.estimate),
            sd = sd(.estimate))

## # A tibble: 3 × 6
##   .metric   min median   max  mean     sd
##   <chr>   <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 roc_auc 0.738  0.803 0.859 0.794 0.0390
## 2 sens    0.556  0.681 0.75  0.672 0.0552
## 3 spec    0.794  0.841 0.906 0.846 0.0402

2.4.4 Hyper parameter tuning in `tidymodels`*

Set up model to label hyper parameters to be tuned

dt_tune_model = decision_tree(cost_complexity = tune(),
                               tree_depth = tune(),
                              min_n = tune()) %>%
  set_engine('rpart') %>%
  set_mode('classification')
dt_tune_model

## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = tune()
##   tree_depth = tune()
##   min_n = tune()
## 
## Computational engine: rpart

leads_tune_wkfl = leads_wkfl %>%
  update_model(dt_tune_model)
leads_tune_wkfl

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_corr()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = tune()
##   tree_depth = tune()
##   min_n = tune()
## 
## Computational engine: rpart

Grid search - random

set.seed(214)
dt_grid = grid_random(parameters(dt_tune_model),size = 5)

dt_tuning = leads_tune_wkfl %>%
             tune_grid(resamples = leads_folds,
                       grid = dt_grid,
                       metrics = leads_metrics)

## Overall exploration of results
# dt_tuning %>%
#   collect_metrics()

dt_tuning %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == 'roc_auc') %>%
  group_by(id) %>%
  summarize(min_roc_auc = min(.estimate),
            median_roc_auc = median(.estimate),
            max_roc_auc = max(.estimate))

## # A tibble: 10 × 4
##    id     min_roc_auc median_roc_auc max_roc_auc
##    <chr>        <dbl>          <dbl>       <dbl>
##  1 Fold01       0.768          0.806       0.807
##  2 Fold02       0.736          0.806       0.838
##  3 Fold03       0.815          0.844       0.854
##  4 Fold04       0.799          0.807       0.837
##  5 Fold05       0.633          0.745       0.768
##  6 Fold06       0.817          0.861       0.874
##  7 Fold07       0.695          0.799       0.811
##  8 Fold08       0.759          0.799       0.860
##  9 Fold09       0.739          0.818       0.845
## 10 Fold10       0.727          0.738       0.775

Grid search - regular

dt_grid = grid_regular(parameters(dt_tune_model),levels = 3)

Show and select best model

# Display the best model from tuning
dt_tuning %>% show_best(metric = 'roc_auc', n = 5)

## # A tibble: 5 × 9
##   cost_complexity tree_depth min_n .metric .estima…¹  mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1    0.0000000758         14    39 roc_auc binary    0.820    10 0.0128  Prepro…
## 2    0.00380               5    36 roc_auc binary    0.805    10 0.0135  Prepro…
## 3    0.000000600           3     5 roc_auc binary    0.798    10 0.00935 Prepro…
## 4    0.0243                5    34 roc_auc binary    0.791    10 0.0124  Prepro…
## 5    0.00000443           11     8 roc_auc binary    0.767    10 0.0228  Prepro…
## # … with abbreviated variable name ¹.estimator

# select the best model
best_dt_model = dt_tuning %>%
  select_best(metric = 'roc_auc')
best_dt_model

## # A tibble: 1 × 4
##   cost_complexity tree_depth min_n .config             
##             <dbl>      <int> <int> <chr>               
## 1    0.0000000758         14    39 Preprocessor1_Model1

# Finalize the workflow
final_leads_wkfl = leads_tune_wkfl %>%
  finalize_workflow(best_dt_model)
final_leads_wkfl

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_corr()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = 7.58290839567418e-08
##   tree_depth = 14
##   min_n = 39
## 
## Computational engine: rpart

leads_final_fit = final_leads_wkfl %>%
  last_fit(split = leads_split)
leads_final_fit %>%
  collect_metrics()

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.810 Preprocessor1_Model1
## 2 roc_auc  binary         0.855 Preprocessor1_Model1

2.5 Final Overall Procedure

The full end-to-end procedure with workflow

# A full picture
rm(list = ls())

# 1. Split training and testing
leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
  training()
leads_test = leads_split %>%
  testing()

# 2. Initial model set up and indicate parameters to be tuned
dt_tune_model = decision_tree(cost_complexity = tune(),
                              tree_depth = tune(),
                              min_n = tune()) %>%
  set_engine('rpart') %>%
  set_mode('classification')

leads_metrics = metric_set(roc_auc, sens, spec)

# 3. Set up the recipe
leads_recipe = recipe(purchased ~ .,
                      data = leads_training) %>%
  step_corr(all_numeric(), threshold = 0.9) %>%
  step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())


# 4. Create the cross validation
leads_folds = vfold_cv(leads_training,
                       v = 10,
                       strata = purchased)

# 5. Create the ML workflow
leads_tune_wkfl = workflow() %>%
  add_model(dt_tune_model) %>%
  add_recipe(leads_recipe)



# 6. Grid search tuning
dt_grid = grid_random(extract_parameter_set_dials(dt_tune_model),size = 5)

dt_tuning = leads_tune_wkfl %>%
  tune_grid(resamples = leads_folds,
            grid = dt_grid,
            metrics = leads_metrics)

# 7. Evaluate the grid search
dt_tuning %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == 'roc_auc') %>%
  group_by(id) %>%
  summarize(min_roc_auc = min(.estimate),
            median_roc_auc = median(.estimate),
            max_roc_auc = max(.estimate))

# 8. Select the best model
best_dt_model = dt_tuning %>%
  select_best(metric = 'roc_auc')

# 9. Finalize the workflow
final_leads_wkfl = leads_tune_wkfl %>%
  finalize_workflow(best_dt_model)
final_leads_wkfl

# 10. Apply the model to the test data and evaluate the test set performance
leads_final_fit = final_leads_wkfl %>%
  last_fit(split = leads_split)

conf_mat(leads_final_fit$.predictions[[1]],truth = purchased,
           estimate = .pred_class) %>% summary()

leads_final_fit %>% collect_metrics()

The full end-to-end procedure without workflow

# A full picture

# 1. Split training and testing
leads_split = initial_split(leads_df,strata = purchased)
leads_training = leads_split %>%
  training()
leads_test = leads_split %>%
  testing()

# 2. Set up the tuning specs
boost_spec = boost_tree(
  trees = 500,
  learn_rate = tune(),
  tree_depth = tune(),
  sample_size = tune()) %>%
  set_mode("classification") %>%
  set_engine("xgboost")

# 3. Create the tuning grid
tunegrid_boost =  grid_regular(extract_parameter_set_dials(boost_spec),
                               levels = 2)

# 4. Tuning
tune_results = tune_grid(
  boost_spec,
  purchased ~ .,
  resamples = vfold_cv(leads_training, v = 6),
  grid = tunegrid_boost,
  metrics = metric_set(roc_auc))

# 5. Visualize the results
autoplot(tune_results)

# 6. Finalize the model
best_params =  select_best(tune_results)
final_spec =  finalize_model(boost_spec,
                             best_params)

# # 7. Train the final model
# final_model = final_spec %>% fit(formula = purchased ~ .,
#                                  data = leads_training)
# 
# # 8. Testing prediction
# predictions = predict(final_model,leads_test, type = "class") %>%
#   bind_cols(leads_test) %>% 
#   
# 
# predictions = leads_test %>% select(purchased) %>%
#   bind_cols(predict(final_model,leads_test, type = "class"),
#             predict(final_model,leads_test, type = "prob"))

# 7. The streamelined approahc
best_params =  select_best(tune_results)
final_spec =  finalize_model(boost_spec,
                             best_params)


final_fit = final_spec %>%
  last_fit(purchased ~ .,
    split = leads_split)

final_fit %>% collect_metrics()
final_fit %>% collect_predictions()

3. Machine learning with `caret`

3.1 ML set up in `caret`

library(caret)
set.seed(42)
breast_cancer_data = readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/breast_cancer_data_orig.csv")

# Create partition index
index = createDataPartition(breast_cancer_data$diagnosis, p = .70,
list = FALSE)

# Subset `breast_cancer_data` with index
bc_train_data = breast_cancer_data[index, ]
bc_test_data = breast_cancer_data[-index, ]

# Set up CV
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5)
hyperparams = expand.grid(degree = 4, scale = 1, C = 1)
set.seed(42)
svm_model = train(diagnosis ~ .,
                    data = bc_train_data,
                    method = "svmPoly",
                    trControl = fitControl,
                    tuneGrid =hyperparams,
                    verbose = FALSE)

3.2 Data preprocessing

model = train(X, Y, method = "glm",
              preProcess = c("zv", "center", "scale", "medianImpute", "pca")
              )

3.3 Hyper parameter tuning in `caret`

Hyper parameters can be viewed: https://topepo.github.io/caret/available-models.html

voters_train_data = readr::read_csv("~/Dropbox/Coursework/Machine Learning in R/Hyperpearameter Tuning in R/datasets/voters_train_data.csv")

# set up cv
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5)

Custom tuning grid

man_grid = expand.grid(n.trees = c(100, 200, 250),
                        interaction.depth = c(1, 4, 6),
                        shrinkage = 0.1, n.minobsinnode = 10)

gbm_model_voters_grid = train(turnout16_2016 ~ .,data = voters_train_data,
                              method = "gbm",
                              trControl = fitControl,
                              verbose = FALSE,
                              tuneGrid = man_grid)

Grid search with range

big_grid = expand.grid(n.trees = seq(from = 10, to = 300, by = 50),
                        interaction.depth = seq(from = 1, to = 10,
                                                length.out = 6),
                        shrinkage = 0.1,
                        n.minobsinnode = 10)
fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "grid")
set.seed(42)
gbm_model_voters_big_grid = train(turnout16_2016 ~ .,
                                   data = voters_train_data,
                                   method = "gbm",
                                   trControl = fitControl,
                                   verbose = FALSE,
                                   tuneGrid = big_grid)
ggplot(gbm_model_voters_big_grid)

Random Search

fitControl = trainControl(method = "repeatedcv", number = 3, repeats = 5, search = "random")
gbm_model_voters_random = train(turnout16_2016 ~ .,
                                 data = voters_train_data,
                                 method = "gbm",
                                 trControl = fitControl,
                                 verbose = FALSE,
                                 tuneLength = 5)

3.4 Comapre multiple models

set.seed(42)
myFolds = createFolds(bc_train_data$diagnosis, k = 5)
myControl = trainControl(
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  verboseIter = TRUE,
  savePredictions = TRUE,
  index = myFolds
)


model_glmnet = train(diagnosis ~ .,
                     bc_train_data,
                     metric = "ROC",
                     method = "glmnet",
                     trControl = myControl)

## + Fold1: alpha=0.10, lambda=0.07724 
## - Fold1: alpha=0.10, lambda=0.07724 
## + Fold1: alpha=0.55, lambda=0.07724 
## - Fold1: alpha=0.55, lambda=0.07724 
## + Fold1: alpha=1.00, lambda=0.07724 
## - Fold1: alpha=1.00, lambda=0.07724 
## + Fold2: alpha=0.10, lambda=0.07724 
## - Fold2: alpha=0.10, lambda=0.07724 
## + Fold2: alpha=0.55, lambda=0.07724 
## - Fold2: alpha=0.55, lambda=0.07724 
## + Fold2: alpha=1.00, lambda=0.07724 
## - Fold2: alpha=1.00, lambda=0.07724 
## + Fold3: alpha=0.10, lambda=0.07724 
## - Fold3: alpha=0.10, lambda=0.07724 
## + Fold3: alpha=0.55, lambda=0.07724 
## - Fold3: alpha=0.55, lambda=0.07724 
## + Fold3: alpha=1.00, lambda=0.07724 
## - Fold3: alpha=1.00, lambda=0.07724 
## + Fold4: alpha=0.10, lambda=0.07724 
## - Fold4: alpha=0.10, lambda=0.07724 
## + Fold4: alpha=0.55, lambda=0.07724 
## - Fold4: alpha=0.55, lambda=0.07724 
## + Fold4: alpha=1.00, lambda=0.07724 
## - Fold4: alpha=1.00, lambda=0.07724 
## + Fold5: alpha=0.10, lambda=0.07724 
## - Fold5: alpha=0.10, lambda=0.07724 
## + Fold5: alpha=0.55, lambda=0.07724 
## - Fold5: alpha=0.55, lambda=0.07724 
## + Fold5: alpha=1.00, lambda=0.07724 
## - Fold5: alpha=1.00, lambda=0.07724 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0772 on full training set

model_rf = train(diagnosis ~ .,
                  bc_train_data,
                  metric = "ROC",
                  method = "ranger",
                  trControl = myControl
)

## + Fold1: mtry= 2, min.node.size=1, splitrule=gini 
## - Fold1: mtry= 2, min.node.size=1, splitrule=gini 
## + Fold1: mtry=16, min.node.size=1, splitrule=gini 
## - Fold1: mtry=16, min.node.size=1, splitrule=gini 
## + Fold1: mtry=30, min.node.size=1, splitrule=gini 
## - Fold1: mtry=30, min.node.size=1, splitrule=gini 
## + Fold1: mtry= 2, min.node.size=1, splitrule=extratrees 
## - Fold1: mtry= 2, min.node.size=1, splitrule=extratrees 
## + Fold1: mtry=16, min.node.size=1, splitrule=extratrees 
## - Fold1: mtry=16, min.node.size=1, splitrule=extratrees 
## + Fold1: mtry=30, min.node.size=1, splitrule=extratrees 
## - Fold1: mtry=30, min.node.size=1, splitrule=extratrees 
## + Fold2: mtry= 2, min.node.size=1, splitrule=gini 
## - Fold2: mtry= 2, min.node.size=1, splitrule=gini 
## + Fold2: mtry=16, min.node.size=1, splitrule=gini 
## - Fold2: mtry=16, min.node.size=1, splitrule=gini 
## + Fold2: mtry=30, min.node.size=1, splitrule=gini 
## - Fold2: mtry=30, min.node.size=1, splitrule=gini 
## + Fold2: mtry= 2, min.node.size=1, splitrule=extratrees 
## - Fold2: mtry= 2, min.node.size=1, splitrule=extratrees 
## + Fold2: mtry=16, min.node.size=1, splitrule=extratrees 
## - Fold2: mtry=16, min.node.size=1, splitrule=extratrees 
## + Fold2: mtry=30, min.node.size=1, splitrule=extratrees 
## - Fold2: mtry=30, min.node.size=1, splitrule=extratrees 
## + Fold3: mtry= 2, min.node.size=1, splitrule=gini 
## - Fold3: mtry= 2, min.node.size=1, splitrule=gini 
## + Fold3: mtry=16, min.node.size=1, splitrule=gini 
## - Fold3: mtry=16, min.node.size=1, splitrule=gini 
## + Fold3: mtry=30, min.node.size=1, splitrule=gini 
## - Fold3: mtry=30, min.node.size=1, splitrule=gini 
## + Fold3: mtry= 2, min.node.size=1, splitrule=extratrees 
## - Fold3: mtry= 2, min.node.size=1, splitrule=extratrees 
## + Fold3: mtry=16, min.node.size=1, splitrule=extratrees 
## - Fold3: mtry=16, min.node.size=1, splitrule=extratrees 
## + Fold3: mtry=30, min.node.size=1, splitrule=extratrees 
## - Fold3: mtry=30, min.node.size=1, splitrule=extratrees 
## + Fold4: mtry= 2, min.node.size=1, splitrule=gini 
## - Fold4: mtry= 2, min.node.size=1, splitrule=gini 
## + Fold4: mtry=16, min.node.size=1, splitrule=gini 
## - Fold4: mtry=16, min.node.size=1, splitrule=gini 
## + Fold4: mtry=30, min.node.size=1, splitrule=gini 
## - Fold4: mtry=30, min.node.size=1, splitrule=gini 
## + Fold4: mtry= 2, min.node.size=1, splitrule=extratrees 
## - Fold4: mtry= 2, min.node.size=1, splitrule=extratrees 
## + Fold4: mtry=16, min.node.size=1, splitrule=extratrees 
## - Fold4: mtry=16, min.node.size=1, splitrule=extratrees 
## + Fold4: mtry=30, min.node.size=1, splitrule=extratrees 
## - Fold4: mtry=30, min.node.size=1, splitrule=extratrees 
## + Fold5: mtry= 2, min.node.size=1, splitrule=gini 
## - Fold5: mtry= 2, min.node.size=1, splitrule=gini 
## + Fold5: mtry=16, min.node.size=1, splitrule=gini 
## - Fold5: mtry=16, min.node.size=1, splitrule=gini 
## + Fold5: mtry=30, min.node.size=1, splitrule=gini 
## - Fold5: mtry=30, min.node.size=1, splitrule=gini 
## + Fold5: mtry= 2, min.node.size=1, splitrule=extratrees 
## - Fold5: mtry= 2, min.node.size=1, splitrule=extratrees 
## + Fold5: mtry=16, min.node.size=1, splitrule=extratrees 
## - Fold5: mtry=16, min.node.size=1, splitrule=extratrees 
## + Fold5: mtry=30, min.node.size=1, splitrule=extratrees 
## - Fold5: mtry=30, min.node.size=1, splitrule=extratrees 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = extratrees, min.node.size = 1 on full training set

# Make a list
model_list = list(glmnet = model_glmnet,rf = model_rf)
resamps = resamples(model_list)
summary(resamps)

## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: glmnet, rf 
## Number of resamples: 5 
## 
## ROC 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## glmnet 0.9875630 0.9890336 0.9917647 0.9913304 0.9934167 0.9948739    0
## rf     0.9873529 0.9873739 0.9891597 0.9894744 0.9907773 0.9927083    0
## 
## Sens 
##         Min. 1st Qu. Median  Mean 3rd Qu. Max. NA's
## glmnet 0.975    0.98  0.995 0.989   0.995    1    0
## rf     0.950    0.96  0.980 0.977   0.995    1    0
## 
## Spec 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## glmnet 0.8067227 0.8750000 0.9075630 0.8876050 0.9075630 0.9411765    0
## rf     0.8151261 0.8833333 0.8991597 0.8892717 0.8991597 0.9495798    0

bwplot(resamps, metric = "ROC")

densityplot(resamps, metric = "ROC")

Streamlined Machine Learning Modeling in R