gee model #154

spsanderson · 2023-11-18T03:16:05Z

spsanderson
Nov 18, 2023
Maintainer

I got the following to work for the gee model, so I can turn it into a function, but this requires a bit of a paradigm shift in how the fast_regression/classification functions work:

library(tidyAML)
library(tidyverse)
library(tidymodels)
library(multilevelmod)

rec_obj <- recipe(mpg ~ ., data = mtcars)
splits_obj <- create_splits(mtcars)

my_formula <- formula(prep(rec_obj))
predictor_vars <- rec_obj$var_info |>
  filter(role == "predictor") |>
  pull(variable)
var_to_replace <- rec_obj$var_info |> 
  filter(role == "predictor") |> 
  slice(1) |> 
  pull(variable)
outcome_var <- rec_obj$var_info |>
  filter(role == "outcome") |>
  pull(variable)

new_terms <-  paste0("id_var(", var_to_replace, ")")
new_terms1 <- paste(new_terms, collapse = "+")
new_formula <- do.call(
  "substitute", 
  list(
    my_formula, 
    setNames(
      list(
        str2lang(new_terms1)
      ), 
      var_to_replace
    )
  )
)
new_formula <- as.formula(new_formula)

mod_tbl <- fast_regression_parsnip_spec_tbl(
  .parsnip_eng = c("gee"),
  .parsnip_fns = "linear_reg"
)

mod_spec <- mod_tbl[["model_spec"]][[1]]
mod_wflw_tbl <- mod_tbl |>
  mutate(wflw = list(workflow() |>
                       add_variables(
                         outcomes = outcome_var,
                         predictors = c(predictor_vars)
                       ) |>
                       add_model(mod_spec, formula = new_formula)))

mod_fitted_tbl <- mod_wflw_tbl |>
  mutate(fitted_wflw = list(fit(mod_wflw_tbl$wflw[[1]], data = training(splits_obj$splits))))

mod_pred_tbl <- mod_fitted_tbl |>
  mutate(pred_wflw = list(predict(mod_fitted_tbl[[7]][[1]], testing(splits_obj$splits))))

mod_pred_tbl[[8]][[1]]

# A tibble: 8 × 1
  .pred
  <dbl>
1 21.6 
2 18.5 
3 16.2 
4  8.27
5 26.7 
6 29.2 
7 24.3 
8 18.0

It's important to note that this is 'fast' so we just pick the first predictor variable.

Also maybe it is only necessary to make functions for those that fail the traditional workflow? thoughts?

spsanderson · 2023-11-21T14:57:29Z

spsanderson
Nov 21, 2023
Maintainer Author

I have made what I think can be a generic update to this that will add a class to the model_spec output #157

2 replies

spsanderson Nov 22, 2023
Maintainer Author

I merged this, and now have to work on the internal_make_wflw_tbl() function. This is where I think we are going to have to use some method dispatching. I have added a class to the make spec and it applies it to the model specification itself.

spsanderson Nov 22, 2023
Maintainer Author

Unfortunately adding a class to the model spec return object causes workflows::fit to fail, so I have added an attribute that strings together what the class would have been. Back to the drawing board.

spsanderson · 2023-11-28T03:50:10Z

spsanderson
Nov 28, 2023
Maintainer Author

I think I am starting to get a framework of sorts going. Here is what I ended up with tonight:

New internal_make_spec_tbl:

internal_make_spec_tblv2 <- function(.model_tbl){
  
  # Tidyeval ----
  model_tbl <- .model_tbl
  
  # Checks ----
  if (!inherits(model_tbl, "tidyaml_base_tbl")){
    rlang::abort(
      message = "The model tibble must come from the make base tbl function.",
      use_cli_format = TRUE
    )
  }
  
  # Manipulation
  model_factor_tbl <- model_tbl |>
    dplyr::mutate(.model_id = dplyr::row_number() |>
                    forcats::as_factor()) |>
    dplyr::select(.model_id, dplyr::everything())
  
  # Make a group split object list
  models_list <- model_factor_tbl |>
    dplyr::group_split(.model_id)
  
  # Make the Workflow Object using purrr imap
  model_spec <- models_list |>
    purrr::imap(
      .f = function(obj, id){
        
        # Pull the model column and then pluck the model
        pe <- obj |> dplyr::pull(2) |> purrr::pluck(1)
        pm <- obj |> dplyr::pull(3) |> purrr::pluck(1)
        pf <- obj |> dplyr::pull(4) |> purrr::pluck(1)
        
        ret <- match.fun(pf)(mode = pm, engine = pe)
        
        # Add parsnip engine and fns as class
        # class(ret) <- c(
        #   class(ret),
        #   paste0(base::tolower(pe), "_", base::tolower(pf))
        # )
        
        # Return the result
        attributes(ret)$.tidyaml_mod_class <- paste0(base::tolower(pe), "_", base::tolower(pf))
        return(ret)
      }
    )
  
  # Return
  # Make sure to return as a tibble
  model_spec_ret <- model_factor_tbl |>
    dplyr::mutate(model_spec = model_spec)  |>
    dplyr::mutate(.model_id = as.integer(.model_id))
  
  return(model_spec_ret)
}

New internal_make_wflw_gee_lin_reg specific to gee linear reg:

internal_make_wflw_gee_lin_reg <- function(.model_tbl, .rec_obj){
  
  # Tidyeval ----
  model_tbl <- .model_tbl
  rec_obj <- .rec_obj
  mod_atb <- attributes(model_tbl$model_spec[[1]])
  
  # Checks ----
  if (!inherits(model_tbl, "tidyaml_mod_spec_tbl")){
    rlang::abort(
      message = "'.model_tbl' must inherit a class of 'tidyaml_mod_spec_tbl",
      use_cli_format = TRUE
    )
  }

  if (!mod_atb$.tidyaml_mod_class == "gee_linear_reg"){
    rlang::abort(
      message = "The model class is not 'gee_linear_reg'.",
      use_cli_format = TRUE
    )
  }
  
  # Manipulation
  model_factor_tbl <- model_tbl |>
    dplyr::mutate(.model_id = forcats::as_factor(.model_id)) |>
    dplyr::mutate(rec_obj = list(rec_obj))
  
  # Make a group split object list
  models_list <- model_factor_tbl |>
    dplyr::group_split(.model_id)
  
  # Make the Workflow Object using purrr imap
  wflw_list <- models_list |>
    purrr::imap(
      .f = function(obj, id){
        
        # Pull the model column and then pluck the model
        mod <- obj |> dplyr::pull(5) |> purrr::pluck(1)
        
        # PUll the recipe column and then pluck the recipe
        rec_obj <- obj |> dplyr::pull(6) |> purrr::pluck(1)
        
        # Make New formula
        # Make a formula
        my_formula <- formula(recipes::prep(rec_obj))
        predictor_vars <- rec_obj$var_info |>
          dplyr::filter(role == "predictor") |>
          dplyr::pull(variable)
        var_to_replace <- rec_obj$var_info |> 
          dplyr::filter(role == "predictor") |> 
          dplyr::slice(1) |> 
          dplyr::pull(variable)
        outcome_var <- rec_obj$var_info |>
          dplyr::filter(role == "outcome") |>
          dplyr::pull(variable)
        
        new_terms <-  paste0("id_var(", var_to_replace, ")")
        new_terms1 <- paste(new_terms, collapse = "+")
        new_formula <- do.call(
          "substitute", 
          list(
            my_formula, 
            stats::setNames(
              list(
                str2lang(new_terms1)
              ), 
              var_to_replace
            )
          )
        )
        new_formula <- stats::as.formula(new_formula)
        
        # Create a safe add_model function
        safe_add_model <- purrr::safely(
          workflows::add_model,
          otherwise = NULL,
          quiet = TRUE
        )
        
        # Return the workflow object with recipe and model
        ret <- workflows::workflow() |>
          workflows::add_variables(
            outcomes = outcome_var,
            predictors = predictor_vars
            ) |>
          safe_add_model(mod, formula = new_formula)
        
        # Pluck the result
        res <- ret |> purrr::pluck("result")
        
        if (!is.null(ret$error)) message(stringr::str_glue("{ret$error}"))
        
        # Return the result
        return(res)
      }
    )
  
  # Return
  return(wflw_list)
}

A new function to make the workflow based upon the model attributes:

full_internal_make_wflw <- function(.model_tbl, .rec_obj){
  
  # Tidyeval ----
  model_tbl <- .model_tbl
  rec_obj <- .rec_obj
  model_tbl_class <- class(model_tbl)
  
  # Checks ----
  if (!inherits(model_tbl, "tidyaml_mod_spec_tbl")){
    rlang::abort(
      message = "'.model_tbl' must inherit a class of 'tidyaml_mod_spec_tbl",
      use_cli_format = TRUE
    )
  }
  
  # Manipulation
  model_factor_tbl <- model_tbl |>
    dplyr::mutate(.model_id = forcats::as_factor(.model_id)) 
    #dplyr::mutate(rec_obj = list(rec_obj))
  
  # Make a group split object list
  models_list <- model_factor_tbl |>
    dplyr::group_split(.model_id)
  
  # Make the Workflow Object using purrr imap
  wflw_list <- models_list |>
    purrr::imap(
      .f = function(obj, id){
        
        # Pull the model column and then pluck the model
        mod <- obj |> dplyr::pull(5) |> purrr::pluck(1)
        
        # PUll the recipe column and then pluck the recipe
        #rec_obj <- obj |> dplyr::pull(6) |> purrr::pluck(1)
        
        # Switch Statement
        # First get attributes of the model
        mod_attr <- attributes(mod)$.tidyaml_mod_class
        class(obj) <- c("tidyaml_mod_spec_tbl", class(obj))
        
        # Switch on the class of the model
        if (mod_attr == "gee_linear_reg"){
          ret <- internal_make_wflw_gee_lin_reg(obj, rec_obj)
        } 
        
        if (!mod_attr == "gee_linear_reg"){
         ret <- internal_make_wflw(obj, rec_obj)
        }
        
        # Return Result
        return(ret)
      }
    )
  
  # Return
  return(wflw_list[[1]])
}

Examples:

library(tidyAML)
library(dplyr)
library(recipes)
library(tidyverse)

> mod_tbl <- make_regression_base_tbl()
> mod_tbl <- mod_tbl |>
+   filter(
+     .parsnip_engine %in% c("lm", "glm", "gee") &
+       .parsnip_fns == "linear_reg"
+   )
> 
> class(mod_tbl) <- c("tidyaml_mod_spec_tbl", class(mod_tbl))
> 
> mod_spec_tbl <- internal_make_spec_tblv2(mod_tbl)
> internal_make_wflw(mod_spec_tbl, rec_obj)
[[1]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 

## We see that this model is wrong as there is no preprocessor
[[2]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: gee 


[[3]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: glm 


> 
> gee_mod_spec_tbl <- mod_spec_tbl |>
+   dplyr::slice(2)
> internal_make_wflw(gee_mod_spec_tbl, rec_obj)
## We see that the current function does not create the appropriate preprocessor
[[1]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: gee 


> internal_make_wflw_gee_lin_reg(gee_mod_spec_tbl, rec_obj)
## We see that there is now an appropriate preprocessor
[[1]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Variables
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
Outcomes: outcome_var
Predictors: predictor_vars

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: gee 

## The new function properly creates the workflow on the single object
> full_internal_make_wflw(gee_mod_spec_tbl, rec_obj)
[[1]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Variables
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
Outcomes: outcome_var
Predictors: predictor_vars

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: gee 

## The new function creates the workflows correctly on all objects
> full_internal_make_wflw(mod_spec_tbl, rec_obj)
[[1]]
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ───────────────────────────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ──────────────────────────────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 


> 
> mod_wflw_tbl <- mod_spec_tbl |>
+   mutate(wflw = full_internal_make_wflw(mod_spec_tbl, rec_obj))
> mod_wflw_tbl
# A tibble: 3 × 6
  .model_id .parsnip_engine .parsnip_mode .parsnip_fns model_spec wflw      
      <int> <chr>           <chr>         <chr>        <list>     <list>    
1         1 lm              regression    linear_reg   <spec[+]>  <workflow>
2         2 gee             regression    linear_reg   <spec[+]>  <workflow>
3         3 glm             regression    linear_reg   <spec[+]>  <workflow>
> 
> internal_make_fitted_wflw(mod_wflw_tbl, splits_obj) |>
+   purrr::map(broom::glance)
[[1]]
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC deviance df.residual  nobs
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
1     0.902         0.826  2.54      11.9 0.0000531    10  -49.1  122.  136.     84.1          13    24

[[2]]
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC deviance df.residual  nobs
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
1     0.902         0.826  2.54      11.9 0.0000531    10  -49.1  122.  136.     84.1          13    24

[[3]]
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC deviance df.residual  nobs
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
1     0.902         0.826  2.54      11.9 0.0000531    10  -49.1  122.  136.     84.1          13    24

0 replies

spsanderson · 2023-11-29T03:49:29Z

spsanderson
Nov 29, 2023
Maintainer Author

this is now fixed!

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gee model #154

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 3 comments 2 replies

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

gee model #154

spsanderson Nov 18, 2023 Maintainer

Replies: 3 comments · 2 replies

spsanderson Nov 21, 2023 Maintainer Author

spsanderson Nov 22, 2023 Maintainer Author

spsanderson Nov 22, 2023 Maintainer Author

spsanderson Nov 28, 2023 Maintainer Author

spsanderson Nov 29, 2023 Maintainer Author

spsanderson
Nov 18, 2023
Maintainer

Replies: 3 comments 2 replies

spsanderson
Nov 21, 2023
Maintainer Author

spsanderson Nov 22, 2023
Maintainer Author

spsanderson Nov 22, 2023
Maintainer Author

spsanderson
Nov 28, 2023
Maintainer Author

spsanderson
Nov 29, 2023
Maintainer Author