rename, check for pre in names, dplyr::between

dsweber2 · dsweber2 · commit 84f991d6fc1e · 2025-03-28T18:06:11.000-05:00
diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
@@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE)
+    check_enough_data(all_predictors(), min_observations = 1, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_data(
       all_predictors(),
       all_outcomes(),
-      min_data_points = args_list$check_enough_data_n,
+      min_observations = args_list$check_enough_data_n,
       epi_keys = args_list$check_enough_data_epi_keys,
       drop_na = FALSE
     )
diff --git a/R/canned-epipred.R b/R/canned-epipred.R
@@ -112,7 +112,7 @@ print.canned_epipred <- function(x, name, ...) {
     "At forecast date{?s}: {.val {fds}},",
     "For target date{?s}: {.val {tds}},"
   ))
-  if ("actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) {
+  if ("pre" %in% names(x) && "actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) {
     fit_recipe <- extract_recipe(x$epi_workflow)
     if (detect_step(fit_recipe, "adjust_latency")) {
       is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency"))
diff --git a/R/check_enough_data.R b/R/check_enough_data.R
@@ -8,7 +8,7 @@
 #' @param ... One or more selector functions to choose variables for this check.
 #'  See [selections()] for more details. You will usually want to use
 #'  [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
-#' @param min_data_points The minimum number of data points required for
+#' @param min_observations The minimum number of data points required for
 #'   training. If this is NULL, the total number of predictors will be used.
 #' @param epi_keys A character vector of column names on which to group the data
 #'   and check threshold within each group. Useful if your forecaster trains
@@ -44,7 +44,7 @@
 check_enough_data <-
   function(recipe,
            ...,
-           min_data_points = NULL,
+           min_observations = NULL,
            epi_keys = NULL,
            drop_na = TRUE,
            role = NA,
@@ -54,7 +54,7 @@ check_enough_data <-
     recipes::add_check(
       recipe,
       check_enough_data_new(
-        min_data_points = min_data_points,
+        min_observations = min_observations,
         epi_keys = epi_keys,
         drop_na = drop_na,
         terms = enquos(...),
@@ -68,12 +68,12 @@ check_enough_data <-
   }
 
 check_enough_data_new <-
-  function(min_data_points, epi_keys, drop_na, terms,
+  function(min_observations, epi_keys, drop_na, terms,
            role, trained, columns, skip, id) {
     recipes::check(
       subclass = "enough_data",
       prefix = "check_",
-      min_data_points = min_data_points,
+      min_observations = min_observations,
       epi_keys = epi_keys,
       drop_na = drop_na,
       terms = terms,
@@ -88,14 +88,14 @@ check_enough_data_new <-
 #' @export
 prep.check_enough_data <- function(x, training, info = NULL, ...) {
   col_names <- recipes::recipes_eval_select(x$terms, training, info)
-  if (is.null(x$min_data_points)) {
-    x$min_data_points <- length(col_names)
+  if (is.null(x$min_observations)) {
+    x$min_observations <- length(col_names)
   }
 
   check_enough_data_core(training, x, col_names, "train")
 
   check_enough_data_new(
-    min_data_points = x$min_data_points,
+    min_observations = x$min_observations,
     epi_keys = x$epi_keys,
     drop_na = x$drop_na,
     terms = x$terms,
@@ -116,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
 
 #' @export
 print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
-  title <- paste0("Check enough data (n = ", x$min_data_points, ") for ")
+  title <- paste0("Check enough data (n = ", x$min_observations, ") for ")
   recipes::print_step(x$columns, x$terms, x$trained, title, width)
   invisible(x)
 }
@@ -129,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) {
     res <- tibble(terms = recipes::sel2char(x$terms))
   }
   res$id <- x$id
-  res$min_data_points <- x$min_data_points
+  res$min_observations <- x$min_observations
   res$epi_keys <- x$epi_keys
   res$drop_na <- x$drop_na
   res
@@ -142,7 +142,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
     any_missing_data <- epi_df %>%
       mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
       # count the number of rows where they're all not na
-      summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop")
+      summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop")
     any_missing_data <- any_missing_data %>%
       summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
       any()
@@ -153,7 +153,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
       summarise(
         across(
           all_of(.env$col_names),
-          ~ sum(!is.na(.x)) < .env$step_obj$min_data_points
+          ~ sum(!is.na(.x)) < .env$step_obj$min_observations
         ),
         .groups = "drop"
       ) %>%
@@ -173,7 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
   } else {
     # if we're not dropping na values, just count
     cols_not_enough_data <- epi_df %>%
-      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points))
+      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations))
     any_missing_data <- cols_not_enough_data %>%
       summarize(across(all_of(.env$col_names), all)) %>%
       all()
diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
@@ -1,7 +1,7 @@
 # check_enough_data works on pooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n + 1,
       drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -10,7 +10,7 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 1,
       drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -19,7 +19,7 @@
 # check_enough_data works on unpooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = n + 1,
       epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -28,7 +28,7 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 3,
       epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -47,7 +47,7 @@
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df)
+        all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R
@@ -32,7 +32,7 @@ test_that("warns if there's not enough data to predict", {
   ) %>%
     mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>%
     # Oct to May (flu season, ish) only:
-    filter(!between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>%
+    filter(!dplyr::between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>%
     # and actually, pretend we're around mid-October 2022:
     filter(time_value <= as.Date("2022-10-12")) %>%
     as_epi_df(as_of = as.Date("2022-10-12"))
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
@@ -18,22 +18,22 @@ test_that("check_enough_data works on pooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = 2 * n, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   # Check both column don't have enough data
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -42,30 +42,30 @@ test_that("check_enough_data works on unpooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = n, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   # Check one column don't have enough data
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
 
 test_that("check_enough_data outputs the correct recipe values", {
   expect_no_error(
     p <- epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 2) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 2) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -90,15 +90,15 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, y, min_observations = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
   # Making sure `skip = TRUE` is working correctly in `predict`
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, min_observations = n - 2, epi_keys = "geo_value") %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df) %>%
       predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
@@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     forecaster <- epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      check_enough_data(x, min_observations = 1, epi_keys = "geo_value", skip = FALSE) %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df)
   )
@@ -125,15 +125,15 @@ test_that("check_enough_data works with all_predictors() downstream of construct
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>%
+      check_enough_data(all_predictors(), y, min_observations = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>%
+      check_enough_data(all_predictors(), y, min_observations = 2 * n - 4) %>%
       prep(toy_epi_df)
   )
 })