feat: review updates

dshemetov · dshemetov · commit d754409dbc36 · 2024-04-30T15:50:59.000-07:00
* check postprocessor for forecast_date in forecast.epi_workflow
* add test
diff --git a/NAMESPACE b/NAMESPACE
@@ -45,6 +45,7 @@ S3method(extrapolate_quantiles,distribution)
 S3method(fit,epi_workflow)
 S3method(flusight_hub_formatter,canned_epipred)
 S3method(flusight_hub_formatter,data.frame)
+S3method(forecast,epi_workflow)
 S3method(format,dist_quantiles)
 S3method(is.na,dist_quantiles)
 S3method(is.na,distribution)
@@ -220,6 +221,7 @@ importFrom(dplyr,ungroup)
 importFrom(epiprocess,growth_rate)
 importFrom(generics,augment)
 importFrom(generics,fit)
+importFrom(generics,forecast)
 importFrom(ggplot2,autoplot)
 importFrom(hardhat,refresh_blueprint)
 importFrom(hardhat,run_mold)
diff --git a/R/arx_classifier.R b/R/arx_classifier.R
@@ -51,9 +51,7 @@ arx_classifier <- function(
     cli::cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.")
   }
 
-  wf <- arx_class_epi_workflow(
-    epi_data, outcome, predictors, trainer, args_list
-  )
+  wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list)
   wf <- generics::fit(wf, epi_data)
 
   preds <- forecast(
diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
@@ -38,24 +38,19 @@
 #'   trainer = quantile_reg(),
 #'   args_list = arx_args_list(quantile_levels = 1:9 / 10)
 #' )
-arx_forecaster <- function(epi_data,
-                           outcome,
-                           predictors = outcome,
-                           trainer = parsnip::linear_reg(),
-                           args_list = arx_args_list()) {
+arx_forecaster <- function(
+    epi_data,
+    outcome,
+    predictors = outcome,
+    trainer = parsnip::linear_reg(),
+    args_list = arx_args_list()) {
   if (!is_regression(trainer)) {
     cli::cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'regression'.")
   }
 
-  wf <- arx_fcast_epi_workflow(
-    epi_data, outcome, predictors, trainer, args_list
-  )
+  wf <- arx_fcast_epi_workflow(epi_data, outcome, predictors, trainer, args_list)
   wf <- generics::fit(wf, epi_data)
 
-  latest <- get_test_data(
-    hardhat::extract_preprocessor(wf), epi_data,
-  )
-
   preds <- forecast(
     wf,
     fill_locf = TRUE,
diff --git a/R/epi_workflow.R b/R/epi_workflow.R
@@ -334,7 +334,8 @@ print.epi_workflow <- function(x, ...) {
 
 #' Produce a forecast from an epi workflow
 #'
-#' @param epi_workflow An epi workflow
+#' @param object An epi workflow.
+#' @param ... Not used.
 #' @param fill_locf Logical. Should we use locf to fill in missing data?
 #' @param n_recent Integer or NULL. If filling missing data with locf = TRUE,
 #' how far back are we willing to tolerate missing data? Larger values allow
@@ -349,21 +350,34 @@ print.epi_workflow <- function(x, ...) {
 #' @return A forecast tibble.
 #'
 #' @export
-forecast <- function(epi_workflow, fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) {
-  if (!epi_workflow$trained) {
+forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) {
+  rlang::check_dots_empty()
+
+  if (!object$trained) {
     cli_abort(c(
       "You cannot `forecast()` a {.cls workflow} that has not been trained.",
       i = "Please use `fit()` before forecasting."
     ))
   }
 
+  frosting_fd <- NULL
+  if (has_postprocessor(object) && detect_layer(object, "layer_add_forecast_date")) {
+    frosting_fd <- extract_argument(object, "layer_add_forecast_date", "forecast_date")
+    if (!is.null(frosting_fd) && class(frosting_fd) != class(object$original_data$time_value)) {
+      cli_abort(c(
+        "Error with layer_add_forecast_date():",
+        i = "The type of `forecast_date` must match the type of the `time_value` column in the data."
+      ))
+    }
+  }
+
   test_data <- get_test_data(
-    hardhat::extract_preprocessor(epi_workflow),
-    epi_workflow$original_data,
+    hardhat::extract_preprocessor(object),
+    object$original_data,
     fill_locf = fill_locf,
     n_recent = n_recent %||% Inf,
-    forecast_date = forecast_date %||% max(epi_workflow$original_data$time_value)
+    forecast_date = forecast_date %||% frosting_fd %||% max(object$original_data$time_value)
   )
 
-  predict(epi_workflow, new_data = test_data)
+  predict(object, new_data = test_data)
 }
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
@@ -32,7 +32,7 @@
 #' # Use ahead + forecast date
 #' f <- frosting() %>%
 #'   layer_predict() %>%
-#'   layer_add_forecast_date(forecast_date = "2022-05-31") %>%
+#'   layer_add_forecast_date(forecast_date = as.Date("2022-05-31")) %>%
 #'   layer_add_target_date() %>%
 #'   layer_naomit(.pred)
 #' wf1 <- wf %>% add_frosting(f)
diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R
@@ -71,7 +71,7 @@
 #' eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline")
 #'
 #' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset)
-#' preds <- suppressWarnings(forecast(wf)) %>%
+#' preds <- forecast(wf) %>%
 #'   dplyr::select(-time_value) %>%
 #'   dplyr::mutate(forecast_date = forecast_date)
 #' preds
diff --git a/R/reexports-tidymodels.R b/R/reexports-tidymodels.R
@@ -2,6 +2,10 @@
 #' @export
 generics::fit
 
+#' @importFrom generics forecast
+#' @export
+generics::forecast
+
 #' @importFrom recipes prep
 #' @export
 recipes::prep
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -86,7 +86,7 @@ reference:
       - predict.epi_workflow
       - fit.epi_workflow
       - augment.epi_workflow
-      - forecast
+      - forecast.epi_workflow
   - title: Epi recipe preprocessing steps
     contents:
       - starts_with("step_")
diff --git a/man/forecast.epi_workflow.Rd b/man/forecast.epi_workflow.Rd
diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd
diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd
diff --git a/man/reexports.Rd b/man/reexports.Rd
diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R
@@ -71,15 +71,26 @@ test_that("forecast method works", {
     step_epi_ahead(death_rate, ahead = 7) %>%
     step_epi_naomit()
   wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu)
-
-  latest <- get_test_data(
-    hardhat::extract_preprocessor(wf),
-    jhu
+  expect_equal(
+    forecast(wf),
+    predict(wf, new_data = get_test_data(
+      hardhat::extract_preprocessor(wf),
+      jhu
+    ))
   )
 
+  args <- list(
+    fill_locf = TRUE,
+    n_recent = 360 * 3,
+    forecast_date = as.Date("2024-01-01")
+  )
   expect_equal(
-    forecast(wf),
-    predict(wf, new_data = latest)
+    forecast(wf, !!!args),
+    predict(wf, new_data = get_test_data(
+      hardhat::extract_preprocessor(wf),
+      jhu,
+      !!!args
+    ))
   )
 })
 
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
@@ -119,7 +119,7 @@ test_that("Postprocessing workflow works and values correct", {
     fit(jhu) %>%
     add_frosting(f)
 
-  suppressWarnings(p <- forecast(wf))
+  p <- forecast(wf)
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
   expect_equal(p$.pred_scaled, p$.pred * c(20000, 30000))
@@ -136,7 +136,7 @@ test_that("Postprocessing workflow works and values correct", {
   wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
     add_frosting(f)
-  suppressWarnings(p <- forecast(wf))
+  p <- forecast(wf)
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
   expect_equal(p$.pred_scaled, p$.pred * c(2, 3))
@@ -178,7 +178,7 @@ test_that("Postprocessing to get cases from case rate", {
     fit(jhu) %>%
     add_frosting(f)
 
-  suppressWarnings(p <- forecast(wf))
+  p <- forecast(wf)
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
   expect_equal(p$.pred_scaled, p$.pred * c(1 / 20000, 1 / 30000))
diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd
@@ -110,8 +110,6 @@ out <- arx_forecaster(
 )
 ```
 
-This call produces a warning, which we'll ignore for now. But essentially, it's telling us that our data comes from May 2022 but we're trying to do a forecast for January 2022. The result is likely not an accurate measure of real-time forecast performance, because the data have been revised over time.
-
 The `out` object has two components:
 
   1. The predictions which is just another `epi_df`. It contains the predictions for each location along with additional columns. By default, these are a 90% predictive interval, the `forecast_date` (the date on which the forecast was putatively made) and the `target_date` (the date for which the forecast is being made).
@@ -123,9 +121,6 @@ out$predictions
 out$epi_workflow
   ```
 
-Note that the `time_value` in the predictions is not necessarily meaningful,
-but it is a required column in an `epi_df`, so it remains here.
-
 By default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, using 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), 1 week back and 2 weeks back. The predictors and outcome can be changed directly. The rest of the defaults are encapsulated into a list of arguments. This list is produced by `arx_args_list()`.
 
 ## Simple adjustments

Original file line number	Diff line number	Diff line change
`@@ -51,9 +51,7 @@ arx_classifier <- function(`
`51`	`51`	cli::cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.")
`52`	`52`	`}`
`53`	`53`
`54`		`- wf <- arx_class_epi_workflow(`
`55`		`- epi_data, outcome, predictors, trainer, args_list`
`56`		`- )`
	`54`	`+ wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list)`
`57`	`55`	`wf <- generics::fit(wf, epi_data)`
`58`	`56`
`59`	`57`	`preds <- forecast(`