CDCgov
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 4 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/check_authorized_users.R‎
Lines changed: 6 additions & 6 deletions b/‎R/check_authorized_users.R‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎R/check_changes_for_autoapproval.R‎
Lines changed: 78 additions & 0 deletions b/‎R/check_changes_for_autoapproval.R‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎R/generate_hub_ensemble.R‎
Lines changed: 12 additions & 6 deletions b/‎R/generate_hub_ensemble.R‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎R/generate_oracle_output.R‎
Lines changed: 112 additions & 0 deletions b/‎R/generate_oracle_output.R‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎R/update_hub_target_data.R‎
Lines changed: 30 additions & 6 deletions b/‎R/update_hub_target_data.R‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
@@ -2,7 +2,7 @@ repos:
 #####
 # Basic file cleanliness
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     -   id: check-added-large-files
     -   id: check-yaml
@@ -13,7 +13,7 @@ repos:
 #####
 # Python
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.10
+  rev: v0.14.5
   hooks:
     # Sort imports
     - id: ruff
@@ -27,13 +27,13 @@ repos:
 #####
 # R
 -   repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.4.3.9009
+    rev: v0.4.3.9017
     hooks:
     -   id: lintr
 #####
 # Java
 - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
-  rev: v2.14.0
+  rev: v2.15.0
   hooks:
   - id: pretty-format-java
     args: [--aosp,--autofix]
@@ -53,3 +53,8 @@ repos:
     -   id: detect-secrets
         args: ['--baseline', '.secrets.baseline']
         exclude: package.lock.json
+-   repo: https://github.com/crate-ci/typos
+    rev: v1
+    hooks:
+    -   id: typos
+        args: ["--force-exclude"]
@@ -27,7 +27,8 @@ Imports:
     purrr,
     stringr,
     tidyr,
-    tidyselect
+    tidyselect,
+    hubUtils
 Remotes:
     forecasttools=github::cdcgov/forecasttools,
     hubUtils=github::hubverse-org/hubUtils,
 
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(check_authorized_users)
+export(check_changes_for_autoapproval)
 export(excluded_locations)
 export(generate_hub_baseline)
 export(generate_hub_ensemble)
 
@@ -2,12 +2,12 @@
 #' directories.
 #'
 #' This function verifies whether a GitHub
-#' user is authorized to modify specific directories
+#' user is authorized to modify specific model IDs
 #' in a Hub by checking the designated users in model
 #' metadata.
 #'
-#' @param changed_dirs Character vector. Names of directories
-#' whose contents have been modified.
+#' @param changed_model_ids Character vector. Model IDs that
+#' have been modified.
 #' @param gh_actor Character. GitHub username of the person
 #' making changes.
 #' @param base_hub_path Character. Path to the base hub
@@ -18,19 +18,19 @@
 #'
 #' @export
 check_authorized_users <- function(
-  changed_dirs,
+  changed_model_ids,
   gh_actor,
   base_hub_path
 ) {
-  checkmate::assert_character(changed_dirs, min.len = 1)
+  checkmate::assert_character(changed_model_ids, min.len = 1)
   checkmate::assert_string(gh_actor)
   checkmate::assert_string(base_hub_path)
 
   model_metadata <- hubData::load_model_metadata(base_hub_path) |>
     dplyr::mutate(is_model_dir = TRUE) |>
     dplyr::rename(dir = "model_id")
 
-  changed_dirs_tbl <- tibble::tibble(dir = changed_dirs)
+  changed_dirs_tbl <- tibble::tibble(dir = changed_model_ids)
 
   authorization_check <- changed_dirs_tbl |>
     dplyr::left_join(model_metadata, by = "dir", na_matches = "never") |>
 
@@ -0,0 +1,78 @@
+#' Check changed files for auto-approval eligibility.
+#'
+#' This function processes a list of changed files from a
+#' GitHub workflow, errors on any changes outside
+#' model-output directory, and passes model IDs to
+#' check_authorized_users for authorization validation.
+#'
+#' @param changed_files Character vector. List of changed
+#' file paths from the GitHub changed-files workflow output.
+#' @param gh_actor Character. GitHub username of the
+#' person making changes.
+#' @param base_hub_path Character. Path to the base hub
+#' directory.
+#'
+#' @return `NULL`, invisibly, raising an error if changes
+#' are outside model-output or if the user is unauthorized.
+#'
+#' @export
+check_changes_for_autoapproval <- function(
+  changed_files,
+  gh_actor,
+  base_hub_path
+) {
+  checkmate::assert_string(gh_actor)
+  checkmate::assert_string(base_hub_path)
+  checkmate::assert_character(changed_files)
+  if (length(changed_files) < 1) {
+    cli::cli_abort(
+      "Empty PRs cannot be autoapproved. At least one file must be changed in the pull request."
+    )
+  }
+  changed_files_tbl <- tibble::tibble(
+    full_path = changed_files
+  ) |>
+    dplyr::mutate(
+      path_rel_root = fs::path_rel(.data$full_path, start = !!base_hub_path),
+      in_model_output = fs::path_has_parent(
+        .data$path_rel_root,
+        "model-output"
+      ),
+      model_id = ifelse(
+        .data$in_model_output,
+        fs::path_dir(.data$path_rel_root) |> fs::path_file(),
+        NA_character_
+      )
+    )
+  files_outside_model_output <- changed_files_tbl |>
+    dplyr::filter(!.data$in_model_output) |>
+    dplyr::pull(.data$full_path)
+
+  if (length(files_outside_model_output) > 0) {
+    cli::cli_abort(
+      c(
+        "Auto-approval failed: Changes detected outside 'model-output' directory.",
+        "The following files are outside 'model-output':",
+        files_outside_model_output
+      )
+    )
+  }
+  changed_model_ids <- changed_files_tbl |>
+    dplyr::filter(.data$in_model_output) |>
+    dplyr::pull(.data$model_id) |>
+    unique()
+
+  if (length(changed_model_ids) > 0) {
+    cli::cli_inform(
+      "Checking authorization for {length(changed_model_ids)} model director{?y/ies}: {.val {changed_model_ids}}"
+    )
+
+    check_authorized_users(
+      changed_model_ids = changed_model_ids,
+      gh_actor = gh_actor,
+      base_hub_path = base_hub_path
+    )
+  }
+
+  invisible()
+}
@@ -99,7 +99,8 @@ generate_hub_ensemble <- function(
     )
   }
 
-  ensemble_model_name <- glue::glue("{get_hub_name(disease)}-ensemble")
+  hub_name <- get_hub_name(disease)
+  ensemble_model_name <- glue::glue("{hub_name}-ensemble")
 
   output_dirpath <- fs::path(base_hub_path, "model-output", ensemble_model_name)
   output_filename <- glue::glue("{reference_date}-{hub_name}-ensemble")
@@ -110,8 +111,7 @@ generate_hub_ensemble <- function(
 
   weekly_forecasts <- hubData::connect_hub(base_hub_path) |>
     dplyr::filter(
-      .data$reference_date == !!reference_date,
-      !stringr::str_detect(.data$model_id, hub_name)
+      .data$reference_date == !!reference_date
     ) |>
     hubData::collect_hub()
 
@@ -129,12 +129,18 @@ generate_hub_ensemble <- function(
     ) |>
     dplyr::arrange(.data$target)
 
+  weekly_model_submissions_path <- fs::path(
+    base_hub_path,
+    "auxiliary-data",
+    "weekly-model-submissions"
+  )
+
+  fs::dir_create(weekly_model_submissions_path, recurse = TRUE)
+
   forecasttools::write_tabular(
     weekly_models,
     fs::path(
-      base_hub_path,
-      "auxiliary-data",
-      "weekly-model-submissions",
+      weekly_model_submissions_path,
       glue::glue("{reference_date}-models-submitted-to-hub"),
       ext = "csv"
     )
 
@@ -0,0 +1,112 @@
+#' Transform a modeling task represented as a nested list
+#' to a single data frame
+#'
+#' @param task Nested list representing a modeling task,
+#' as one entry of the output of [hubUtils::get_round_model_tasks()].
+#' Must have a `target_end_date` specification.
+#' @return A [`tibble`][tibble::tibble()] of all potentially
+#' valid submittable outputs for the modeling task defined in `task`.
+#' Each row of the table represents a single valid forecastable quantity
+#' (e.g. "`target` X on `target_end_date` Y in `location` Z"),
+#' plus a valid submittable output_type for forecasting that quantity.
+#' If multiple `output_type`s are accepted for a given valid forecastable
+#' quantity, that quantity will be represented multiple times, with
+#' one row for each valid associated `output_type`.
+flatten_task <- function(task) {
+  checkmate::assert_names(
+    names(task),
+    must.include = c("output_type", "task_ids")
+  )
+  checkmate::assert_names(
+    names(task$task_ids),
+    must.include = "target_end_date"
+  )
+  output_types <- names(task$output_type)
+
+  task_params <- purrr::map(task$task_ids, \(x) c(x$required, x$optional)) |>
+    purrr::discard_at(c("horizon", "reference_date"))
+  ## discard columns that are redundant with `target_end_date`
+
+  return(do.call(
+    tidyr::crossing,
+    c(task_params, list(output_type = output_types))
+  ))
+}
+
+
+#' Transform a group of modeling task represented as a list of
+#' nested lists into a single data frame.
+#'
+#' Calls [flatten_task()] on each entry of the task list.
+#'
+#' @param task_list List of tasks. Each entry should itself be
+#' be a nested list that can be passed to [flatten_task()].
+#' @param .deduplicate deduplicate the output if the same flat
+#' configuration is found multiple times while flattening the task list?
+#' Default `TRUE`.
+#'
+#' @return A [`tibble`][tibble::tibble()] of all potentially
+#' valid submittable outputs for all the modeling tasks defined in `task_lists`.
+#' Each row of the table represents a single valid forecastable quantity
+#' (e.g. "`target` X on `target_end_date` Y in `location` Z"),
+#' plus a valid submittable output_type for forecasting that quantity.
+#' If multiple `output_type`s are accepted for a given valid forecastable
+#' quantity, that quantity will be represented multiple times, with
+#' one row for each valid associated `output_type`.
+#'
+flatten_task_list <- function(task_list, .deduplicate = TRUE) {
+  flat_tasks <- purrr::map_df(task_list, flatten_task)
+
+  if (.deduplicate) {
+    flat_tasks <- dplyr::distinct(flat_tasks)
+  }
+
+  return(flat_tasks)
+}
+
+#' Generate and save oracle output for the Hub
+#'
+#' @param hub_path Path to the hub root.
+#'
+#' @return nothing, invisibly, on success.
+#' @export
+generate_oracle_output <- function(hub_path) {
+  output_dirpath <- fs::path(hub_path, "target-data")
+  fs::dir_create(output_dirpath)
+  target_ts <- hubData::connect_target_timeseries(hub_path)
+
+  config_tasks <- hubUtils::read_config(hub_path, "tasks")
+  round_ids <- hubUtils::get_round_ids(config_tasks)
+
+  ## this involves duplication given how hubUtils::get_round_model_tasks
+  ## behaves by default with round ids created from reference dates,
+  ## but to support hubs with round_ids created in other ways, we
+  ## do it this way and then deduplicate as needed.
+  list_of_task_lists <- purrr::map(round_ids, \(id) {
+    hubUtils::get_round_model_tasks(config_tasks, id)
+  })
+
+  unique_tasks <- purrr::map_df(list_of_task_lists, flatten_task_list) |>
+    dplyr::distinct() |>
+    dplyr::mutate(target_end_date = as.Date(.data$target_end_date))
+
+  target_data <- target_ts |>
+    forecasttools::hub_target_data_as_of("latest", .drop = TRUE) |>
+    dplyr::collect() |>
+    dplyr::rename(target_end_date = "date")
+
+  join_key <- intersect(
+    colnames(unique_tasks),
+    colnames(target_data)
+  )
+
+  oracle_data <- dplyr::inner_join(unique_tasks, target_data, by = join_key) |>
+    dplyr::mutate(output_type_id = NA) |>
+    dplyr::rename(
+      oracle_value = "observation"
+    )
+
+  output_file <- fs::path(output_dirpath, "oracle-output", ext = "parquet")
+  forecasttools::write_tabular_file(oracle_data, output_file)
+  invisible()
+}
@@ -24,6 +24,9 @@ nssp_col_names <- list(
 #' is "2024-11-09".
 #' @param legacy_file Logical. Whether to write legacy
 #' CSV output (default: FALSE).
+#' @param nssp_update_local Logical. Whether to update NSSP
+#' data from local file `auxiliary-data/latest.csv`
+#' (default: FALSE).
 #'
 #' @return Writes `time-series.parquet` and optionally
 #' legacy CSV target data files to the target-data
@@ -35,7 +38,8 @@ update_hub_target_data <- function(
   as_of = lubridate::today(),
   nhsn_first_weekending_date = lubridate::as_date("2024-11-09"),
   included_locations = hubhelpr::included_locations,
-  legacy_file = FALSE
+  legacy_file = FALSE,
+  nssp_update_local = FALSE
 ) {
   if (!disease %in% c("covid", "rsv")) {
     stop("'disease' must be either 'covid' or 'rsv'")
@@ -90,11 +94,31 @@ update_hub_target_data <- function(
       )
   }
 
-  hubverse_format_nssp_data <- forecasttools::pull_data_cdc_gov_dataset(
-    dataset = "nssp_prop_ed_visits",
-    columns = c(nssp_col_name, "geography"),
-    locations = "All"
-  ) |>
+  if (nssp_update_local) {
+    raw_nssp_data <- forecasttools::read_tabular(
+      fs::path(
+        base_hub_path,
+        "auxiliary-data",
+        "nssp-raw-data",
+        "latest",
+        ext = "csv"
+      )
+    ) |>
+      dplyr::filter(county == "All") |>
+      dplyr::select(
+        week_end,
+        geography,
+        dplyr::all_of(nssp_col_name)
+      )
+  } else {
+    raw_nssp_data <- forecasttools::pull_data_cdc_gov_dataset(
+      dataset = "nssp_prop_ed_visits",
+      columns = c(nssp_col_name, "geography"),
+      locations = "All"
+    )
+  }
+
+  hubverse_format_nssp_data <- raw_nssp_data |>
     dplyr::mutate(
       date = lubridate::as_date(.data$week_end),
       observation = as.numeric(.data[[nssp_col_name]]) / 100,
 
@@ -6,7 +6,7 @@
 
 ## Overview
 
-In-progress R package providing functions for CFA Hubs maintainence.
+In-progress R package providing functions for CFA Hubs maintenance.
 
 ## Getting started