Skip to content

Commit 165a3b2

Browse files
committed
Merge remote-tracking branch 'origin/main' into 36-add-get_forecast_datar
2 parents 2297204 + f7a5604 commit 165a3b2

16 files changed

+386
-29
lines changed

.pre-commit-config.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ repos:
22
#####
33
# Basic file cleanliness
44
- repo: https://github.com/pre-commit/pre-commit-hooks
5-
rev: v5.0.0
5+
rev: v6.0.0
66
hooks:
77
- id: check-added-large-files
88
- id: check-yaml
@@ -13,7 +13,7 @@ repos:
1313
#####
1414
# Python
1515
- repo: https://github.com/astral-sh/ruff-pre-commit
16-
rev: v0.11.10
16+
rev: v0.14.5
1717
hooks:
1818
# Sort imports
1919
- id: ruff
@@ -27,13 +27,13 @@ repos:
2727
#####
2828
# R
2929
- repo: https://github.com/lorenzwalthert/precommit
30-
rev: v0.4.3.9009
30+
rev: v0.4.3.9017
3131
hooks:
3232
- id: lintr
3333
#####
3434
# Java
3535
- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
36-
rev: v2.14.0
36+
rev: v2.15.0
3737
hooks:
3838
- id: pretty-format-java
3939
args: [--aosp,--autofix]
@@ -53,3 +53,8 @@ repos:
5353
- id: detect-secrets
5454
args: ['--baseline', '.secrets.baseline']
5555
exclude: package.lock.json
56+
- repo: https://github.com/crate-ci/typos
57+
rev: v1
58+
hooks:
59+
- id: typos
60+
args: ["--force-exclude"]

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ Imports:
2727
purrr,
2828
stringr,
2929
tidyr,
30-
tidyselect
30+
tidyselect,
31+
hubUtils
3132
Remotes:
3233
forecasttools=github::cdcgov/forecasttools,
3334
hubUtils=github::hubverse-org/hubUtils,

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(check_authorized_users)
4+
export(check_changes_for_autoapproval)
45
export(excluded_locations)
56
export(generate_hub_baseline)
67
export(generate_hub_ensemble)

R/check_authorized_users.R

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
#' directories.
33
#'
44
#' This function verifies whether a GitHub
5-
#' user is authorized to modify specific directories
5+
#' user is authorized to modify specific model IDs
66
#' in a Hub by checking the designated users in model
77
#' metadata.
88
#'
9-
#' @param changed_dirs Character vector. Names of directories
10-
#' whose contents have been modified.
9+
#' @param changed_model_ids Character vector. Model IDs that
10+
#' have been modified.
1111
#' @param gh_actor Character. GitHub username of the person
1212
#' making changes.
1313
#' @param base_hub_path Character. Path to the base hub
@@ -18,19 +18,19 @@
1818
#'
1919
#' @export
2020
check_authorized_users <- function(
21-
changed_dirs,
21+
changed_model_ids,
2222
gh_actor,
2323
base_hub_path
2424
) {
25-
checkmate::assert_character(changed_dirs, min.len = 1)
25+
checkmate::assert_character(changed_model_ids, min.len = 1)
2626
checkmate::assert_string(gh_actor)
2727
checkmate::assert_string(base_hub_path)
2828

2929
model_metadata <- hubData::load_model_metadata(base_hub_path) |>
3030
dplyr::mutate(is_model_dir = TRUE) |>
3131
dplyr::rename(dir = "model_id")
3232

33-
changed_dirs_tbl <- tibble::tibble(dir = changed_dirs)
33+
changed_dirs_tbl <- tibble::tibble(dir = changed_model_ids)
3434

3535
authorization_check <- changed_dirs_tbl |>
3636
dplyr::left_join(model_metadata, by = "dir", na_matches = "never") |>

R/check_changes_for_autoapproval.R

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#' Check changed files for auto-approval eligibility.
2+
#'
3+
#' This function processes a list of changed files from a
4+
#' GitHub workflow, errors on any changes outside
5+
#' model-output directory, and passes model IDs to
6+
#' check_authorized_users for authorization validation.
7+
#'
8+
#' @param changed_files Character vector. List of changed
9+
#' file paths from the GitHub changed-files workflow output.
10+
#' @param gh_actor Character. GitHub username of the
11+
#' person making changes.
12+
#' @param base_hub_path Character. Path to the base hub
13+
#' directory.
14+
#'
15+
#' @return `NULL`, invisibly, raising an error if changes
16+
#' are outside model-output or if the user is unauthorized.
17+
#'
18+
#' @export
19+
check_changes_for_autoapproval <- function(
20+
changed_files,
21+
gh_actor,
22+
base_hub_path
23+
) {
24+
checkmate::assert_string(gh_actor)
25+
checkmate::assert_string(base_hub_path)
26+
checkmate::assert_character(changed_files)
27+
if (length(changed_files) < 1) {
28+
cli::cli_abort(
29+
"Empty PRs cannot be autoapproved. At least one file must be changed in the pull request."
30+
)
31+
}
32+
changed_files_tbl <- tibble::tibble(
33+
full_path = changed_files
34+
) |>
35+
dplyr::mutate(
36+
path_rel_root = fs::path_rel(.data$full_path, start = !!base_hub_path),
37+
in_model_output = fs::path_has_parent(
38+
.data$path_rel_root,
39+
"model-output"
40+
),
41+
model_id = ifelse(
42+
.data$in_model_output,
43+
fs::path_dir(.data$path_rel_root) |> fs::path_file(),
44+
NA_character_
45+
)
46+
)
47+
files_outside_model_output <- changed_files_tbl |>
48+
dplyr::filter(!.data$in_model_output) |>
49+
dplyr::pull(.data$full_path)
50+
51+
if (length(files_outside_model_output) > 0) {
52+
cli::cli_abort(
53+
c(
54+
"Auto-approval failed: Changes detected outside 'model-output' directory.",
55+
"The following files are outside 'model-output':",
56+
files_outside_model_output
57+
)
58+
)
59+
}
60+
changed_model_ids <- changed_files_tbl |>
61+
dplyr::filter(.data$in_model_output) |>
62+
dplyr::pull(.data$model_id) |>
63+
unique()
64+
65+
if (length(changed_model_ids) > 0) {
66+
cli::cli_inform(
67+
"Checking authorization for {length(changed_model_ids)} model director{?y/ies}: {.val {changed_model_ids}}"
68+
)
69+
70+
check_authorized_users(
71+
changed_model_ids = changed_model_ids,
72+
gh_actor = gh_actor,
73+
base_hub_path = base_hub_path
74+
)
75+
}
76+
77+
invisible()
78+
}

R/generate_hub_ensemble.R

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ generate_hub_ensemble <- function(
9999
)
100100
}
101101

102-
ensemble_model_name <- glue::glue("{get_hub_name(disease)}-ensemble")
102+
hub_name <- get_hub_name(disease)
103+
ensemble_model_name <- glue::glue("{hub_name}-ensemble")
103104

104105
output_dirpath <- fs::path(base_hub_path, "model-output", ensemble_model_name)
105106
output_filename <- glue::glue("{reference_date}-{hub_name}-ensemble")
@@ -110,8 +111,7 @@ generate_hub_ensemble <- function(
110111

111112
weekly_forecasts <- hubData::connect_hub(base_hub_path) |>
112113
dplyr::filter(
113-
.data$reference_date == !!reference_date,
114-
!stringr::str_detect(.data$model_id, hub_name)
114+
.data$reference_date == !!reference_date
115115
) |>
116116
hubData::collect_hub()
117117

@@ -129,12 +129,18 @@ generate_hub_ensemble <- function(
129129
) |>
130130
dplyr::arrange(.data$target)
131131

132+
weekly_model_submissions_path <- fs::path(
133+
base_hub_path,
134+
"auxiliary-data",
135+
"weekly-model-submissions"
136+
)
137+
138+
fs::dir_create(weekly_model_submissions_path, recurse = TRUE)
139+
132140
forecasttools::write_tabular(
133141
weekly_models,
134142
fs::path(
135-
base_hub_path,
136-
"auxiliary-data",
137-
"weekly-model-submissions",
143+
weekly_model_submissions_path,
138144
glue::glue("{reference_date}-models-submitted-to-hub"),
139145
ext = "csv"
140146
)

R/generate_oracle_output.R

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#' Transform a modeling task represented as a nested list
2+
#' to a single data frame
3+
#'
4+
#' @param task Nested list representing a modeling task,
5+
#' as one entry of the output of [hubUtils::get_round_model_tasks()].
6+
#' Must have a `target_end_date` specification.
7+
#' @return A [`tibble`][tibble::tibble()] of all potentially
8+
#' valid submittable outputs for the modeling task defined in `task`.
9+
#' Each row of the table represents a single valid forecastable quantity
10+
#' (e.g. "`target` X on `target_end_date` Y in `location` Z"),
11+
#' plus a valid submittable output_type for forecasting that quantity.
12+
#' If multiple `output_type`s are accepted for a given valid forecastable
13+
#' quantity, that quantity will be represented multiple times, with
14+
#' one row for each valid associated `output_type`.
15+
flatten_task <- function(task) {
16+
checkmate::assert_names(
17+
names(task),
18+
must.include = c("output_type", "task_ids")
19+
)
20+
checkmate::assert_names(
21+
names(task$task_ids),
22+
must.include = "target_end_date"
23+
)
24+
output_types <- names(task$output_type)
25+
26+
task_params <- purrr::map(task$task_ids, \(x) c(x$required, x$optional)) |>
27+
purrr::discard_at(c("horizon", "reference_date"))
28+
## discard columns that are redundant with `target_end_date`
29+
30+
return(do.call(
31+
tidyr::crossing,
32+
c(task_params, list(output_type = output_types))
33+
))
34+
}
35+
36+
37+
#' Transform a group of modeling task represented as a list of
38+
#' nested lists into a single data frame.
39+
#'
40+
#' Calls [flatten_task()] on each entry of the task list.
41+
#'
42+
#' @param task_list List of tasks. Each entry should itself be
43+
#' be a nested list that can be passed to [flatten_task()].
44+
#' @param .deduplicate deduplicate the output if the same flat
45+
#' configuration is found multiple times while flattening the task list?
46+
#' Default `TRUE`.
47+
#'
48+
#' @return A [`tibble`][tibble::tibble()] of all potentially
49+
#' valid submittable outputs for all the modeling tasks defined in `task_lists`.
50+
#' Each row of the table represents a single valid forecastable quantity
51+
#' (e.g. "`target` X on `target_end_date` Y in `location` Z"),
52+
#' plus a valid submittable output_type for forecasting that quantity.
53+
#' If multiple `output_type`s are accepted for a given valid forecastable
54+
#' quantity, that quantity will be represented multiple times, with
55+
#' one row for each valid associated `output_type`.
56+
#'
57+
flatten_task_list <- function(task_list, .deduplicate = TRUE) {
58+
flat_tasks <- purrr::map_df(task_list, flatten_task)
59+
60+
if (.deduplicate) {
61+
flat_tasks <- dplyr::distinct(flat_tasks)
62+
}
63+
64+
return(flat_tasks)
65+
}
66+
67+
#' Generate and save oracle output for the Hub
68+
#'
69+
#' @param hub_path Path to the hub root.
70+
#'
71+
#' @return nothing, invisibly, on success.
72+
#' @export
73+
generate_oracle_output <- function(hub_path) {
74+
output_dirpath <- fs::path(hub_path, "target-data")
75+
fs::dir_create(output_dirpath)
76+
target_ts <- hubData::connect_target_timeseries(hub_path)
77+
78+
config_tasks <- hubUtils::read_config(hub_path, "tasks")
79+
round_ids <- hubUtils::get_round_ids(config_tasks)
80+
81+
## this involves duplication given how hubUtils::get_round_model_tasks
82+
## behaves by default with round ids created from reference dates,
83+
## but to support hubs with round_ids created in other ways, we
84+
## do it this way and then deduplicate as needed.
85+
list_of_task_lists <- purrr::map(round_ids, \(id) {
86+
hubUtils::get_round_model_tasks(config_tasks, id)
87+
})
88+
89+
unique_tasks <- purrr::map_df(list_of_task_lists, flatten_task_list) |>
90+
dplyr::distinct() |>
91+
dplyr::mutate(target_end_date = as.Date(.data$target_end_date))
92+
93+
target_data <- target_ts |>
94+
forecasttools::hub_target_data_as_of("latest", .drop = TRUE) |>
95+
dplyr::collect() |>
96+
dplyr::rename(target_end_date = "date")
97+
98+
join_key <- intersect(
99+
colnames(unique_tasks),
100+
colnames(target_data)
101+
)
102+
103+
oracle_data <- dplyr::inner_join(unique_tasks, target_data, by = join_key) |>
104+
dplyr::mutate(output_type_id = NA) |>
105+
dplyr::rename(
106+
oracle_value = "observation"
107+
)
108+
109+
output_file <- fs::path(output_dirpath, "oracle-output", ext = "parquet")
110+
forecasttools::write_tabular_file(oracle_data, output_file)
111+
invisible()
112+
}

R/update_hub_target_data.R

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ nssp_col_names <- list(
2424
#' is "2024-11-09".
2525
#' @param legacy_file Logical. Whether to write legacy
2626
#' CSV output (default: FALSE).
27+
#' @param nssp_update_local Logical. Whether to update NSSP
28+
#' data from local file `auxiliary-data/latest.csv`
29+
#' (default: FALSE).
2730
#'
2831
#' @return Writes `time-series.parquet` and optionally
2932
#' legacy CSV target data files to the target-data
@@ -35,7 +38,8 @@ update_hub_target_data <- function(
3538
as_of = lubridate::today(),
3639
nhsn_first_weekending_date = lubridate::as_date("2024-11-09"),
3740
included_locations = hubhelpr::included_locations,
38-
legacy_file = FALSE
41+
legacy_file = FALSE,
42+
nssp_update_local = FALSE
3943
) {
4044
if (!disease %in% c("covid", "rsv")) {
4145
stop("'disease' must be either 'covid' or 'rsv'")
@@ -90,11 +94,31 @@ update_hub_target_data <- function(
9094
)
9195
}
9296

93-
hubverse_format_nssp_data <- forecasttools::pull_data_cdc_gov_dataset(
94-
dataset = "nssp_prop_ed_visits",
95-
columns = c(nssp_col_name, "geography"),
96-
locations = "All"
97-
) |>
97+
if (nssp_update_local) {
98+
raw_nssp_data <- forecasttools::read_tabular(
99+
fs::path(
100+
base_hub_path,
101+
"auxiliary-data",
102+
"nssp-raw-data",
103+
"latest",
104+
ext = "csv"
105+
)
106+
) |>
107+
dplyr::filter(county == "All") |>
108+
dplyr::select(
109+
week_end,
110+
geography,
111+
dplyr::all_of(nssp_col_name)
112+
)
113+
} else {
114+
raw_nssp_data <- forecasttools::pull_data_cdc_gov_dataset(
115+
dataset = "nssp_prop_ed_visits",
116+
columns = c(nssp_col_name, "geography"),
117+
locations = "All"
118+
)
119+
}
120+
121+
hubverse_format_nssp_data <- raw_nssp_data |>
98122
dplyr::mutate(
99123
date = lubridate::as_date(.data$week_end),
100124
observation = as.numeric(.data[[nssp_col_name]]) / 100,

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
## Overview
88

9-
In-progress R package providing functions for CFA Hubs maintainence.
9+
In-progress R package providing functions for CFA Hubs maintenance.
1010

1111
## Getting started
1212

0 commit comments

Comments
 (0)