Skip to content
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Description: This package provides functions to help with the maintenance of CDC
License: Apache License (>= 2)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Imports:
checkmate,
cli,
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ export(get_forecast_data)
export(get_hub_name)
export(get_map_data)
export(included_locations)
export(summarize_ref_date_forecasts)
export(update_authorized_users)
export(update_hub_target_data)
export(write_ref_date_summary)
export(write_ref_date_summary_all)
export(write_ref_date_summary_ensemble)
importFrom(rlang,":=")
importFrom(rlang,.data)
140 changes: 10 additions & 130 deletions R/get_forecast_data.R
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the thought to retain this for backward compatibility? I think it is not used enough yet to make this worthwhile. I would just delete and make sure we port over the (currently one) hub that uses it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally wrote a deprecation comment on both get_*.R files, but I left them with minimal comment, in hopes that you might afford some remarks. I think deleting them makes the most sense (given there is not really a user base who've become accustomed to them) but didn't want to make this decision myself.

Original file line number Diff line number Diff line change
@@ -1,25 +1,10 @@
#' Generate forecast data file containing all forecast hub
#' model submissions.
#' Generate forecast data file containing all forecast hub model submissions
#'
#' This function fetches all forecast submissions from a
#' forecast hub based on the reference date. The forecast
#' data is then pivoted to create a wide format with
#' quantile levels as columns.
#'
#' The resulting file contains the following columns:
#' - `location_name`: full state name (including "US" for
#' the US state)
#' - `abbreviation`: state abbreviation
#' - `horizon`: forecast horizon
#' - `forecast_date`: date the forecast was generated
#' - `target_end_date`: target date for the forecast
#' - `model`: model name
#' - `quantile_*`: forecast values for various quantiles
#' (e.g., 0.025, 0.5, 0.975)
#' - `forecast_teams`: name of the team that generated the
#' model
#' - `forecast_fullnames`: full model name
#'
#' @param reference_date character, the reference date for
#' the forecast in YYYY-MM-DD format (ISO-8601).
#' @param base_hub_path character, path to the forecast
Expand Down Expand Up @@ -51,119 +36,14 @@ get_forecast_data <- function(
output_format = "csv",
targets = NULL
) {
checkmate::assert_choice(disease, choices = c("covid", "rsv"))
checkmate::assert_subset(horizons_to_include, choices = c(-1, 0, 1, 2, 3))
checkmate::assert_character(excluded_locations)
checkmate::assert_choice(output_format, choices = c("csv", "tsv", "parquet"))
checkmate::assert_character(targets, null.ok = TRUE)

reference_date <- lubridate::as_date(reference_date)

model_metadata <- hubData::load_model_metadata(
base_hub_path,
model_ids = NULL
)

hub_content <- hubData::connect_hub(base_hub_path)

current_forecasts <- hub_content |>
dplyr::filter(
.data$reference_date == !!reference_date,
!(.data$location %in% !!excluded_locations),
.data$horizon %in% !!horizons_to_include
) |>
hubData::collect_hub() |>
dplyr::filter(forecasttools::nullable_comparison(
.data$target,
"%in%",
!!targets
))

all_forecasts_data <- forecasttools::pivot_hubverse_quantiles_wider(
hubverse_table = current_forecasts,
pivot_quantiles = c(
"quantile_0.025" = 0.025,
"quantile_0.25" = 0.25,
"quantile_0.5" = 0.5,
"quantile_0.75" = 0.75,
"quantile_0.975" = 0.975
)
) |>
dplyr::mutate(
location_name = forecasttools::us_location_recode(
.data$location,
"hub",
"name"
),
abbreviation = forecasttools::us_location_recode(
.data$location,
"hub",
"abbr"
),
dplyr::across(
tidyselect::starts_with("quantile_"),
round,
.names = "{.col}_rounded"
),
forecast_due_date = as.Date(!!reference_date) - 3,
location_sort_order = ifelse(.data$location_name == "United States", 0, 1)
) |>
dplyr::mutate(
location_name = dplyr::case_match(
.data$location_name,
"United States" ~ "US",
.default = .data$location_name
)
) |>
dplyr::arrange(.data$location_sort_order, .data$location_name) |>
dplyr::left_join(
dplyr::distinct(
model_metadata,
.data$model_id,
.keep_all = TRUE
),
by = "model_id"
) |>
dplyr::select(
"location_name",
"abbreviation",
"horizon",
forecast_date = "reference_date",
"target_end_date",
model = "model_id",
"quantile_0.025",
"quantile_0.25",
"quantile_0.5",
"quantile_0.75",
"quantile_0.975",
"quantile_0.025_rounded",
"quantile_0.25_rounded",
"quantile_0.5_rounded",
"quantile_0.75_rounded",
"quantile_0.975_rounded",
forecast_team = "team_name",
"forecast_due_date",
model_full_name = "model_name"
)

output_folder_path <- fs::path(
hub_reports_path,
"weekly-summaries",
reference_date
)
output_filename <- glue::glue("{reference_date}_{disease}_forecasts_data")
output_filepath <- fs::path(
output_folder_path,
output_filename,
ext = output_format
write_ref_date_summary_all(
reference_date = reference_date,
base_hub_path = base_hub_path,
hub_reports_path = hub_reports_path,
disease = disease,
horizons_to_include = horizons_to_include,
excluded_locations = excluded_locations,
output_format = output_format,
targets = targets
)

fs::dir_create(output_folder_path)

if (!fs::file_exists(output_filepath)) {
forecasttools::write_tabular(all_forecasts_data, output_filepath)
cli::cli_inform("File saved as: {output_filepath}")
} else {
cli::cli_abort("File already exists: {output_filepath}")
}
}
181 changes: 10 additions & 171 deletions R/get_map_data.R
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as get_forecast_data.R

Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#' Generate map data file containing ensemble forecast
#' data.
#' Generate map data file containing ensemble forecast data
#'
#' This function loads the latest ensemble forecast data
#' from the forecast hub and processes it into the required
Expand All @@ -8,37 +7,6 @@
#' various forecast horizons, and quantiles (0.025, 0.5,
#' and 0.975).
#'
#' The ensemble data is expected to contain the following
#' columns:
#' - `reference_date`: the date of the forecast
#' - `location`: state abbreviation
#' - `horizon`: forecast horizon
#' - `target`: forecast target (e.g., "wk inc covid hosp")
#' - `target_end_date`: the forecast target date
#' - `output_type`: type of output (e.g., "quantile")
#' - `output_type_id`: quantile value (e.g., 0.025, 0.5,
#' 0.975)
#' - `value`: forecast value
#'
#' The resulting map file will have the following columns:
#' - `location_name`: full state name (including "US" for
#' the US state)
#' - `quantile_*`: the quantile forecast values (rounded
#' to two decimal places)
#' - `horizon`: forecast horizon
#' - `target`: forecast target (e.g., "7 day ahead inc
#' hosp")
#' - `target_end_date`: target date for the forecast (Ex:
#' 2024-11-30)
#' - `reference_date`: date that the forecast was generated
#' (Ex: 2024-11-23)
#' - `target_end_date_formatted`: target date for the
#' forecast, prettily re-formatted as a string (Ex:
#' "November 30, 2024")
#' - `reference_date_formatted`: date that the forecast
#' was generated, prettily re-formatted as a string
#' (Ex: "November 23, 2024")
#'
#' @param reference_date character, the reference date for
#' the forecast in YYYY-MM-DD format (ISO-8601).
#' @param base_hub_path character, path to the forecast
Expand Down Expand Up @@ -67,143 +35,14 @@ get_map_data <- function(
excluded_locations = character(0),
output_format = "csv"
) {
checkmate::assert_choice(disease, choices = c("covid", "rsv"))
checkmate::assert_subset(horizons_to_include, choices = c(-1, 0, 1, 2, 3))
checkmate::assert_data_frame(population_data)
checkmate::assert_names(
colnames(population_data),
must.include = c("location", "population")
)
checkmate::assert_character(excluded_locations)
checkmate::assert_choice(output_format, choices = c("csv", "tsv", "parquet"))

reference_date <- lubridate::as_date(reference_date)

hub_name <- get_hub_name(disease)
ensemble_model_name <- glue::glue("{hub_name}-ensemble")

ensemble_data <- hubData::connect_hub(base_hub_path) |>
dplyr::filter(
.data$reference_date == !!reference_date,
.data$model_id == !!ensemble_model_name
) |>
hubData::collect_hub()

if (nrow(ensemble_data) == 0) {
cli::cli_abort(
glue::glue(
"No ensemble data found for reference date {reference_date} ",
"and model {ensemble_model_name}"
)
)
}

# process ensemble data into the required format for Map file
map_data <- forecasttools::pivot_hubverse_quantiles_wider(
hubverse_table = ensemble_data,
pivot_quantiles = c(
"quantile_0.025" = 0.025,
"quantile_0.25" = 0.25,
"quantile_0.5" = 0.5,
"quantile_0.75" = 0.75,
"quantile_0.975" = 0.975
)
) |>
dplyr::filter(.data$horizon %in% !!horizons_to_include) |>
dplyr::filter(!(.data$location %in% !!excluded_locations)) |>
dplyr::mutate(
reference_date = as.Date(.data$reference_date),
target_end_date = as.Date(.data$target_end_date),
model = !!ensemble_model_name
) |>
# convert location column codes to full location names
dplyr::mutate(
location = forecasttools::us_location_recode(
.data$location,
"hub",
"name"
)
) |>
# long name "United States" to "US"
dplyr::mutate(
location = dplyr::case_match(
.data$location,
"United States" ~ "US",
.default = .data$location
),
# sort locations alphabetically, except for US
location_sort_order = ifelse(.data$location == "US", 0, 1)
) |>
dplyr::arrange(.data$location_sort_order, .data$location) |>
dplyr::left_join(
population_data,
by = "location"
) |>
dplyr::mutate(
population = as.numeric(.data$population),
quantile_0.025_per100k = .data$quantile_0.025 / .data$population * 100000,
quantile_0.5_per100k = .data$quantile_0.5 / .data$population * 100000,
quantile_0.975_per100k = .data$quantile_0.975 / .data$population * 100000,
quantile_0.025_count = .data$quantile_0.025,
quantile_0.5_count = .data$quantile_0.5,
quantile_0.975_count = .data$quantile_0.975,
quantile_0.025_per100k_rounded = round(.data$quantile_0.025_per100k, 2),
quantile_0.5_per100k_rounded = round(.data$quantile_0.5_per100k, 2),
quantile_0.975_per100k_rounded = round(.data$quantile_0.975_per100k, 2),
quantile_0.025_count_rounded = round(.data$quantile_0.025_count),
quantile_0.5_count_rounded = round(.data$quantile_0.5_count),
quantile_0.975_count_rounded = round(.data$quantile_0.975_count),
target_end_date_formatted = format(.data$target_end_date, "%B %d, %Y"),
reference_date_formatted = format(.data$reference_date, "%B %d, %Y"),
forecast_due_date = as.Date(!!reference_date) - 3,
forecast_due_date_formatted = format(
.data$forecast_due_date,
"%B %d, %Y"
),
) |>
dplyr::select(
location_name = "location",
"horizon",
"quantile_0.025_per100k",
"quantile_0.5_per100k",
"quantile_0.975_per100k",
"quantile_0.025_count",
"quantile_0.5_count",
"quantile_0.975_count",
"quantile_0.025_per100k_rounded",
"quantile_0.5_per100k_rounded",
"quantile_0.975_per100k_rounded",
"quantile_0.025_count_rounded",
"quantile_0.5_count_rounded",
"quantile_0.975_count_rounded",
"target",
"target_end_date",
"reference_date",
"forecast_due_date",
"target_end_date_formatted",
"forecast_due_date_formatted",
"reference_date_formatted",
"model",
)

output_folder_path <- fs::path(
hub_reports_path,
"weekly-summaries",
reference_date
)
output_filename <- glue::glue("{reference_date}_{disease}_map_data")
output_filepath <- fs::path(
output_folder_path,
output_filename,
ext = output_format
write_ref_date_summary_ensemble(
reference_date = reference_date,
base_hub_path = base_hub_path,
hub_reports_path = hub_reports_path,
disease = disease,
horizons_to_include = horizons_to_include,
population_data = population_data,
excluded_locations = excluded_locations,
output_format = output_format
)

fs::dir_create(output_folder_path)

if (!fs::file_exists(output_filepath)) {
forecasttools::write_tabular(map_data, output_filepath)
cli::cli_inform("File saved as: {output_filepath}")
} else {
cli::cli_abort("File already exists: {output_filepath}")
}
}
Loading