1
1
# ' Check the dataset contains enough data points.
2
2
# '
3
- # ' `check_enough_train_data ` creates a *specification* of a recipe
3
+ # ' `check_enough_data ` creates a *specification* of a recipe
4
4
# ' operation that will check if variables contain enough data.
5
5
# '
6
6
# ' @param recipe A recipe object. The check will be added to the
7
7
# ' sequence of operations for this recipe.
8
8
# ' @param ... One or more selector functions to choose variables for this check.
9
9
# ' See [selections()] for more details. You will usually want to use
10
- # ' [recipes::all_predictors()] here.
10
+ # ' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
11
11
# ' @param n The minimum number of data points required for training. If this is
12
12
# ' NULL, the total number of predictors will be used.
13
13
# ' @param epi_keys A character vector of column names on which to group the data
21
21
# ' @param columns An internal argument that tracks which columns are evaluated
22
22
# ' for this check. Should not be used by the user.
23
23
# ' @param id A character string that is unique to this check to identify it.
24
- # ' @param skip A logical. Should the check be skipped when the
25
- # ' recipe is baked by [bake()]? While all operations are baked
26
- # ' when [prep()] is run, some operations may not be able to be
27
- # ' conducted on new data (e.g. processing the outcome variable(s)).
28
- # ' Care should be taken when using `skip = TRUE` as it may affect
29
- # ' the computations for subsequent operations.
24
+ # ' @param skip A logical. If `TRUE`, only training data is checked, while if
25
+ # ' `FALSE`, both training and predicting data is checked. Technically, this
26
+ # ' answers the question "should the check be skipped when the recipe is baked
27
+ # ' by [bake()]?" While all operations are baked when [prep()] is run, some
28
+ # ' operations may not be able to be conducted on new data (e.g. processing the
29
+ # ' outcome variable(s)). Care should be taken when using `skip = TRUE` as it
30
+ # ' may affect the computations for subsequent operations.
30
31
# ' @family checks
31
32
# ' @export
32
- # ' @details This check will break the `bake` function if any of the checked
33
- # ' columns have not enough non-NA values. If the check passes, nothing is
34
- # ' changed to the data.
33
+ # ' @details This check will break the `prep` and/or bake function if any of the
34
+ # ' checked columns have not enough non-NA values. If the check passes, nothing
35
+ # ' is changed in the data. It is best used after every other step.
36
+ # '
37
+ # ' For checking training data, it is best to set `...` to be
38
+ # ' `all_predictors(), all_outcomes()`, while for checking prediction data, it
39
+ # ' is best to set `...` to be `all_predictors()` only, with `n = 1`.
35
40
# '
36
41
# ' # tidy() results
37
42
# '
38
43
# ' When you [`tidy()`][tidy.recipe()] this check, a tibble with column
39
44
# ' `terms` (the selectors or variables selected) is returned.
40
45
# '
41
- check_enough_train_data <-
46
+ check_enough_data <-
42
47
function (recipe ,
43
48
... ,
44
49
n = NULL ,
@@ -47,11 +52,11 @@ check_enough_train_data <-
47
52
role = NA ,
48
53
trained = FALSE ,
49
54
columns = NULL ,
50
- skip = FALSE ,
51
- id = rand_id(" enough_train_data " )) {
55
+ skip = TRUE ,
56
+ id = rand_id(" enough_data " )) {
52
57
recipes :: add_check(
53
58
recipe ,
54
- check_enough_train_data_new (
59
+ check_enough_data_new (
55
60
n = n ,
56
61
epi_keys = epi_keys ,
57
62
drop_na = drop_na ,
@@ -65,10 +70,10 @@ check_enough_train_data <-
65
70
)
66
71
}
67
72
68
- check_enough_train_data_new <-
73
+ check_enough_data_new <-
69
74
function (n , epi_keys , drop_na , terms , role , trained , columns , skip , id ) {
70
75
recipes :: check(
71
- subclass = " enough_train_data " ,
76
+ subclass = " enough_data " ,
72
77
prefix = " check_" ,
73
78
n = n ,
74
79
epi_keys = epi_keys ,
@@ -83,7 +88,7 @@ check_enough_train_data_new <-
83
88
}
84
89
85
90
# ' @export
86
- prep.check_enough_train_data <- function (x , training , info = NULL , ... ) {
91
+ prep.check_enough_data <- function (x , training , info = NULL , ... ) {
87
92
col_names <- recipes :: recipes_eval_select(x $ terms , training , info )
88
93
if (is.null(x $ n )) {
89
94
x $ n <- length(col_names )
@@ -102,11 +107,11 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
102
107
if (length(cols_not_enough_data ) > 0 ) {
103
108
cli_abort(
104
109
" The following columns don't have enough data to predict: {cols_not_enough_data}." ,
105
- class = " epipredict__not_enough_train_data "
110
+ class = " epipredict__not_enough_data "
106
111
)
107
112
}
108
113
109
- check_enough_train_data_new (
114
+ check_enough_data_new (
110
115
n = x $ n ,
111
116
epi_keys = x $ epi_keys ,
112
117
drop_na = x $ drop_na ,
@@ -120,7 +125,7 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
120
125
}
121
126
122
127
# ' @export
123
- bake.check_enough_train_data <- function (object , new_data , ... ) {
128
+ bake.check_enough_data <- function (object , new_data , ... ) {
124
129
col_names <- object $ columns
125
130
if (object $ drop_na ) {
126
131
non_na_data <- tidyr :: drop_na(new_data , any_of(unname(col_names )))
@@ -137,21 +142,21 @@ bake.check_enough_train_data <- function(object, new_data, ...) {
137
142
if (length(cols_not_enough_data ) > 0 ) {
138
143
cli_abort(
139
144
" The following columns don't have enough data to predict: {cols_not_enough_data}." ,
140
- class = " epipredict__not_enough_train_data "
145
+ class = " epipredict__not_enough_data "
141
146
)
142
147
}
143
148
new_data
144
149
}
145
150
146
151
# ' @export
147
- print.check_enough_train_data <- function (x , width = max(20 , options()$ width - 30 ), ... ) {
152
+ print.check_enough_data <- function (x , width = max(20 , options()$ width - 30 ), ... ) {
148
153
title <- paste0(" Check enough data (n = " , x $ n , " ) for " )
149
154
recipes :: print_step(x $ columns , x $ terms , x $ trained , title , width )
150
155
invisible (x )
151
156
}
152
157
153
158
# ' @export
154
- tidy.check_enough_train_data <- function (x , ... ) {
159
+ tidy.check_enough_data <- function (x , ... ) {
155
160
if (recipes :: is_trained(x )) {
156
161
res <- tibble(terms = unname(x $ columns ))
157
162
} else {
0 commit comments