diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index a305c8a93..19fd90c7e 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -7,10 +7,12 @@ from .decision_tree import DecisionTreeDiscretiser from .equal_frequency import EqualFrequencyDiscretiser from .equal_width import EqualWidthDiscretiser +from .chi_merge import ChiMergeDiscretiser __all__ = [ "DecisionTreeDiscretiser", "EqualFrequencyDiscretiser", "EqualWidthDiscretiser", "ArbitraryDiscretiser", + "ChiMergeDiscretiser", ] diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py new file mode 100644 index 000000000..6ef5b2d69 --- /dev/null +++ b/feature_engine/discretisation/chi_merge.py @@ -0,0 +1,309 @@ + +from typing import List, Union + +import numpy as np +import pandas as pd +from numpy.typing import NDArray +from sklearn.utils.validation import check_is_fitted + +from feature_engine._docstrings.class_inputs import ( + _variables_numerical_docstring, + _drop_original_docstring, +) +from feature_engine._docstrings.fit_attributes import ( + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.methods import _fit_transform_docstring +from feature_engine._docstrings.substitute import Substitution +from feature_engine.dataframe_checks import ( + _check_contains_inf, + _check_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.discretisation.base_discretiser import BaseDiscretiser +from feature_engine.variable_manipulation import ( + _check_input_parameter_variables, + _find_or_check_numerical_variables, +) + +@Substitution( + variables=_variables_numerical_docstring, + drop_original=_drop_original_docstring, + fit_transform=_fit_transform_docstring, + return_objects=BaseDiscretiser._return_object_docstring, + return_boundaries=BaseDiscretiser._return_boundaries_docstring, + binner_dict_=BaseDiscretiser._binner_dict_docstring, + fit=BaseDiscretiser._fit_docstring, + transform=BaseDiscretiser._transform_docstring +) +class ChiMergeDiscretiser(BaseDiscretiser): + """" + + Chi-Squared test is a statistical hypothesis test that assumes (the null hypothesis) + that the observed frequencies for a categorical variable match the expected frequencies + for the categorical variable. + + + Parameters + --------- + {variables} + + threshold: float, default=4.6 + The transformer will merge the frequency distributions until + all chi-scores are greater than the threshold. + + min_intervals: int, default=2 + An additional constraint for the transformer. The transformer + stops merging the distributions once the number of frequency matrix + intervals equals the min_intervals. + + max_intervals: int, default=2 + # TODO: Does not exist. Do we need this param? + + {drop_original} + + + Attributes + ---------- + frequency_matrix_intervals_: + The variable values that are used as the upper- and lower-bounds + of the frequency matrix. + + frequency_matrix_: + The frequency distributions for every interval. + + chi_test_: + The chi-scores for all adjacent frequency distributions. + + {binner_dict_} + + { + + Methods: + -------- + {fit} + + + + """ + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + threshold: Union[float, int] = 1.4, + min_intervals: int = 2, + max_intervals: int = 10, + return_object: bool = False, + return_boundaries: bool = False, + ) -> None: + + if not isinstance(threshold, (int, float)) or threshold < 0: + raise ValueError( + "threshold must be a positive integer or a float. " + f"Got {threshold} instead." + ) + + if not isinstance(min_intervals, int) or min_intervals < 2: + raise ValueError( + "min_intervals must be an integer that is greater than or " + f"equal to 2. Got {min_intervals} instead." + ) + + # TODO: Should we limit max_intervals? If so, how much? + if not isinstance(max_intervals, int) or max_intervals > 15: + raise ValueError( + "max_intervals must be an integer that is less than or " + f"equal to 15. Got {max_intervals} instead." + ) + super().__init__(return_object, return_boundaries) + + self.variables = _check_input_parameter_variables(variables) + self.threshold = threshold + self.min_intervals = min_intervals + self.max_intervals = max_intervals + + def fit(self, X: pd.DataFrame, y: pd.Series): + """ + Learn the limits of the intervals using the chi-square test. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the variables + to be transformed. + + y: pd.Series + y is the predicted variables. + + """ + # check input dataframe + X = check_X(X) + _check_contains_na(X, self.variables) + _check_contains_inf(X, self.variables) + + # find or check for numerical variables + # self.variables = _find_or_check_numerical_variables(X, self.variables) + + self.frequency_matrix_intervals_, self.frequency_matrix_ = ( + self._create_frequency_matrix(X, y, self.variables) + ) + self.chi_test_ = self._perform_chi_merge() + + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Sort the variable values into the intervals. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the discrete variables. + """ + # check that fit method has been called + check_is_fitted(self) + + # check if X is a dataframe + X = check_X(X) + + # TODO: How to type hint 2 numpy arrays + def _create_frequency_matrix(self, X: pd.DataFrame, y: pd.Series, variable: str) -> [NDArray, NDArray]: + """ + Generates a frequency table in which the labels organized into bins. + + Parameters + ---------- + X: pandas series = [n_samples, ] + The data to discretised. + + y: pandas series = [n_samples, ] + The categorical data that will be arranged in the bins. + + variable: str + The variable used to count the frequency of the class labels. + + Returns + ------- + contingency_table: dict + A frequency table of the tables for each unvariable feature value. + + + """ + frequency_matrix_intervals = np.sort(np.unique(X[variable])) + unique_class_values = np.sort(np.unique(y)) + frequency_matrix = np.zeros( + (len(frequency_matrix_intervals), len(unique_class_values)) + ) + + for value, label in zip(X[variable], y): + row_idx = np.where(frequency_matrix_intervals == value)[0][0] + col_idx = np.where(unique_class_values == label)[0][0] + frequency_matrix[row_idx][col_idx] += 1 + + return frequency_matrix_intervals, frequency_matrix + + + def _calc_chi_square(self, array: np.array) -> float: + """ + Calculates chi-squared. Using the following equation: + + # TODO: Add chi2 formula docstring + + Parameters + ---------- + X: np.array = [2, n_features] + Two sequential rows from the contingency table. + + Returns + ------- + chi2: float + Determines whether two sets of measurements are related. + """ + + shape = array.shape + num_obs = float(array.sum()) + rows_sums = {} + cols_sums = {} + chi2 = 0 + + # calculate row-wise summations + for row_idx in range(shape[0]): + rows_sums[row_idx] = array[row_idx, :].sum() + + # calculate column-wise summations + for col_idx in range(shape[1]): + cols_sums[col_idx] = array[:, col_idx].sum() + + # iterate through all expect and actual value pairs. + for row_idx in range(shape[0]): + for col_idx in range(shape[1]): + expected_val = rows_sums[row_idx] * cols_sums[col_idx] / num_obs + actual_val = array[row_idx, col_idx] + + if expected_val == 0: + # prevents NaN error + chi2 += 0 + else: + chi2 += (actual_val - expected_val) ** 2 / float(expected_val) + + return chi2 + + def _perform_chi_merge(self) -> None: + """ + Merge adjacent distributions until the the minimum chi-square is greater than + the threshold or the number of frequency-matrix intervals is equal to the + limit of the minimum number of intervals. + + Parameters + ---------- + None + + Returns + ------- + None + + """ + + while self.frequency_matrix_.shape[0] > self.min_intervals: + + chi_test = {} + shape = self.frequency_matrix_.shape + + for row_idx in range(0, shape[0] - 1): + row_idx_2 = row_idx + 1 + chi2 = self._calc_chi_square( + self.frequency_matrix_[row_idx: row_idx_2 + 1] + ) + + if chi2 not in chi_test: + chi_test[chi2] = [] + + chi_test[chi2].append((row_idx, row_idx_2)) + + # use variable to merge the frequency-matrix intervals that + # have the lowest confidence that the frequency distributions are different + min_chi_score = min(chi_test.keys()) + + if min_chi_score < self.threshold: + + # reverse list allows code to remove the upperbound as it is updating the frequency matrix + for lower_bound, upper_bound in list(reversed(chi_test[min_chi_score])): + for col_idx in range(shape[1]): + # merge upper-bound distribution into lower-bound distribution + self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx] + + # delete upperbound and its distribution from the frequeny matrix + self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0) + self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0) + + # stop merge when minimum chi-score is greater than or equal to the threshold + else: + break + + return chi_test \ No newline at end of file diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py new file mode 100644 index 000000000..6d5c4a1a2 --- /dev/null +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -0,0 +1,142 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn import datasets + +from feature_engine.discretisation import ChiMergeDiscretiser + +# TODO: Should we create the df here on in conftest? + +# create dataset for unit tests +col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"] +iris_data = datasets.load_iris().data +iris = pd.DataFrame(iris_data, columns=col_names) +iris["flower"] = datasets.load_iris().target + + +def test_create_frequency_matrix(): + transformer = ChiMergeDiscretiser( + variables="sepal_length", + threshold=1.4, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + frequency_matrix = transformer._create_frequency_matrix( + X=iris[["sepal_length", "sepal_width", "petal_length"]], + y=iris["flower"], + variable="sepal_length" + ) + lengths = list(frequency_matrix[0]) + distributions = list(frequency_matrix[1]) + freq_matrix_dict = {l: list(d) for l, d in zip(lengths, distributions)} + + # number of flowers accounted for in frequency matrix + num_flowers = np.sum(distributions) + + # expected results + expected_frequency_matrix = { + 4.3: [1, 0, 0], + 4.4: [3, 0, 0], + 4.5: [1, 0, 0], + 4.6: [4, 0, 0], + 4.7: [2, 0, 0], + 4.8: [5, 0, 0], + 4.9: [4, 1, 1], + 5.0: [8, 2, 0], + 5.1: [8, 1, 0], + 5.2: [3, 1, 0], + 5.3: [1, 0, 0], + 5.4: [5, 1, 0], + 5.5: [2, 5, 0], + 5.6: [0, 5, 1], + 5.7: [2, 5, 1], + 5.8: [1, 3, 3], + 5.9: [0, 2, 1], + 6.0: [0, 4, 2], + 6.1: [0, 4, 2], + 6.2: [0, 2, 2], + 6.3: [0, 3, 6], + 6.4: [0, 2, 5], + 6.5: [0, 1, 4], + 6.6: [0, 2, 0], + 6.7: [0, 3, 5], + 6.8: [0, 1, 2], + 6.9: [0, 1, 3], + 7.0: [0, 1, 0], + 7.1: [0, 0, 1], + 7.2: [0, 0, 3], + 7.3: [0, 0, 1], + 7.4: [0, 0, 1], + 7.6: [0, 0, 1], + 7.7: [0, 0, 4], + 7.9: [0, 0, 1] + } + expected_num_flowers = iris.shape[0] + + # check results + assert freq_matrix_dict == expected_frequency_matrix + # confirm all flowers are included + assert num_flowers == expected_num_flowers + + +def test_chi_merge(): + # Test 1 - threshold is 0.5 significance level + transformer = ChiMergeDiscretiser( + variables="sepal_length", + threshold=1.4, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + transformer.fit( + iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"] + ) + + chi_scores = transformer.chi_test_.keys() + chi_scores_round = pd.Series(chi_scores).round(1) + + frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_) + + # expected results + expected_chi_scores = pd.Series( + [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6] + ) + expected_frequency_matrix_intervals = [ + 4.3, 4.9, 5.0, 5.5, 5.6, 5.7, 5.8, 5.9, 6.3, 6.6, 6.7, 7.0, 7.1 + ] + + # tests - 0.5 significance level + assert frequency_matrix_intervals == expected_frequency_matrix_intervals + assert (chi_scores_round == expected_chi_scores).all() + + # Test 2 - threshold is 0.9 significance level + transformer = ChiMergeDiscretiser( + variables="sepal_length", + threshold=4.6, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + transformer.fit( + iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"] + ) + + chi_scores = transformer.chi_test_.keys() + chi_scores_round = pd.Series(chi_scores).round(1) + + frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_) + + # expected results + expected_chi_scores = pd.Series([30.9, 6.7, 4.9, 5.9]) + expected_frequency_matrix_intervals = [4.3, 5.5, 5.8, 6.3, 7.1] + + # tests - 0.9 significance level + assert frequency_matrix_intervals == expected_frequency_matrix_intervals + assert (chi_scores_round == expected_chi_scores).all() \ No newline at end of file