From e79e8b642a044c9a2ec4ebf9f5b666489fe2ce68 Mon Sep 17 00:00:00 2001 From: sikai zhang Date: Tue, 13 Jan 2026 16:30:07 +0800 Subject: [PATCH 1/2] FEAT add minimize solver for NARX --- fastcan/narx/_base.py | 616 ++++++++++++++++++++------ fastcan/narx/_narx_fast.pyi | 56 ++- fastcan/narx/_narx_fast.pyx | 462 +++++++++++++------ fastcan/narx/tests/test_minimize.py | 94 ++++ fastcan/narx/tests/test_narx_hess.py | 79 +++- fastcan/narx/tests/test_narx_hessp.py | 42 +- fastcan/narx/tests/test_narx_jac.py | 15 +- 7 files changed, 1048 insertions(+), 316 deletions(-) create mode 100644 fastcan/narx/tests/test_minimize.py diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py index 3a6599b..8d6785e 100644 --- a/fastcan/narx/_base.py +++ b/fastcan/narx/_base.py @@ -8,7 +8,7 @@ from numbers import Integral import numpy as np -from scipy.optimize import least_squares +from scipy.optimize import least_squares, minimize from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.linear_model import LinearRegression from sklearn.utils import check_array, check_consistent_length, column_or_1d @@ -27,10 +27,16 @@ make_poly_ids, make_time_shift_features, ) -from ._narx_fast import _predict, _update_der +from ._narx_fast import ( + _compute_d2ydx2, + _compute_d2ydx2p, + _compute_dydx, + _compute_term_lib, + _predict, +) -class _MemoizeOpt: +class _OptMemoize: """Cache of residual, jac, and hess during optimization. x: parameters func: function to compute residual, jac, hess @@ -39,62 +45,147 @@ class _MemoizeOpt: 1: compute loss, grad, and hess """ - def __init__(self, func, mode): - self.func = func - self.mode = mode + def __init__( + self, + opt_residual, # residual, y_hat, mask_valid, sample_weight_sqrt_masked + opt_jac, # jac, term_lib, jc, dydx + opt_hess, # hess + opt_hessp, # hessp + sample_weight_sqrt, + ): + self.opt_residual = opt_residual + self.opt_jac = opt_jac + self.opt_hess = opt_hess + self.opt_hessp = opt_hessp + self.sample_weight_sqrt = sample_weight_sqrt + self._residual = None + self._y_hat = None + self._mask_valid = None + self._sample_weight_sqrt_masked = None + self._jac = None + self._term_lib = None + self._jc = None + self._dydx = None + self._hess = None + self._hessp = None - self.x = None - def _compute_if_needed(self, x, *args, p=None): + self.x_residual = None # Order 1 + self.x_jac = None # Order 2 + self.x_hess = None # Order 3 + self.x_hessp = None # Order 3 + + def _if_compute_residual(self, x, *args): if ( - not np.all(x == self.x) + not np.all(x == self.x_residual) or self._residual is None + or self._y_hat is None + or self._mask_valid is None + or self._sample_weight_sqrt_masked is None + ): + self.x_residual = np.asarray(x).copy() + ( + self._residual, + self._y_hat, + self._mask_valid, + self._sample_weight_sqrt_masked, + ) = self.opt_residual(x, *args) + + def _if_compute_jac(self, x, *args): + self._if_compute_residual(x, *args) + if ( + not np.all(x == self.x_jac) or self._jac is None - or (self._hess is None and self.mode == 1) - or (self._hessp is None and self.mode == 2) + or self._term_lib is None + or self._jc is None + or self._dydx is None ): - self.x = np.asarray(x).copy() - self._residual, self._jac, self._hess, self._hessp = self.func( - x, self.mode, *args, p=p + self.x_jac = np.asarray(x).copy() + ( + self._jac, + self._term_lib, + self._jc, + self._dydx, + ) = self.opt_jac( + x, + *args, + y_hat=self._y_hat, + mask_valid=self._mask_valid, + sample_weight_sqrt_masked=self._sample_weight_sqrt_masked, + ) + + def _if_compute_hess(self, x, *args): + self._if_compute_jac(x, *args) + if not np.all(x == self.x_hess) or self._hess is None: + self.x_hess = np.asarray(x).copy() + self._hess = self.opt_hess( + x, + *args, + residual=self._residual, + y_hat=self._y_hat, + mask_valid=self._mask_valid, + sample_weight_sqrt_masked=self._sample_weight_sqrt_masked, + jac=self._jac, + term_lib=self._term_lib, + jc=self._jc, + dydx=self._dydx, + ) + + def _if_compute_hessp(self, x, p, *args): + self._if_compute_jac(x, *args) + if not np.all(x == self.x_hessp) or self._hessp is None: + self.x_hessp = np.asarray(x).copy() + self._hessp = self.opt_hessp( + x, + *args, + residual=self._residual, + y_hat=self._y_hat, + mask_valid=self._mask_valid, + sample_weight_sqrt_masked=self._sample_weight_sqrt_masked, + jac=self._jac, + term_lib=self._term_lib, + jc=self._jc, + dydx=self._dydx, + p=p, ) def residual(self, x, *args): """R = sqrt(sw) * (y - y_hat)""" - self._compute_if_needed(x, *args) + self._if_compute_residual(x, *args) return self._residual def jac(self, x, *args): """J = sqrt(sw) * dydx""" - self._compute_if_needed(x, *args) + self._if_compute_jac(x, *args) return self._jac def hess(self, x, *args): """Hessian of least squares loss. H = J^T @ J + sum(R * sqrt(sw) * d2ydx2) H: (n_samples * n_outputs, n_x, n_x)""" - self._compute_if_needed(x, *args) + self._if_compute_hess(x, *args) return self._hess def hessp(self, x, p, *args): """Hessian-vector product of least squares loss. H * p = J^T @ (J @ p) + sum(R * sqrt(sw) * (d2ydx2 @ p)) Hp: (n_samples * n_outputs, n_x)""" - self._compute_if_needed(x, *args, p=p) + self._if_compute_hessp(x, p, *args) return self._hessp def loss(self, x, *args): """Least squares loss: 0.5 * sum(R^2)""" - self._compute_if_needed(x, *args) + self._if_compute_residual(x, *args) return 0.5 * np.sum(np.square(self._residual)) def grad(self, x, *args): """Gradient of least squares loss. G = sw * R * dydx = J^T @ R""" - self._compute_if_needed(x, *args) - return np.transpose(self._jac).T @ self._residual + self._if_compute_jac(x, *args) + return np.transpose(self._jac) @ self._residual class NARX(MultiOutputMixin, RegressorMixin, BaseEstimator): @@ -230,11 +321,19 @@ def __init__( "coef_init": [None, StrOptions({"one_step_ahead"}), "array-like"], "sample_weight": [None, "array-like"], "session_sizes": [None, "array-like"], + "solver": [StrOptions({"least_squares", "minimize"})], }, prefer_skip_nested_validation=True, ) def fit( - self, X, y, sample_weight=None, coef_init=None, session_sizes=None, **params + self, + X, + y, + sample_weight=None, + coef_init=None, + session_sizes=None, + solver="least_squares", + **params, ): """ Fit narx model. @@ -270,15 +369,50 @@ def fit( .. versionadded:: 0.5 + solver : {'least_squares', 'minimize'}, default='least_squares' + The SciPy solver for optimization. + + .. versionadded:: 0.5.1 + **params : dict Keyword arguments passed to - `scipy.optimize.least_squares`. + `scipy.optimize.least_squares` or `scipy.optimize.minimize`. Returns ------- self : object Fitted Estimator. """ + if solver == "least_squares": + mode = 0 # jac + else: # solver == "minimize" + mode = 1 # loss and grad + if "method" in params: + method = params["method"] + if method in ["dogleg", "trust-exact"]: + mode = 2 # hess + elif method in [ + "Newton-CG", + "trust-ncg", + "trust-krylov", + "trust-constr", + ]: + mode = 3 # hessp + elif method not in [ + None, + "CG", + "BFGS", + "Newton-CG", + "L-BFGS-B", + "TNC", + "SLSQP", + "dogleg", + "trust-ncg", + "trust-krylov", + "trust-exact", + "trust-constr", + ]: + mode = 4 # loss only none_inputs = False if X is None: # Auto-regressive model X = np.empty((1, 0), dtype=float, order="C") # Skip validation @@ -427,7 +561,6 @@ def fit( f"({(n_coef_intercept,)}), but got {coef_init.shape}." ) - mode = 1 jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids = NARX._get_jc_ids( self.feat_ids_, self.delay_ids_, self.output_ids_, n_features ) @@ -455,35 +588,80 @@ def fit( y_ids = np.r_[self.output_ids_, np.arange(self.n_outputs_, dtype=np.int32)] else: y_ids = self.output_ids_ - memoize_opt = _MemoizeOpt(NARX._func, mode=mode) - res = least_squares( - memoize_opt.residual, - x0=coef_init, - jac=memoize_opt.jac, - args=( - X, - y, - self.feat_ids_, - self.delay_ids_, - self.output_ids_, - self.fit_intercept, - sample_weight_sqrt, - session_sizes_cumsum, - self.max_delay_, - y_ids, - unique_feat_ids, - unique_delay_ids, - const_term_ids, - jac_yyd_ids, - jac_coef_ids, - jac_term_ids, - hess_yyd_ids, - hess_coef_ids, - hess_term_ids, - hess_yd_ids, - ), - **params, + memoize_opt = _OptMemoize( + self._opt_residual, + self._opt_jac, + self._opt_hess, + self._opt_hessp, + sample_weight_sqrt, ) + if mode == 0: + res = least_squares( + fun=memoize_opt.residual, + x0=coef_init, + jac=memoize_opt.jac, + args=( + X, + y, + self.feat_ids_, + self.delay_ids_, + self.output_ids_, + self.fit_intercept, + sample_weight_sqrt, + session_sizes_cumsum, + self.max_delay_, + y_ids, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + ), + **params, + ) + else: # solver == "minimize" + jac_hess_kw = {} + if mode == 1: + jac_hess_kw["jac"] = memoize_opt.grad + elif mode == 2: + jac_hess_kw["jac"] = memoize_opt.grad + jac_hess_kw["hess"] = memoize_opt.hess + elif mode == 3: + jac_hess_kw["jac"] = memoize_opt.grad + jac_hess_kw["hessp"] = memoize_opt.hessp + res = minimize( + fun=memoize_opt.loss, + x0=coef_init, + args=( + X, + y, + self.feat_ids_, + self.delay_ids_, + self.output_ids_, + self.fit_intercept, + sample_weight_sqrt, + session_sizes_cumsum, + self.max_delay_, + y_ids, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + ), + **jac_hess_kw, + **params, + ) if self.fit_intercept: self.coef_ = res.x[: -self.n_outputs_] self.intercept_ = res.x[-self.n_outputs_ :] @@ -536,7 +714,7 @@ def _get_hc_ids( hess_feat_ids = np.zeros((0, n_degree), dtype=np.int32) hess_delay_ids = np.zeros((0, n_degree), dtype=np.int32) - if mode < 1: + if mode < 2: return ( hess_yyd_ids, hess_yd_ids, @@ -646,9 +824,24 @@ def _get_jc_ids(feat_ids, delay_ids, output_ids, n_features_in): ) @staticmethod - def _func( + def _split_coef_intercept(coef_intercept, fit_intercept, y): + """ + Split coef_intercept into coef and intercept. + """ + n_samples, n_outputs = y.shape + n_x = coef_intercept.shape[0] + + if fit_intercept: + coef = coef_intercept[:-n_outputs] + intercept = coef_intercept[-n_outputs:] + else: + coef = coef_intercept + intercept = np.zeros(n_outputs, dtype=float) + return coef, intercept, n_samples, n_outputs, n_x + + @staticmethod + def _opt_residual( coef_intercept, - mode, X, y, feat_ids, @@ -669,32 +862,17 @@ def _func( hess_coef_ids, hess_term_ids, hess_yd_ids, - p=None, ): """ - Compute residual, Jacobian, and optionally Hessian. - - Parameters - ---------- - mode : int - 0: compute residual and Jacobian - 1: compute residual, Jacobian, and Hessian + Compute residual. Returns ------- residual : array of shape (n_samples,) - jac : array of shape (n_samples, n_x) - hess : None or array - None when mode=0, hessian array when mode=1 """ - n_samples, n_outputs = y.shape - n_x = coef_intercept.shape[0] - if fit_intercept: - coef = coef_intercept[:-n_outputs] - intercept = coef_intercept[-n_outputs:] - else: - coef = coef_intercept - intercept = np.zeros(n_outputs, dtype=float) + coef, intercept, n_samples, n_outputs, _ = NARX._split_coef_intercept( + coef_intercept, fit_intercept, y + ) # Compute prediction y_hat = np.zeros((n_samples, n_outputs), dtype=float) @@ -711,28 +889,138 @@ def _func( y_hat, ) - # Compute Jacobian + # Mask missing values + mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True) + y_masked = y[mask_valid] + y_hat_masked = y_hat[mask_valid] + sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid] + residual = (sample_weight_sqrt_masked * (y_hat_masked - y_masked)).flatten() + return residual, y_hat, mask_valid, sample_weight_sqrt_masked + + @staticmethod + def _opt_jac( + coef_intercept, + X, + y, + feat_ids, + delay_ids, + output_ids, + fit_intercept, + sample_weight_sqrt, + session_sizes_cumsum, + max_delay, + y_ids, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + y_hat, + mask_valid, + sample_weight_sqrt_masked, + ): + """ + Compute Jacobian. + + Returns + ------- + jac : array of shape (n_samples, n_x) + dydx : array of shape (n_samples, n_outputs, n_x) + term_lib : array of shape (n_samples, n_unique_terms) + """ + coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept( + coef_intercept, fit_intercept, y + ) + dydx = np.zeros((n_samples, n_outputs, n_x), dtype=float) jc = np.zeros((max_delay, n_outputs, n_outputs), dtype=float) + term_lib = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float) - term_libs = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float) - if mode == 0: - hc = np.zeros((1, 1, 1, 1), dtype=float) # dummy - d2ydx2 = np.zeros((1, 1, 1, 1), dtype=float) # dummy - d2ydx2p = np.zeros((1, 1, 1), dtype=float) # dummy - p = np.zeros(1, dtype=float) # dummy - elif mode == 1: - hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float) - d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float) - d2ydx2p = np.zeros((1, 1, 1), dtype=float) # dummy - p = np.zeros(1, dtype=float) # dummy - elif mode == 2: - hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float) - d2ydx2 = np.zeros((max_delay + 1, n_x, n_outputs, n_x), dtype=float) - d2ydx2p = np.zeros((n_samples, n_outputs, n_x), dtype=float) - - _update_der( - mode, + _compute_term_lib( + X, + y_hat, + max_delay, + session_sizes_cumsum, + unique_feat_ids, + unique_delay_ids, + term_lib, + ) + + _compute_dydx( + X, + y_hat, + max_delay, + session_sizes_cumsum, + y_ids, + coef, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + term_lib, + jc, + dydx, + ) + # jac (n_samples, n_outputs (out), n_x) to (n_samples * n_outputs, n_x) + dydx_masked = dydx[mask_valid] + jac = np.reshape( + dydx_masked * sample_weight_sqrt_masked[..., np.newaxis], (-1, n_x) + ) + return jac, term_lib, jc, dydx + + @staticmethod + def _opt_hess( + coef_intercept, + X, + y, + feat_ids, + delay_ids, + output_ids, + fit_intercept, + sample_weight_sqrt, + session_sizes_cumsum, + max_delay, + y_ids, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + residual, + y_hat, + mask_valid, + sample_weight_sqrt_masked, + jac, + term_lib, + jc, + dydx, + ): + """ + Compute Hessian matrix. + + Returns + ------- + hess : array of shape (n_x, n_x) + """ + + coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept( + coef_intercept, fit_intercept, y + ) + hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float) + d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float) + _compute_d2ydx2( X, y_hat, max_delay, @@ -749,65 +1037,109 @@ def _func( hess_coef_ids, hess_term_ids, hess_yd_ids, - p, - term_libs, + term_lib, + dydx, jc, hc, - dydx, d2ydx2, - d2ydx2p, ) + # d2ydx2 has shape in (n_samples, n_x, n_outputs (out), n_x) + d2ydx2_masked = ( + d2ydx2[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis, np.newaxis] + ) + # reshape to (n_samples, n_outputs (out), n_x, n_x) + d2ydx2_masked = d2ydx2_masked.swapaxes(1, 2) + # reshape to (n_samples * n_outputs, n_x, n_x) + d2ydx2_masked = np.reshape(d2ydx2_masked, (-1, n_x, n_x)) + hess = jac.T @ jac + np.tensordot(residual, d2ydx2_masked, axes=1) + hess = 0.5 * (hess + hess.T) # force symmetric + return hess - # Mask missing values - mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True) - y_masked = y[mask_valid] - y_hat_masked = y_hat[mask_valid] - sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid] - dydx_masked = dydx[mask_valid] + @staticmethod + def _opt_hessp( + coef_intercept, + X, + y, + feat_ids, + delay_ids, + output_ids, + fit_intercept, + sample_weight_sqrt, + session_sizes_cumsum, + max_delay, + y_ids, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + residual, + y_hat, + mask_valid, + sample_weight_sqrt_masked, + jac, + term_lib, + jc, + dydx, + p, + ): + """ + Compute Hessian-vector product. - # Compute residuals (n_samples, n_outputs) to (n_samples * n_outputs,) - residual = (sample_weight_sqrt_masked * (y_hat_masked - y_masked)).flatten() + Returns + ------- + hessp : array of shape (n_x,) + """ - # Compute Jacobian - # jac (n_samples, n_outputs (out), n_x) to (n_samples * n_outputs, n_x) - jac = np.reshape( - dydx_masked * sample_weight_sqrt_masked[..., np.newaxis], (-1, n_x) + coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept( + coef_intercept, fit_intercept, y ) + hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float) + d2ydx2 = np.zeros((max_delay + 1, n_x, n_outputs, n_x), dtype=float) + d2ydx2p = np.zeros((n_samples, n_outputs, n_x), dtype=float) + _compute_d2ydx2p( + X, + y_hat, + max_delay, + session_sizes_cumsum, + y_ids, + coef, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + hess_yd_ids, + term_lib, + dydx, + p, + jc, + hc, + d2ydx2, + d2ydx2p, + ) + # d2ydx2p has shape in (n_samples, n_outputs (out), n_x) + d2ydx2p_masked = ( + d2ydx2p[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis] + ) + # reshape to (n_samples * n_outputs, n_x) + d2ydx2p_masked = np.reshape(d2ydx2p_masked, (-1, n_x)) - # Compute Hessian (or Hessian-vector product) if needed - hess = None - hessp = None - if mode == 1: - # d2ydx2 has shape in (n_samples, n_x, n_outputs (out), n_x) - d2ydx2_masked = ( - d2ydx2[mask_valid] - * sample_weight_sqrt_masked[..., np.newaxis, np.newaxis] - ) - # reshape to (n_samples, n_outputs (out), n_x, n_x) - d2ydx2_masked = d2ydx2_masked.swapaxes(1, 2) - # reshape to (n_samples * n_outputs, n_x, n_x) - d2ydx2_masked = np.reshape(d2ydx2_masked, (-1, n_x, n_x)) - hess = jac.T @ jac + np.tensordot(residual, d2ydx2_masked, axes=1) - hess = 0.5 * (hess + hess.T) # force symmetric - elif mode == 2: - # d2ydx2p has shape in (n_samples, n_outputs (out), n_x) - d2ydx2p_masked = ( - d2ydx2p[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis] - ) - # reshape to (n_samples * n_outputs, n_x) - d2ydx2p_masked = np.reshape(d2ydx2p_masked, (-1, n_x)) - - # H @ p = (J^T @ J) @ p + sum(residual * (d2ydx2 @ p)) - # d2ydx2p corresponds to d2ydx2 @ p - - # (J^T @ J) @ p = J^T @ (J @ p) - hessp = jac.T @ (jac @ p) + residual @ d2ydx2p_masked + # H @ p = (J^T @ J) @ p + sum(residual * (d2ydx2 @ p)) + # d2ydx2p corresponds to d2ydx2 @ p - # residual: (n_samples * n_outputs,) - # jac: (n_samples * n_outputs, n_x) - # hess: (n_x, n_x) if mode == 1 - # hessp: (n_x,) if mode == 2 - return residual, jac, hess, hessp + # (J^T @ J) @ p = J^T @ (J @ p) + hessp = jac.T @ (jac @ p) + residual @ d2ydx2p_masked + return hessp @validate_params( { diff --git a/fastcan/narx/_narx_fast.pyi b/fastcan/narx/_narx_fast.pyi index e6f7d9a..dc835d6 100644 --- a/fastcan/narx/_narx_fast.pyi +++ b/fastcan/narx/_narx_fast.pyi @@ -16,8 +16,33 @@ def _predict( max_delay: int, y_hat: npt.NDArray[np.float64], ) -> None: ... -def _update_der( - mode: int, +def _compute_term_lib( + X: npt.NDArray[np.float64], + y_hat: npt.NDArray[np.float64], + max_delay: int, + session_sizes_cumsum: npt.NDArray[np.intc], + unique_feat_ids: npt.NDArray[np.intc], + unique_delay_ids: npt.NDArray[np.intc], + term_lib: npt.NDArray[np.float64], +) -> None: ... +def _compute_dydx( + X: npt.NDArray[np.float64], + y_hat: npt.NDArray[np.float64], + max_delay: int, + session_sizes_cumsum: npt.NDArray[np.intc], + y_ids: npt.NDArray[np.intc], + coefs: npt.NDArray[np.float64], + unique_feat_ids: npt.NDArray[np.intc], + unique_delay_ids: npt.NDArray[np.intc], + const_term_ids: npt.NDArray[np.intc], + jac_yyd_ids: npt.NDArray[np.intc], + jac_coef_ids: npt.NDArray[np.intc], + jac_term_ids: npt.NDArray[np.intc], + term_lib: npt.NDArray[np.float64], + jc: npt.NDArray[np.float64], + dydx: npt.NDArray[np.float64], +) -> None: ... +def _compute_d2ydx2( X: npt.NDArray[np.float64], y_hat: npt.NDArray[np.float64], max_delay: int, @@ -34,11 +59,34 @@ def _update_der( hess_coef_ids: npt.NDArray[np.intc], hess_term_ids: npt.NDArray[np.intc], hess_yd_ids: npt.NDArray[np.intc], - p: npt.NDArray[np.float64], - term_libs: npt.NDArray[np.float64], + term_lib: npt.NDArray[np.float64], + dydx: npt.NDArray[np.float64], jc: npt.NDArray[np.float64], hc: npt.NDArray[np.float64], + d2ydx2: npt.NDArray[np.float64], +) -> None: ... +def _compute_d2ydx2p( + X: npt.NDArray[np.float64], + y_hat: npt.NDArray[np.float64], + max_delay: int, + session_sizes_cumsum: npt.NDArray[np.intc], + y_ids: npt.NDArray[np.intc], + coefs: npt.NDArray[np.float64], + unique_feat_ids: npt.NDArray[np.intc], + unique_delay_ids: npt.NDArray[np.intc], + const_term_ids: npt.NDArray[np.intc], + jac_yyd_ids: npt.NDArray[np.intc], + jac_coef_ids: npt.NDArray[np.intc], + jac_term_ids: npt.NDArray[np.intc], + hess_yyd_ids: npt.NDArray[np.intc], + hess_coef_ids: npt.NDArray[np.intc], + hess_term_ids: npt.NDArray[np.intc], + hess_yd_ids: npt.NDArray[np.intc], + term_lib: npt.NDArray[np.float64], dydx: npt.NDArray[np.float64], + p: npt.NDArray[np.float64], + jc: npt.NDArray[np.float64], + hc: npt.NDArray[np.float64], d2ydx2: npt.NDArray[np.float64], d2ydx2p: npt.NDArray[np.float64], ) -> None: ... diff --git a/fastcan/narx/_narx_fast.pyx b/fastcan/narx/_narx_fast.pyx index f49bf98..8366c78 100644 --- a/fastcan/narx/_narx_fast.pyx +++ b/fastcan/narx/_narx_fast.pyx @@ -293,8 +293,86 @@ cdef inline void _update_d2ydx2( @final -cpdef void _update_der( - const int mode, +cdef inline void _check_at_init( + const double[:, ::1] X, + const double[:, ::1] y_hat, + const int max_delay, + const int[::1] session_sizes_cumsum, + const Py_ssize_t k, + Py_ssize_t *s, # IN/OUT pointer + Py_ssize_t *init_k, # IN/OUT pointer + bint *at_init, # IN/OUT pointer +) noexcept nogil: + """Check if at init""" + cdef bint is_finite + + # Check boundaries + if s[0] < session_sizes_cumsum.shape[0] and k == session_sizes_cumsum[s[0]]: + s[0] += 1 + init_k[0] = k + at_init[0] = True + + with gil: + is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k])) + + if not is_finite: + init_k[0] = k + 1 + at_init[0] = True + + if k - init_k[0] == max_delay: + at_init[0] = False + + +@final +cpdef void _compute_term_lib( + const double[:, ::1] X, + const double[:, ::1] y_hat, + const int max_delay, + const int[::1] session_sizes_cumsum, + const int[:, ::1] unique_feat_ids, # IN + const int[:, ::1] unique_delay_ids, # IN + double[:, ::1] term_lib, # OUT +) noexcept nogil: + """ + Pre-compute all terms for all time steps and store them in term_libs. + """ + cdef: + Py_ssize_t n_samples = X.shape[0] + Py_ssize_t k, s = 0 + Py_ssize_t init_k = 0 + bint at_init = True + + for k in range(n_samples): + # Check if at init + _check_at_init( + X, + y_hat, + max_delay, + session_sizes_cumsum, + k, + &s, + &init_k, + &at_init, + ) + if at_init: + continue + + # Compute terms for time step k + _update_terms( + X, + y_hat, + term_lib[k], + unique_feat_ids, + unique_delay_ids, + k, + ) + with gil: + if np.max(np.abs(term_lib[k])) > 1e20: + break + + +@final +cpdef void _compute_dydx( const double[:, ::1] X, const double[:, ::1] y_hat, const int max_delay, @@ -307,33 +385,16 @@ cpdef void _update_der( const int[:, ::1] jac_yyd_ids, const int[::1] jac_coef_ids, const int[::1] jac_term_ids, - const int[:, ::1] hess_yyd_ids, - const int[::1] hess_coef_ids, - const int[::1] hess_term_ids, - const int[:, ::1] hess_yd_ids, - const double[::1] p, # IN arbitrary vector - double[:, ::1] term_libs, # initialized with 1.0 + const double[:, ::1] term_lib, # initialized with 1.0 double[:, :, ::1] jc, - double[:, :, :, ::1] hc, double[:, :, ::1] dydx, # OUT initialized with 0.0 - double[:, :, :, ::1] d2ydx2, # OUT initialized with 0.0 - double[:, :, ::1] d2ydx2p, # OUT initialized with 0.0 ) noexcept nogil: """ - Computation of dydx and d2ydx2 matrix. - mode: - 0 - only dydx - 1 - both dydx and d2ydx2 - + Computation of dydx matrix. Returns ------- dydx : ndarray of shape (n_samples, n_outputs, n_x) First derivative matrix of the outputs with respect to coefficients and intercepts. - d2ydx2 : ndarray of shape (n_samples, n_x, n_outputs (out), n_x) - Second derivative matrix of the outputs with respect to coefficients and intercepts - d2ydx2p : ndarray of shape (n_samples, n_outputs, n_x) - When mode == 2, d2ydx2 is of shape (max_delay + 1, n_x, n_outputs (out), n_x) - Second derivative matrix times an arbitrary vector p. """ cdef: Py_ssize_t n_samples = y_hat.shape[0] @@ -342,50 +403,29 @@ cpdef void _update_der( Py_ssize_t N = dydx.shape[2] # n_x Py_ssize_t n_const = const_term_ids.shape[0] Py_ssize_t n_jac = jac_term_ids.shape[0] - Py_ssize_t n_hess = hess_term_ids.shape[0] Py_ssize_t init_k = 0 bint at_init = True - bint is_finite - bint jac_not_empty, hess_not_empty + bint jac_not_empty with gil: jac_not_empty = max_delay > 0 and n_jac > 0 - hess_not_empty = max_delay > 0 and n_hess > 0 for k in range(n_samples): # Check if at init - if k == session_sizes_cumsum[s]: - s += 1 - at_init = True - init_k = k - continue - with gil: - is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k])) - if not is_finite: - at_init = True - init_k = k + 1 - continue - if k - init_k == max_delay: - at_init = False - - if at_init: - continue - # Compute terms for time step k - _update_terms( + _check_at_init( X, y_hat, - term_libs[k], - unique_feat_ids, - unique_delay_ids, + max_delay, + session_sizes_cumsum, k, + &s, + &init_k, + &at_init, ) - with gil: - if np.max(np.abs(term_libs[k])) > 1e20: - break - - # Update constant terms of Jacobian + if at_init: + continue for i in range(n_const): - dydx[k, y_ids[i], i] = term_libs[k, const_term_ids[i]] + dydx[k, y_ids[i], i] = term_lib[k, const_term_ids[i]] # Update intercepts if any for i in range(n_const, N): @@ -398,7 +438,7 @@ cpdef void _update_der( jac_yyd_ids, jac_coef_ids, jac_term_ids, - term_libs[k], + term_lib[k], jc, ) for d in range(max_delay): @@ -410,90 +450,242 @@ cpdef void _update_der( 1.0, &jc[d, 0, 0], M, &dydx[k-d-1, 0, 0], N, 1.0, &dydx[k, 0, 0], N, ) - if mode == 1 and hess_not_empty: - # Update dynamic terms of Hessian - _update_d2ydx2( - coefs, - hess_yyd_ids, - hess_coef_ids, - hess_term_ids, - term_libs[k], - hess_yd_ids, - dydx, - jc, - M, - N, - max_delay, - k, - k, - hc, - d2ydx2, - ) - - # Handle divergence - with gil: - if ( - not np.all(np.isfinite(d2ydx2[k])) or - np.max(np.abs(d2ydx2[k])) > 1e20 - ): - break - - if mode == 2 and hess_not_empty: - # Update d2ydx2p - # d2ydx2 values move forward by 1 step d2ydx2[k] -> d2ydx2[k-1] - memmove( - &d2ydx2[0, 0, 0, 0], - &d2ydx2[1, 0, 0, 0], - max_delay * N * M * N * sizeof(double) - ) - memset( - &d2ydx2[max_delay, 0, 0, 0], - 0, - N * M * N * sizeof(double) - ) - - # Update dynamic terms of Hessian - _update_d2ydx2( - coefs, - hess_yyd_ids, - hess_coef_ids, - hess_term_ids, - term_libs[k], - hess_yd_ids, - dydx, - jc, - M, - N, - max_delay, - k, - max_delay, - hc, - d2ydx2, - ) - - # Compute d2ydx2p[k] = d2ydx2[k].T @ p - # d2ydx2[k] shape (N, M, N), p shape (N) - # target d2ydx2p[k] shape (M, N) - # Treat d2ydx2[k] as (N, M*N) matrix - _gemv( - RowMajor, Trans, - N, M * N, - 1.0, &d2ydx2[max_delay, 0, 0, 0], M * N, - &p[0], 1, - 0.0, &d2ydx2p[k, 0, 0], 1 - ) - - # Handle divergence - with gil: - if ( - not np.all(np.isfinite(d2ydx2p[k])) or - not np.all(np.isfinite(d2ydx2[max_delay])) or - (np.max(np.abs(d2ydx2p[k])) > 1e20) or - (np.max(np.abs(d2ydx2[max_delay])) > 1e20) - ): - break # Handle divergence with gil: if not np.all(np.isfinite(dydx[k])) or np.max(np.abs(dydx[k])) > 1e20: break + + +@final +cpdef void _compute_d2ydx2( + const double[:, ::1] X, + const double[:, ::1] y_hat, + const int max_delay, + const int[::1] session_sizes_cumsum, + const int[::1] y_ids, + const double[::1] coefs, + const int[:, ::1] unique_feat_ids, # IN + const int[:, ::1] unique_delay_ids, # IN + const int[::1] const_term_ids, + const int[:, ::1] jac_yyd_ids, + const int[::1] jac_coef_ids, + const int[::1] jac_term_ids, + const int[:, ::1] hess_yyd_ids, + const int[::1] hess_coef_ids, + const int[::1] hess_term_ids, + const int[:, ::1] hess_yd_ids, + const double[:, ::1] term_lib, + const double[:, :, ::1] dydx, + double[:, :, ::1] jc, + double[:, :, :, ::1] hc, + double[:, :, :, ::1] d2ydx2, # OUT initialized with 0.0 +) noexcept nogil: + """ + Computation of d2ydx2 matrix. + Returns + ------- + d2ydx2 : ndarray of shape (n_samples, n_x, n_outputs (out), n_x) + Second derivative matrix of the outputs with respect to coefficients and intercepts + """ + cdef: + Py_ssize_t n_samples = y_hat.shape[0] + Py_ssize_t k, s = 0 + Py_ssize_t M = jc.shape[1] # n_outputs + Py_ssize_t N = dydx.shape[2] # n_x + Py_ssize_t n_jac = jac_term_ids.shape[0] + Py_ssize_t n_hess = hess_term_ids.shape[0] + Py_ssize_t init_k = 0 + bint at_init = True + bint jac_not_empty + bint hess_not_empty + + with gil: + jac_not_empty = max_delay > 0 and n_jac > 0 + hess_not_empty = max_delay > 0 and n_hess > 0 + + for k in range(n_samples): + # Check if at init + _check_at_init( + X, + y_hat, + max_delay, + session_sizes_cumsum, + k, + &s, + &init_k, + &at_init, + ) + if at_init: + continue + + if jac_not_empty: + _update_jc( + coefs, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + term_lib[k], + jc, + ) + + if hess_not_empty: + # Update dynamic terms of Hessian + _update_d2ydx2( + coefs, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + term_lib[k], + hess_yd_ids, + dydx, + jc, + M, + N, + max_delay, + k, + k, + hc, + d2ydx2, + ) + + # Handle divergence + with gil: + if ( + not np.all(np.isfinite(d2ydx2[k])) or + np.max(np.abs(d2ydx2[k])) > 1e20 + ): + break + + +@final +cpdef void _compute_d2ydx2p( + const double[:, ::1] X, + const double[:, ::1] y_hat, + const int max_delay, + const int[::1] session_sizes_cumsum, + const int[::1] y_ids, + const double[::1] coefs, + const int[:, ::1] unique_feat_ids, # IN + const int[:, ::1] unique_delay_ids, # IN + const int[::1] const_term_ids, + const int[:, ::1] jac_yyd_ids, + const int[::1] jac_coef_ids, + const int[::1] jac_term_ids, + const int[:, ::1] hess_yyd_ids, + const int[::1] hess_coef_ids, + const int[::1] hess_term_ids, + const int[:, ::1] hess_yd_ids, + const double[:, ::1] term_lib, + const double[:, :, ::1] dydx, + const double[::1] p, # IN arbitrary vector + double[:, :, ::1] jc, + double[:, :, :, ::1] hc, + double[:, :, :, ::1] d2ydx2, # OUT initialized with 0.0 + double[:, :, ::1] d2ydx2p, # OUT initialized with 0.0 +) noexcept nogil: + """ + Computation of d2ydx2-vector product matrix. + + Returns + ------- + d2ydx2 : ndarray of shape (max_delay + 1, n_x, n_outputs (out), n_x) + Second derivative matrix of the outputs with respect to coefficients and intercepts + d2ydx2p : ndarray of shape (n_samples, n_outputs, n_x) + Second derivative matrix times an arbitrary vector p. + """ + cdef: + Py_ssize_t n_samples = y_hat.shape[0] + Py_ssize_t k, s = 0 + Py_ssize_t M = jc.shape[1] # n_outputs + Py_ssize_t N = dydx.shape[2] # n_x + Py_ssize_t n_jac = jac_term_ids.shape[0] + Py_ssize_t n_hess = hess_term_ids.shape[0] + Py_ssize_t init_k = 0 + bint at_init = True + bint jac_not_empty + bint hess_not_empty + + with gil: + jac_not_empty = max_delay > 0 and n_jac > 0 + hess_not_empty = max_delay > 0 and n_hess > 0 + + for k in range(n_samples): + # Check if at init + _check_at_init( + X, + y_hat, + max_delay, + session_sizes_cumsum, + k, + &s, + &init_k, + &at_init, + ) + if at_init: + continue + + if jac_not_empty: + _update_jc( + coefs, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + term_lib[k], + jc, + ) + + if hess_not_empty: + # Update d2ydx2p + # d2ydx2 values move forward by 1 step d2ydx2[k] -> d2ydx2[k-1] + memmove( + &d2ydx2[0, 0, 0, 0], + &d2ydx2[1, 0, 0, 0], + max_delay * N * M * N * sizeof(double) + ) + memset( + &d2ydx2[max_delay, 0, 0, 0], + 0, + N * M * N * sizeof(double) + ) + + # Update dynamic terms of Hessian + _update_d2ydx2( + coefs, + hess_yyd_ids, + hess_coef_ids, + hess_term_ids, + term_lib[k], + hess_yd_ids, + dydx, + jc, + M, + N, + max_delay, + k, + max_delay, + hc, + d2ydx2, + ) + + # Compute d2ydx2p[k] = d2ydx2[k].T @ p + # d2ydx2[k] shape (N, M, N), p shape (N) + # target d2ydx2p[k] shape (M, N) + # Treat d2ydx2[k] as (N, M*N) matrix + _gemv( + RowMajor, Trans, + N, M * N, + 1.0, &d2ydx2[max_delay, 0, 0, 0], M * N, + &p[0], 1, + 0.0, &d2ydx2p[k, 0, 0], 1 + ) + + # Handle divergence + with gil: + if ( + not np.all(np.isfinite(d2ydx2p[k])) or + not np.all(np.isfinite(d2ydx2[max_delay])) or + (np.max(np.abs(d2ydx2p[k])) > 1e20) or + (np.max(np.abs(d2ydx2[max_delay])) > 1e20) + ): + break diff --git a/fastcan/narx/tests/test_minimize.py b/fastcan/narx/tests/test_minimize.py new file mode 100644 index 0000000..019105a --- /dev/null +++ b/fastcan/narx/tests/test_minimize.py @@ -0,0 +1,94 @@ +"""Test NARX with minimize solver""" + +import numpy as np +import pytest +from sklearn.metrics import r2_score + +from fastcan.narx import make_narx +from fastcan.utils import mask_missing_values + + +@pytest.mark.parametrize("multi_output", [False, True]) +@pytest.mark.parametrize("nan", [False, True]) +@pytest.mark.parametrize( + "method", ["default", None, "L-BFGS-B", "dogleg", "trust-ncg", "CG", "Nelder-Mead"] +) +def test_minimize(nan, multi_output, method): + """Test NARX with minimize solver on synthetic data.""" + + def make_data(multi_output, nan, rng): + if multi_output: + n_samples = 1000 + max_delay = 3 + e0 = rng.normal(0, 0.1, n_samples) + e1 = rng.normal(0, 0.02, n_samples) + u0 = rng.uniform(0, 1, n_samples + max_delay) + u1 = rng.normal(0, 0.1, n_samples + max_delay) + y0 = np.zeros(n_samples + max_delay) + y1 = np.zeros(n_samples + max_delay) + for i in range(max_delay, n_samples + max_delay): + y0[i] = ( + 0.5 * y0[i - 1] + + 0.8 * y1[i - 1] + + 0.3 * u0[i] ** 2 + + 2 * u0[i - 1] * u0[i - 3] + + 1.5 * u0[i - 2] * u1[i - 3] + + 1 + ) + y1[i] = ( + 0.6 * y1[i - 1] + - 0.2 * y0[i - 1] * y1[i - 2] + + 0.3 * u1[i] ** 2 + + 1.5 * u1[i - 2] * u0[i - 3] + + 0.5 + ) + y = np.c_[y0[max_delay:] + e0, y1[max_delay:] + e1] + X = np.c_[u0[max_delay:], u1[max_delay:]] + n_outputs = 2 + else: + rng = np.random.default_rng(12345) + n_samples = 1000 + max_delay = 3 + e = rng.normal(0, 0.1, n_samples) + u0 = rng.uniform(0, 1, n_samples + max_delay) + u1 = rng.normal(0, 0.1, n_samples) + y = np.zeros(n_samples + max_delay) + for i in range(max_delay, n_samples + max_delay): + y[i] = ( + 0.5 * y[i - 1] + + 0.3 * u0[i] ** 2 + + 2 * u0[i - 1] * u0[i - 3] + + 1.5 * u0[i - 2] * u1[i - max_delay] + + 1 + ) + y = y[max_delay:] + e + X = np.c_[u0[max_delay:], u1] + n_outputs = 1 + + if nan: + X_nan_ids = rng.choice(n_samples, 20, replace=False) + y_nan_ids = rng.choice(n_samples, 10, replace=False) + X[X_nan_ids] = np.nan + y[y_nan_ids] = np.nan + + X = np.asfortranarray(X) + y = np.asfortranarray(y) + return X, y, n_outputs + + rng = np.random.default_rng(12345) + X, y, _ = make_data(multi_output, nan, rng) + + fit_kwargs = {"solver": "minimize"} + if method != "default": + fit_kwargs["method"] = method + + narx_score = make_narx( + X, + y, + n_terms_to_select=[5, 4] if multi_output else 4, + max_delay=3, + poly_degree=2, + verbose=0, + ).fit(X, y, coef_init="one_step_ahead", **fit_kwargs) + + assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.5 diff --git a/fastcan/narx/tests/test_narx_hess.py b/fastcan/narx/tests/test_narx_hess.py index 85d4329..d6b7d83 100644 --- a/fastcan/narx/tests/test_narx_hess.py +++ b/fastcan/narx/tests/test_narx_hess.py @@ -10,7 +10,13 @@ from scipy.optimize import approx_fprime from fastcan.narx import NARX -from fastcan.narx._narx_fast import _predict, _update_der +from fastcan.narx._base import _OptMemoize +from fastcan.narx._narx_fast import ( + _compute_d2ydx2, + _compute_dydx, + _compute_term_lib, + _predict, +) def _hessian_wrapper( @@ -29,10 +35,15 @@ def _hessian_wrapper( jac_delay_ids, return_hess=True, ): - mode = 1 + mode = 2 (hess_yyd_ids, hess_yd_ids, hess_coef_ids, hess_feat_ids, hess_delay_ids) = ( NARX._get_hc_ids( - jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids, X.shape[1], mode=1 + jac_yyd_ids, + jac_coef_ids, + jac_feat_ids, + jac_delay_ids, + X.shape[1], + mode=mode, ) ) @@ -56,9 +67,15 @@ def _hessian_wrapper( y_ids = np.asarray(output_ids, dtype=np.int32) if return_hess: - res, jac, hess, _ = NARX._func( - coef_intercept, - mode, + memoize = _OptMemoize( + NARX._opt_residual, + NARX._opt_jac, + NARX._opt_hess, + NARX._opt_hessp, + sample_weight_sqrt, + ) + + args = ( X, y, feat_ids, @@ -80,6 +97,11 @@ def _hessian_wrapper( hess_term_ids, hess_yd_ids, ) + + hess = memoize.hess(coef_intercept, *args) + jac = memoize.jac(coef_intercept, *args) + res = memoize.residual(coef_intercept, *args) + return res, jac, hess else: # Compute prediction @@ -106,19 +128,43 @@ def _hessian_wrapper( y_hat, ) + term_lib = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float) + _compute_term_lib( + X, + y_hat, + max_delay, + session_sizes_cumsum, + unique_feat_ids, + unique_delay_ids, + term_lib, + ) + # Compute Jacobian dydx = np.zeros((n_samples, n_outputs, n_x), dtype=float) jc = np.zeros((max_delay, n_outputs, n_outputs), dtype=float) - term_libs = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float) + _compute_dydx( + X, + y_hat, + max_delay, + session_sizes_cumsum, + y_ids, + coef, + unique_feat_ids, + unique_delay_ids, + const_term_ids, + jac_yyd_ids, + jac_coef_ids, + jac_term_ids, + term_lib, + jc, + dydx, + ) + hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float) d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float) - p = np.zeros(1, dtype=float) - d2ydx2p = np.zeros((1, 1, 1), dtype=float) - - _update_der( - mode, + _compute_d2ydx2( X, y_hat, max_delay, @@ -135,14 +181,13 @@ def _hessian_wrapper( hess_coef_ids, hess_term_ids, hess_yd_ids, - p, - term_libs, + term_lib, + dydx, jc, hc, - dydx, d2ydx2, - d2ydx2p, ) + return y_hat, d2ydx2 @@ -511,7 +556,7 @@ def get_narx_hess(feat_ids, delay_ids, output_ids): n_features = feat_ids.max() - n_outputs + 1 n_x = n_terms + n_outputs - mode = 1 + mode = 2 jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids = NARX._get_jc_ids( feat_ids, delay_ids, output_ids, n_features diff --git a/fastcan/narx/tests/test_narx_hessp.py b/fastcan/narx/tests/test_narx_hessp.py index f5d33f5..a731b6f 100644 --- a/fastcan/narx/tests/test_narx_hessp.py +++ b/fastcan/narx/tests/test_narx_hessp.py @@ -5,7 +5,7 @@ from numpy.testing import assert_allclose from fastcan.narx import NARX -from fastcan.narx._base import _MemoizeOpt +from fastcan.narx._base import _OptMemoize @pytest.mark.parametrize("seed", [10, 42, 123, 999, 2024]) @@ -66,15 +66,21 @@ def test_random(seed): # call arguments: (fun, x0, ..., args=args_tuple, ...) call_kwargs = mock_ls.call_args.kwargs args_tuple = call_kwargs["args"] + sample_weight_sqrt = args_tuple[6] + + # 1. Compute Hessian matrix using mode=2 + memoize = _OptMemoize( + NARX._opt_residual, + NARX._opt_jac, + NARX._opt_hess, + NARX._opt_hessp, + sample_weight_sqrt, + ) + H = memoize.hess(coef_init, *args_tuple) - # 1. Compute Hessian matrix using mode=1 - opt_hess = _MemoizeOpt(NARX._func, mode=1) - H = opt_hess.hess(coef_init, *args_tuple) - - # 2. Compute Hessp product using mode=2 with random vector p + # 2. Compute Hessp product using mode=3 with random vector p p = rng.standard_normal(len(coef_init)) * 0.01 - opt_hessp = _MemoizeOpt(NARX._func, mode=2) - Hp = opt_hessp.hessp(coef_init, p, *args_tuple) + Hp = memoize.hessp(coef_init, p, *args_tuple) # 3. Verify Hp == H @ p Hp_expected = H @ p @@ -184,15 +190,21 @@ def test_complex(): # Capture arguments passed to least_squares call_kwargs = mock_ls.call_args.kwargs args_tuple = call_kwargs["args"] + sample_weight_sqrt = args_tuple[6] + + # 1. Compute Hessian matrix using mode=2 + memoize = _OptMemoize( + NARX._opt_residual, + NARX._opt_jac, + NARX._opt_hess, + NARX._opt_hessp, + sample_weight_sqrt, + ) + H = memoize.hess(coef_init, *args_tuple) - # 1. Compute Hessian matrix using mode=1 - opt_hess = _MemoizeOpt(NARX._func, mode=1) - H = opt_hess.hess(coef_init, *args_tuple) - - # 2. Compute Hessp product using mode=2 with random vector p + # 2. Compute Hessp product using mode=3 with random vector p p = rng.standard_normal(len(coef_init)) - opt_hessp = _MemoizeOpt(NARX._func, mode=2) - Hp = opt_hessp.hessp(coef_init, p, *args_tuple) + Hp = memoize.hessp(coef_init, p, *args_tuple) # 3. Verify Hp == H @ p Hp_expected = H @ p diff --git a/fastcan/narx/tests/test_narx_jac.py b/fastcan/narx/tests/test_narx_jac.py index 9009947..165c0e1 100644 --- a/fastcan/narx/tests/test_narx_jac.py +++ b/fastcan/narx/tests/test_narx_jac.py @@ -6,6 +6,7 @@ from sklearn.metrics import r2_score from fastcan.narx import NARX, make_narx +from fastcan.narx._base import _OptMemoize from fastcan.narx._narx_fast import _predict @@ -50,9 +51,15 @@ def _derivative_wrapper( else: y_ids = np.asarray(output_ids, dtype=np.int32) - _, jac, _, _ = NARX._func( - coef_intercept, - 0, + memoize = _OptMemoize( + NARX._opt_residual, + NARX._opt_jac, + NARX._opt_hess, + NARX._opt_hessp, + sample_weight_sqrt, + ) + + args = ( X, y, feat_ids, @@ -74,6 +81,8 @@ def _derivative_wrapper( hess_term_ids, hess_yd_ids, ) + + jac = memoize.jac(coef_intercept, *args) return jac From 941ca3e63d2f8081014b8e36808fa71c06e25706 Mon Sep 17 00:00:00 2001 From: sikai zhang Date: Wed, 14 Jan 2026 09:21:19 +0800 Subject: [PATCH 2/2] fix versionadded and test_minimize threshold --- fastcan/_fastcan.py | 2 +- fastcan/narx/_base.py | 2 +- fastcan/narx/_utils.py | 2 +- fastcan/narx/tests/test_minimize.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py index e556c01..8d7c116 100644 --- a/fastcan/_fastcan.py +++ b/fastcan/_fastcan.py @@ -53,7 +53,7 @@ class FastCan(SelectorMixin, BaseEstimator): When `beam_width` = 1, use greedy search. When `beam_width` > 1, use beam search. - .. versionadded:: 0.5 + .. versionadded:: 0.5.0 verbose : int, default=1 The verbosity level. diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py index 8d6785e..41b8396 100644 --- a/fastcan/narx/_base.py +++ b/fastcan/narx/_base.py @@ -367,7 +367,7 @@ def fit( The sum of session_sizes should be equal to n_samples. If None, the whole data is treated as one session. - .. versionadded:: 0.5 + .. versionadded:: 0.5.0 solver : {'least_squares', 'minimize'}, default='least_squares' The SciPy solver for optimization. diff --git a/fastcan/narx/_utils.py b/fastcan/narx/_utils.py index 9f86ba9..3ce61d1 100644 --- a/fastcan/narx/_utils.py +++ b/fastcan/narx/_utils.py @@ -199,7 +199,7 @@ def make_narx( The sum of session_sizes should be equal to n_samples. If None, the whole data is treated as one session. - .. versionadded:: 0.5 + .. versionadded:: 0.5.0 max_candidates : int, default=None Maximum number of candidate polynomial terms retained before selection. diff --git a/fastcan/narx/tests/test_minimize.py b/fastcan/narx/tests/test_minimize.py index 019105a..3878cdc 100644 --- a/fastcan/narx/tests/test_minimize.py +++ b/fastcan/narx/tests/test_minimize.py @@ -91,4 +91,4 @@ def make_data(multi_output, nan, rng): verbose=0, ).fit(X, y, coef_init="one_step_ahead", **fit_kwargs) - assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.5 + assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.97