From e79e8b642a044c9a2ec4ebf9f5b666489fe2ce68 Mon Sep 17 00:00:00 2001
From: sikai zhang <matthew.szhang91@gmail.com>
Date: Tue, 13 Jan 2026 16:30:07 +0800
Subject: [PATCH 1/2] FEAT add minimize solver for NARX

---
 fastcan/narx/_base.py                 | 616 ++++++++++++++++++++------
 fastcan/narx/_narx_fast.pyi           |  56 ++-
 fastcan/narx/_narx_fast.pyx           | 462 +++++++++++++------
 fastcan/narx/tests/test_minimize.py   |  94 ++++
 fastcan/narx/tests/test_narx_hess.py  |  79 +++-
 fastcan/narx/tests/test_narx_hessp.py |  42 +-
 fastcan/narx/tests/test_narx_jac.py   |  15 +-
 7 files changed, 1048 insertions(+), 316 deletions(-)
 create mode 100644 fastcan/narx/tests/test_minimize.py

diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py
index 3a6599b..8d6785e 100644
--- a/fastcan/narx/_base.py
+++ b/fastcan/narx/_base.py
@@ -8,7 +8,7 @@
 from numbers import Integral
 
 import numpy as np
-from scipy.optimize import least_squares
+from scipy.optimize import least_squares, minimize
 from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
 from sklearn.linear_model import LinearRegression
 from sklearn.utils import check_array, check_consistent_length, column_or_1d
@@ -27,10 +27,16 @@
     make_poly_ids,
     make_time_shift_features,
 )
-from ._narx_fast import _predict, _update_der
+from ._narx_fast import (
+    _compute_d2ydx2,
+    _compute_d2ydx2p,
+    _compute_dydx,
+    _compute_term_lib,
+    _predict,
+)
 
 
-class _MemoizeOpt:
+class _OptMemoize:
     """Cache of residual, jac, and hess during optimization.
     x: parameters
     func: function to compute residual, jac, hess
@@ -39,62 +45,147 @@ class _MemoizeOpt:
         1: compute loss, grad, and hess
     """
 
-    def __init__(self, func, mode):
-        self.func = func
-        self.mode = mode
+    def __init__(
+        self,
+        opt_residual,  # residual, y_hat, mask_valid, sample_weight_sqrt_masked
+        opt_jac,  # jac, term_lib, jc, dydx
+        opt_hess,  # hess
+        opt_hessp,  # hessp
+        sample_weight_sqrt,
+    ):
+        self.opt_residual = opt_residual
+        self.opt_jac = opt_jac
+        self.opt_hess = opt_hess
+        self.opt_hessp = opt_hessp
+        self.sample_weight_sqrt = sample_weight_sqrt
+
         self._residual = None
+        self._y_hat = None
+        self._mask_valid = None
+        self._sample_weight_sqrt_masked = None
+
         self._jac = None
+        self._term_lib = None
+        self._jc = None
+        self._dydx = None
+
         self._hess = None
+
         self._hessp = None
-        self.x = None
 
-    def _compute_if_needed(self, x, *args, p=None):
+        self.x_residual = None  # Order 1
+        self.x_jac = None  # Order 2
+        self.x_hess = None  # Order 3
+        self.x_hessp = None  # Order 3
+
+    def _if_compute_residual(self, x, *args):
         if (
-            not np.all(x == self.x)
+            not np.all(x == self.x_residual)
             or self._residual is None
+            or self._y_hat is None
+            or self._mask_valid is None
+            or self._sample_weight_sqrt_masked is None
+        ):
+            self.x_residual = np.asarray(x).copy()
+            (
+                self._residual,
+                self._y_hat,
+                self._mask_valid,
+                self._sample_weight_sqrt_masked,
+            ) = self.opt_residual(x, *args)
+
+    def _if_compute_jac(self, x, *args):
+        self._if_compute_residual(x, *args)
+        if (
+            not np.all(x == self.x_jac)
             or self._jac is None
-            or (self._hess is None and self.mode == 1)
-            or (self._hessp is None and self.mode == 2)
+            or self._term_lib is None
+            or self._jc is None
+            or self._dydx is None
         ):
-            self.x = np.asarray(x).copy()
-            self._residual, self._jac, self._hess, self._hessp = self.func(
-                x, self.mode, *args, p=p
+            self.x_jac = np.asarray(x).copy()
+            (
+                self._jac,
+                self._term_lib,
+                self._jc,
+                self._dydx,
+            ) = self.opt_jac(
+                x,
+                *args,
+                y_hat=self._y_hat,
+                mask_valid=self._mask_valid,
+                sample_weight_sqrt_masked=self._sample_weight_sqrt_masked,
+            )
+
+    def _if_compute_hess(self, x, *args):
+        self._if_compute_jac(x, *args)
+        if not np.all(x == self.x_hess) or self._hess is None:
+            self.x_hess = np.asarray(x).copy()
+            self._hess = self.opt_hess(
+                x,
+                *args,
+                residual=self._residual,
+                y_hat=self._y_hat,
+                mask_valid=self._mask_valid,
+                sample_weight_sqrt_masked=self._sample_weight_sqrt_masked,
+                jac=self._jac,
+                term_lib=self._term_lib,
+                jc=self._jc,
+                dydx=self._dydx,
+            )
+
+    def _if_compute_hessp(self, x, p, *args):
+        self._if_compute_jac(x, *args)
+        if not np.all(x == self.x_hessp) or self._hessp is None:
+            self.x_hessp = np.asarray(x).copy()
+            self._hessp = self.opt_hessp(
+                x,
+                *args,
+                residual=self._residual,
+                y_hat=self._y_hat,
+                mask_valid=self._mask_valid,
+                sample_weight_sqrt_masked=self._sample_weight_sqrt_masked,
+                jac=self._jac,
+                term_lib=self._term_lib,
+                jc=self._jc,
+                dydx=self._dydx,
+                p=p,
             )
 
     def residual(self, x, *args):
         """R = sqrt(sw) * (y - y_hat)"""
-        self._compute_if_needed(x, *args)
+        self._if_compute_residual(x, *args)
         return self._residual
 
     def jac(self, x, *args):
         """J = sqrt(sw) * dydx"""
-        self._compute_if_needed(x, *args)
+        self._if_compute_jac(x, *args)
         return self._jac
 
     def hess(self, x, *args):
         """Hessian of least squares loss.
         H = J^T @ J + sum(R * sqrt(sw) * d2ydx2)
         H: (n_samples * n_outputs, n_x, n_x)"""
-        self._compute_if_needed(x, *args)
+        self._if_compute_hess(x, *args)
         return self._hess
 
     def hessp(self, x, p, *args):
         """Hessian-vector product of least squares loss.
         H * p = J^T @ (J @ p) + sum(R * sqrt(sw) * (d2ydx2 @ p))
         Hp: (n_samples * n_outputs, n_x)"""
-        self._compute_if_needed(x, *args, p=p)
+        self._if_compute_hessp(x, p, *args)
         return self._hessp
 
     def loss(self, x, *args):
         """Least squares loss: 0.5 * sum(R^2)"""
-        self._compute_if_needed(x, *args)
+        self._if_compute_residual(x, *args)
         return 0.5 * np.sum(np.square(self._residual))
 
     def grad(self, x, *args):
         """Gradient of least squares loss.
         G = sw * R * dydx =  J^T @ R"""
-        self._compute_if_needed(x, *args)
-        return np.transpose(self._jac).T @ self._residual
+        self._if_compute_jac(x, *args)
+        return np.transpose(self._jac) @ self._residual
 
 
 class NARX(MultiOutputMixin, RegressorMixin, BaseEstimator):
@@ -230,11 +321,19 @@ def __init__(
             "coef_init": [None, StrOptions({"one_step_ahead"}), "array-like"],
             "sample_weight": [None, "array-like"],
             "session_sizes": [None, "array-like"],
+            "solver": [StrOptions({"least_squares", "minimize"})],
         },
         prefer_skip_nested_validation=True,
     )
     def fit(
-        self, X, y, sample_weight=None, coef_init=None, session_sizes=None, **params
+        self,
+        X,
+        y,
+        sample_weight=None,
+        coef_init=None,
+        session_sizes=None,
+        solver="least_squares",
+        **params,
     ):
         """
         Fit narx model.
@@ -270,15 +369,50 @@ def fit(
 
             .. versionadded:: 0.5
 
+        solver : {'least_squares', 'minimize'}, default='least_squares'
+            The SciPy solver for optimization.
+
+            .. versionadded:: 0.5.1
+
         **params : dict
             Keyword arguments passed to
-            `scipy.optimize.least_squares`.
+            `scipy.optimize.least_squares` or `scipy.optimize.minimize`.
 
         Returns
         -------
         self : object
             Fitted Estimator.
         """
+        if solver == "least_squares":
+            mode = 0  # jac
+        else:  # solver == "minimize"
+            mode = 1  # loss and grad
+            if "method" in params:
+                method = params["method"]
+                if method in ["dogleg", "trust-exact"]:
+                    mode = 2  # hess
+                elif method in [
+                    "Newton-CG",
+                    "trust-ncg",
+                    "trust-krylov",
+                    "trust-constr",
+                ]:
+                    mode = 3  # hessp
+                elif method not in [
+                    None,
+                    "CG",
+                    "BFGS",
+                    "Newton-CG",
+                    "L-BFGS-B",
+                    "TNC",
+                    "SLSQP",
+                    "dogleg",
+                    "trust-ncg",
+                    "trust-krylov",
+                    "trust-exact",
+                    "trust-constr",
+                ]:
+                    mode = 4  # loss only
         none_inputs = False
         if X is None:  # Auto-regressive model
             X = np.empty((1, 0), dtype=float, order="C")  # Skip validation
@@ -427,7 +561,6 @@ def fit(
                     f"({(n_coef_intercept,)}), but got {coef_init.shape}."
                 )
 
-        mode = 1
         jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids = NARX._get_jc_ids(
             self.feat_ids_, self.delay_ids_, self.output_ids_, n_features
         )
@@ -455,35 +588,80 @@ def fit(
             y_ids = np.r_[self.output_ids_, np.arange(self.n_outputs_, dtype=np.int32)]
         else:
             y_ids = self.output_ids_
-        memoize_opt = _MemoizeOpt(NARX._func, mode=mode)
-        res = least_squares(
-            memoize_opt.residual,
-            x0=coef_init,
-            jac=memoize_opt.jac,
-            args=(
-                X,
-                y,
-                self.feat_ids_,
-                self.delay_ids_,
-                self.output_ids_,
-                self.fit_intercept,
-                sample_weight_sqrt,
-                session_sizes_cumsum,
-                self.max_delay_,
-                y_ids,
-                unique_feat_ids,
-                unique_delay_ids,
-                const_term_ids,
-                jac_yyd_ids,
-                jac_coef_ids,
-                jac_term_ids,
-                hess_yyd_ids,
-                hess_coef_ids,
-                hess_term_ids,
-                hess_yd_ids,
-            ),
-            **params,
+        memoize_opt = _OptMemoize(
+            self._opt_residual,
+            self._opt_jac,
+            self._opt_hess,
+            self._opt_hessp,
+            sample_weight_sqrt,
         )
+        if mode == 0:
+            res = least_squares(
+                fun=memoize_opt.residual,
+                x0=coef_init,
+                jac=memoize_opt.jac,
+                args=(
+                    X,
+                    y,
+                    self.feat_ids_,
+                    self.delay_ids_,
+                    self.output_ids_,
+                    self.fit_intercept,
+                    sample_weight_sqrt,
+                    session_sizes_cumsum,
+                    self.max_delay_,
+                    y_ids,
+                    unique_feat_ids,
+                    unique_delay_ids,
+                    const_term_ids,
+                    jac_yyd_ids,
+                    jac_coef_ids,
+                    jac_term_ids,
+                    hess_yyd_ids,
+                    hess_coef_ids,
+                    hess_term_ids,
+                    hess_yd_ids,
+                ),
+                **params,
+            )
+        else:  # solver == "minimize"
+            jac_hess_kw = {}
+            if mode == 1:
+                jac_hess_kw["jac"] = memoize_opt.grad
+            elif mode == 2:
+                jac_hess_kw["jac"] = memoize_opt.grad
+                jac_hess_kw["hess"] = memoize_opt.hess
+            elif mode == 3:
+                jac_hess_kw["jac"] = memoize_opt.grad
+                jac_hess_kw["hessp"] = memoize_opt.hessp
+            res = minimize(
+                fun=memoize_opt.loss,
+                x0=coef_init,
+                args=(
+                    X,
+                    y,
+                    self.feat_ids_,
+                    self.delay_ids_,
+                    self.output_ids_,
+                    self.fit_intercept,
+                    sample_weight_sqrt,
+                    session_sizes_cumsum,
+                    self.max_delay_,
+                    y_ids,
+                    unique_feat_ids,
+                    unique_delay_ids,
+                    const_term_ids,
+                    jac_yyd_ids,
+                    jac_coef_ids,
+                    jac_term_ids,
+                    hess_yyd_ids,
+                    hess_coef_ids,
+                    hess_term_ids,
+                    hess_yd_ids,
+                ),
+                **jac_hess_kw,
+                **params,
+            )
         if self.fit_intercept:
             self.coef_ = res.x[: -self.n_outputs_]
             self.intercept_ = res.x[-self.n_outputs_ :]
@@ -536,7 +714,7 @@ def _get_hc_ids(
         hess_feat_ids = np.zeros((0, n_degree), dtype=np.int32)
         hess_delay_ids = np.zeros((0, n_degree), dtype=np.int32)
 
-        if mode < 1:
+        if mode < 2:
             return (
                 hess_yyd_ids,
                 hess_yd_ids,
@@ -646,9 +824,24 @@ def _get_jc_ids(feat_ids, delay_ids, output_ids, n_features_in):
         )
 
     @staticmethod
-    def _func(
+    def _split_coef_intercept(coef_intercept, fit_intercept, y):
+        """
+        Split coef_intercept into coef and intercept.
+        """
+        n_samples, n_outputs = y.shape
+        n_x = coef_intercept.shape[0]
+
+        if fit_intercept:
+            coef = coef_intercept[:-n_outputs]
+            intercept = coef_intercept[-n_outputs:]
+        else:
+            coef = coef_intercept
+            intercept = np.zeros(n_outputs, dtype=float)
+        return coef, intercept, n_samples, n_outputs, n_x
+
+    @staticmethod
+    def _opt_residual(
         coef_intercept,
-        mode,
         X,
         y,
         feat_ids,
@@ -669,32 +862,17 @@ def _func(
         hess_coef_ids,
         hess_term_ids,
         hess_yd_ids,
-        p=None,
     ):
         """
-        Compute residual, Jacobian, and optionally Hessian.
-
-        Parameters
-        ----------
-        mode : int
-            0: compute residual and Jacobian
-            1: compute residual, Jacobian, and Hessian
+        Compute residual.
 
         Returns
         -------
         residual : array of shape (n_samples,)
-        jac : array of shape (n_samples, n_x)
-        hess : None or array
-            None when mode=0, hessian array when mode=1
         """
-        n_samples, n_outputs = y.shape
-        n_x = coef_intercept.shape[0]
-        if fit_intercept:
-            coef = coef_intercept[:-n_outputs]
-            intercept = coef_intercept[-n_outputs:]
-        else:
-            coef = coef_intercept
-            intercept = np.zeros(n_outputs, dtype=float)
+        coef, intercept, n_samples, n_outputs, _ = NARX._split_coef_intercept(
+            coef_intercept, fit_intercept, y
+        )
 
         # Compute prediction
         y_hat = np.zeros((n_samples, n_outputs), dtype=float)
@@ -711,28 +889,138 @@ def _func(
             y_hat,
         )
 
-        # Compute Jacobian
+        # Mask missing values
+        mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True)
+        y_masked = y[mask_valid]
+        y_hat_masked = y_hat[mask_valid]
+        sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid]
+        residual = (sample_weight_sqrt_masked * (y_hat_masked - y_masked)).flatten()
+        return residual, y_hat, mask_valid, sample_weight_sqrt_masked
+
+    @staticmethod
+    def _opt_jac(
+        coef_intercept,
+        X,
+        y,
+        feat_ids,
+        delay_ids,
+        output_ids,
+        fit_intercept,
+        sample_weight_sqrt,
+        session_sizes_cumsum,
+        max_delay,
+        y_ids,
+        unique_feat_ids,
+        unique_delay_ids,
+        const_term_ids,
+        jac_yyd_ids,
+        jac_coef_ids,
+        jac_term_ids,
+        hess_yyd_ids,
+        hess_coef_ids,
+        hess_term_ids,
+        hess_yd_ids,
+        y_hat,
+        mask_valid,
+        sample_weight_sqrt_masked,
+    ):
+        """
+        Compute Jacobian.
+
+        Returns
+        -------
+        jac : array of shape (n_samples, n_x)
+        dydx : array of shape (n_samples, n_outputs, n_x)
+        term_lib : array of shape (n_samples, n_unique_terms)
+        """
+        coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept(
+            coef_intercept, fit_intercept, y
+        )
+
         dydx = np.zeros((n_samples, n_outputs, n_x), dtype=float)
         jc = np.zeros((max_delay, n_outputs, n_outputs), dtype=float)
+        term_lib = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float)
 
-        term_libs = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float)
-        if mode == 0:
-            hc = np.zeros((1, 1, 1, 1), dtype=float)  # dummy
-            d2ydx2 = np.zeros((1, 1, 1, 1), dtype=float)  # dummy
-            d2ydx2p = np.zeros((1, 1, 1), dtype=float)  # dummy
-            p = np.zeros(1, dtype=float)  # dummy
-        elif mode == 1:
-            hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float)
-            d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float)
-            d2ydx2p = np.zeros((1, 1, 1), dtype=float)  # dummy
-            p = np.zeros(1, dtype=float)  # dummy
-        elif mode == 2:
-            hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float)
-            d2ydx2 = np.zeros((max_delay + 1, n_x, n_outputs, n_x), dtype=float)
-            d2ydx2p = np.zeros((n_samples, n_outputs, n_x), dtype=float)
-
-        _update_der(
-            mode,
+        _compute_term_lib(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            unique_feat_ids,
+            unique_delay_ids,
+            term_lib,
+        )
+
+        _compute_dydx(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            y_ids,
+            coef,
+            unique_feat_ids,
+            unique_delay_ids,
+            const_term_ids,
+            jac_yyd_ids,
+            jac_coef_ids,
+            jac_term_ids,
+            term_lib,
+            jc,
+            dydx,
+        )
+        # jac (n_samples, n_outputs (out), n_x) to (n_samples * n_outputs, n_x)
+        dydx_masked = dydx[mask_valid]
+        jac = np.reshape(
+            dydx_masked * sample_weight_sqrt_masked[..., np.newaxis], (-1, n_x)
+        )
+        return jac, term_lib, jc, dydx
+
+    @staticmethod
+    def _opt_hess(
+        coef_intercept,
+        X,
+        y,
+        feat_ids,
+        delay_ids,
+        output_ids,
+        fit_intercept,
+        sample_weight_sqrt,
+        session_sizes_cumsum,
+        max_delay,
+        y_ids,
+        unique_feat_ids,
+        unique_delay_ids,
+        const_term_ids,
+        jac_yyd_ids,
+        jac_coef_ids,
+        jac_term_ids,
+        hess_yyd_ids,
+        hess_coef_ids,
+        hess_term_ids,
+        hess_yd_ids,
+        residual,
+        y_hat,
+        mask_valid,
+        sample_weight_sqrt_masked,
+        jac,
+        term_lib,
+        jc,
+        dydx,
+    ):
+        """
+        Compute Hessian matrix.
+
+        Returns
+        -------
+        hess : array of shape (n_x, n_x)
+        """
+
+        coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept(
+            coef_intercept, fit_intercept, y
+        )
+        hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float)
+        d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float)
+        _compute_d2ydx2(
             X,
             y_hat,
             max_delay,
@@ -749,65 +1037,109 @@ def _func(
             hess_coef_ids,
             hess_term_ids,
             hess_yd_ids,
-            p,
-            term_libs,
+            term_lib,
+            dydx,
             jc,
             hc,
-            dydx,
             d2ydx2,
-            d2ydx2p,
         )
+        # d2ydx2 has shape in (n_samples, n_x, n_outputs (out), n_x)
+        d2ydx2_masked = (
+            d2ydx2[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis, np.newaxis]
+        )
+        # reshape to (n_samples, n_outputs (out), n_x, n_x)
+        d2ydx2_masked = d2ydx2_masked.swapaxes(1, 2)
+        # reshape to (n_samples * n_outputs, n_x, n_x)
+        d2ydx2_masked = np.reshape(d2ydx2_masked, (-1, n_x, n_x))
+        hess = jac.T @ jac + np.tensordot(residual, d2ydx2_masked, axes=1)
+        hess = 0.5 * (hess + hess.T)  # force symmetric
+        return hess
 
-        # Mask missing values
-        mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True)
-        y_masked = y[mask_valid]
-        y_hat_masked = y_hat[mask_valid]
-        sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid]
-        dydx_masked = dydx[mask_valid]
+    @staticmethod
+    def _opt_hessp(
+        coef_intercept,
+        X,
+        y,
+        feat_ids,
+        delay_ids,
+        output_ids,
+        fit_intercept,
+        sample_weight_sqrt,
+        session_sizes_cumsum,
+        max_delay,
+        y_ids,
+        unique_feat_ids,
+        unique_delay_ids,
+        const_term_ids,
+        jac_yyd_ids,
+        jac_coef_ids,
+        jac_term_ids,
+        hess_yyd_ids,
+        hess_coef_ids,
+        hess_term_ids,
+        hess_yd_ids,
+        residual,
+        y_hat,
+        mask_valid,
+        sample_weight_sqrt_masked,
+        jac,
+        term_lib,
+        jc,
+        dydx,
+        p,
+    ):
+        """
+        Compute Hessian-vector product.
 
-        # Compute residuals (n_samples, n_outputs) to (n_samples * n_outputs,)
-        residual = (sample_weight_sqrt_masked * (y_hat_masked - y_masked)).flatten()
+        Returns
+        -------
+        hessp : array of shape (n_x,)
+        """
 
-        # Compute Jacobian
-        # jac (n_samples, n_outputs (out), n_x) to (n_samples * n_outputs, n_x)
-        jac = np.reshape(
-            dydx_masked * sample_weight_sqrt_masked[..., np.newaxis], (-1, n_x)
+        coef, _, n_samples, n_outputs, n_x = NARX._split_coef_intercept(
+            coef_intercept, fit_intercept, y
         )
+        hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float)
+        d2ydx2 = np.zeros((max_delay + 1, n_x, n_outputs, n_x), dtype=float)
+        d2ydx2p = np.zeros((n_samples, n_outputs, n_x), dtype=float)
+        _compute_d2ydx2p(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            y_ids,
+            coef,
+            unique_feat_ids,
+            unique_delay_ids,
+            const_term_ids,
+            jac_yyd_ids,
+            jac_coef_ids,
+            jac_term_ids,
+            hess_yyd_ids,
+            hess_coef_ids,
+            hess_term_ids,
+            hess_yd_ids,
+            term_lib,
+            dydx,
+            p,
+            jc,
+            hc,
+            d2ydx2,
+            d2ydx2p,
+        )
+        # d2ydx2p has shape in (n_samples, n_outputs (out), n_x)
+        d2ydx2p_masked = (
+            d2ydx2p[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis]
+        )
+        # reshape to (n_samples * n_outputs, n_x)
+        d2ydx2p_masked = np.reshape(d2ydx2p_masked, (-1, n_x))
 
-        # Compute Hessian (or Hessian-vector product) if needed
-        hess = None
-        hessp = None
-        if mode == 1:
-            # d2ydx2 has shape in (n_samples, n_x, n_outputs (out), n_x)
-            d2ydx2_masked = (
-                d2ydx2[mask_valid]
-                * sample_weight_sqrt_masked[..., np.newaxis, np.newaxis]
-            )
-            # reshape to (n_samples, n_outputs (out), n_x, n_x)
-            d2ydx2_masked = d2ydx2_masked.swapaxes(1, 2)
-            # reshape to (n_samples * n_outputs, n_x, n_x)
-            d2ydx2_masked = np.reshape(d2ydx2_masked, (-1, n_x, n_x))
-            hess = jac.T @ jac + np.tensordot(residual, d2ydx2_masked, axes=1)
-            hess = 0.5 * (hess + hess.T)  # force symmetric
-        elif mode == 2:
-            # d2ydx2p has shape in (n_samples, n_outputs (out), n_x)
-            d2ydx2p_masked = (
-                d2ydx2p[mask_valid] * sample_weight_sqrt_masked[..., np.newaxis]
-            )
-            # reshape to (n_samples * n_outputs, n_x)
-            d2ydx2p_masked = np.reshape(d2ydx2p_masked, (-1, n_x))
-
-            # H @ p = (J^T @ J) @ p + sum(residual * (d2ydx2 @ p))
-            # d2ydx2p corresponds to d2ydx2 @ p
-
-            # (J^T @ J) @ p = J^T @ (J @ p)
-            hessp = jac.T @ (jac @ p) + residual @ d2ydx2p_masked
+        # H @ p = (J^T @ J) @ p + sum(residual * (d2ydx2 @ p))
+        # d2ydx2p corresponds to d2ydx2 @ p
 
-        # residual: (n_samples * n_outputs,)
-        # jac: (n_samples * n_outputs, n_x)
-        # hess: (n_x, n_x) if mode == 1
-        # hessp: (n_x,) if mode == 2
-        return residual, jac, hess, hessp
+        # (J^T @ J) @ p = J^T @ (J @ p)
+        hessp = jac.T @ (jac @ p) + residual @ d2ydx2p_masked
+        return hessp
 
     @validate_params(
         {
diff --git a/fastcan/narx/_narx_fast.pyi b/fastcan/narx/_narx_fast.pyi
index e6f7d9a..dc835d6 100644
--- a/fastcan/narx/_narx_fast.pyi
+++ b/fastcan/narx/_narx_fast.pyi
@@ -16,8 +16,33 @@ def _predict(
     max_delay: int,
     y_hat: npt.NDArray[np.float64],
 ) -> None: ...
-def _update_der(
-    mode: int,
+def _compute_term_lib(
+    X: npt.NDArray[np.float64],
+    y_hat: npt.NDArray[np.float64],
+    max_delay: int,
+    session_sizes_cumsum: npt.NDArray[np.intc],
+    unique_feat_ids: npt.NDArray[np.intc],
+    unique_delay_ids: npt.NDArray[np.intc],
+    term_lib: npt.NDArray[np.float64],
+) -> None: ...
+def _compute_dydx(
+    X: npt.NDArray[np.float64],
+    y_hat: npt.NDArray[np.float64],
+    max_delay: int,
+    session_sizes_cumsum: npt.NDArray[np.intc],
+    y_ids: npt.NDArray[np.intc],
+    coefs: npt.NDArray[np.float64],
+    unique_feat_ids: npt.NDArray[np.intc],
+    unique_delay_ids: npt.NDArray[np.intc],
+    const_term_ids: npt.NDArray[np.intc],
+    jac_yyd_ids: npt.NDArray[np.intc],
+    jac_coef_ids: npt.NDArray[np.intc],
+    jac_term_ids: npt.NDArray[np.intc],
+    term_lib: npt.NDArray[np.float64],
+    jc: npt.NDArray[np.float64],
+    dydx: npt.NDArray[np.float64],
+) -> None: ...
+def _compute_d2ydx2(
     X: npt.NDArray[np.float64],
     y_hat: npt.NDArray[np.float64],
     max_delay: int,
@@ -34,11 +59,34 @@ def _update_der(
     hess_coef_ids: npt.NDArray[np.intc],
     hess_term_ids: npt.NDArray[np.intc],
     hess_yd_ids: npt.NDArray[np.intc],
-    p: npt.NDArray[np.float64],
-    term_libs: npt.NDArray[np.float64],
+    term_lib: npt.NDArray[np.float64],
+    dydx: npt.NDArray[np.float64],
     jc: npt.NDArray[np.float64],
     hc: npt.NDArray[np.float64],
+    d2ydx2: npt.NDArray[np.float64],
+) -> None: ...
+def _compute_d2ydx2p(
+    X: npt.NDArray[np.float64],
+    y_hat: npt.NDArray[np.float64],
+    max_delay: int,
+    session_sizes_cumsum: npt.NDArray[np.intc],
+    y_ids: npt.NDArray[np.intc],
+    coefs: npt.NDArray[np.float64],
+    unique_feat_ids: npt.NDArray[np.intc],
+    unique_delay_ids: npt.NDArray[np.intc],
+    const_term_ids: npt.NDArray[np.intc],
+    jac_yyd_ids: npt.NDArray[np.intc],
+    jac_coef_ids: npt.NDArray[np.intc],
+    jac_term_ids: npt.NDArray[np.intc],
+    hess_yyd_ids: npt.NDArray[np.intc],
+    hess_coef_ids: npt.NDArray[np.intc],
+    hess_term_ids: npt.NDArray[np.intc],
+    hess_yd_ids: npt.NDArray[np.intc],
+    term_lib: npt.NDArray[np.float64],
     dydx: npt.NDArray[np.float64],
+    p: npt.NDArray[np.float64],
+    jc: npt.NDArray[np.float64],
+    hc: npt.NDArray[np.float64],
     d2ydx2: npt.NDArray[np.float64],
     d2ydx2p: npt.NDArray[np.float64],
 ) -> None: ...
diff --git a/fastcan/narx/_narx_fast.pyx b/fastcan/narx/_narx_fast.pyx
index f49bf98..8366c78 100644
--- a/fastcan/narx/_narx_fast.pyx
+++ b/fastcan/narx/_narx_fast.pyx
@@ -293,8 +293,86 @@ cdef inline void _update_d2ydx2(
 
 
 @final
-cpdef void _update_der(
-    const int mode,
+cdef inline void _check_at_init(
+    const double[:, ::1] X,
+    const double[:, ::1] y_hat,
+    const int max_delay,
+    const int[::1] session_sizes_cumsum,
+    const Py_ssize_t k,
+    Py_ssize_t *s,                          # IN/OUT pointer
+    Py_ssize_t *init_k,                     # IN/OUT pointer
+    bint *at_init,                          # IN/OUT pointer
+) noexcept nogil:
+    """Check if at init"""
+    cdef bint is_finite
+
+    # Check boundaries
+    if s[0] < session_sizes_cumsum.shape[0] and k == session_sizes_cumsum[s[0]]:
+        s[0] += 1
+        init_k[0] = k
+        at_init[0] = True
+
+    with gil:
+        is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k]))
+
+    if not is_finite:
+        init_k[0] = k + 1
+        at_init[0] = True
+
+    if k - init_k[0] == max_delay:
+        at_init[0] = False
+
+
+@final
+cpdef void _compute_term_lib(
+    const double[:, ::1] X,
+    const double[:, ::1] y_hat,
+    const int max_delay,
+    const int[::1] session_sizes_cumsum,
+    const int[:, ::1] unique_feat_ids,      # IN
+    const int[:, ::1] unique_delay_ids,     # IN
+    double[:, ::1] term_lib,                # OUT
+) noexcept nogil:
+    """
+    Pre-compute all terms for all time steps and store them in term_libs.
+    """
+    cdef:
+        Py_ssize_t n_samples = X.shape[0]
+        Py_ssize_t k, s = 0
+        Py_ssize_t init_k = 0
+        bint at_init = True
+
+    for k in range(n_samples):
+        # Check if at init
+        _check_at_init(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            k,
+            &s,
+            &init_k,
+            &at_init,
+        )
+        if at_init:
+            continue
+
+        # Compute terms for time step k
+        _update_terms(
+            X,
+            y_hat,
+            term_lib[k],
+            unique_feat_ids,
+            unique_delay_ids,
+            k,
+        )
+        with gil:
+            if np.max(np.abs(term_lib[k])) > 1e20:
+                break
+
+
+@final
+cpdef void _compute_dydx(
     const double[:, ::1] X,
     const double[:, ::1] y_hat,
     const int max_delay,
@@ -307,33 +385,16 @@ cpdef void _update_der(
     const int[:, ::1] jac_yyd_ids,
     const int[::1] jac_coef_ids,
     const int[::1] jac_term_ids,
-    const int[:, ::1] hess_yyd_ids,
-    const int[::1] hess_coef_ids,
-    const int[::1] hess_term_ids,
-    const int[:, ::1] hess_yd_ids,
-    const double[::1] p,                          # IN arbitrary vector
-    double[:, ::1] term_libs,               # initialized with 1.0
+    const double[:, ::1] term_lib,               # initialized with 1.0
     double[:, :, ::1] jc,
-    double[:, :, :, ::1] hc,
     double[:, :, ::1] dydx,                 # OUT initialized with 0.0
-    double[:, :, :, ::1] d2ydx2,            # OUT initialized with 0.0
-    double[:, :, ::1] d2ydx2p,              # OUT initialized with 0.0
 ) noexcept nogil:
     """
-    Computation of dydx and d2ydx2 matrix.
-    mode:
-        0 - only dydx
-        1 - both dydx and d2ydx2
-
+    Computation of dydx matrix.
     Returns
     -------
     dydx : ndarray of shape (n_samples, n_outputs, n_x)
         First derivative matrix of the outputs with respect to coefficients and intercepts.
-    d2ydx2 : ndarray of shape (n_samples, n_x, n_outputs (out), n_x)
-        Second derivative matrix of the outputs with respect to coefficients and intercepts
-    d2ydx2p : ndarray of shape (n_samples, n_outputs, n_x)
-        When mode == 2, d2ydx2 is of shape (max_delay + 1, n_x, n_outputs (out), n_x)
-        Second derivative matrix times an arbitrary vector p.
     """
     cdef:
         Py_ssize_t n_samples = y_hat.shape[0]
@@ -342,50 +403,29 @@ cpdef void _update_der(
         Py_ssize_t N = dydx.shape[2]     # n_x
         Py_ssize_t n_const = const_term_ids.shape[0]
         Py_ssize_t n_jac = jac_term_ids.shape[0]
-        Py_ssize_t n_hess = hess_term_ids.shape[0]
         Py_ssize_t init_k = 0
         bint at_init = True
-        bint is_finite
-        bint jac_not_empty, hess_not_empty
+        bint jac_not_empty
 
     with gil:
         jac_not_empty = max_delay > 0 and n_jac > 0
-        hess_not_empty = max_delay > 0 and n_hess > 0
 
     for k in range(n_samples):
         # Check if at init
-        if k == session_sizes_cumsum[s]:
-            s += 1
-            at_init = True
-            init_k = k
-            continue
-        with gil:
-            is_finite = np.all(np.isfinite(X[k])) and np.all(np.isfinite(y_hat[k]))
-        if not is_finite:
-            at_init = True
-            init_k = k + 1
-            continue
-        if k - init_k == max_delay:
-            at_init = False
-
-        if at_init:
-            continue
-        # Compute terms for time step k
-        _update_terms(
+        _check_at_init(
             X,
             y_hat,
-            term_libs[k],
-            unique_feat_ids,
-            unique_delay_ids,
+            max_delay,
+            session_sizes_cumsum,
             k,
+            &s,
+            &init_k,
+            &at_init,
         )
-        with gil:
-            if np.max(np.abs(term_libs[k])) > 1e20:
-                break
-
-        # Update constant terms of Jacobian
+        if at_init:
+            continue
         for i in range(n_const):
-            dydx[k, y_ids[i], i] = term_libs[k, const_term_ids[i]]
+            dydx[k, y_ids[i], i] = term_lib[k, const_term_ids[i]]
 
         # Update intercepts if any
         for i in range(n_const, N):
@@ -398,7 +438,7 @@ cpdef void _update_der(
                 jac_yyd_ids,
                 jac_coef_ids,
                 jac_term_ids,
-                term_libs[k],
+                term_lib[k],
                 jc,
             )
             for d in range(max_delay):
@@ -410,90 +450,242 @@ cpdef void _update_der(
                     1.0, &jc[d, 0, 0], M, &dydx[k-d-1, 0, 0], N,
                     1.0, &dydx[k, 0, 0], N,
                 )
-            if mode == 1 and hess_not_empty:
-                # Update dynamic terms of Hessian
-                _update_d2ydx2(
-                    coefs,
-                    hess_yyd_ids,
-                    hess_coef_ids,
-                    hess_term_ids,
-                    term_libs[k],
-                    hess_yd_ids,
-                    dydx,
-                    jc,
-                    M,
-                    N,
-                    max_delay,
-                    k,
-                    k,
-                    hc,
-                    d2ydx2,
-                )
-
-                # Handle divergence
-                with gil:
-                    if (
-                        not np.all(np.isfinite(d2ydx2[k])) or
-                        np.max(np.abs(d2ydx2[k])) > 1e20
-                    ):
-                        break
-
-            if mode == 2 and hess_not_empty:
-                # Update d2ydx2p
-                # d2ydx2 values move forward by 1 step d2ydx2[k] -> d2ydx2[k-1]
-                memmove(
-                    &d2ydx2[0, 0, 0, 0],
-                    &d2ydx2[1, 0, 0, 0],
-                    max_delay * N * M * N * sizeof(double)
-                )
-                memset(
-                    &d2ydx2[max_delay, 0, 0, 0],
-                    0,
-                    N * M * N * sizeof(double)
-                )
-
-                # Update dynamic terms of Hessian
-                _update_d2ydx2(
-                    coefs,
-                    hess_yyd_ids,
-                    hess_coef_ids,
-                    hess_term_ids,
-                    term_libs[k],
-                    hess_yd_ids,
-                    dydx,
-                    jc,
-                    M,
-                    N,
-                    max_delay,
-                    k,
-                    max_delay,
-                    hc,
-                    d2ydx2,
-                )
-
-                # Compute d2ydx2p[k] = d2ydx2[k].T @ p
-                # d2ydx2[k] shape (N, M, N), p shape (N)
-                # target d2ydx2p[k] shape (M, N)
-                # Treat d2ydx2[k] as (N, M*N) matrix
-                _gemv(
-                    RowMajor, Trans,
-                    N, M * N,
-                    1.0, &d2ydx2[max_delay, 0, 0, 0], M * N,
-                    &p[0], 1,
-                    0.0, &d2ydx2p[k, 0, 0], 1
-                )
-
-                # Handle divergence
-                with gil:
-                    if (
-                        not np.all(np.isfinite(d2ydx2p[k])) or
-                        not np.all(np.isfinite(d2ydx2[max_delay])) or
-                        (np.max(np.abs(d2ydx2p[k])) > 1e20) or
-                        (np.max(np.abs(d2ydx2[max_delay])) > 1e20)
-                    ):
-                        break
 
         # Handle divergence
         with gil:
             if not np.all(np.isfinite(dydx[k])) or np.max(np.abs(dydx[k])) > 1e20:
                 break
+
+
+@final
+cpdef void _compute_d2ydx2(
+    const double[:, ::1] X,
+    const double[:, ::1] y_hat,
+    const int max_delay,
+    const int[::1] session_sizes_cumsum,
+    const int[::1] y_ids,
+    const double[::1] coefs,
+    const int[:, ::1] unique_feat_ids,      # IN
+    const int[:, ::1] unique_delay_ids,     # IN
+    const int[::1] const_term_ids,
+    const int[:, ::1] jac_yyd_ids,
+    const int[::1] jac_coef_ids,
+    const int[::1] jac_term_ids,
+    const int[:, ::1] hess_yyd_ids,
+    const int[::1] hess_coef_ids,
+    const int[::1] hess_term_ids,
+    const int[:, ::1] hess_yd_ids,
+    const double[:, ::1] term_lib,
+    const double[:, :, ::1] dydx,
+    double[:, :, ::1] jc,
+    double[:, :, :, ::1] hc,
+    double[:, :, :, ::1] d2ydx2,            # OUT initialized with 0.0
+) noexcept nogil:
+    """
+    Computation of d2ydx2 matrix.
+    Returns
+    -------
+    d2ydx2 : ndarray of shape (n_samples, n_x, n_outputs (out), n_x)
+        Second derivative matrix of the outputs with respect to coefficients and intercepts
+    """
+    cdef:
+        Py_ssize_t n_samples = y_hat.shape[0]
+        Py_ssize_t k, s = 0
+        Py_ssize_t M = jc.shape[1]      # n_outputs
+        Py_ssize_t N = dydx.shape[2]     # n_x
+        Py_ssize_t n_jac = jac_term_ids.shape[0]
+        Py_ssize_t n_hess = hess_term_ids.shape[0]
+        Py_ssize_t init_k = 0
+        bint at_init = True
+        bint jac_not_empty
+        bint hess_not_empty
+
+    with gil:
+        jac_not_empty = max_delay > 0 and n_jac > 0
+        hess_not_empty = max_delay > 0 and n_hess > 0
+
+    for k in range(n_samples):
+        # Check if at init
+        _check_at_init(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            k,
+            &s,
+            &init_k,
+            &at_init,
+        )
+        if at_init:
+            continue
+
+        if jac_not_empty:
+            _update_jc(
+                coefs,
+                jac_yyd_ids,
+                jac_coef_ids,
+                jac_term_ids,
+                term_lib[k],
+                jc,
+            )
+
+        if hess_not_empty:
+            # Update dynamic terms of Hessian
+            _update_d2ydx2(
+                coefs,
+                hess_yyd_ids,
+                hess_coef_ids,
+                hess_term_ids,
+                term_lib[k],
+                hess_yd_ids,
+                dydx,
+                jc,
+                M,
+                N,
+                max_delay,
+                k,
+                k,
+                hc,
+                d2ydx2,
+            )
+
+            # Handle divergence
+            with gil:
+                if (
+                    not np.all(np.isfinite(d2ydx2[k])) or
+                    np.max(np.abs(d2ydx2[k])) > 1e20
+                ):
+                    break
+
+
+@final
+cpdef void _compute_d2ydx2p(
+    const double[:, ::1] X,
+    const double[:, ::1] y_hat,
+    const int max_delay,
+    const int[::1] session_sizes_cumsum,
+    const int[::1] y_ids,
+    const double[::1] coefs,
+    const int[:, ::1] unique_feat_ids,      # IN
+    const int[:, ::1] unique_delay_ids,     # IN
+    const int[::1] const_term_ids,
+    const int[:, ::1] jac_yyd_ids,
+    const int[::1] jac_coef_ids,
+    const int[::1] jac_term_ids,
+    const int[:, ::1] hess_yyd_ids,
+    const int[::1] hess_coef_ids,
+    const int[::1] hess_term_ids,
+    const int[:, ::1] hess_yd_ids,
+    const double[:, ::1] term_lib,
+    const double[:, :, ::1] dydx,
+    const double[::1] p,                          # IN arbitrary vector
+    double[:, :, ::1] jc,
+    double[:, :, :, ::1] hc,
+    double[:, :, :, ::1] d2ydx2,            # OUT initialized with 0.0
+    double[:, :, ::1] d2ydx2p,              # OUT initialized with 0.0
+) noexcept nogil:
+    """
+    Computation of d2ydx2-vector product matrix.
+
+    Returns
+    -------
+    d2ydx2 : ndarray of shape (max_delay + 1, n_x, n_outputs (out), n_x)
+        Second derivative matrix of the outputs with respect to coefficients and intercepts
+    d2ydx2p : ndarray of shape (n_samples, n_outputs, n_x)
+        Second derivative matrix times an arbitrary vector p.
+    """
+    cdef:
+        Py_ssize_t n_samples = y_hat.shape[0]
+        Py_ssize_t k, s = 0
+        Py_ssize_t M = jc.shape[1]      # n_outputs
+        Py_ssize_t N = dydx.shape[2]     # n_x
+        Py_ssize_t n_jac = jac_term_ids.shape[0]
+        Py_ssize_t n_hess = hess_term_ids.shape[0]
+        Py_ssize_t init_k = 0
+        bint at_init = True
+        bint jac_not_empty
+        bint hess_not_empty
+
+    with gil:
+        jac_not_empty = max_delay > 0 and n_jac > 0
+        hess_not_empty = max_delay > 0 and n_hess > 0
+
+    for k in range(n_samples):
+        # Check if at init
+        _check_at_init(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            k,
+            &s,
+            &init_k,
+            &at_init,
+        )
+        if at_init:
+            continue
+
+        if jac_not_empty:
+            _update_jc(
+                coefs,
+                jac_yyd_ids,
+                jac_coef_ids,
+                jac_term_ids,
+                term_lib[k],
+                jc,
+            )
+
+        if hess_not_empty:
+            # Update d2ydx2p
+            # d2ydx2 values move forward by 1 step d2ydx2[k] -> d2ydx2[k-1]
+            memmove(
+                &d2ydx2[0, 0, 0, 0],
+                &d2ydx2[1, 0, 0, 0],
+                max_delay * N * M * N * sizeof(double)
+            )
+            memset(
+                &d2ydx2[max_delay, 0, 0, 0],
+                0,
+                N * M * N * sizeof(double)
+            )
+
+            # Update dynamic terms of Hessian
+            _update_d2ydx2(
+                coefs,
+                hess_yyd_ids,
+                hess_coef_ids,
+                hess_term_ids,
+                term_lib[k],
+                hess_yd_ids,
+                dydx,
+                jc,
+                M,
+                N,
+                max_delay,
+                k,
+                max_delay,
+                hc,
+                d2ydx2,
+            )
+
+            # Compute d2ydx2p[k] = d2ydx2[k].T @ p
+            # d2ydx2[k] shape (N, M, N), p shape (N)
+            # target d2ydx2p[k] shape (M, N)
+            # Treat d2ydx2[k] as (N, M*N) matrix
+            _gemv(
+                RowMajor, Trans,
+                N, M * N,
+                1.0, &d2ydx2[max_delay, 0, 0, 0], M * N,
+                &p[0], 1,
+                0.0, &d2ydx2p[k, 0, 0], 1
+            )
+
+            # Handle divergence
+            with gil:
+                if (
+                    not np.all(np.isfinite(d2ydx2p[k])) or
+                    not np.all(np.isfinite(d2ydx2[max_delay])) or
+                    (np.max(np.abs(d2ydx2p[k])) > 1e20) or
+                    (np.max(np.abs(d2ydx2[max_delay])) > 1e20)
+                ):
+                    break
diff --git a/fastcan/narx/tests/test_minimize.py b/fastcan/narx/tests/test_minimize.py
new file mode 100644
index 0000000..019105a
--- /dev/null
+++ b/fastcan/narx/tests/test_minimize.py
@@ -0,0 +1,94 @@
+"""Test NARX with minimize solver"""
+
+import numpy as np
+import pytest
+from sklearn.metrics import r2_score
+
+from fastcan.narx import make_narx
+from fastcan.utils import mask_missing_values
+
+
+@pytest.mark.parametrize("multi_output", [False, True])
+@pytest.mark.parametrize("nan", [False, True])
+@pytest.mark.parametrize(
+    "method", ["default", None, "L-BFGS-B", "dogleg", "trust-ncg", "CG", "Nelder-Mead"]
+)
+def test_minimize(nan, multi_output, method):
+    """Test NARX with minimize solver on synthetic data."""
+
+    def make_data(multi_output, nan, rng):
+        if multi_output:
+            n_samples = 1000
+            max_delay = 3
+            e0 = rng.normal(0, 0.1, n_samples)
+            e1 = rng.normal(0, 0.02, n_samples)
+            u0 = rng.uniform(0, 1, n_samples + max_delay)
+            u1 = rng.normal(0, 0.1, n_samples + max_delay)
+            y0 = np.zeros(n_samples + max_delay)
+            y1 = np.zeros(n_samples + max_delay)
+            for i in range(max_delay, n_samples + max_delay):
+                y0[i] = (
+                    0.5 * y0[i - 1]
+                    + 0.8 * y1[i - 1]
+                    + 0.3 * u0[i] ** 2
+                    + 2 * u0[i - 1] * u0[i - 3]
+                    + 1.5 * u0[i - 2] * u1[i - 3]
+                    + 1
+                )
+                y1[i] = (
+                    0.6 * y1[i - 1]
+                    - 0.2 * y0[i - 1] * y1[i - 2]
+                    + 0.3 * u1[i] ** 2
+                    + 1.5 * u1[i - 2] * u0[i - 3]
+                    + 0.5
+                )
+            y = np.c_[y0[max_delay:] + e0, y1[max_delay:] + e1]
+            X = np.c_[u0[max_delay:], u1[max_delay:]]
+            n_outputs = 2
+        else:
+            rng = np.random.default_rng(12345)
+            n_samples = 1000
+            max_delay = 3
+            e = rng.normal(0, 0.1, n_samples)
+            u0 = rng.uniform(0, 1, n_samples + max_delay)
+            u1 = rng.normal(0, 0.1, n_samples)
+            y = np.zeros(n_samples + max_delay)
+            for i in range(max_delay, n_samples + max_delay):
+                y[i] = (
+                    0.5 * y[i - 1]
+                    + 0.3 * u0[i] ** 2
+                    + 2 * u0[i - 1] * u0[i - 3]
+                    + 1.5 * u0[i - 2] * u1[i - max_delay]
+                    + 1
+                )
+            y = y[max_delay:] + e
+            X = np.c_[u0[max_delay:], u1]
+            n_outputs = 1
+
+        if nan:
+            X_nan_ids = rng.choice(n_samples, 20, replace=False)
+            y_nan_ids = rng.choice(n_samples, 10, replace=False)
+            X[X_nan_ids] = np.nan
+            y[y_nan_ids] = np.nan
+
+        X = np.asfortranarray(X)
+        y = np.asfortranarray(y)
+        return X, y, n_outputs
+
+    rng = np.random.default_rng(12345)
+    X, y, _ = make_data(multi_output, nan, rng)
+
+    fit_kwargs = {"solver": "minimize"}
+    if method != "default":
+        fit_kwargs["method"] = method
+
+    narx_score = make_narx(
+        X,
+        y,
+        n_terms_to_select=[5, 4] if multi_output else 4,
+        max_delay=3,
+        poly_degree=2,
+        verbose=0,
+    ).fit(X, y, coef_init="one_step_ahead", **fit_kwargs)
+
+    assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.5
diff --git a/fastcan/narx/tests/test_narx_hess.py b/fastcan/narx/tests/test_narx_hess.py
index 85d4329..d6b7d83 100644
--- a/fastcan/narx/tests/test_narx_hess.py
+++ b/fastcan/narx/tests/test_narx_hess.py
@@ -10,7 +10,13 @@
 from scipy.optimize import approx_fprime
 
 from fastcan.narx import NARX
-from fastcan.narx._narx_fast import _predict, _update_der
+from fastcan.narx._base import _OptMemoize
+from fastcan.narx._narx_fast import (
+    _compute_d2ydx2,
+    _compute_dydx,
+    _compute_term_lib,
+    _predict,
+)
 
 
 def _hessian_wrapper(
@@ -29,10 +35,15 @@ def _hessian_wrapper(
     jac_delay_ids,
     return_hess=True,
 ):
-    mode = 1
+    mode = 2
     (hess_yyd_ids, hess_yd_ids, hess_coef_ids, hess_feat_ids, hess_delay_ids) = (
         NARX._get_hc_ids(
-            jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids, X.shape[1], mode=1
+            jac_yyd_ids,
+            jac_coef_ids,
+            jac_feat_ids,
+            jac_delay_ids,
+            X.shape[1],
+            mode=mode,
         )
     )
 
@@ -56,9 +67,15 @@ def _hessian_wrapper(
         y_ids = np.asarray(output_ids, dtype=np.int32)
 
     if return_hess:
-        res, jac, hess, _ = NARX._func(
-            coef_intercept,
-            mode,
+        memoize = _OptMemoize(
+            NARX._opt_residual,
+            NARX._opt_jac,
+            NARX._opt_hess,
+            NARX._opt_hessp,
+            sample_weight_sqrt,
+        )
+
+        args = (
             X,
             y,
             feat_ids,
@@ -80,6 +97,11 @@ def _hessian_wrapper(
             hess_term_ids,
             hess_yd_ids,
         )
+
+        hess = memoize.hess(coef_intercept, *args)
+        jac = memoize.jac(coef_intercept, *args)
+        res = memoize.residual(coef_intercept, *args)
+
         return res, jac, hess
     else:
         # Compute prediction
@@ -106,19 +128,43 @@ def _hessian_wrapper(
             y_hat,
         )
 
+        term_lib = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float)
+        _compute_term_lib(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            unique_feat_ids,
+            unique_delay_ids,
+            term_lib,
+        )
+
         # Compute Jacobian
         dydx = np.zeros((n_samples, n_outputs, n_x), dtype=float)
         jc = np.zeros((max_delay, n_outputs, n_outputs), dtype=float)
 
-        term_libs = np.ones((n_samples, unique_feat_ids.shape[0]), dtype=float)
+        _compute_dydx(
+            X,
+            y_hat,
+            max_delay,
+            session_sizes_cumsum,
+            y_ids,
+            coef,
+            unique_feat_ids,
+            unique_delay_ids,
+            const_term_ids,
+            jac_yyd_ids,
+            jac_coef_ids,
+            jac_term_ids,
+            term_lib,
+            jc,
+            dydx,
+        )
+
         hc = np.zeros((n_x, max_delay, n_outputs, n_outputs), dtype=float)
         d2ydx2 = np.zeros((n_samples, n_x, n_outputs, n_x), dtype=float)
 
-        p = np.zeros(1, dtype=float)
-        d2ydx2p = np.zeros((1, 1, 1), dtype=float)
-
-        _update_der(
-            mode,
+        _compute_d2ydx2(
             X,
             y_hat,
             max_delay,
@@ -135,14 +181,13 @@ def _hessian_wrapper(
             hess_coef_ids,
             hess_term_ids,
             hess_yd_ids,
-            p,
-            term_libs,
+            term_lib,
+            dydx,
             jc,
             hc,
-            dydx,
             d2ydx2,
-            d2ydx2p,
         )
+
         return y_hat, d2ydx2
 
 
@@ -511,7 +556,7 @@ def get_narx_hess(feat_ids, delay_ids, output_ids):
         n_features = feat_ids.max() - n_outputs + 1
         n_x = n_terms + n_outputs
 
-        mode = 1
+        mode = 2
 
         jac_yyd_ids, jac_coef_ids, jac_feat_ids, jac_delay_ids = NARX._get_jc_ids(
             feat_ids, delay_ids, output_ids, n_features
diff --git a/fastcan/narx/tests/test_narx_hessp.py b/fastcan/narx/tests/test_narx_hessp.py
index f5d33f5..a731b6f 100644
--- a/fastcan/narx/tests/test_narx_hessp.py
+++ b/fastcan/narx/tests/test_narx_hessp.py
@@ -5,7 +5,7 @@
 from numpy.testing import assert_allclose
 
 from fastcan.narx import NARX
-from fastcan.narx._base import _MemoizeOpt
+from fastcan.narx._base import _OptMemoize
 
 
 @pytest.mark.parametrize("seed", [10, 42, 123, 999, 2024])
@@ -66,15 +66,21 @@ def test_random(seed):
         # call arguments: (fun, x0, ..., args=args_tuple, ...)
         call_kwargs = mock_ls.call_args.kwargs
         args_tuple = call_kwargs["args"]
+        sample_weight_sqrt = args_tuple[6]
+
+    # 1. Compute Hessian matrix using mode=2
+    memoize = _OptMemoize(
+        NARX._opt_residual,
+        NARX._opt_jac,
+        NARX._opt_hess,
+        NARX._opt_hessp,
+        sample_weight_sqrt,
+    )
+    H = memoize.hess(coef_init, *args_tuple)
 
-    # 1. Compute Hessian matrix using mode=1
-    opt_hess = _MemoizeOpt(NARX._func, mode=1)
-    H = opt_hess.hess(coef_init, *args_tuple)
-
-    # 2. Compute Hessp product using mode=2 with random vector p
+    # 2. Compute Hessp product using mode=3 with random vector p
     p = rng.standard_normal(len(coef_init)) * 0.01
-    opt_hessp = _MemoizeOpt(NARX._func, mode=2)
-    Hp = opt_hessp.hessp(coef_init, p, *args_tuple)
+    Hp = memoize.hessp(coef_init, p, *args_tuple)
 
     # 3. Verify Hp == H @ p
     Hp_expected = H @ p
@@ -184,15 +190,21 @@ def test_complex():
         # Capture arguments passed to least_squares
         call_kwargs = mock_ls.call_args.kwargs
         args_tuple = call_kwargs["args"]
+        sample_weight_sqrt = args_tuple[6]
+
+    # 1. Compute Hessian matrix using mode=2
+    memoize = _OptMemoize(
+        NARX._opt_residual,
+        NARX._opt_jac,
+        NARX._opt_hess,
+        NARX._opt_hessp,
+        sample_weight_sqrt,
+    )
+    H = memoize.hess(coef_init, *args_tuple)
 
-    # 1. Compute Hessian matrix using mode=1
-    opt_hess = _MemoizeOpt(NARX._func, mode=1)
-    H = opt_hess.hess(coef_init, *args_tuple)
-
-    # 2. Compute Hessp product using mode=2 with random vector p
+    # 2. Compute Hessp product using mode=3 with random vector p
     p = rng.standard_normal(len(coef_init))
-    opt_hessp = _MemoizeOpt(NARX._func, mode=2)
-    Hp = opt_hessp.hessp(coef_init, p, *args_tuple)
+    Hp = memoize.hessp(coef_init, p, *args_tuple)
 
     # 3. Verify Hp == H @ p
     Hp_expected = H @ p
diff --git a/fastcan/narx/tests/test_narx_jac.py b/fastcan/narx/tests/test_narx_jac.py
index 9009947..165c0e1 100644
--- a/fastcan/narx/tests/test_narx_jac.py
+++ b/fastcan/narx/tests/test_narx_jac.py
@@ -6,6 +6,7 @@
 from sklearn.metrics import r2_score
 
 from fastcan.narx import NARX, make_narx
+from fastcan.narx._base import _OptMemoize
 from fastcan.narx._narx_fast import _predict
 
 
@@ -50,9 +51,15 @@ def _derivative_wrapper(
     else:
         y_ids = np.asarray(output_ids, dtype=np.int32)
 
-    _, jac, _, _ = NARX._func(
-        coef_intercept,
-        0,
+    memoize = _OptMemoize(
+        NARX._opt_residual,
+        NARX._opt_jac,
+        NARX._opt_hess,
+        NARX._opt_hessp,
+        sample_weight_sqrt,
+    )
+
+    args = (
         X,
         y,
         feat_ids,
@@ -74,6 +81,8 @@ def _derivative_wrapper(
         hess_term_ids,
         hess_yd_ids,
     )
+
+    jac = memoize.jac(coef_intercept, *args)
     return jac
 
 

From 941ca3e63d2f8081014b8e36808fa71c06e25706 Mon Sep 17 00:00:00 2001
From: sikai zhang <matthew.szhang91@gmail.com>
Date: Wed, 14 Jan 2026 09:21:19 +0800
Subject: [PATCH 2/2] fix versionadded and test_minimize threshold

---
 fastcan/_fastcan.py                 | 2 +-
 fastcan/narx/_base.py               | 2 +-
 fastcan/narx/_utils.py              | 2 +-
 fastcan/narx/tests/test_minimize.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py
index e556c01..8d7c116 100644
--- a/fastcan/_fastcan.py
+++ b/fastcan/_fastcan.py
@@ -53,7 +53,7 @@ class FastCan(SelectorMixin, BaseEstimator):
         When `beam_width` = 1, use greedy search.
         When `beam_width` > 1, use beam search.
 
-        .. versionadded:: 0.5
+        .. versionadded:: 0.5.0
 
     verbose : int, default=1
         The verbosity level.
diff --git a/fastcan/narx/_base.py b/fastcan/narx/_base.py
index 8d6785e..41b8396 100644
--- a/fastcan/narx/_base.py
+++ b/fastcan/narx/_base.py
@@ -367,7 +367,7 @@ def fit(
             The sum of session_sizes should be equal to n_samples.
             If None, the whole data is treated as one session.
 
-            .. versionadded:: 0.5
+            .. versionadded:: 0.5.0
 
         solver : {'least_squares', 'minimize'}, default='least_squares'
             The SciPy solver for optimization.
diff --git a/fastcan/narx/_utils.py b/fastcan/narx/_utils.py
index 9f86ba9..3ce61d1 100644
--- a/fastcan/narx/_utils.py
+++ b/fastcan/narx/_utils.py
@@ -199,7 +199,7 @@ def make_narx(
         The sum of session_sizes should be equal to n_samples.
         If None, the whole data is treated as one session.
 
-        .. versionadded:: 0.5
+        .. versionadded:: 0.5.0
 
     max_candidates : int, default=None
         Maximum number of candidate polynomial terms retained before selection.
diff --git a/fastcan/narx/tests/test_minimize.py b/fastcan/narx/tests/test_minimize.py
index 019105a..3878cdc 100644
--- a/fastcan/narx/tests/test_minimize.py
+++ b/fastcan/narx/tests/test_minimize.py
@@ -91,4 +91,4 @@ def make_data(multi_output, nan, rng):
         verbose=0,
     ).fit(X, y, coef_init="one_step_ahead", **fit_kwargs)
 
-    assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.5
+    assert r2_score(*mask_missing_values(y, narx_score.predict(X, y_init=y))) > 0.97