diff --git a/autoemulate/experimental/exploratory/hm_refactor.ipynb b/autoemulate/experimental/exploratory/hm_refactor.ipynb
new file mode 100644
index 000000000..2f4dc22a6
--- /dev/null
+++ b/autoemulate/experimental/exploratory/hm_refactor.ipynb
@@ -0,0 +1,454 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Refactored HistoryMatching workflow example\n",
+    "\n",
+    "## 1. Set up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# imports from main\n",
+    "from autoemulate.history_matching_dashboard import HistoryMatchingDashboard\n",
+    "\n",
+    "# imports from experimental\n",
+    "from autoemulate.experimental.emulators.gaussian_process.exact import (\n",
+    "    GaussianProcessExact,\n",
+    ")\n",
+    "from autoemulate.experimental.simulations.epidemic import Epidemic\n",
+    "from autoemulate.experimental.history_matching import HistoryMatching, HistoryMatchingWorkflow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Simulate data & train a GP\n",
+    "\n",
+    "Set up a `Simulator` and generate data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running simulations: 100%|██████████| 10/10 [00:00<00:00, 553.21it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully completed 10/10 simulations (100.0%)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "simulator = Epidemic()\n",
+    "x = simulator.sample_inputs(10)\n",
+    "y = simulator.forward_batch(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next step should be done with `AutoEmulate.compare()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gp = GaussianProcessExact(x, y)\n",
+    "gp.fit(x, y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. HistoryMatching\n",
+    "\n",
+    "Firstly, one can instantiate `HistoryMatching` without a simulator or an emulator. It can be used to calculate implausability for a given set of predictions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/bj/kdwy1bhj3h728lr5xdj19yd40000gr/T/ipykernel_13433/4021603184.py:3: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+      "  output = gp.predict(torch.tensor(x_new, dtype=torch.float32))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# generate predictions for new x inputs\n",
+    "x_new = simulator.sample_inputs(10)\n",
+    "output = gp.predict(torch.tensor(x_new, dtype=torch.float32))\n",
+    "pred_means, pred_vars = (\n",
+    "    output.mean.float().detach(),\n",
+    "    output.variance.float().detach(),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define observed data with means and variances\n",
+    "observations = {\"infection_rate\": (0.3, 0.05)}\n",
+    "\n",
+    "# Create history matcher\n",
+    "hm = HistoryMatching(\n",
+    "    observations=observations,\n",
+    "    threshold=3.0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "implausability = hm.calculate_implausibility(pred_means, pred_vars)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once implausability has been calculated, it can be used to identify indices of NROY parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hm.get_nroy(implausability)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or to filter parameters at those NROY indices:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0.1709, 0.0508],\n",
+       "        [0.4401, 0.0828],\n",
+       "        [0.3762, 0.0232],\n",
+       "        [0.1998, 0.1910],\n",
+       "        [0.3041, 0.1451],\n",
+       "        [0.2680, 0.0296],\n",
+       "        [0.4938, 0.1741],\n",
+       "        [0.2367, 0.0964],\n",
+       "        [0.1025, 0.1294],\n",
+       "        [0.3978, 0.1181]])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "hm.get_nroy(implausability, x_new)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Optionally, `HistoryMatching` can be instantiated with an emulator to make extracting prediction means and variances easier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0.0526],\n",
+       "        [0.0409],\n",
+       "        [0.0536],\n",
+       "        [0.0313],\n",
+       "        [0.0283],\n",
+       "        [0.0547],\n",
+       "        [0.0281],\n",
+       "        [0.0363],\n",
+       "        [0.0300],\n",
+       "        [0.0315]])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hm_with_emul = HistoryMatching(\n",
+    "    observations=observations,\n",
+    "    threshold=3.0,\n",
+    "    emulator=gp\n",
+    ")\n",
+    "\n",
+    "pred_means, pred_vars = hm_with_emul.emulator_predict(x_new)\n",
+    "hm_with_emul.calculate_implausibility(pred_means, pred_vars)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Iterative HistoryMatchingWorkflow\n",
+    "\n",
+    "We also have a separate class that implements an iterative sample-predict-refit workflow:\n",
+    "- sample `n_test_samples` to test from the NROY space\n",
+    "- use emulator to filter out implausible samples and update the NROY space\n",
+    "- run `n_simulations` predictions for the sampled parameters using the simulator\n",
+    "- refit the emulator using the simulated data\n",
+    "\n",
+    "The object maintains and updates the internal state each time `run()` is called so this can be done as many times as the user wants."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running simulations: 100%|██████████| 20/20 [00:00<00:00, 1140.05it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully completed 20/20 simulations (100.0%)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "hmw = HistoryMatchingWorkflow(\n",
+    "    simulator=simulator,\n",
+    "    emulator=gp,\n",
+    "    observations=observations,\n",
+    "    threshold=3.0,\n",
+    "    train_x=x,\n",
+    "    train_y=y\n",
+    ")\n",
+    "\n",
+    "test_parameters, impl_scores = hmw.run(n_simulations=20, n_test_samples=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([100, 2]), torch.Size([100, 1]))"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_parameters.shape, impl_scores.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can call `run()` as many times as we want, the class stores states from previous runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running simulations: 100%|██████████| 20/20 [00:00<00:00, 1008.67it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully completed 20/20 simulations (100.0%)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_parameters, impl_scores = hmw.run(n_simulations=20, n_test_samples=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Integration with dashboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dashboard = HistoryMatchingDashboard(\n",
+    "    samples=test_parameters,\n",
+    "    impl_scores=impl_scores,\n",
+    "    param_names=simulator.param_names,  \n",
+    "    output_names=simulator.output_names, \n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c0a1f4ff869a4acf86ed20bf20689055",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HTML(value='<h2>History Matching Dashboard</h2>')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f83c37cf211246d6a8753961a7c02639",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(Dropdown(description='Plot Type:', options=('Parameter vs Implausibility', 'Pair…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dashboard.display()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/autoemulate/experimental/history_matching.py b/autoemulate/experimental/history_matching.py
new file mode 100644
index 000000000..5c067f3bc
--- /dev/null
+++ b/autoemulate/experimental/history_matching.py
@@ -0,0 +1,487 @@
+import warnings
+
+import torch
+
+from autoemulate.experimental.device import TorchDeviceMixin
+from autoemulate.experimental.emulators import GaussianProcessExact
+from autoemulate.experimental.simulations.base import Simulator
+from autoemulate.experimental.types import DeviceLike, GaussianLike, TensorLike
+
+
+class HistoryMatching(TorchDeviceMixin):
+    """
+    History matching is a model calibration method, which uses observed data to
+    rule out ``implausible`` parameter values. The implausibility metric is:
+
+    .. math::
+        I_i(\bar{x_0}) = \frac{|z_i - \\mathbb{E}(f_i(\bar{x_0}))|}
+        {\\sqrt{\text{Var}[z_i - \\mathbb{E}(f_i(\bar{x_0}))]}}
+
+    Queried parameters above a given implausibility threshold are ruled out (RO)
+    whereas all other parameters are marked as not ruled out yet (NROY).
+    """
+
+    def __init__(  # noqa: PLR0913 allow too many arguments since all currently required
+        self,
+        observations: dict[str, tuple[float, float]] | dict[str, float],
+        threshold: float = 3.0,
+        model_discrepancy: float = 0.0,
+        rank: int = 1,
+        emulator: GaussianProcessExact | None = None,
+        device: DeviceLike | None = None,
+    ):
+        """
+        Initialize the history matching object.
+
+        Parameters
+        ----------
+        observations: dict[str, tuple[float, float] | dict[str, float]
+            For each output variable, specifies observed [value, noise]. In case
+            of no uncertainty in observations, provides just the observed value.
+        threshold: float
+            Implausibility threshold (query points with implausibility scores that
+            exceed this value are ruled out). Defaults to 3, which is considered
+            a good value for simulations with a single output.
+        model_discrepancy: float
+            Additional variance to include in the implausibility calculation.
+        rank: int
+            Scoring method for multi-output problems. Must be 1 <= rank <= n_outputs.
+            When the implausibility scores are ordered across outputs, it indicates
+            which rank to use when determining whether the query point is NROY. The
+            default of ``1`` indicates that the largest implausibility will be used.
+        emulator: GaussianProcessExact
+            TODO: make this EmulatorWithUncertainty once implemented (see #542)
+            An optional trained Gaussian Process emulator.
+        device: DeviceLike | None
+            The device to use. If None, the default torch device is returned.
+        """
+        TorchDeviceMixin.__init__(self, device=device)
+
+        self.threshold = threshold
+        self.discrepancy = model_discrepancy
+        self.out_dim = len(observations)
+        self.emulator = emulator
+
+        # TODO: make this EmulatorWithUncertainty once implemented (see #542)
+        if isinstance(self.emulator, GaussianProcessExact):
+            self.emulator.device = self.device
+
+        if rank > self.out_dim or rank < 1:
+            raise ValueError(
+                f"Rank ({rank}) is outside valid range between 1 and output dimension "
+                f"of simulator ({self.out_dim})",
+            )
+        self.rank = rank
+
+        # Save mean and variance of observations, shape: [1, n_outputs]
+        self.obs_means, self.obs_vars = self._process_observations(observations)
+
+    def _process_observations(
+        self,
+        observations: dict[str, tuple[float, float]] | dict[str, float],
+    ) -> tuple[TensorLike, TensorLike]:
+        """
+        Turn observations into tensors of shape [1, n_inputs].
+
+        Parameters
+        ----------
+        observations: dict[str, tuple[float, float] | dict[str, float]
+            For each output variable, specifies observed [value, noise]. In case
+            of no uncertainty in observations, provides just the observed value.
+
+        Returns
+        -------
+        tuple[TensorLike, TensorLike]
+            Tensors of observations and the associated noise (which can be 0).
+        """
+        values = torch.tensor(list(observations.values()), device=self.device)
+
+        # No variance
+        if values.ndim == 1:
+            means = values
+            variances = torch.zeros_like(means, device=self.device)
+        # Values are (mean, variance)
+        elif values.ndim == 2:
+            means = values[:, 0]
+            variances = values[:, 1]
+        else:
+            msg = "Observations must be either float or tuple of two floats."
+            raise ValueError(msg)
+
+        # Reshape observation tensors for broadcasting
+        return means.view(1, -1), variances.view(1, -1)
+
+    def _create_nroy_mask(self, implausibility: TensorLike) -> TensorLike:
+        """
+        Create mask for NROY points based on rank.
+
+        Parameters
+        ----------
+        implausibility: TensorLike
+            Tensor of implausibility scores for tested parameters.
+
+        Returns
+        -------
+        TensorLike
+            Tensor indicating whether each implausability score is NROY
+            given self.rank and self.threshold values.
+        """
+        # Sort implausibilities for each sample (descending)
+        I_sorted, _ = torch.sort(implausibility, dim=1, descending=True)
+        # The rank-th highest output implausibility must be <= threshold
+        return I_sorted[:, self.rank - 1] <= self.threshold
+
+    def emulator_predict(self, x: TensorLike) -> tuple[TensorLike, TensorLike]:
+        """
+        Return emulator predicted mean and variance for input parameters `x`.
+
+        Parameters
+        ----------
+        x: TensorLike
+            Tensor of input parameters to make predictions for.
+
+        Returns
+        -------
+        tuple[TensorLike, TensorLike]
+            The emulator predicted mean and variance for `x`.
+        """
+        if self.emulator is not None:
+            output = self.emulator.predict(x)
+            assert isinstance(output, GaussianLike)
+            assert output.variance.ndim == 2
+            return (
+                output.mean.float().detach(),
+                output.variance.float().detach(),
+            )
+        msg = "Need an emulator to make predictions."
+        raise ValueError(msg)
+
+    def get_nroy(
+        self, implausibility: TensorLike, x: TensorLike | None = None
+    ) -> TensorLike:
+        """
+        Get indices of NROY points from implausibility scores. If `x`
+        is provided, returns parameter values at NROY indices.
+
+        Parameters
+        ----------
+        implausibility: TensorLike
+            Tensor of implausibility scores for tested input parameters.
+        x: Tensorlike | None
+            Optional tensor of scored input parameters.
+
+        Returns
+        -------
+        TensorLike
+            Indices of NROY points or `x` parameters at NROY indices.
+        """
+        nroy_mask = self._create_nroy_mask(implausibility)
+        idx = torch.where(nroy_mask)[0]
+        if x is None:
+            return idx
+        return x[idx]
+
+    def get_ro(
+        self, implausibility: TensorLike, x: TensorLike | None = None
+    ) -> TensorLike:
+        """
+        Get indices of RO points from implausibility scores. If `x`
+        is provided, returns parameter values at RO indices.
+
+        Parameters
+        ----------
+        implausibility: TensorLike
+            Tensor of implausibility scores for tested input parameters.
+        x: Tensorlike | None
+            Optional tensor of scored iput parameters.
+
+        Returns
+        -------
+        TensorLike
+            Indices of RO points or `x` parameters at RO indices.
+        """
+        nroy_mask = self._create_nroy_mask(implausibility)
+        idx = torch.where(~nroy_mask)[0]
+        if x is None:
+            return idx
+        return x[idx]
+
+    def calculate_implausibility(
+        self,
+        pred_means: TensorLike,  # [n_samples, n_outputs]
+        pred_vars: TensorLike,  # [n_samples, n_outputs]
+    ) -> TensorLike:
+        """
+        Calculate implausibility scores.
+
+        Parameters
+        ----------
+        pred_means: TensorLike
+            Tensor of prediction means [n_samples, n_outputs]
+        pred_vars: TensorLike
+            Tensor of prediction variances [n_samples, n_outputs].
+
+        Returns
+        -------
+        TensorLike
+            Tensor of implausibility scores.
+        """
+        # Additional variance due to model discrepancy (defaults to 0)
+        discrepancy = torch.full_like(
+            self.obs_vars, self.discrepancy, device=self.device
+        )
+
+        # Calculate total variance
+        Vs = pred_vars + discrepancy + self.obs_vars
+
+        # Calculate implausibility
+        return torch.abs(self.obs_means - pred_means) / torch.sqrt(Vs)
+
+
+class HistoryMatchingWorkflow(HistoryMatching):
+    """
+    Run history matching workflow:
+    - sample parameter values to test from the current NROY parameter space
+    - use emulator to rule out implausible parameters and update NROY space
+    - make predictions for a subset the NROY parameters using the simulator
+    - refit the emulator using the simulated data
+    """
+
+    def __init__(  # noqa: PLR0913 allow too many arguments since all currently required
+        self,
+        simulator: Simulator,
+        # TODO: make this EmulatorWithUncertainty once implemented (see #542)
+        emulator: GaussianProcessExact,
+        observations: dict[str, tuple[float, float]] | dict[str, float],
+        threshold: float = 3.0,
+        model_discrepancy: float = 0.0,
+        rank: int = 1,
+        train_x: TensorLike | None = None,
+        train_y: TensorLike | None = None,
+        device: DeviceLike | None = None,
+    ):
+        """
+        Initialize the history matching workflow object.
+
+        TODO:
+        - add random seed for reproducibility (once #465 is complete)
+
+        Parameters
+        ----------
+        simulator: Simulator
+            A simulator.
+        emulator: GaussianProcessExact
+            TODO: make this EmulatorWithUncertainty once implemented (see #542)
+            A Gaussian Process emulator pre-trained on `simulator` data.
+        observations: dict[str, tuple[float, float] | dict[str, float]
+            For each output variable, specifies observed [value, noise]. In case
+            of no uncertainty in observations, provides just the observed value.
+        threshold: float
+            Implausibility threshold (query points with implausibility scores that
+            exceed this value are ruled out). Defaults to 3, which is considered
+            a good value for simulations with a single output.
+        model_discrepancy: float
+            Additional variance to include in the implausibility calculation.
+        rank: int
+            Scoring method for multi-output problems. Must be 1 <= rank <= n_outputs.
+            When the implausibility scores are ordered across outputs, it indicates
+            which rank to use when determining whether the query point is NROY. The
+            default val of ``1`` indicates that the largest implausibility will be used.
+        train_x: TensorLike | None
+            Optional tensor of input data the emulator was trained on.
+        train_y: TensorLike | None
+            Optional tensor of output data the emulator was trained on.
+        device: DeviceLike | None
+            The device to use. If None, the default torch device is returned.
+        """
+        super().__init__(
+            observations, threshold, model_discrepancy, rank, emulator, device
+        )
+        self.simulator = simulator
+
+        # These get updated when run() is called and used to refit the emulator
+        if train_x is not None and train_y is not None:
+            self.train_x = train_x.to(self.device)
+            self.train_y = train_y.to(self.device)
+        else:
+            self.train_x = torch.empty((0, self.simulator.in_dim), device=self.device)
+            self.train_y = torch.empty((0, self.simulator.out_dim), device=self.device)
+
+    def generate_samples(self, n: int) -> tuple[TensorLike, TensorLike]:
+        """
+        Draw `n` samples from the simulator min/max parameter bounds and
+        evaluate implausability given emulator predictions.
+
+        Parameters
+        ----------
+        n: int
+            The number of parameter samples to generate.
+
+        Returns
+        -------
+        tuple[TensorLike, TensorLike]
+            A tensor of tested input parameters and their implausability scores.
+        """
+        # Generate `n` parameter samples from within NROY bounds
+        test_x = self.simulator.sample_inputs(n).to(self.device)
+
+        # Rule out implausible parameters from samples using an emulator
+        pred_means, pred_vars = self.emulator_predict(test_x)
+        impl_scores = self.calculate_implausibility(pred_means, pred_vars)
+
+        return test_x, impl_scores
+
+    def update_simulator_bounds(self, nroy_x: TensorLike):
+        """
+        Update simulator parameter bounds to min/max of NROY parameter samples.
+
+        Parameters
+        ----------
+        nroy_x: TensorLike
+            A tensor of NROY parameter samples [n_samples, n_inputs]
+        """
+        # Can't get min/max if have only 1 sample
+        if nroy_x.shape[0] > 1:
+            min_nroy_values = torch.min(nroy_x, dim=0).values
+            max_nroy_values = torch.max(nroy_x, dim=0).values
+            self.simulator._param_bounds = list(
+                zip(
+                    min_nroy_values.cpu().tolist(),
+                    max_nroy_values.cpu().tolist(),
+                    strict=False,
+                )
+            )
+        else:
+            warnings.warn(
+                (
+                    f"Could not update simulator parameter bounds only "
+                    f"{nroy_x.shape[0]} samples were provided."
+                ),
+                stacklevel=2,
+            )
+
+    def sample_tensor(self, n: int, x: TensorLike) -> TensorLike:
+        """
+        Randomly sample `n` rows from `x`.
+
+        Parameters
+        ----------
+        n: int
+            The number of samples to draw.
+        x: TensorLike
+            The tensor to sample from.
+
+        Returns
+        -------
+        TensorLike
+            A tensor of samples with `n` rows.
+        """
+        if x.shape[0] < n:
+            warnings.warn(
+                f"Number of tensor rows {x.shape[0]} is less than {n} samples.",
+                stacklevel=2,
+            )
+        idx = torch.randperm(x.shape[0], device=self.device)[:n]
+        return x[idx]
+
+    def simulate(self, x: TensorLike) -> tuple[TensorLike, TensorLike]:
+        """
+        Simulate `x` parameter inputs and filter out failed simulations.
+
+        Parameters
+        ----------
+        x: TensorLike
+            A tensor of parameters to simulate [n_samples, n_inputs].
+
+        Returns
+        -------
+        tuple[TensorLike, TensorLike]
+            Tensors of succesfully simulated input parameters and predictions.
+        """
+        y = self.simulator.forward_batch(x).to(self.device)
+
+        # Filter out runs that simulator failed to return predictions for
+        # TODO: this assumes that simulator returns None if it fails (see #438)
+        valid_indices = [i for i, res in enumerate(y) if res is not None]
+        valid_x = x[valid_indices]
+        valid_y = y[valid_indices]
+
+        self.train_y = torch.cat([self.train_y, valid_y], dim=0)
+        self.train_x = torch.cat([self.train_x, valid_x], dim=0)
+
+        return valid_x, valid_y
+
+    def run(
+        self,
+        n_simulations: int = 100,
+        n_test_samples: int = 10000,
+        max_retries: int = 3,
+    ) -> tuple[TensorLike, TensorLike]:
+        """
+        Run a wave of the history matching workflow.
+
+        Parameters
+        ----------
+        n_simulations: int
+            The number of simulations to run.
+        n_test_samples: int
+            Number of input parameters to test for implausibility with the emulator.
+            Parameters to simulate are sampled from this NROY subset.
+        max_retries: int
+            Maximum number of times to try to generate `n_simulations` NROY parameters.
+            That is the maximum number of times to repeat the following steps:
+                - draw `n_test_samples` parameters
+                - use emulator to make predictions for those parameters
+                - score implausability of parameters given predictions
+                - identify NROY parameters within this set
+
+        Returns
+        -------
+        tuple[TensorLike, TensorLike]
+            A tensor of tested input parameters and their implausibility scores from
+            which simulation samples were then selected.
+        """
+
+        test_parameters_list, impl_scores_list, nroy_parameters_list = (
+            [],
+            [],
+            [torch.empty((0, self.simulator.in_dim), device=self.device)],
+        )
+
+        retries = 0
+        while torch.cat(nroy_parameters_list, 0).shape[0] < n_simulations:
+            if retries == max_retries:
+                msg = (
+                    f"Could not generate n_simulations ({n_simulations}) samples "
+                    f"that are NROY after {max_retries} retries."
+                )
+                raise RuntimeError(msg)
+
+            # Generate `n_test_samples` with implausability scores, identify NROY
+            test_parameters, impl_scores = self.generate_samples(n_test_samples)
+            nroy_parameters = self.get_nroy(impl_scores, test_parameters)
+
+            # Store results
+            nroy_parameters_list.append(nroy_parameters)
+            test_parameters_list.append(test_parameters)
+            impl_scores_list.append(impl_scores)
+
+            retries += 1
+
+        # Update simulator parameter bounds to NROY region
+        # Next time that call run(), will sample from within this region
+        nroy_parameters_all = torch.cat(nroy_parameters_list, 0)
+        self.update_simulator_bounds(nroy_parameters_all)
+
+        # Randomly pick at most `n_simulations` parameters from NROY to simulate
+        nroy_simulation_samples = self.sample_tensor(n_simulations, nroy_parameters_all)
+
+        # Make predictions using simulator (this updates self.x_train and self.y_train)
+        _, _ = self.simulate(nroy_simulation_samples)
+
+        # Refit emulator using all available data
+        assert self.emulator is not None
+        self.emulator.fit(self.train_x, self.train_y)
+
+        # Return test parameters and impl scores for this run/wave
+        return torch.cat(test_parameters_list, 0), torch.cat(impl_scores_list, 0)
diff --git a/autoemulate/experimental/simulations/epidemic.py b/autoemulate/experimental/simulations/epidemic.py
index 8235f6e94..7c15b4fc0 100644
--- a/autoemulate/experimental/simulations/epidemic.py
+++ b/autoemulate/experimental/simulations/epidemic.py
@@ -35,5 +35,6 @@ def _forward(self, x: TensorLike) -> TensorLike:
         TensorLike
             Peak infection rate.
         """
-        y = simulate_epidemic(x.numpy()[0])
-        return torch.tensor([y]).view(-1, 1)
+        y = simulate_epidemic(x.cpu().numpy()[0])
+        # TODO (#537): update with default dtype
+        return torch.tensor([y], dtype=torch.float32).view(-1, 1)
diff --git a/autoemulate/history_matching_dashboard.py b/autoemulate/history_matching_dashboard.py
index d6dcdd7aa..d8c41bd44 100644
--- a/autoemulate/history_matching_dashboard.py
+++ b/autoemulate/history_matching_dashboard.py
@@ -8,6 +8,8 @@
 from IPython.display import display
 from sklearn.decomposition import PCA
 
+from autoemulate.experimental.types import TensorLike
+
 
 class HistoryMatchingDashboard:
     """
@@ -21,9 +23,11 @@ def __init__(self, samples, impl_scores, param_names, output_names, threshold=3.
         """
         Initialize the dashboard
 
+        TODO: shouldn't this include rank as input?
+
         Args:
-            samples: DataFrame or numpy array with parameter samples
-            impl_scores: Array of implausibility scores
+            samples: DataFrame or numpy array or tensor with parameter samples
+            impl_scores: Array or tensor of implausibility scores
             param_names: List of parameter names
             output_names: List of output names
             threshold: Implausibility threshold
@@ -31,6 +35,8 @@ def __init__(self, samples, impl_scores, param_names, output_names, threshold=3.
         # Convert samples to DataFrame if it's an array
         if isinstance(samples, np.ndarray):
             self.samples_df = pd.DataFrame(samples, columns=param_names)
+        elif isinstance(samples, TensorLike):
+            self.samples_df = pd.DataFrame(samples.numpy(), columns=param_names)
         else:
             self.samples_df = samples.copy()
 
@@ -45,18 +51,21 @@ def __init__(self, samples, impl_scores, param_names, output_names, threshold=3.
             self.max_waves = min(10, n_samples // 100) if n_samples > 100 else 5
 
         # Store other data
-        self.impl_scores = impl_scores
+        if isinstance(impl_scores, TensorLike):
+            self.impl_scores = impl_scores.numpy()
+        else:
+            self.impl_scores = impl_scores
         self.param_names = param_names
         self.output_names = output_names
         self.threshold = threshold
 
         # Calculate minimum implausibility for each sample
-        if len(impl_scores.shape) > 1:
-            self.min_impl = np.min(impl_scores, axis=1)
-            self.max_impl = np.max(impl_scores, axis=1)
+        if len(self.impl_scores.shape) > 1:
+            self.min_impl = np.min(self.impl_scores, axis=1)
+            self.max_impl = np.max(self.impl_scores, axis=1)
         else:
-            self.min_impl = impl_scores
-            self.max_impl = impl_scores
+            self.min_impl = self.impl_scores
+            self.max_impl = self.impl_scores
 
         # Add implausibility to DataFrame
         self.samples_df["min_implausibility"] = self.min_impl
diff --git a/tests/experimental/__init__.py b/tests/experimental/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/experimental/test_experimental_history_matching.py b/tests/experimental/test_experimental_history_matching.py
new file mode 100644
index 000000000..7dc5d3a00
--- /dev/null
+++ b/tests/experimental/test_experimental_history_matching.py
@@ -0,0 +1,166 @@
+import pytest
+import torch
+from autoemulate.experimental.device import (
+    SUPPORTED_DEVICES,
+    check_torch_device_is_available,
+)
+from autoemulate.experimental.emulators.gaussian_process.exact import (
+    GaussianProcessExact,
+)
+from autoemulate.experimental.history_matching import (
+    HistoryMatching,
+    HistoryMatchingWorkflow,
+)
+from autoemulate.experimental.simulations.epidemic import Epidemic
+from autoemulate.experimental.types import TensorLike
+
+from .test_experimental_base_simulator import MockSimulator
+
+
+@pytest.fixture
+def mock_simulator():
+    """Fixture for the mock simulator from test_base_simulator"""
+    param_ranges = {"param1": (0.0, 1.0), "param2": (-10.0, 10.0)}
+    return MockSimulator(param_ranges, output_names=["output1", "output2"])
+
+
+@pytest.fixture
+def observations():
+    """Fixture for observations data matching mock simulator outputs"""
+    return {"output1": (0.5, 0.1), "output2": (0.6, 0.2)}  # (mean, variance)
+
+
+@pytest.fixture
+def history_matcher(observations):
+    """Fixture for a basic HistoryMatching instance."""
+    return HistoryMatching(
+        observations=observations,
+        threshold=3.0,
+        model_discrepancy=0.1,
+        rank=1,
+    )
+
+
+def test_history_matcher_init(history_matcher):
+    """Test initialization of HistoryMatching."""
+    assert history_matcher.threshold == 3.0
+    assert history_matcher.discrepancy == 0.1
+    assert history_matcher.rank == 1
+    assert history_matcher.obs_means.shape == (1, 2)
+    assert history_matcher.obs_vars.shape == (1, 2)
+
+
+def test_calculate_implausibility(history_matcher, observations):
+    """Test implausibility calculation."""
+
+    # have 1 sample of 2 outputs arranged as [n_samples, n_outputs]
+    pred_means = torch.Tensor([[0.4, 0.7]])
+    pred_vars = torch.Tensor([[0.05, 0.1]])
+
+    impl_scores = history_matcher.calculate_implausibility(pred_means, pred_vars)
+    assert impl_scores.shape == (1, 2)
+
+    assert impl_scores[0][0] == (
+        abs(pred_means[0][0] - observations["output1"][0])
+        # have an extra term in the denominator for model discrepancy
+        / (pred_vars[0][0] + observations["output1"][1] + 0.1) ** 0.5
+    )
+
+    assert impl_scores[0][1] == (
+        abs(pred_means[0][1] - observations["output2"][0])
+        # have an extra term in the denominator for model discrepancy
+        / (pred_vars[0][1] + observations["output2"][1] + 0.1) ** 0.5
+    )
+
+    # the output shape is the same irrespective of rank
+    history_matcher.rank = 2
+    impl_scores = history_matcher.calculate_implausibility(pred_means, pred_vars)
+    assert impl_scores.shape == (1, 2)
+
+
+def test_get_indices(history_matcher):
+    """Test NROY and RO indices vary with rank."""
+    impl_scores = torch.tensor([[1, 5], [1, 2], [4, 2]])
+
+    # rank = 1
+    nroy = history_matcher.get_nroy(impl_scores)
+    assert len(nroy) == 1
+    assert nroy[0] == 1
+
+    ro = history_matcher.get_ro(impl_scores)
+    assert len(ro) == 2
+    assert ro[0] == 0
+    assert ro[1] == 2
+
+    # rank = n
+    history_matcher.rank = 2
+    assert len(history_matcher.get_nroy(impl_scores)) == 3
+    assert len(history_matcher.get_ro(impl_scores)) == 0
+
+
+@pytest.mark.parametrize("device", SUPPORTED_DEVICES)
+def test_run(device):
+    """Test the full history matching workflow with Epidemic simulator."""
+    if not check_torch_device_is_available(device):
+        pytest.skip(f"Device ({device}) is not available.")
+
+    simulator = Epidemic()
+    x = simulator.sample_inputs(10)
+    y = simulator.forward_batch(x)
+
+    # Run history matching
+    gp = GaussianProcessExact(x, y, device=device)
+    gp.fit(x, y)
+
+    observations = {"infection_rate": (0.3, 0.05)}
+
+    hm = HistoryMatchingWorkflow(
+        simulator=simulator,
+        emulator=gp,
+        observations=observations,
+        threshold=3.0,
+        model_discrepancy=0.1,
+        rank=1,
+        device=device,
+    )
+
+    # call run first time
+    hm.run(n_simulations=5)
+
+    # Check basic structure of results
+    assert isinstance(hm.train_x, TensorLike)
+    assert isinstance(hm.emulator, GaussianProcessExact)
+
+    assert len(hm.train_x) == 5
+
+    # can run again
+    hm.run(n_simulations=5)
+
+    # We should get results for all valid samples
+    assert len(hm.train_x) == 5 * 2
+
+
+def test_run_max_tries():
+    """Run history matching with observations that return no NROY params."""
+    simulator = Epidemic()
+    x = simulator.sample_inputs(10)
+    y = simulator.forward_batch(x)
+
+    # Run history matching
+    gp = GaussianProcessExact(x, y)
+    gp.fit(x, y)
+
+    # Extreme values outside the range of what the simulator returns
+    observations = {"infection_rate": (100.0, 1.0)}
+
+    hm = HistoryMatchingWorkflow(
+        simulator=simulator,
+        emulator=gp,
+        observations=observations,
+        threshold=3.0,
+        model_discrepancy=0.1,
+        rank=1,
+    )
+
+    with pytest.raises(RuntimeError):
+        hm.run(n_simulations=5)