refactor(stats): move wilson_interval + cohens_kappa to ami/utils/stats

mihow · claude · mihow · commit 336c1fe6a89c · 2026-05-27T06:27:48.000-07:00
Both are generic statistical helpers — they don't depend on Django or any
domain model. Lifting them out of ami/main/models_future/occurrence.py so
other endpoints/jobs that need binomial CIs or chance-corrected agreement
can import them without dragging in the occurrence module.

Same implementations, just relocated. Renamed parameter names on
cohens_kappa from (human, model) to (rater_a, rater_b) so the helper
reads as generic rather than human-vs-model specific.

Tests already use isolated `from ami.utils.stats import …` imports
(updated all 9 sites in ami/main/tests.py).

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py
@@ -11,12 +11,12 @@
 from __future__ import annotations
 
 import collections
-import math
 from typing import TYPE_CHECKING
 
 from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery
 
 from ami.main.models import Project, TaxonRank, User
+from ami.utils.stats import cohens_kappa, wilson_interval
 
 if TYPE_CHECKING:
     from ami.main.models import Classification, Identification, Occurrence
@@ -52,64 +52,6 @@ def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None:
     return deepest
 
 
-# z-score for a 95% two-sided confidence interval (Wilson score).
-WILSON_Z_95 = 1.959963984540054
-
-
-def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None:
-    """Wilson score confidence interval for a binomial proportion.
-
-    Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or
-    ``None`` when ``total`` is 0. Defaults to a 95% interval.
-
-    The Wilson score interval is used instead of the normal approximation
-    because the verified set is often tiny (single-digit counts), where the
-    normal approximation produces bounds outside [0, 1] and understates the
-    uncertainty. Wilson stays well-behaved at small n and at proportions
-    near 0 or 1.
-    """
-    if total <= 0:
-        return None
-    phat = successes / total
-    z2 = z * z
-    denom = 1 + z2 / total
-    center = (phat + z2 / (2 * total)) / denom
-    margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total))
-    low = max(0.0, center - margin)
-    high = min(1.0, center + margin)
-    return (round(low, 4), round(high, 4))
-
-
-def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None:
-    """Cohen's kappa for exact-taxon agreement between human and model.
-
-    ``pairs`` is one ``(human_taxon_id, model_taxon_id)`` per occurrence that
-    both a human and the model assigned a taxon to. Returns kappa rounded to
-    4 dp in ``[-1, 1]`` (negative = worse than chance), or ``None`` when
-    there are no pairs or expected agreement is 1.0 (kappa undefined — a
-    single category leaves no chance-agreement to correct for).
-
-    Plain agreement rate rewards luck: in a project dominated by one common
-    species, human and model agree most of the time just by both naming the
-    common one. Kappa subtracts that chance agreement, so it answers "how
-    much better than guessing is the model" rather than "how often do they
-    happen to match".
-    """
-    n = len(pairs)
-    if n == 0:
-        return None
-    observed_agree = sum(1 for h, m in pairs if h == m) / n
-    human_counts: collections.Counter = collections.Counter(h for h, _ in pairs)
-    model_counts: collections.Counter = collections.Counter(m for _, m in pairs)
-    expected_agree = sum(
-        (human_counts[taxon_id] / n) * (model_counts[taxon_id] / n)
-        for taxon_id in set(human_counts) | set(model_counts)
-    )
-    if expected_agree >= 1.0:
-        return None
-    return round((observed_agree - expected_agree) / (1 - expected_agree), 4)
-
-
 def _detections_prefetch(*, ordering: tuple[str, ...], with_source_image: bool) -> Prefetch:
     from ami.main.models import Classification, Detection
 
diff --git a/ami/main/tests.py b/ami/main/tests.py
@@ -4804,29 +4804,29 @@ class TestWilsonInterval(TestCase):
     """Pure-Python Wilson score confidence interval."""
 
     def test_zero_total_returns_none(self):
-        from ami.main.models_future.occurrence import wilson_interval
+        from ami.utils.stats import wilson_interval
 
         self.assertIsNone(wilson_interval(0, 0))
 
     def test_known_value_8_of_10(self):
         """Textbook Wilson 95% CI for 8/10 ≈ [0.490, 0.943]."""
-        from ami.main.models_future.occurrence import wilson_interval
+        from ami.utils.stats import wilson_interval
 
         low, high = wilson_interval(8, 10)
         self.assertAlmostEqual(low, 0.4902, places=3)
         self.assertAlmostEqual(high, 0.9433, places=3)
 
     def test_bounds_stay_within_unit_interval(self):
         """At p̂ = 1.0 the normal approximation would exceed 1; Wilson must not."""
-        from ami.main.models_future.occurrence import wilson_interval
+        from ami.utils.stats import wilson_interval
 
         low, high = wilson_interval(1, 1)
         self.assertGreaterEqual(low, 0.0)
         self.assertLessEqual(high, 1.0)
         self.assertLess(low, high)
 
     def test_interval_tightens_as_n_grows(self):
-        from ami.main.models_future.occurrence import wilson_interval
+        from ami.utils.stats import wilson_interval
 
         narrow = wilson_interval(90, 100)
         wide = wilson_interval(9, 10)
@@ -4837,18 +4837,18 @@ class TestCohensKappa(TestCase):
     """Pure-Python Cohen's kappa over (human_taxon, model_taxon) pairs."""
 
     def test_empty_returns_none(self):
-        from ami.main.models_future.occurrence import cohens_kappa
+        from ami.utils.stats import cohens_kappa
 
         self.assertIsNone(cohens_kappa([]))
 
     def test_single_category_is_undefined(self):
         """Everyone picks the same taxon → expected agreement 1.0 → kappa undefined."""
-        from ami.main.models_future.occurrence import cohens_kappa
+        from ami.utils.stats import cohens_kappa
 
         self.assertIsNone(cohens_kappa([(1, 1), (1, 1), (1, 1)]))
 
     def test_perfect_agreement_two_categories(self):
-        from ami.main.models_future.occurrence import cohens_kappa
+        from ami.utils.stats import cohens_kappa
 
         self.assertEqual(cohens_kappa([(1, 1), (2, 2)]), 1.0)
 
@@ -4857,13 +4857,13 @@ def test_known_2x2_value(self):
 
         pairs: 3× human=1, 1× human=2; model 1 twice, 2 twice; 3 of 4 match.
         """
-        from ami.main.models_future.occurrence import cohens_kappa
+        from ami.utils.stats import cohens_kappa
 
         self.assertEqual(cohens_kappa([(1, 1), (1, 1), (2, 2), (1, 2)]), 0.5)
 
     def test_can_be_negative(self):
         """Systematic disagreement → worse than chance → negative kappa."""
-        from ami.main.models_future.occurrence import cohens_kappa
+        from ami.utils.stats import cohens_kappa
 
         kappa = cohens_kappa([(1, 2), (2, 1), (1, 2), (2, 1)])
         self.assertLess(kappa, 0.0)
diff --git a/ami/utils/stats.py b/ami/utils/stats.py
@@ -0,0 +1,65 @@
+"""Generic statistical helpers reusable across apps.
+
+Kept independent of Django and any domain models so they can be unit-tested
+in isolation and reused by other endpoints/jobs that need to express
+uncertainty (Wilson CI) or correct an agreement rate for chance (kappa).
+"""
+
+from __future__ import annotations
+
+import collections
+import math
+
+# z-score for a 95% two-sided confidence interval (Wilson score).
+WILSON_Z_95 = 1.959963984540054
+
+
+def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None:
+    """Wilson score confidence interval for a binomial proportion.
+
+    Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or
+    ``None`` when ``total`` is 0. Defaults to a 95% interval.
+
+    The Wilson score interval is used instead of the normal approximation
+    because the verified set is often tiny (single-digit counts), where the
+    normal approximation produces bounds outside [0, 1] and understates the
+    uncertainty. Wilson stays well-behaved at small n and at proportions
+    near 0 or 1.
+    """
+    if total <= 0:
+        return None
+    phat = successes / total
+    z2 = z * z
+    denom = 1 + z2 / total
+    center = (phat + z2 / (2 * total)) / denom
+    margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total))
+    low = max(0.0, center - margin)
+    high = min(1.0, center + margin)
+    return (round(low, 4), round(high, 4))
+
+
+def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None:
+    """Cohen's kappa for exact agreement between two raters.
+
+    ``pairs`` is one ``(rater_a, rater_b)`` per item that both raters
+    classified. Returns kappa rounded to 4 dp in ``[-1, 1]`` (negative =
+    worse than chance), or ``None`` when there are no pairs or expected
+    agreement is 1.0 (kappa undefined — a single category leaves no
+    chance-agreement to correct for).
+
+    Plain agreement rate rewards luck: in a project dominated by one common
+    category, both raters agree most of the time just by both naming the
+    common one. Kappa subtracts that chance agreement, so it answers "how
+    much better than guessing do they agree" rather than "how often do they
+    happen to match".
+    """
+    n = len(pairs)
+    if n == 0:
+        return None
+    observed_agree = sum(1 for a, b in pairs if a == b) / n
+    a_counts: collections.Counter = collections.Counter(a for a, _ in pairs)
+    b_counts: collections.Counter = collections.Counter(b for _, b in pairs)
+    expected_agree = sum((a_counts[key] / n) * (b_counts[key] / n) for key in set(a_counts) | set(b_counts))
+    if expected_agree >= 1.0:
+        return None
+    return round((observed_agree - expected_agree) / (1 - expected_agree), 4)