Skip to content

Commit 336c1fe

Browse files
mihowclaude
andcommitted
refactor(stats): move wilson_interval + cohens_kappa to ami/utils/stats
Both are generic statistical helpers — they don't depend on Django or any domain model. Lifting them out of ami/main/models_future/occurrence.py so other endpoints/jobs that need binomial CIs or chance-corrected agreement can import them without dragging in the occurrence module. Same implementations, just relocated. Renamed parameter names on cohens_kappa from (human, model) to (rater_a, rater_b) so the helper reads as generic rather than human-vs-model specific. Tests already use isolated `from ami.utils.stats import …` imports (updated all 9 sites in ami/main/tests.py). Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 2c65cce commit 336c1fe

3 files changed

Lines changed: 75 additions & 68 deletions

File tree

ami/main/models_future/occurrence.py

Lines changed: 1 addition & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
from __future__ import annotations
1212

1313
import collections
14-
import math
1514
from typing import TYPE_CHECKING
1615

1716
from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery
1817

1918
from ami.main.models import Project, TaxonRank, User
19+
from ami.utils.stats import cohens_kappa, wilson_interval
2020

2121
if TYPE_CHECKING:
2222
from ami.main.models import Classification, Identification, Occurrence
@@ -52,64 +52,6 @@ def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None:
5252
return deepest
5353

5454

55-
# z-score for a 95% two-sided confidence interval (Wilson score).
56-
WILSON_Z_95 = 1.959963984540054
57-
58-
59-
def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None:
60-
"""Wilson score confidence interval for a binomial proportion.
61-
62-
Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or
63-
``None`` when ``total`` is 0. Defaults to a 95% interval.
64-
65-
The Wilson score interval is used instead of the normal approximation
66-
because the verified set is often tiny (single-digit counts), where the
67-
normal approximation produces bounds outside [0, 1] and understates the
68-
uncertainty. Wilson stays well-behaved at small n and at proportions
69-
near 0 or 1.
70-
"""
71-
if total <= 0:
72-
return None
73-
phat = successes / total
74-
z2 = z * z
75-
denom = 1 + z2 / total
76-
center = (phat + z2 / (2 * total)) / denom
77-
margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total))
78-
low = max(0.0, center - margin)
79-
high = min(1.0, center + margin)
80-
return (round(low, 4), round(high, 4))
81-
82-
83-
def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None:
84-
"""Cohen's kappa for exact-taxon agreement between human and model.
85-
86-
``pairs`` is one ``(human_taxon_id, model_taxon_id)`` per occurrence that
87-
both a human and the model assigned a taxon to. Returns kappa rounded to
88-
4 dp in ``[-1, 1]`` (negative = worse than chance), or ``None`` when
89-
there are no pairs or expected agreement is 1.0 (kappa undefined — a
90-
single category leaves no chance-agreement to correct for).
91-
92-
Plain agreement rate rewards luck: in a project dominated by one common
93-
species, human and model agree most of the time just by both naming the
94-
common one. Kappa subtracts that chance agreement, so it answers "how
95-
much better than guessing is the model" rather than "how often do they
96-
happen to match".
97-
"""
98-
n = len(pairs)
99-
if n == 0:
100-
return None
101-
observed_agree = sum(1 for h, m in pairs if h == m) / n
102-
human_counts: collections.Counter = collections.Counter(h for h, _ in pairs)
103-
model_counts: collections.Counter = collections.Counter(m for _, m in pairs)
104-
expected_agree = sum(
105-
(human_counts[taxon_id] / n) * (model_counts[taxon_id] / n)
106-
for taxon_id in set(human_counts) | set(model_counts)
107-
)
108-
if expected_agree >= 1.0:
109-
return None
110-
return round((observed_agree - expected_agree) / (1 - expected_agree), 4)
111-
112-
11355
def _detections_prefetch(*, ordering: tuple[str, ...], with_source_image: bool) -> Prefetch:
11456
from ami.main.models import Classification, Detection
11557

ami/main/tests.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4804,29 +4804,29 @@ class TestWilsonInterval(TestCase):
48044804
"""Pure-Python Wilson score confidence interval."""
48054805

48064806
def test_zero_total_returns_none(self):
4807-
from ami.main.models_future.occurrence import wilson_interval
4807+
from ami.utils.stats import wilson_interval
48084808

48094809
self.assertIsNone(wilson_interval(0, 0))
48104810

48114811
def test_known_value_8_of_10(self):
48124812
"""Textbook Wilson 95% CI for 8/10 ≈ [0.490, 0.943]."""
4813-
from ami.main.models_future.occurrence import wilson_interval
4813+
from ami.utils.stats import wilson_interval
48144814

48154815
low, high = wilson_interval(8, 10)
48164816
self.assertAlmostEqual(low, 0.4902, places=3)
48174817
self.assertAlmostEqual(high, 0.9433, places=3)
48184818

48194819
def test_bounds_stay_within_unit_interval(self):
48204820
"""At p̂ = 1.0 the normal approximation would exceed 1; Wilson must not."""
4821-
from ami.main.models_future.occurrence import wilson_interval
4821+
from ami.utils.stats import wilson_interval
48224822

48234823
low, high = wilson_interval(1, 1)
48244824
self.assertGreaterEqual(low, 0.0)
48254825
self.assertLessEqual(high, 1.0)
48264826
self.assertLess(low, high)
48274827

48284828
def test_interval_tightens_as_n_grows(self):
4829-
from ami.main.models_future.occurrence import wilson_interval
4829+
from ami.utils.stats import wilson_interval
48304830

48314831
narrow = wilson_interval(90, 100)
48324832
wide = wilson_interval(9, 10)
@@ -4837,18 +4837,18 @@ class TestCohensKappa(TestCase):
48374837
"""Pure-Python Cohen's kappa over (human_taxon, model_taxon) pairs."""
48384838

48394839
def test_empty_returns_none(self):
4840-
from ami.main.models_future.occurrence import cohens_kappa
4840+
from ami.utils.stats import cohens_kappa
48414841

48424842
self.assertIsNone(cohens_kappa([]))
48434843

48444844
def test_single_category_is_undefined(self):
48454845
"""Everyone picks the same taxon → expected agreement 1.0 → kappa undefined."""
4846-
from ami.main.models_future.occurrence import cohens_kappa
4846+
from ami.utils.stats import cohens_kappa
48474847

48484848
self.assertIsNone(cohens_kappa([(1, 1), (1, 1), (1, 1)]))
48494849

48504850
def test_perfect_agreement_two_categories(self):
4851-
from ami.main.models_future.occurrence import cohens_kappa
4851+
from ami.utils.stats import cohens_kappa
48524852

48534853
self.assertEqual(cohens_kappa([(1, 1), (2, 2)]), 1.0)
48544854

@@ -4857,13 +4857,13 @@ def test_known_2x2_value(self):
48574857
48584858
pairs: 3× human=1, 1× human=2; model 1 twice, 2 twice; 3 of 4 match.
48594859
"""
4860-
from ami.main.models_future.occurrence import cohens_kappa
4860+
from ami.utils.stats import cohens_kappa
48614861

48624862
self.assertEqual(cohens_kappa([(1, 1), (1, 1), (2, 2), (1, 2)]), 0.5)
48634863

48644864
def test_can_be_negative(self):
48654865
"""Systematic disagreement → worse than chance → negative kappa."""
4866-
from ami.main.models_future.occurrence import cohens_kappa
4866+
from ami.utils.stats import cohens_kappa
48674867

48684868
kappa = cohens_kappa([(1, 2), (2, 1), (1, 2), (2, 1)])
48694869
self.assertLess(kappa, 0.0)

ami/utils/stats.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Generic statistical helpers reusable across apps.
2+
3+
Kept independent of Django and any domain models so they can be unit-tested
4+
in isolation and reused by other endpoints/jobs that need to express
5+
uncertainty (Wilson CI) or correct an agreement rate for chance (kappa).
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import collections
11+
import math
12+
13+
# z-score for a 95% two-sided confidence interval (Wilson score).
14+
WILSON_Z_95 = 1.959963984540054
15+
16+
17+
def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None:
18+
"""Wilson score confidence interval for a binomial proportion.
19+
20+
Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or
21+
``None`` when ``total`` is 0. Defaults to a 95% interval.
22+
23+
The Wilson score interval is used instead of the normal approximation
24+
because the verified set is often tiny (single-digit counts), where the
25+
normal approximation produces bounds outside [0, 1] and understates the
26+
uncertainty. Wilson stays well-behaved at small n and at proportions
27+
near 0 or 1.
28+
"""
29+
if total <= 0:
30+
return None
31+
phat = successes / total
32+
z2 = z * z
33+
denom = 1 + z2 / total
34+
center = (phat + z2 / (2 * total)) / denom
35+
margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total))
36+
low = max(0.0, center - margin)
37+
high = min(1.0, center + margin)
38+
return (round(low, 4), round(high, 4))
39+
40+
41+
def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None:
42+
"""Cohen's kappa for exact agreement between two raters.
43+
44+
``pairs`` is one ``(rater_a, rater_b)`` per item that both raters
45+
classified. Returns kappa rounded to 4 dp in ``[-1, 1]`` (negative =
46+
worse than chance), or ``None`` when there are no pairs or expected
47+
agreement is 1.0 (kappa undefined — a single category leaves no
48+
chance-agreement to correct for).
49+
50+
Plain agreement rate rewards luck: in a project dominated by one common
51+
category, both raters agree most of the time just by both naming the
52+
common one. Kappa subtracts that chance agreement, so it answers "how
53+
much better than guessing do they agree" rather than "how often do they
54+
happen to match".
55+
"""
56+
n = len(pairs)
57+
if n == 0:
58+
return None
59+
observed_agree = sum(1 for a, b in pairs if a == b) / n
60+
a_counts: collections.Counter = collections.Counter(a for a, _ in pairs)
61+
b_counts: collections.Counter = collections.Counter(b for _, b in pairs)
62+
expected_agree = sum((a_counts[key] / n) * (b_counts[key] / n) for key in set(a_counts) | set(b_counts))
63+
if expected_agree >= 1.0:
64+
return None
65+
return round((observed_agree - expected_agree) / (1 - expected_agree), 4)

0 commit comments

Comments
 (0)