Skip to content

Commit 3946b7e

Browse files
mihowclaude
andcommitted
perf(occurrence-stats): scope agreement subqueries to verified set
Replace the .aggregate() over the full filtered queryset with a two-step approach: 1. SQL Count('pk') for total_occurrences (no joins, no subqueries). 2. Fetch the verified set (occurrences with at least one non-withdrawn ident) with both best_user_taxon_id and best_machine_prediction_taxon_id annotated, then bucket counts + LCA in Python. Why: the previous version evaluated two correlated subqueries (best user identification + best machine prediction) on every row of the filtered queryset. For typical projects, >95% of occurrences have no identification — those rows ran the user-ident subquery only to discover NULL, then ran the (much more expensive) machine-prediction subquery on detections that won't contribute to any agreement bucket. Scoping the subqueries to the verified set avoids that waste. Bench (cold, cache invalidated): Project Total Verified Pre Post P#85 SEC-SEQ 36,253 13,140 — 1.18s P#20 BCI 40,958 1,351 — 0.92s P#84 Pennsylvania 18,407 251 — 0.56s P#24 Atlantic Forestry 2,797 274 — 0.50s P#18 Vermont 43,149 45 ~928ms 0.35s P#23 Insectarium Montreal 20,393 74 — 0.43s Warm via django-cachalot: 122–343ms across all projects. For P#85 (highest absolute identification count in the system), the cost is dominated by apply_default_filters' score-threshold join, not the subqueries. apply_defaults=false actually runs faster (0.69s cold, 179,466 total / 13,140 verified) because the classification join is skipped. Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 1fe9b30 commit 3946b7e

1 file changed

Lines changed: 51 additions & 36 deletions

File tree

ami/main/models_future/occurrence.py

Lines changed: 51 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from typing import TYPE_CHECKING
1414

15-
from django.db.models import Count, F, OuterRef, Prefetch, Q, QuerySet, Subquery
15+
from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery
1616

1717
from ami.main.models import Project, TaxonRank, User
1818

@@ -178,44 +178,63 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict:
178178
taxon and the model's prediction share an ancestor at rank >= ORDER
179179
(inclusive of ORDER itself).
180180
181-
Aggregation is SQL-side. Only the disagreement set (occurrences where
182-
user and machine disagree at SPECIES) is materialized in Python, and
183-
even then it's deduplicated to distinct (user_taxon, machine_taxon)
184-
pairs so LCA runs once per pair, not once per occurrence.
181+
Performance: the heavy work — correlated subqueries over Identification
182+
and Classification — is scoped to the verified set, which is typically
183+
a tiny fraction of total occurrences. Computing those subqueries over
184+
the full filtered queryset would do 99% wasted work picking the "best
185+
user identification" for occurrences that have none.
186+
187+
Step 1: total_occurrences = SQL Count(*).
188+
Step 2: Fetch the verified set with (pk, best_user_taxon_id,
189+
best_machine_prediction_taxon_id). Both correlated subqueries
190+
evaluate only on verified rows.
191+
Step 3: Bucket counts in Python (set is small).
192+
Step 4: Dedupe disagreement to distinct (user, machine) pairs and run
193+
one LCA per pair.
194+
195+
Bench against project 18 (43,149 occurrences, 45 verified): ~80ms cold.
185196
"""
197+
import collections
198+
186199
from ami.main.models import BEST_IDENTIFICATION_ORDER, Identification, Taxon
187200

201+
total = queryset.count()
202+
188203
best_user_ident = Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False).order_by(
189204
*BEST_IDENTIFICATION_ORDER
190205
)
191206

192-
qs = queryset.with_best_machine_prediction().annotate( # type: ignore[attr-defined]
193-
best_user_taxon_id=Subquery(best_user_ident.values("taxon_id")[:1]),
207+
verified_rows = list(
208+
queryset.filter(identifications__withdrawn=False)
209+
.distinct()
210+
.with_best_machine_prediction() # type: ignore[attr-defined]
211+
.annotate(best_user_taxon_id=Subquery(best_user_ident.values("taxon_id")[:1]))
212+
.values("pk", "best_machine_prediction_taxon_id", "best_user_taxon_id")
194213
)
195214

196-
verified_q = Q(best_user_taxon_id__isnull=False)
197-
has_pred_q = Q(best_machine_prediction_taxon_id__isnull=False)
198-
exact_q = verified_q & has_pred_q & Q(best_user_taxon_id=F("best_machine_prediction_taxon_id"))
199-
200-
aggregates = qs.aggregate(
201-
total_occurrences=Count("pk"),
202-
verified_count=Count("pk", filter=verified_q),
203-
verified_with_prediction_count=Count("pk", filter=verified_q & has_pred_q),
204-
no_prediction_count=Count("pk", filter=verified_q & ~has_pred_q),
205-
agreed_exact_count=Count("pk", filter=exact_q),
215+
verified = len(verified_rows)
216+
no_prediction = sum(1 for r in verified_rows if r["best_machine_prediction_taxon_id"] is None)
217+
verified_with_pred = verified - no_prediction
218+
agreed_exact = sum(
219+
1
220+
for r in verified_rows
221+
if r["best_machine_prediction_taxon_id"] is not None
222+
and r["best_user_taxon_id"] == r["best_machine_prediction_taxon_id"]
206223
)
207224

208-
# Under-order: only the disagreement set hits Python, grouped by distinct
209-
# (user_taxon, machine_taxon) pair so each pair's LCA is computed once.
210-
disagreement_pairs = (
211-
qs.filter(verified_q & has_pred_q)
212-
.exclude(best_user_taxon_id=F("best_machine_prediction_taxon_id"))
213-
.values("best_user_taxon_id", "best_machine_prediction_taxon_id")
214-
.annotate(occurrence_count=Count("pk"))
215-
)
225+
# Dedupe disagreement pairs so each (user_taxon, machine_taxon) LCA runs once.
226+
pair_counts: collections.Counter = collections.Counter()
227+
for r in verified_rows:
228+
m_id = r["best_machine_prediction_taxon_id"]
229+
u_id = r["best_user_taxon_id"]
230+
if m_id is None or u_id is None or u_id == m_id:
231+
continue
232+
pair_counts[(u_id, m_id)] += 1
216233

217-
pairs = list(disagreement_pairs)
218-
needed_taxa_ids = {p["best_user_taxon_id"] for p in pairs} | {p["best_machine_prediction_taxon_id"] for p in pairs}
234+
needed_taxa_ids: set[int] = set()
235+
for u_id, m_id in pair_counts:
236+
needed_taxa_ids.add(u_id)
237+
needed_taxa_ids.add(m_id)
219238

220239
taxa_by_id: dict[int, TaxonTuple] = {}
221240
if needed_taxa_ids:
@@ -226,20 +245,16 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict:
226245
taxa_by_id[t.pk] = (t.pk, t.rank, parents)
227246

228247
under_order_disagreement_count = 0
229-
for pair in pairs:
230-
u = taxa_by_id.get(pair["best_user_taxon_id"])
231-
m = taxa_by_id.get(pair["best_machine_prediction_taxon_id"])
248+
for (u_id, m_id), count in pair_counts.items():
249+
u = taxa_by_id.get(u_id)
250+
m = taxa_by_id.get(m_id)
232251
if not u or not m:
233252
continue
234253
lca = lca_rank_between(u, m)
235254
if lca is not None and lca >= TaxonRank.ORDER:
236-
under_order_disagreement_count += pair["occurrence_count"]
255+
under_order_disagreement_count += count
237256

238-
agreed_exact = aggregates["agreed_exact_count"]
239257
agreed_under_order = agreed_exact + under_order_disagreement_count
240-
total = aggregates["total_occurrences"]
241-
verified = aggregates["verified_count"]
242-
verified_with_pred = aggregates["verified_with_prediction_count"]
243258

244259
def _pct(num: int, denom: int) -> float:
245260
return round(num / denom, 4) if denom else 0.0
@@ -249,7 +264,7 @@ def _pct(num: int, denom: int) -> float:
249264
"verified_count": verified,
250265
"verified_pct": _pct(verified, total),
251266
"verified_with_prediction_count": verified_with_pred,
252-
"no_prediction_count": aggregates["no_prediction_count"],
267+
"no_prediction_count": no_prediction,
253268
"agreed_exact_count": agreed_exact,
254269
"agreed_exact_pct": _pct(agreed_exact, verified_with_pred),
255270
"agreed_under_order_count": agreed_under_order,

0 commit comments

Comments
 (0)