Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ea9f45d
feat(taxa): per-taxon verification + agreement counts and verified fi…
mihow May 22, 2026
16b1468
perf(taxa): compute verification rollup in one pass, not per-taxon su…
mihow May 22, 2026
5d929ce
docs(taxa): clarify GIN index purpose + add rollup query-performance …
mihow May 26, 2026
10c72cb
fix(taxa): dedupe occurrences in verification rollup under collection…
mihow May 26, 2026
7dcf325
Merge remote-tracking branch 'origin/main' into worktree-taxa-verific…
mihow May 26, 2026
30955e3
chore(migrations): renumber parents_json GIN index 0085 -> 0087 after…
mihow May 26, 2026
b92b2b0
fix(taxa): make collection-filtered taxa list COUNT scale
mihow May 26, 2026
29bec78
fix(taxa): materialize observed-taxon id set instead of IN-subquery
mihow May 26, 2026
838f9d7
refactor(taxa): centralize per-taxon counts into one filtered-occurre…
mihow May 26, 2026
7f571be
fix(taxa): use conditional aggregation for dense per-taxon counts
mihow May 26, 2026
f20a05d
fix(taxa): drop redundant taxa filter from occurrences_count aggregate
mihow May 26, 2026
cf86550
fix(taxa): remove redundant TaxonCollectionFilter backend
mihow May 26, 2026
4f45681
docs(taxa): document sparse vs dense — when CASE breaks, when to use …
mihow May 26, 2026
e014a73
docs(taxa): next-session handoff — hybrid direct-aggregates + move to…
mihow May 26, 2026
04a62c6
refactor(taxa): move count logic to TaxonQuerySet, hybrid subquery/ag…
mihow May 26, 2026
20683ee
refactor(taxa): drop model-agreement counts, keep verification only
mihow May 26, 2026
02f9dc3
docs(taxa): consolidate PR #1317 findings into single reference, drop…
mihow May 27, 2026
c4132e9
Merge branch 'main' into worktree-taxa-verification-counts [skip ci]
mihow May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions ami/main/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,13 +588,27 @@ def get_taxa(self, obj):
return [{"id": taxon.id, "name": taxon.name} for taxon in obj.taxa.all()]


def agreement_requested(request: Request | None) -> bool:
"""Whether ``with_agreement=true`` is set, gating the heavier agreed_exact_count."""
if request is None:
return False
value = request.query_params.get("with_agreement", "")
Comment thread
mihow marked this conversation as resolved.
Outdated
return str(value).lower() in ("true", "1", "yes", "on")


class TaxonListSerializer(DefaultSerializer):
# latest_detection = DetectionNestedSerializer(read_only=True)
occurrences = serializers.SerializerMethodField()
parents = TaxonParentSerializer(many=True, read_only=True, source="parents_json")
parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent")
tags = serializers.SerializerMethodField()

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# agreed_exact_count is a gated annotation: omit it unless with_agreement=true.
if not agreement_requested(self.context.get("request")):
self.fields.pop("agreed_exact_count", None)

def get_tags(self, obj):
tag_list = getattr(obj, "prefetched_tags", [])
return TagSerializer(tag_list, many=True, context=self.context).data
Expand All @@ -609,6 +623,9 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"agreed_with_prediction_count",
"agreed_exact_count",
"occurrences",
"tags",
"last_detected",
Expand Down Expand Up @@ -886,6 +903,9 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"agreed_with_prediction_count",
Comment thread
mihow marked this conversation as resolved.
Outdated
"agreed_exact_count",
"events_count",
"occurrences",
"gbif_taxon_key",
Expand Down
266 changes: 201 additions & 65 deletions ami/main/api/views.py

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions ami/main/migrations/0087_taxon_parents_json_gin_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from django.db import migrations


class Migration(migrations.Migration):
"""
GIN index on Taxon.parents_json to support the hierarchical (descendant) taxon
filters that issue a literal `parents_json @> [{"id": <id>}]` containment: the
occurrence-list `taxon=<id>` filter (CustomOccurrenceDeterminationFilter) and the
project default-taxa filter (build_occurrence_default_filters_q). The index applies
to these because the right-hand side is a constant.

Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is
computed in a single Python pass over the (sparse) verified-occurrence set rather
than a correlated subquery, because a containment whose RHS is an OuterRef can't use
the index. See TaxonViewSet._annotate_verification_counts.

CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is
non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands
separately via the #1307 follow-up.
"""

atomic = False

dependencies = [
("main", "0086_sourceimage_recent_capture_index"),
]

operations = [
migrations.RunSQL(
sql=(
"CREATE INDEX CONCURRENTLY IF NOT EXISTS main_taxon_parents_json_gin_idx "
"ON main_taxon USING gin (parents_json jsonb_path_ops);"
),
reverse_sql="DROP INDEX CONCURRENTLY IF EXISTS main_taxon_parents_json_gin_idx;",
),
]
12 changes: 12 additions & 0 deletions ami/main/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3817,6 +3817,18 @@ def best_determination_score(self) -> float | None:
# This is handled by an annotation if we are filtering by project, deployment or event
return None

def verified_count(self) -> int | None:
# Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts)
return None

def agreed_with_prediction_count(self) -> int | None:
# Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts)
return None

def agreed_exact_count(self) -> int | None:
# Handled by an annotation only when with_agreement is requested or on the detail view
return None

def occurrence_images(self, limit: int | None = 10) -> list[str]:
# This is handled by an annotation if we are filtering by project, deployment or event
return []
Expand Down
152 changes: 152 additions & 0 deletions ami/main/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4761,3 +4761,155 @@ def test_registration_order_preserves_occurrence_retrieve(self):
retrieve_response = self.client.get(f"/api/v2/occurrences/{occurrence.pk}/?project_id={self.project.pk}")
self.assertEqual(stats_response.status_code, 200, "stats URL must resolve")
self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work")


class TestTaxaVerification(APITestCase):
"""Per-taxon verification + human/model agreement annotations and the verified filter (#1316)."""

def setUp(self):
self.project, self.deployment = setup_test_project(reuse=False)
self.taxa_list = create_taxa(self.project)
self.order = Taxon.objects.get(name="Lepidoptera")
self.family = Taxon.objects.get(name="Nymphalidae")
self.genus = Taxon.objects.get(name="Vanessa")
self.cardui = Taxon.objects.get(name="Vanessa cardui")
self.atalanta = Taxon.objects.get(name="Vanessa atalanta")
self.itea = Taxon.objects.get(name="Vanessa itea")

create_captures(deployment=self.deployment, num_nights=1, images_per_night=3)
# 3 occurrences ML-determined to cardui, 1 to itea (left unverified)
create_occurrences(deployment=self.deployment, num=3, taxon=self.cardui, determination_score=0.9)
create_occurrences(deployment=self.deployment, num=1, taxon=self.itea, determination_score=0.9)

self.user = User.objects.create_user(email="verifier@insectai.org", is_staff=True, is_superuser=True)
self.client.force_authenticate(user=self.user)

cardui_occ = list(Occurrence.objects.filter(project=self.project, determination=self.cardui).order_by("pk"))
self.assertEqual(len(cardui_occ), 3)
self.occ_pred, self.occ_exact, self.occ_disagree = cardui_occ

# occ_pred: user agrees with the model prediction (cardui), agreed_with_prediction set
Identification.objects.create(
occurrence=self.occ_pred,
taxon=self.cardui,
user=self.user,
agreed_with_prediction=self.occ_pred.best_prediction,
)
# occ_exact: same taxon as the model, but not via the "agree" workflow
Identification.objects.create(occurrence=self.occ_exact, taxon=self.cardui, user=self.user)
# occ_disagree: user overrides to a different taxon (atalanta) than the model (cardui)
Identification.objects.create(occurrence=self.occ_disagree, taxon=self.atalanta, user=self.user)

self.itea_occ = Occurrence.objects.get(project=self.project, determination=self.itea)
self.list_url = f"/api/v2/taxa/?project_id={self.project.pk}&limit=1000"

def _detail(self, taxon):
res = self.client.get(f"/api/v2/taxa/{taxon.pk}/?project_id={self.project.pk}")
self.assertEqual(res.status_code, status.HTTP_200_OK)
return res.json()

def _list_by_name(self, url=None):
res = self.client.get(url or self.list_url)
self.assertEqual(res.status_code, status.HTTP_200_OK)
return {row["name"]: row for row in res.json()["results"]}

# --- verified_count (hierarchical rollup) ---

def test_verified_count_species(self):
self.assertEqual(self._detail(self.cardui)["verified_count"], 2)
self.assertEqual(self._detail(self.atalanta)["verified_count"], 1)
self.assertEqual(self._detail(self.itea)["verified_count"], 0)

def test_verified_count_rolls_up_to_ancestors(self):
# Verifying species marks genus/family/order verified, occurrence-weighted by descendants.
for ancestor in (self.genus, self.family, self.order):
self.assertEqual(self._detail(ancestor)["verified_count"], 3, ancestor.name)

# --- agreed_with_prediction_count (chosen identification only) ---

def test_agreed_with_prediction_counts_only_chosen_identification(self):
self.assertEqual(self._detail(self.cardui)["agreed_with_prediction_count"], 1)
self.assertEqual(self._detail(self.atalanta)["agreed_with_prediction_count"], 0)
# Rolls up: only occ_pred contributes under the genus.
self.assertEqual(self._detail(self.genus)["agreed_with_prediction_count"], 1)

# --- agreed_exact_count (gated) ---

def test_agreed_exact_count_on_detail(self):
# occ_pred + occ_exact: user determination == top machine prediction (cardui).
self.assertEqual(self._detail(self.cardui)["agreed_exact_count"], 2)
# occ_disagree: user picked atalanta, model said cardui → not exact.
self.assertEqual(self._detail(self.atalanta)["agreed_exact_count"], 0)
self.assertEqual(self._detail(self.genus)["agreed_exact_count"], 2)

def test_agreed_exact_count_gated_on_list(self):
rows = self._list_by_name()
self.assertIn("verified_count", rows["Vanessa cardui"])
self.assertIn("agreed_with_prediction_count", rows["Vanessa cardui"])
self.assertNotIn("agreed_exact_count", rows["Vanessa cardui"])

rows = self._list_by_name(self.list_url + "&with_agreement=true")
self.assertIn("agreed_exact_count", rows["Vanessa cardui"])
self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2)

# --- list field values ---

def test_list_field_values(self):
rows = self._list_by_name()
self.assertEqual(rows["Vanessa cardui"]["occurrences_count"], 2)
self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2)
self.assertEqual(rows["Vanessa cardui"]["agreed_with_prediction_count"], 1)
self.assertEqual(rows["Vanessa atalanta"]["verified_count"], 1)
self.assertEqual(rows["Vanessa itea"]["verified_count"], 0)

# --- verified=true|false filter ---

def test_verified_filter_true_false_complement(self):
all_names = set(self._list_by_name().keys())
verified = set(self._list_by_name(self.list_url + "&verified=true").keys())
unverified = set(self._list_by_name(self.list_url + "&verified=false").keys())
self.assertEqual(verified, {"Vanessa cardui", "Vanessa atalanta"})
self.assertEqual(unverified, {"Vanessa itea"})
# verified=false is the strict complement of verified=true on the filtered set.
self.assertEqual(verified | unverified, all_names)
self.assertEqual(verified & unverified, set())

def test_ordering_by_verified_count(self):
res = self.client.get(self.list_url + "&ordering=verified_count")
self.assertEqual(res.status_code, status.HTTP_200_OK)
counts = [row["verified_count"] for row in res.json()["results"]]
self.assertEqual(counts, sorted(counts))

# --- apply_defaults handling ---

def test_verified_filter_respects_apply_defaults(self):
self.project.default_filters_exclude_taxa.add(self.atalanta)

verified_default = set(self._list_by_name(self.list_url + "&verified=true").keys())
self.assertEqual(verified_default, {"Vanessa cardui"})

verified_bypassed = set(self._list_by_name(self.list_url + "&verified=true&apply_defaults=false").keys())
self.assertEqual(verified_bypassed, {"Vanessa cardui", "Vanessa atalanta"})

# --- collection filter must not inflate counts via the detections join ---

def test_verified_count_not_inflated_by_collection_join(self):
# A second detection on a verified occurrence means the ?collection= INNER JOIN to
# detections yields two rows for that occurrence; the rollup must still count it once.
extra_detection = Detection.objects.create(
source_image=self.occ_exact.best_detection.source_image,
occurrence=self.occ_exact,
timestamp=self.occ_exact.best_detection.timestamp,
bbox=[0.5, 0.5, 0.6, 0.6],
path="detections/test_detection_dup.jpg",
)
extra_detection.classifications.create(taxon=self.cardui, score=0.9, timestamp=datetime.datetime.now())
self.assertEqual(self.occ_exact.detections.count(), 2)

collection = SourceImageCollection.objects.create(project=self.project, name="verif-dedup")
collection.images.set(SourceImage.objects.filter(deployment=self.deployment))

rows = self._list_by_name(f"{self.list_url}&collection={collection.pk}&with_agreement=true")
# 2 verified cardui occurrences, not 3 — the duplicate detection must not double-count.
self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2)
self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2)
Loading
Loading