diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index a34e7d561..6d70906e1 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -609,6 +609,7 @@ class Meta: "parents", "details", "occurrences_count", + "verified_count", "occurrences", "tags", "last_detected", @@ -886,6 +887,7 @@ class Meta: "parents", "details", "occurrences_count", + "verified_count", "events_count", "occurrences", "gbif_taxon_key", diff --git a/ami/main/api/views.py b/ami/main/api/views.py index d79779074..5d21b9b20 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -6,7 +6,6 @@ from django.core import exceptions from django.db import models from django.db.models import OuterRef, Prefetch, Q, Subquery -from django.db.models.functions import Coalesce from django.db.models.query import QuerySet from django.forms import BooleanField, CharField, IntegerField from django.shortcuts import get_object_or_404 @@ -1026,7 +1025,9 @@ class CustomOccurrenceDeterminationFilter(CustomTaxonFilter): def filter_queryset(self, request, queryset, view): taxon = self.get_filter_taxon(request, query_params=self.query_params) if taxon: - # Here the queryset is the Occurrence queryset + # Here the queryset is the Occurrence queryset. + # The literal parents_json containment (constant RHS) is what the GIN index from + # migration 0087 serves — this hierarchical taxon filter is the index's main consumer. return queryset.filter( models.Q(determination=taxon) | models.Q(determination__parents_json__contains=[{"id": taxon.pk}]) ) @@ -1201,22 +1202,6 @@ def filter_queryset(self, request, queryset, view): return queryset -class TaxonCollectionFilter(filters.BaseFilterBackend): - """ - Filter taxa by the capture set their occurrences belong to. - """ - - query_param = "collection" - - def filter_queryset(self, request, queryset, view): - collection_id = IntegerField(required=False).clean(request.query_params.get(self.query_param)) - if collection_id: - # Here the queryset is the Taxon queryset - return queryset.filter(occurrences__detections__source_image__collections=collection_id) - else: - return queryset - - class OccurrenceViewSet(DefaultViewSet, ProjectMixin): """ API endpoint that allows occurrences to be viewed or edited. @@ -1456,9 +1441,13 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin): queryset = Taxon.objects.all().defer("notes") serializer_class = TaxonSerializer + # ``?collection=`` is handled inside get_taxa_observed (via get_occurrence_filters + # + TaxonQuerySet.with_observation_counts_aggregated + HAVING). A dedicated + # filter_backends entry that re-applied the collection filter on the main queryset + # would add a redundant JOIN that the planner cannot reconcile with the + # conditional-aggregate GROUP BY, turning the page into a multi-minute scan. filter_backends = DefaultViewSetMixin.filter_backends + [ CustomTaxonFilter, - TaxonCollectionFilter, TaxonTaxaListFilter, TaxonBestScoreFilter, TaxonTagFilter, @@ -1477,6 +1466,7 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin): "created_at", "updated_at", "occurrences_count", + "verified_count", "last_detected", "best_determination_score", "name", @@ -1533,12 +1523,17 @@ def get_serializer_class(self): else: return TaxonSerializer - def get_occurrence_filters(self, project: Project) -> models.Q: + def get_occurrence_filters(self, project: Project, accessor: str = "") -> models.Q: """ - Filter taxa by when/where it has occurred. + Filter by when/where a taxon has occurred. Supports querying by occurrence, project, deployment, or event. + ``accessor`` is the relation path to the Occurrence model. Pass "" to filter the + Occurrence model directly, or "occurrences" to filter the Taxon model via its + reverse relation (for conditional aggregation in + :meth:`TaxonQuerySet.with_observation_counts_aggregated`). + @TODO Consider using a custom filter class for this (see get_filter_name) @TODO Move this to a custom QuerySet manager on the Taxon model """ @@ -1550,12 +1545,12 @@ def get_occurrence_filters(self, project: Project) -> models.Q: event_id = self.request.query_params.get("event") or self.request.query_params.get("occurrences__event") collection_id = self.request.query_params.get("collection") - # filter_active = any([occurrence_id, project, deployment_id, event_id, collection_id]) + prefix = f"{accessor}__" if accessor else "" - filters = models.Q( - project=project, - event__isnull=False, - ) + def field(path: str) -> str: + return f"{prefix}{path}" + + filters = models.Q(**{field("project"): project, field("event__isnull"): False}) try: """ Ensure that the related objects exist before filtering by them. @@ -1564,16 +1559,16 @@ def get_occurrence_filters(self, project: Project) -> models.Q: if occurrence_id: Occurrence.objects.get(id=occurrence_id) # This query does not need the same filtering as the others - filters &= models.Q(id=occurrence_id) + filters &= models.Q(**{field("id"): occurrence_id}) if deployment_id: Deployment.objects.get(id=deployment_id) - filters &= models.Q(deployment=deployment_id) + filters &= models.Q(**{field("deployment"): deployment_id}) if event_id: Event.objects.get(id=event_id) - filters &= models.Q(event=event_id) + filters &= models.Q(**{field("event"): event_id}) if collection_id: SourceImageCollection.objects.get(id=collection_id) - filters &= models.Q(detections__source_image__collections=collection_id) + filters &= models.Q(**{field("detections__source_image__collections"): collection_id}) except exceptions.ObjectDoesNotExist as e: # Raise a 404 if any of the related objects don't exist raise NotFound(detail=str(e)) @@ -1630,81 +1625,67 @@ def get_taxa_observed( apply_default_score_filter=True, apply_default_taxa_filter=True, ) -> QuerySet: - """ - If a project is passed, only return taxa that have been observed. - Also add the number of occurrences and the last time it was detected. - - Uses efficient subqueries with default filters applied directly via Q objects - to leverage composite indexes on (determination_id, project_id, event_id, determination_score). - This avoids the N+1 query problem by building a single Q filter that can be reused - across all subqueries. - """ - occurrence_filters = self.get_occurrence_filters(project) + """Annotate per-(project, taxon) counts and optionally restrict to observed taxa. + + Two SQL shapes for the direct aggregates (``occurrences_count`` / + ``best_determination_score`` / ``last_detected``): + + - **Default / event / deployment / verified paths** — correlated ``Subquery`` + annotations, index-served by the composite + ``(determination_id, project_id, event_id, determination_score)`` index on + Occurrence. Membership via materialised ``id__in``. + - **``?collection=``** — conditional aggregation over the Taxon→occurrences + reverse relation. The detections join would turn each correlated subquery into + a per-row scan, so we switch to one GROUP BY. Membership via HAVING. + + The sparse verification rollup (``verified_count`` / ``agreed_*``) is the same on + either path — a Python pass over the verified subset applied as ``CASE`` + annotations, see :meth:`TaxonQuerySet.with_verification_counts`. + """ + request = self.request + use_aggregation = "collection" in request.query_params + direct_filters = self.get_occurrence_filters(project) + + if use_aggregation: + relation_filters = self.get_occurrence_filters(project, accessor="occurrences") + qs = qs.with_observation_counts_aggregated( + project, + request, + relation_occurrence_filters=relation_filters, + apply_default_score_filter=apply_default_score_filter, + ) + if not include_unobserved: + qs = qs.filter(occurrences_count__gt=0) + else: + qs = qs.with_observation_counts_subqueries( + project, + request, + occurrence_filters=direct_filters, + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + if not include_unobserved: + qs = qs.observed_in_project_subqueries( + project, + request, + occurrence_filters=direct_filters, + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) - # Build a single Q filter for default filters (score threshold + taxa filters) - # This creates an efficient filter that works with composite indexes - # Respects apply_defaults flag: build_occurrence_default_filters_q checks it internally - from ami.main.models_future.filters import build_occurrence_default_filters_q + verified_param: bool | None = None + if self.action == "list" and "verified" in request.query_params: + verified_param = BooleanField(required=False).clean(request.query_params.get("verified")) - default_filters_q = build_occurrence_default_filters_q( + return qs.with_verification_counts( project, - self.request, - occurrence_accessor="", + request, + occurrence_filters=direct_filters, apply_default_score_filter=apply_default_score_filter, apply_default_taxa_filter=apply_default_taxa_filter, + verified=verified_param, ) - # Combine base occurrence filters with default filters - base_filter = models.Q( - occurrence_filters, - determination_id=models.OuterRef("id"), - ) - - base_filter = base_filter & default_filters_q - - # Count occurrences - uses composite index (determination_id, project_id, event_id, determination_score) - occurrences_count_subquery = models.Subquery( - Occurrence.objects.filter(base_filter) - .values("determination_id") - .annotate(count=models.Count("id")) - .values("count")[:1], - output_field=models.IntegerField(), - ) - - # Get best score - uses same composite index - best_score_subquery = models.Subquery( - Occurrence.objects.filter(base_filter) - .values("determination_id") - .annotate(max_score=models.Max("determination_score")) - .values("max_score")[:1], - output_field=models.FloatField(), - ) - - # Get last detected timestamp - requires join with detections - last_detected_subquery = models.Subquery( - Occurrence.objects.filter( - base_filter, - detections__timestamp__isnull=False, - ) - .values("determination_id") - .annotate(last_detected=models.Max("detections__timestamp")) - .values("last_detected")[:1], - output_field=models.DateTimeField(), - ) - - # Apply annotations - qs = qs.annotate( - occurrences_count=Coalesce(occurrences_count_subquery, 0), - best_determination_score=best_score_subquery, - last_detected=last_detected_subquery, - ) - - if not include_unobserved: - # Efficient EXISTS check that uses the composite index - qs = qs.filter(models.Exists(Occurrence.objects.filter(base_filter))) - - return qs - def attach_tags_by_project(self, qs: QuerySet, project: Project) -> QuerySet: """ Prefetch and override the `.tags` attribute on each Taxon diff --git a/ami/main/migrations/0087_taxon_parents_json_gin_index.py b/ami/main/migrations/0087_taxon_parents_json_gin_index.py new file mode 100644 index 000000000..6ffc93127 --- /dev/null +++ b/ami/main/migrations/0087_taxon_parents_json_gin_index.py @@ -0,0 +1,36 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + """ + GIN index on Taxon.parents_json to support the hierarchical (descendant) taxon + filters that issue a literal `parents_json @> [{"id": }]` containment: the + occurrence-list `taxon=` filter (CustomOccurrenceDeterminationFilter) and the + project default-taxa filter (build_occurrence_default_filters_q). The index applies + to these because the right-hand side is a constant. + + Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is + computed in a single Python pass over the (sparse) verified-occurrence set rather + than a correlated subquery, because a containment whose RHS is an OuterRef can't use + the index. See TaxonViewSet._annotate_verification_counts. + + CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is + non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands + separately via the #1307 follow-up. + """ + + atomic = False + + dependencies = [ + ("main", "0086_sourceimage_recent_capture_index"), + ] + + operations = [ + migrations.RunSQL( + sql=( + "CREATE INDEX CONCURRENTLY IF NOT EXISTS main_taxon_parents_json_gin_idx " + "ON main_taxon USING gin (parents_json jsonb_path_ops);" + ), + reverse_sql="DROP INDEX CONCURRENTLY IF EXISTS main_taxon_parents_json_gin_idx;", + ), + ] diff --git a/ami/main/models.py b/ami/main/models.py index c1616754f..3d21c07cd 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3419,15 +3419,213 @@ def update_occurrence_determination( return needs_update +def _case_from_map(mapping: dict, default, output_field: models.Field) -> models.expressions.Combinable: + """Turn a precomputed ``{taxon_id: value}`` map into a constant-time ``CASE``. + + The result is constant per row, so it is DB-sortable, paginatable, and stripped from + the pagination ``COUNT`` — unlike a per-taxon correlated subquery, which is + re-evaluated for every row and (in ``COUNT``) for every taxon in the project. Only + sparse maps work here: one ``When`` per entry blows past sqlparse's 10000-token + limit at ~hundreds of taxa × multiple columns. + """ + if not mapping: + return models.Value(default, output_field=output_field) + return models.Case( + *( + models.When(id=taxon_id, then=models.Value(value, output_field=output_field)) + for taxon_id, value in mapping.items() + ), + default=models.Value(default, output_field=output_field), + output_field=output_field, + ) + + class TaxonQuerySet(BaseQuerySet): - def with_occurrence_counts(self, project: Project): - """ - Annotate each taxon with the count of its occurrences for a given project. - """ - qs = self - qs = qs.filter(occurrences__project=project) + def with_observation_counts_subqueries( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + ): + """Annotate ``occurrences_count`` / ``best_determination_score`` / ``last_detected`` + via three correlated ``Subquery`` annotations. + + Index-served by the composite ``(determination_id, project_id, event_id, + determination_score)`` index on Occurrence. Use this on non-collection paths. + When ``occurrence_filters`` joins detections (e.g. ``?collection=``) the + correlated form degrades to a per-row scan; use + :meth:`with_observation_counts_aggregated` instead. + """ + default_filters_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + base_filter = models.Q(occurrence_filters, determination_id=OuterRef("id")) & default_filters_q + + occurrences_count_subquery = models.Subquery( + Occurrence.objects.filter(base_filter) + .values("determination_id") + .annotate(count=models.Count("id")) + .values("count")[:1], + output_field=models.IntegerField(), + ) + best_score_subquery = models.Subquery( + Occurrence.objects.filter(base_filter) + .values("determination_id") + .annotate(max_score=models.Max("determination_score")) + .values("max_score")[:1], + output_field=models.FloatField(), + ) + last_detected_subquery = models.Subquery( + Occurrence.objects.filter(base_filter, detections__timestamp__isnull=False) + .values("determination_id") + .annotate(last_detected=models.Max("detections__timestamp")) + .values("last_detected")[:1], + output_field=models.DateTimeField(), + ) + return self.annotate( + occurrences_count=Coalesce(occurrences_count_subquery, 0), + best_determination_score=best_score_subquery, + last_detected=last_detected_subquery, + ) + + def with_observation_counts_aggregated( + self, + project: Project, + request: Request | None, + *, + relation_occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + ): + """Annotate ``occurrences_count`` / ``best_determination_score`` / ``last_detected`` + via conditional aggregation over the Taxon→occurrences reverse relation. + + Required when ``relation_occurrence_filters`` joins detections (``?collection=``), + where the correlated-subquery form degrades to per-row scans. One GROUP BY, + constant-size SQL. ``Count(distinct)`` dedupes the detections-join fan-out. + + The default *taxa* include/exclude filter is deliberately omitted from + ``count_filter``: it is redundant with row-level + :meth:`filter_by_project_default_taxa`, and including it adds a + ``parents_json`` containment join inside the aggregate that the planner cannot + reconcile with the detections join (measured: 0.3s → 182s on a ~1k-taxa project). + Score threshold is per-occurrence so it stays. + """ + count_filter = relation_occurrence_filters & build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="occurrences", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=False, + ) + return self.annotate( + occurrences_count=models.Count("occurrences", filter=count_filter, distinct=True), + best_determination_score=models.Max("occurrences__determination_score", filter=count_filter), + last_detected=models.Max("occurrences__detections__timestamp", filter=count_filter), + ) - return qs.annotate(occurrence_count=models.Count("occurrences", distinct=True)) + def observed_in_project_subqueries( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + ): + """Restrict the queryset to taxa observed in the filtered occurrence set, via a + materialised ``id__in``. Pair with :meth:`with_observation_counts_subqueries`. + + The materialised form runs the (potentially detections-joined) filter exactly + once and leaves the pagination ``COUNT`` / page as a plain indexed ``id IN + (...)``. Aggregation-path callers should use ``.filter(occurrences_count__gt=0)`` + (HAVING) instead. + """ + default_filters_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + observed_taxon_ids = list( + Occurrence.objects.filter(occurrence_filters) + .filter(default_filters_q) + .filter(determination_id__isnull=False) + .values_list("determination_id", flat=True) + .distinct() + ) + return self.filter(id__in=observed_taxon_ids) + + def with_verification_counts( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + verified: bool | None = None, + ): + """Annotate ``verified_count`` and optionally apply the + ``verified=true|false`` filter. + + Counts roll up descendant occurrences (verifying a species also counts toward + its genus / family rows). They concern only *verified* occurrences (those with a + non-withdrawn ``Identification``) — sparse relative to all occurrences — so the + hierarchical rollup is a single Python pass over that small subset applied as + constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per + taxon would not scale (GIN can't serve a containment with an ``OuterRef`` RHS). + + Model-agreement counts (whether the chosen identification matched the model's + top prediction) are tracked separately — see issue #1319. + """ + default_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + verified_occurrences = ( + Occurrence.objects.filter(occurrence_filters) + .filter(default_q) + .filter(Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False))) + ) + # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when + # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence + # yields a row per matching Detection, which would otherwise inflate counts. + value_fields = ["pk", "determination_id", "determination__parents_json"] + + verified_counts: dict[int, int] = {} + for row in verified_occurrences.values(*value_fields).distinct(): + determination_id = row["determination_id"] + taxon_ids: set[int] = set() + if determination_id is not None: + taxon_ids.add(determination_id) + for parent in row["determination__parents_json"] or []: + # parents_json round-trips through the pydantic schema field, so elements + # may be dicts or ``TaxonParent`` objects depending on the query path. + parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) + if parent_id is not None: + taxon_ids.add(int(parent_id)) + for taxon_id in taxon_ids: + verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1 + + qs = self.annotate(verified_count=_case_from_map(verified_counts, 0, models.IntegerField())) + + if verified is True: + qs = qs.filter(id__in=list(verified_counts.keys())) + elif verified is False: + qs = qs.exclude(id__in=list(verified_counts.keys())) + + return qs def filter_by_project_default_taxa(self, project: Project | None = None, request: Request | None = None): """ @@ -3817,6 +4015,10 @@ def best_determination_score(self) -> float | None: # This is handled by an annotation if we are filtering by project, deployment or event return None + def verified_count(self) -> int | None: + # Handled by an annotation when filtering by project (TaxonQuerySet.with_verification_counts) + return None + def occurrence_images(self, limit: int | None = 10) -> list[str]: # This is handled by an annotation if we are filtering by project, deployment or event return [] diff --git a/ami/main/tests.py b/ami/main/tests.py index 517c4c87b..3352bd2d0 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4761,3 +4761,126 @@ def test_registration_order_preserves_occurrence_retrieve(self): retrieve_response = self.client.get(f"/api/v2/occurrences/{occurrence.pk}/?project_id={self.project.pk}") self.assertEqual(stats_response.status_code, 200, "stats URL must resolve") self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work") + + +class TestTaxaVerification(APITestCase): + """Per-taxon verification + human/model agreement annotations and the verified filter (#1316).""" + + def setUp(self): + self.project, self.deployment = setup_test_project(reuse=False) + self.taxa_list = create_taxa(self.project) + self.order = Taxon.objects.get(name="Lepidoptera") + self.family = Taxon.objects.get(name="Nymphalidae") + self.genus = Taxon.objects.get(name="Vanessa") + self.cardui = Taxon.objects.get(name="Vanessa cardui") + self.atalanta = Taxon.objects.get(name="Vanessa atalanta") + self.itea = Taxon.objects.get(name="Vanessa itea") + + create_captures(deployment=self.deployment, num_nights=1, images_per_night=3) + # 3 occurrences ML-determined to cardui, 1 to itea (left unverified) + create_occurrences(deployment=self.deployment, num=3, taxon=self.cardui, determination_score=0.9) + create_occurrences(deployment=self.deployment, num=1, taxon=self.itea, determination_score=0.9) + + self.user = User.objects.create_user(email="verifier@insectai.org", is_staff=True, is_superuser=True) + self.client.force_authenticate(user=self.user) + + cardui_occ = list(Occurrence.objects.filter(project=self.project, determination=self.cardui).order_by("pk")) + self.assertEqual(len(cardui_occ), 3) + self.occ_pred, self.occ_exact, self.occ_disagree = cardui_occ + + # occ_pred: user agrees with the model prediction (cardui), agreed_with_prediction set + Identification.objects.create( + occurrence=self.occ_pred, + taxon=self.cardui, + user=self.user, + agreed_with_prediction=self.occ_pred.best_prediction, + ) + # occ_exact: same taxon as the model, but not via the "agree" workflow + Identification.objects.create(occurrence=self.occ_exact, taxon=self.cardui, user=self.user) + # occ_disagree: user overrides to a different taxon (atalanta) than the model (cardui) + Identification.objects.create(occurrence=self.occ_disagree, taxon=self.atalanta, user=self.user) + + self.itea_occ = Occurrence.objects.get(project=self.project, determination=self.itea) + self.list_url = f"/api/v2/taxa/?project_id={self.project.pk}&limit=1000" + + def _detail(self, taxon): + res = self.client.get(f"/api/v2/taxa/{taxon.pk}/?project_id={self.project.pk}") + self.assertEqual(res.status_code, status.HTTP_200_OK) + return res.json() + + def _list_by_name(self, url=None): + res = self.client.get(url or self.list_url) + self.assertEqual(res.status_code, status.HTTP_200_OK) + return {row["name"]: row for row in res.json()["results"]} + + # --- verified_count (hierarchical rollup) --- + + def test_verified_count_species(self): + self.assertEqual(self._detail(self.cardui)["verified_count"], 2) + self.assertEqual(self._detail(self.atalanta)["verified_count"], 1) + self.assertEqual(self._detail(self.itea)["verified_count"], 0) + + def test_verified_count_rolls_up_to_ancestors(self): + # Verifying species marks genus/family/order verified, occurrence-weighted by descendants. + for ancestor in (self.genus, self.family, self.order): + self.assertEqual(self._detail(ancestor)["verified_count"], 3, ancestor.name) + + # --- list field values --- + + def test_list_field_values(self): + rows = self._list_by_name() + self.assertEqual(rows["Vanessa cardui"]["occurrences_count"], 2) + self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) + self.assertEqual(rows["Vanessa atalanta"]["verified_count"], 1) + self.assertEqual(rows["Vanessa itea"]["verified_count"], 0) + + # --- verified=true|false filter --- + + def test_verified_filter_true_false_complement(self): + all_names = set(self._list_by_name().keys()) + verified = set(self._list_by_name(self.list_url + "&verified=true").keys()) + unverified = set(self._list_by_name(self.list_url + "&verified=false").keys()) + self.assertEqual(verified, {"Vanessa cardui", "Vanessa atalanta"}) + self.assertEqual(unverified, {"Vanessa itea"}) + # verified=false is the strict complement of verified=true on the filtered set. + self.assertEqual(verified | unverified, all_names) + self.assertEqual(verified & unverified, set()) + + def test_ordering_by_verified_count(self): + res = self.client.get(self.list_url + "&ordering=verified_count") + self.assertEqual(res.status_code, status.HTTP_200_OK) + counts = [row["verified_count"] for row in res.json()["results"]] + self.assertEqual(counts, sorted(counts)) + + # --- apply_defaults handling --- + + def test_verified_filter_respects_apply_defaults(self): + self.project.default_filters_exclude_taxa.add(self.atalanta) + + verified_default = set(self._list_by_name(self.list_url + "&verified=true").keys()) + self.assertEqual(verified_default, {"Vanessa cardui"}) + + verified_bypassed = set(self._list_by_name(self.list_url + "&verified=true&apply_defaults=false").keys()) + self.assertEqual(verified_bypassed, {"Vanessa cardui", "Vanessa atalanta"}) + + # --- collection filter must not inflate counts via the detections join --- + + def test_verified_count_not_inflated_by_collection_join(self): + # A second detection on a verified occurrence means the ?collection= INNER JOIN to + # detections yields two rows for that occurrence; the rollup must still count it once. + extra_detection = Detection.objects.create( + source_image=self.occ_exact.best_detection.source_image, + occurrence=self.occ_exact, + timestamp=self.occ_exact.best_detection.timestamp, + bbox=[0.5, 0.5, 0.6, 0.6], + path="detections/test_detection_dup.jpg", + ) + extra_detection.classifications.create(taxon=self.cardui, score=0.9, timestamp=datetime.datetime.now()) + self.assertEqual(self.occ_exact.detections.count(), 2) + + collection = SourceImageCollection.objects.create(project=self.project, name="verif-dedup") + collection.images.set(SourceImage.objects.filter(deployment=self.deployment)) + + rows = self._list_by_name(f"{self.list_url}&collection={collection.pk}") + # 2 verified cardui occurrences, not 3 — the duplicate detection must not double-count. + self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md new file mode 100644 index 000000000..a3055a785 --- /dev/null +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -0,0 +1,266 @@ +# Per-taxon rollup counts on the taxa endpoint — query patterns and pitfalls + +How to compute per-taxon counts on `GET /api/v2/taxa/` that roll up +descendant occurrences (a Family/Order row aggregating its species) without +timing out the list endpoint on large projects. Generalises to any "aggregate +over a taxon node and all its `parents_json` descendants" problem. + +Canonical reference for PR #1317 (verification status on taxa views, issue +#1316) and follow-up #1319 (model-agreement stats — deferred). + +Reference implementations in `ami/main/models.py`: + +- `TaxonQuerySet.with_observation_counts_subqueries` (default path, correlated `Subquery`) +- `TaxonQuerySet.with_observation_counts_aggregated` (collection path, conditional aggregation) +- `TaxonQuerySet.observed_in_project_subqueries` (membership filter for default path) +- `TaxonQuerySet.with_verification_counts` (sparse `CASE`-from-map rollup) + +Reference orchestrator in `ami/main/api/views.py`: + +- `TaxonViewSet.get_taxa_observed` (dispatches between the two count shapes) + +## TL;DR + +- **Do not** roll up with a per-taxon correlated `parents_json @> [{"id": OuterRef}]` + subquery. A containment whose RHS is an `OuterRef` can't use the GIN index, so it + degrades to a per-row seq-scan that runs once per page row **and** once per taxon in + the pagination `COUNT`. +- **Do** precompute when the driving data is *sparse*: one pass over the small set, + build `{taxon_id: count}` dicts (incrementing the determination taxon + every + ancestor in `parents_json`), then apply as constant-time `CASE` annotations. Resolve + any membership filter from the same set via `id__in`. +- For *dense* per-taxon aggregates (one value per observed taxon: occurrences_count, + best_determination_score, last_detected) the right shape depends on whether the + occurrence filter joins detections: correlated `Subquery` per row when it does not, + conditional aggregation over the reverse relation when it does. +- The GIN index (`main_taxon_parents_json_gin_idx`, migration `0087`) only helps the + **literal**-RHS containment filters (occurrence-list `taxon=`, + `build_occurrence_default_filters_q`), not correlated ones. + +## When to use which mechanism + +| Driving set | Shape | Method | Why | +|---|---|---|---| +| Sparse (e.g. verified subset, bounded by review effort) | Python pass + `CASE`-from-map | `with_verification_counts` | Constant-time per row, DB-sortable, stripped from pagination `COUNT`. Sparse maps stay under sqlparse's 10000-token limit. | +| Dense, no detections join (default / event / deployment / verified / ordering paths) | Three correlated `Subquery` annotations | `with_observation_counts_subqueries` + `observed_in_project_subqueries` | Index-served by the composite `(determination_id, project_id, event_id, determination_score)` index on Occurrence. Membership materialised as `id__in`. | +| Dense, detections join in play (`?collection=`) | Conditional aggregation over the Taxon→occurrences reverse relation | `with_observation_counts_aggregated` | One `GROUP BY`, constant-size SQL. The detections join turns each correlated `Subquery` into a per-row scan; the aggregate dedupes via `Count(distinct)` and replaces the membership query with a HAVING (`occurrences_count__gt=0`). | + +The dispatch is in `TaxonViewSet.get_taxa_observed` and keys on whether +`"collection" in request.query_params` — that is the only filter on this viewset +that currently induces a detections join. Add new join-inducing filters to the +same branch. + +The sparse verification rollup is the same on both paths — it queries Occurrence +directly with the same `occurrence_filters` Q, regardless of whether the +observation counts are subqueries or aggregates. + +## The anti-pattern (does not scale) + +```python +# Per-taxon correlated subquery — OuterRef RHS, can't use the GIN index. +descendant_match = JSONBContains( + F("determination__parents_json"), + jsonb_build_array(jsonb_build_object("id", OuterRef("id"))), +) +under_taxon = Occurrence.objects.filter(...).filter( + Q(determination_id=OuterRef("id")) | Q(_under_taxon=True) +) +qs.annotate(verified_count=Coalesce(Subquery(under_taxon.values("project_id") + .annotate(c=Count("id")).values("c")[:1]), 0)) +``` + +Measured on a ~1k-taxa / ~17k-occurrence project (statement_timeout = 30s): +`limit=25` page **9s** for one such annotation; with two always-on annotations the +default list and `verified=false` hit the 30s timeout; `ordering=verified_count` +timed out (the subquery is computed for *every* taxon before the LIMIT). + +Standalone, the same containment for **one** constant taxon id is ~70ms (it uses the +index). The cost is the correlation: per outer row the planner can't use the index for +a parameterised `@>` and re-evaluates the join. **Always benchmark the correlated form, +not the standalone query.** + +## The sparse pattern (precompute over the small set + CASE) + +Key observation: the verification counts only concern *verified* occurrences (those +with a non-withdrawn `Identification`), which are sparse — bounded by human review +effort, not total occurrences. + +```python +verified_occ = ( + Occurrence.objects.filter(occ_filters) + .filter(default_filters_q) + .filter(Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False))) +) + +# ``pk`` in value_fields + ``.distinct()`` dedupes occurrences when occ_filters +# joins to detections (see "Detection fan-out" below). +counts: dict[int, int] = {} +for row in verified_occ.values("pk", "determination_id", "determination__parents_json").distinct(): + taxon_ids = {row["determination_id"], *(_id(p) for p in row["determination__parents_json"] or [])} + for tid in taxon_ids: + counts[tid] = counts.get(tid, 0) + 1 + +case = Case(*(When(id=tid, then=Value(c)) for tid, c in counts.items()), + default=Value(0), output_field=IntegerField()) if counts else Value(0, ...) +qs = qs.annotate(verified_count=case) + +# Membership filter from the same precomputed set: +qs.filter(id__in=list(counts)) / qs.exclude(id__in=list(counts)) +``` + +Same project, after: every path (`default`, `verified=true/false`, +`ordering=verified_count`) ~1.1s cold. Cost is `O(verified occ × ancestor depth)`, +paid once per request. The `CASE` annotation is a constant per row → DB-sortable, +paginatable, and stripped from the pagination `COUNT`. + +### Why this keeps `COUNT` cheap + +Django's `QuerySet.count()` strips **select-only** annotations it doesn't need — so the +`CASE` annotations don't appear in the pagination `COUNT`. Two things defeat that strip +and were the original timeout: + +- A filter that *references* the expensive expression — e.g. `verified=false` as + `~Exists(correlated_subquery)` forces the subquery into the `COUNT` WHERE for every + taxon. Resolving the filter via `id__in=` avoids it. +- `.distinct()` combined with select annotations (wraps the whole annotated query in the + `COUNT` subquery). Watch for it on list viewsets. + +## The dense pattern (conditional aggregation over the reverse relation) + +For dense per-taxon aggregates — one value per observed taxon, e.g. `occurrences_count` +/ `best_determination_score` / `last_detected` for a project's ~hundreds-to-thousands +of taxa — the sparse `CASE`-from-map fails: one `When` branch per taxon blows past +`sqlparse`'s parser limit and the query fails with +`SQLParseError: Maximum number of tokens exceeded (10000)`. + +Use conditional aggregation over the Taxon→occurrences reverse relation: + +```python +count_filter = ( + self.get_occurrence_filters(project, accessor="occurrences") + & build_occurrence_default_filters_q( + project, request, occurrence_accessor="occurrences", + apply_default_score_filter=True, + apply_default_taxa_filter=False, # see "Two gotchas" below + ) +) +qs = qs.annotate( + occurrences_count=Count("occurrences", filter=count_filter, distinct=True), + best_determination_score=Max("occurrences__determination_score", filter=count_filter), + last_detected=Max("occurrences__detections__timestamp", filter=count_filter), +).filter(occurrences_count__gt=0) # HAVING — replaces the EXISTS membership query +``` + +One `GROUP BY`, constant-size SQL, scales in both taxa and occurrences. `Count(distinct)` +dedupes the detections fan-out under `?collection=`. The HAVING replaces a separate +membership query (the determination ids present in the filtered set are exactly the +observed taxa). + +This is also the right shape for `with_charts=...` style endpoints or any other +aggregation over a join — it's the general pattern, not collection-specific. + +### Two gotchas that turn fast conditional aggregation into a multi-minute scan + +1. **Do not include the default *taxa* include/exclude filter in `count_filter` when + the count groups by `determination = the taxon row`.** The filter is redundant + (`filter_by_project_default_taxa` already keeps/drops the row at the queryset + level), and including it adds a `parents_json` containment join inside the + aggregate that the planner cannot reconcile with the detections join from + `?collection=` — measured 0.3s → 182s on a ~1k-taxa project. Keep only the score + threshold (per-occurrence, not redundant). The verification rollup *base* (which + queries Occurrence directly and rolls up to ancestors) does still need the taxa + filter, and pays no cost because its driving set is sparse. + +2. **Audit `filter_backends` for redundant collection / event JOIN filters before + adding conditional-aggregate annotations.** A backend like + `queryset.filter(occurrences__detections__source_image__collections=)` was + harmless on top of correlated subqueries but, combined with the aggregate + `GROUP BY`, induces an INNER JOIN that multiplies taxon rows and breaks the planner. + Express the filter once — inside the aggregate (`accessor="occurrences"`) — and + remove the backend, since the HAVING already enforces membership. (`TaxonCollectionFilter` + was removed for this reason in PR #1317.) + +## Detection fan-out under `?collection=` + +When `occurrence_filters` joins to detections (because `?collection=` resolves to +`detections__source_image__collections=`), a single Occurrence yields one row per +matching Detection in `.values(...)`. Without dedup the rollup counts every detection +as if it were a separate verified occurrence. + +The fix in `with_verification_counts`: + +```python +value_fields = ["pk", "determination_id", "determination__parents_json"] +# ... +verified_occ.values(*value_fields).distinct() +``` + +`pk` is selected only so that `.distinct()` can dedupe by Occurrence. Regression test: +`test_verified_count_not_inflated_by_collection_join` in +`ami/main/tests.py::TestTaxaVerification`. + +The aggregate path uses `Count("occurrences", filter=count_filter, distinct=True)` +which serves the same dedup role for the dense per-taxon counts. + +## Gotchas + +- **`cachalot` caches query results**, including across repeated benchmark runs — a second + identical timing is a cache hit, not a real measurement. Flush between paths with + `docker compose exec -T redis redis-cli FLUSHALL`, or wrap timing in + `from cachalot.api import cachalot_disabled` → `with cachalot_disabled(): ...`. +- **`django-pydantic-field` `.values()` returns deserialised pydantic objects**, not + dicts. `parents_json` elements may be `TaxonParent` objects depending on the query + path, so read the id defensively: + + ```python + parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) + ``` + + This silently broke ancestor rollup in an earlier revision — direct/species counts + worked, ancestors were 0. +- **GIN `jsonb_path_ops` only serves `@>` with a constant RHS.** Literal + `parents_json__contains=[{"id": X}]` uses the index; an `OuterRef` RHS does not. +- **`Family/Order` rows can show `verified_count > occurrences_count`** because the + former rolls up descendants while the latter is direct-match. Document this in API + consumers if it might confuse readers. + +## Why model-agreement stats were split out (#1319) + +The verification surface (`verified_count`, `verified=true|false` filter) measures +human trust — "how much of this taxon is human-reviewed?" Model-agreement stats +(`agreed_with_prediction_count`, `agreed_exact_count`) measure ML evaluation — "how +often does the model match humans?" Different audiences (naturalists vs ML team) and +different consumers; bundling them into one PR without a FE consumer for the +agreement surface produced dead API columns, naming friction (`agreed_*` was ambiguous +between human and machine signals), and parser duplication (one in serializer, one in +view). + +PR #1317 ships the verification surface only. Follow-up issue #1319 carries the +agreement stats with these implementation guardrails preserved: + +- Reuse the sparse `CASE`-from-map pattern over the verified subset — both + `agreed_with_prediction_count` and `agreed_exact_count` are bounded by the verified + set, so they stay sparse. +- Extend `with_verification_counts` with an `include_agreement` flag and the + `_best_machine_taxon_id` (`Classification` subquery ordered by + `BEST_MACHINE_PREDICTION_ORDER`) + `_agreed_prediction_id` (`Identification` + subquery ordered by `BEST_IDENTIFICATION_ORDER`) annotations from the original + revision. +- Rename to `model_agreed_*` prefix to disambiguate from human verifications. +- Gate behind `?with_model_agreement=true` on the list endpoint to keep the default + cheap; detail view always includes. +- Pass the gate flag through serializer context (single parser), not re-parsed in the + serializer. +- Port the regression tests (rollup, chosen-identification-only, gated-on-list, + detection-dedup) with the new names. + +## Future direction — denormalised per-project observed taxa + +The precompute approach scales with verified-data volume. If per-project taxa +aggregates (observed counts, verified counts, agreement) become a recurring perf +problem across endpoints, consider a dedicated **`TaxonObserved`** model holding +denormalised data per `(project, taxon)` — distinct from the generic project-agnostic +`Taxon` profile — refreshed via the cached-count pattern +(`update_cached_counts(run_async=True)`) on `Identification` / `Occurrence` writes. +That moves the rollup off the read path entirely and makes counts directly sortable / +filterable as real columns. Open idea, not yet scoped. diff --git a/ui/src/data-services/models/species.ts b/ui/src/data-services/models/species.ts index e507eca4f..b4f766b66 100644 --- a/ui/src/data-services/models/species.ts +++ b/ui/src/data-services/models/species.ts @@ -80,6 +80,10 @@ export class Species extends Taxon { return this._species.occurrences_count ?? 0 } + get numVerified(): number { + return this._species.verified_count ?? 0 + } + get score(): number | undefined { const score = this._species.best_determination_score diff --git a/ui/src/pages/species-details/species-details.tsx b/ui/src/pages/species-details/species-details.tsx index 808f529a0..6d4adc5f1 100644 --- a/ui/src/pages/species-details/species-details.tsx +++ b/ui/src/pages/species-details/species-details.tsx @@ -158,6 +158,17 @@ export const SpeciesDetails = ({ })} /> + + + diff --git a/ui/src/pages/species/species-columns.tsx b/ui/src/pages/species/species-columns.tsx index 7f31d0b02..b7f7a33ec 100644 --- a/ui/src/pages/species/species-columns.tsx +++ b/ui/src/pages/species/species-columns.tsx @@ -95,6 +95,24 @@ export const columns: (project: { ), }, + { + id: 'verified', + sortField: 'verified_count', + name: 'Verified', + styles: { + textAlign: TextAlign.Right, + }, + renderCell: (item: Species) => ( + + + + ), + }, { id: 'best-determination-score', name: translate(STRING.FIELD_LABEL_BEST_SCORE), diff --git a/ui/src/pages/species/species.tsx b/ui/src/pages/species/species.tsx index 7bb8ce099..60b8e460f 100644 --- a/ui/src/pages/species/species.tsx +++ b/ui/src/pages/species/species.tsx @@ -41,6 +41,7 @@ export const Species = () => { rank: false, 'last-seen': true, occurrences: true, + verified: true, 'best-determination-score': true, 'created-at': false, 'updated-at': false, @@ -84,6 +85,7 @@ export const Species = () => { )} + {project?.featureFlags.tags ? ( <> diff --git a/ui/src/utils/getAppRoute.ts b/ui/src/utils/getAppRoute.ts index 4cb054028..ca4012f94 100644 --- a/ui/src/utils/getAppRoute.ts +++ b/ui/src/utils/getAppRoute.ts @@ -13,6 +13,7 @@ type FilterType = | 'taxa_list_id' | 'taxon' | 'timestamp' + | 'verified' export const getAppRoute = ({ to,