Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ea9f45d
feat(taxa): per-taxon verification + agreement counts and verified fi…
mihow May 22, 2026
16b1468
perf(taxa): compute verification rollup in one pass, not per-taxon su…
mihow May 22, 2026
5d929ce
docs(taxa): clarify GIN index purpose + add rollup query-performance …
mihow May 26, 2026
10c72cb
fix(taxa): dedupe occurrences in verification rollup under collection…
mihow May 26, 2026
7dcf325
Merge remote-tracking branch 'origin/main' into worktree-taxa-verific…
mihow May 26, 2026
30955e3
chore(migrations): renumber parents_json GIN index 0085 -> 0087 after…
mihow May 26, 2026
b92b2b0
fix(taxa): make collection-filtered taxa list COUNT scale
mihow May 26, 2026
29bec78
fix(taxa): materialize observed-taxon id set instead of IN-subquery
mihow May 26, 2026
838f9d7
refactor(taxa): centralize per-taxon counts into one filtered-occurre…
mihow May 26, 2026
7f571be
fix(taxa): use conditional aggregation for dense per-taxon counts
mihow May 26, 2026
f20a05d
fix(taxa): drop redundant taxa filter from occurrences_count aggregate
mihow May 26, 2026
cf86550
fix(taxa): remove redundant TaxonCollectionFilter backend
mihow May 26, 2026
4f45681
docs(taxa): document sparse vs dense — when CASE breaks, when to use …
mihow May 26, 2026
e014a73
docs(taxa): next-session handoff — hybrid direct-aggregates + move to…
mihow May 26, 2026
04a62c6
refactor(taxa): move count logic to TaxonQuerySet, hybrid subquery/ag…
mihow May 26, 2026
20683ee
refactor(taxa): drop model-agreement counts, keep verification only
mihow May 26, 2026
02f9dc3
docs(taxa): consolidate PR #1317 findings into single reference, drop…
mihow May 27, 2026
c4132e9
Merge branch 'main' into worktree-taxa-verification-counts [skip ci]
mihow May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ami/main/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"occurrences",
"tags",
"last_detected",
Expand Down Expand Up @@ -886,6 +887,7 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"events_count",
"occurrences",
"gbif_taxon_key",
Expand Down
177 changes: 79 additions & 98 deletions ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from django.core import exceptions
from django.db import models
from django.db.models import OuterRef, Prefetch, Q, Subquery
from django.db.models.functions import Coalesce
from django.db.models.query import QuerySet
from django.forms import BooleanField, CharField, IntegerField
from django.shortcuts import get_object_or_404
Expand Down Expand Up @@ -1026,7 +1025,9 @@ class CustomOccurrenceDeterminationFilter(CustomTaxonFilter):
def filter_queryset(self, request, queryset, view):
taxon = self.get_filter_taxon(request, query_params=self.query_params)
if taxon:
# Here the queryset is the Occurrence queryset
# Here the queryset is the Occurrence queryset.
# The literal parents_json containment (constant RHS) is what the GIN index from
# migration 0087 serves — this hierarchical taxon filter is the index's main consumer.
return queryset.filter(
models.Q(determination=taxon) | models.Q(determination__parents_json__contains=[{"id": taxon.pk}])
)
Expand Down Expand Up @@ -1201,22 +1202,6 @@ def filter_queryset(self, request, queryset, view):
return queryset


class TaxonCollectionFilter(filters.BaseFilterBackend):
"""
Filter taxa by the capture set their occurrences belong to.
"""

query_param = "collection"

def filter_queryset(self, request, queryset, view):
collection_id = IntegerField(required=False).clean(request.query_params.get(self.query_param))
if collection_id:
# Here the queryset is the Taxon queryset
return queryset.filter(occurrences__detections__source_image__collections=collection_id)
else:
return queryset


class OccurrenceViewSet(DefaultViewSet, ProjectMixin):
"""
API endpoint that allows occurrences to be viewed or edited.
Expand Down Expand Up @@ -1456,9 +1441,13 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin):

queryset = Taxon.objects.all().defer("notes")
serializer_class = TaxonSerializer
# ``?collection=`` is handled inside get_taxa_observed (via get_occurrence_filters
# + TaxonQuerySet.with_observation_counts_aggregated + HAVING). A dedicated
# filter_backends entry that re-applied the collection filter on the main queryset
# would add a redundant JOIN that the planner cannot reconcile with the
# conditional-aggregate GROUP BY, turning the page into a multi-minute scan.
filter_backends = DefaultViewSetMixin.filter_backends + [
CustomTaxonFilter,
TaxonCollectionFilter,
TaxonTaxaListFilter,
TaxonBestScoreFilter,
TaxonTagFilter,
Expand All @@ -1477,6 +1466,7 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin):
"created_at",
"updated_at",
"occurrences_count",
"verified_count",
Comment thread
mihow marked this conversation as resolved.
"last_detected",
"best_determination_score",
"name",
Expand Down Expand Up @@ -1533,12 +1523,17 @@ def get_serializer_class(self):
else:
return TaxonSerializer

def get_occurrence_filters(self, project: Project) -> models.Q:
def get_occurrence_filters(self, project: Project, accessor: str = "") -> models.Q:
"""
Filter taxa by when/where it has occurred.
Filter by when/where a taxon has occurred.

Supports querying by occurrence, project, deployment, or event.

``accessor`` is the relation path to the Occurrence model. Pass "" to filter the
Occurrence model directly, or "occurrences" to filter the Taxon model via its
reverse relation (for conditional aggregation in
:meth:`TaxonQuerySet.with_observation_counts_aggregated`).

@TODO Consider using a custom filter class for this (see get_filter_name)
@TODO Move this to a custom QuerySet manager on the Taxon model
"""
Expand All @@ -1550,12 +1545,12 @@ def get_occurrence_filters(self, project: Project) -> models.Q:
event_id = self.request.query_params.get("event") or self.request.query_params.get("occurrences__event")
collection_id = self.request.query_params.get("collection")

# filter_active = any([occurrence_id, project, deployment_id, event_id, collection_id])
prefix = f"{accessor}__" if accessor else ""

filters = models.Q(
project=project,
event__isnull=False,
)
def field(path: str) -> str:
return f"{prefix}{path}"

filters = models.Q(**{field("project"): project, field("event__isnull"): False})
try:
"""
Ensure that the related objects exist before filtering by them.
Expand All @@ -1564,16 +1559,16 @@ def get_occurrence_filters(self, project: Project) -> models.Q:
if occurrence_id:
Occurrence.objects.get(id=occurrence_id)
# This query does not need the same filtering as the others
filters &= models.Q(id=occurrence_id)
filters &= models.Q(**{field("id"): occurrence_id})
if deployment_id:
Deployment.objects.get(id=deployment_id)
filters &= models.Q(deployment=deployment_id)
filters &= models.Q(**{field("deployment"): deployment_id})
if event_id:
Event.objects.get(id=event_id)
filters &= models.Q(event=event_id)
filters &= models.Q(**{field("event"): event_id})
if collection_id:
SourceImageCollection.objects.get(id=collection_id)
filters &= models.Q(detections__source_image__collections=collection_id)
filters &= models.Q(**{field("detections__source_image__collections"): collection_id})
except exceptions.ObjectDoesNotExist as e:
# Raise a 404 if any of the related objects don't exist
raise NotFound(detail=str(e))
Expand Down Expand Up @@ -1630,81 +1625,67 @@ def get_taxa_observed(
apply_default_score_filter=True,
apply_default_taxa_filter=True,
) -> QuerySet:
"""
If a project is passed, only return taxa that have been observed.
Also add the number of occurrences and the last time it was detected.

Uses efficient subqueries with default filters applied directly via Q objects
to leverage composite indexes on (determination_id, project_id, event_id, determination_score).
This avoids the N+1 query problem by building a single Q filter that can be reused
across all subqueries.
"""
occurrence_filters = self.get_occurrence_filters(project)
"""Annotate per-(project, taxon) counts and optionally restrict to observed taxa.

Two SQL shapes for the direct aggregates (``occurrences_count`` /
``best_determination_score`` / ``last_detected``):

- **Default / event / deployment / verified paths** — correlated ``Subquery``
annotations, index-served by the composite
``(determination_id, project_id, event_id, determination_score)`` index on
Occurrence. Membership via materialised ``id__in``.
- **``?collection=<id>``** — conditional aggregation over the Taxon→occurrences
reverse relation. The detections join would turn each correlated subquery into
a per-row scan, so we switch to one GROUP BY. Membership via HAVING.

The sparse verification rollup (``verified_count`` / ``agreed_*``) is the same on
either path — a Python pass over the verified subset applied as ``CASE``
annotations, see :meth:`TaxonQuerySet.with_verification_counts`.
"""
request = self.request
use_aggregation = "collection" in request.query_params
direct_filters = self.get_occurrence_filters(project)

if use_aggregation:
relation_filters = self.get_occurrence_filters(project, accessor="occurrences")
qs = qs.with_observation_counts_aggregated(
project,
request,
relation_occurrence_filters=relation_filters,
apply_default_score_filter=apply_default_score_filter,
)
if not include_unobserved:
qs = qs.filter(occurrences_count__gt=0)
else:
qs = qs.with_observation_counts_subqueries(
project,
request,
occurrence_filters=direct_filters,
apply_default_score_filter=apply_default_score_filter,
apply_default_taxa_filter=apply_default_taxa_filter,
)
if not include_unobserved:
qs = qs.observed_in_project_subqueries(
project,
request,
occurrence_filters=direct_filters,
apply_default_score_filter=apply_default_score_filter,
apply_default_taxa_filter=apply_default_taxa_filter,
)

# Build a single Q filter for default filters (score threshold + taxa filters)
# This creates an efficient filter that works with composite indexes
# Respects apply_defaults flag: build_occurrence_default_filters_q checks it internally
from ami.main.models_future.filters import build_occurrence_default_filters_q
verified_param: bool | None = None
if self.action == "list" and "verified" in request.query_params:
verified_param = BooleanField(required=False).clean(request.query_params.get("verified"))

default_filters_q = build_occurrence_default_filters_q(
return qs.with_verification_counts(
project,
self.request,
occurrence_accessor="",
request,
occurrence_filters=direct_filters,
apply_default_score_filter=apply_default_score_filter,
apply_default_taxa_filter=apply_default_taxa_filter,
verified=verified_param,
)

# Combine base occurrence filters with default filters
base_filter = models.Q(
occurrence_filters,
determination_id=models.OuterRef("id"),
)

base_filter = base_filter & default_filters_q

# Count occurrences - uses composite index (determination_id, project_id, event_id, determination_score)
occurrences_count_subquery = models.Subquery(
Occurrence.objects.filter(base_filter)
.values("determination_id")
.annotate(count=models.Count("id"))
.values("count")[:1],
output_field=models.IntegerField(),
)

# Get best score - uses same composite index
best_score_subquery = models.Subquery(
Occurrence.objects.filter(base_filter)
.values("determination_id")
.annotate(max_score=models.Max("determination_score"))
.values("max_score")[:1],
output_field=models.FloatField(),
)

# Get last detected timestamp - requires join with detections
last_detected_subquery = models.Subquery(
Occurrence.objects.filter(
base_filter,
detections__timestamp__isnull=False,
)
.values("determination_id")
.annotate(last_detected=models.Max("detections__timestamp"))
.values("last_detected")[:1],
output_field=models.DateTimeField(),
)

# Apply annotations
qs = qs.annotate(
occurrences_count=Coalesce(occurrences_count_subquery, 0),
best_determination_score=best_score_subquery,
last_detected=last_detected_subquery,
)

if not include_unobserved:
# Efficient EXISTS check that uses the composite index
qs = qs.filter(models.Exists(Occurrence.objects.filter(base_filter)))

return qs

def attach_tags_by_project(self, qs: QuerySet, project: Project) -> QuerySet:
"""
Prefetch and override the `.tags` attribute on each Taxon
Expand Down
36 changes: 36 additions & 0 deletions ami/main/migrations/0087_taxon_parents_json_gin_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from django.db import migrations


class Migration(migrations.Migration):
"""
GIN index on Taxon.parents_json to support the hierarchical (descendant) taxon
filters that issue a literal `parents_json @> [{"id": <id>}]` containment: the
occurrence-list `taxon=<id>` filter (CustomOccurrenceDeterminationFilter) and the
project default-taxa filter (build_occurrence_default_filters_q). The index applies
to these because the right-hand side is a constant.

Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is
computed in a single Python pass over the (sparse) verified-occurrence set rather
than a correlated subquery, because a containment whose RHS is an OuterRef can't use
the index. See TaxonViewSet._annotate_verification_counts.

CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is
non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands
separately via the #1307 follow-up.
"""

atomic = False

dependencies = [
("main", "0086_sourceimage_recent_capture_index"),
]

operations = [
migrations.RunSQL(
sql=(
"CREATE INDEX CONCURRENTLY IF NOT EXISTS main_taxon_parents_json_gin_idx "
"ON main_taxon USING gin (parents_json jsonb_path_ops);"
),
reverse_sql="DROP INDEX CONCURRENTLY IF EXISTS main_taxon_parents_json_gin_idx;",
),
]
Loading