Skip to content

Commit 16b1468

Browse files
mihowclaude
andcommitted
perf(taxa): compute verification rollup in one pass, not per-taxon subquery
The per-taxon correlated parents_json subquery for verified_count / agreed_with_prediction_count / agreed_exact_count did not scale: on a large project (~1k taxa, ~17k occurrences) the taxa list timed out at the 30s statement limit even with the column hidden and on the default sort, because each page row (and the verified=false COUNT) ran a JSONB containment scan the GIN index can't serve when the @> right-hand side is an OuterRef. All three counts only concern verified occurrences (those with a non-withdrawn Identification), which are sparse. Compute the hierarchical rollup in a single pass over that small set in Python and apply it as constant-time CASE annotations; resolve the verified filter from the same precomputed set via id__in. Page values, sort, and the pagination COUNT are now constant-time. Also fixes ancestor rollup returning 0: parents_json round-trips through the pydantic schema field, so elements may be TaxonParent objects rather than dicts. Measured on the large project: default page 30s timeout -> ~0.6s; verified=false 30s timeout -> ~0.2s; ordering=verified_count 30s timeout -> ~0.04s. Co-Authored-By: Claude <noreply@anthropic.com>
1 parent ea9f45d commit 16b1468

1 file changed

Lines changed: 75 additions & 87 deletions

File tree

ami/main/api/views.py

Lines changed: 75 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,19 +1402,6 @@ def filter_queryset(self, request, queryset, view):
14021402
return queryset.distinct()
14031403

14041404

1405-
class JSONBContains(models.Func):
1406-
"""Postgres ``@>`` containment rendered as a boolean expression.
1407-
1408-
Needed for correlated subqueries where the right-hand side is built from an
1409-
``OuterRef`` — the literal ``parents_json__contains=[{"id": ...}]`` lookup
1410-
can't embed an ``OuterRef`` (it would try to JSON-serialize the expression).
1411-
"""
1412-
1413-
arg_joiner = " @> "
1414-
template = "(%(expressions)s)"
1415-
output_field = models.BooleanField()
1416-
1417-
14181405
class TaxonViewSet(DefaultViewSet, ProjectMixin):
14191406
"""
14201407
API endpoint that allows taxa to be viewed or edited.
@@ -1674,36 +1661,6 @@ def get_taxa_observed(
16741661

16751662
return qs
16761663

1677-
def _occurrences_under_taxon(self, occurrence_filters: models.Q, default_filters_q: models.Q) -> QuerySet:
1678-
"""
1679-
Correlated Occurrence queryset matching occurrences whose determination is the
1680-
outer Taxon (``OuterRef("id")``) or any of its descendants, project-scoped and
1681-
default-filtered.
1682-
1683-
Mirrors the hierarchical match used by the occurrence-list ``taxon=<id>`` filter
1684-
(``CustomOccurrenceDeterminationFilter``), but the descendant test is built with
1685-
an ``OuterRef`` right-hand side so a Family/Order row aggregates all its
1686-
descendant species' occurrences.
1687-
"""
1688-
descendant_match = JSONBContains(
1689-
models.F("determination__parents_json"),
1690-
models.Func(
1691-
models.Func(
1692-
models.Value("id"),
1693-
models.OuterRef("id"),
1694-
function="jsonb_build_object",
1695-
),
1696-
function="jsonb_build_array",
1697-
output_field=models.JSONField(),
1698-
),
1699-
)
1700-
return (
1701-
Occurrence.objects.filter(occurrence_filters)
1702-
.filter(default_filters_q)
1703-
.alias(_under_taxon=descendant_match)
1704-
.filter(models.Q(determination_id=models.OuterRef("id")) | models.Q(_under_taxon=True))
1705-
)
1706-
17071664
def _include_agreement(self) -> bool:
17081665
"""Whether the heavier ``agreed_exact_count`` annotation should be computed."""
17091666
if self.action == "retrieve":
@@ -1717,65 +1674,96 @@ def add_verification_data(
17171674
Annotate per-taxon verification and human/model agreement counts, and apply the
17181675
``verified=true|false`` filter on list responses.
17191676
1720-
All counts roll up descendant occurrences via ``_occurrences_under_taxon`` and
1721-
respect the project's default filters (same ``apply_defaults`` handling as
1722-
``occurrences_count``).
1723-
"""
1724-
under_taxon = self._occurrences_under_taxon(occurrence_filters, default_filters_q)
1677+
Counts roll up descendant occurrences (verifying a species also counts toward its
1678+
genus/family rows) and respect the project's default filters (same
1679+
``apply_defaults`` handling as ``occurrences_count``).
17251680
1726-
has_identification = models.Exists(
1727-
Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)
1728-
)
1729-
verified_occurrences = under_taxon.filter(has_identification)
1730-
1731-
def correlated_count(occurrence_qs: QuerySet) -> Coalesce:
1732-
# Group by project_id (constant within the subquery) to collapse the
1733-
# hierarchical match — determination_id varies across descendants so it
1734-
# can't be the grouping key.
1735-
return Coalesce(
1736-
models.Subquery(
1737-
occurrence_qs.values("project_id").annotate(c=models.Count("id")).values("c")[:1],
1738-
output_field=models.IntegerField(),
1739-
),
1740-
0,
1741-
)
1681+
All three counts only concern *verified* occurrences (those with a non-withdrawn
1682+
Identification), which are sparse relative to all occurrences. So the hierarchical
1683+
rollup is computed in a single pass over that small set in Python and applied as
1684+
constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per
1685+
taxon does not scale: on large projects it forces a per-row scan that the GIN
1686+
index can't serve (the containment RHS is an ``OuterRef``), timing out the list.
1687+
"""
1688+
include_agreement = self._include_agreement()
17421689

1743-
# The chosen (best, non-withdrawn) identification's agreed_with_prediction FK is set.
1690+
# The chosen (best, non-withdrawn) identification's agreed_with_prediction FK.
17441691
best_identification_agreed_prediction = models.Subquery(
17451692
Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)
17461693
.order_by(*BEST_IDENTIFICATION_ORDER)
17471694
.values("agreed_with_prediction_id")[:1]
17481695
)
1749-
agreed_with_prediction_occurrences = under_taxon.annotate(
1750-
_best_agreed_prediction=best_identification_agreed_prediction
1751-
).filter(_best_agreed_prediction__isnull=False)
1752-
1753-
qs = qs.annotate(
1754-
verified_count=correlated_count(verified_occurrences),
1755-
agreed_with_prediction_count=correlated_count(agreed_with_prediction_occurrences),
1696+
verified_occurrences = (
1697+
Occurrence.objects.filter(occurrence_filters)
1698+
.filter(default_filters_q)
1699+
.filter(models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)))
1700+
.annotate(_agreed_prediction_id=best_identification_agreed_prediction)
17561701
)
1757-
1758-
if self._include_agreement():
1759-
# Verified occurrence where the user determination equals the top machine
1760-
# prediction's taxon for the same occurrence.
1761-
best_machine_taxon = models.Subquery(
1762-
Classification.objects.filter(detection__occurrence=models.OuterRef("pk"))
1763-
.order_by(*BEST_MACHINE_PREDICTION_ORDER)
1764-
.values("taxon_id")[:1]
1702+
value_fields = ["determination_id", "determination__parents_json", "_agreed_prediction_id"]
1703+
if include_agreement:
1704+
# Top machine prediction's taxon for the same occurrence.
1705+
verified_occurrences = verified_occurrences.annotate(
1706+
_best_machine_taxon_id=models.Subquery(
1707+
Classification.objects.filter(detection__occurrence=models.OuterRef("pk"))
1708+
.order_by(*BEST_MACHINE_PREDICTION_ORDER)
1709+
.values("taxon_id")[:1]
1710+
)
17651711
)
1766-
agreed_exact_occurrences = verified_occurrences.annotate(_best_machine_taxon=best_machine_taxon).filter(
1767-
determination_id=models.F("_best_machine_taxon")
1712+
value_fields.append("_best_machine_taxon_id")
1713+
1714+
verified_counts: dict[int, int] = {}
1715+
agreed_with_prediction_counts: dict[int, int] = {}
1716+
agreed_exact_counts: dict[int, int] = {}
1717+
for row in verified_occurrences.values(*value_fields):
1718+
determination_id = row["determination_id"]
1719+
# The taxon itself plus every ancestor — i.e. every row this occurrence rolls up to.
1720+
taxon_ids: set[int] = set()
1721+
if determination_id is not None:
1722+
taxon_ids.add(determination_id)
1723+
for parent in row["determination__parents_json"] or []:
1724+
# parents_json round-trips through the pydantic schema field, so elements
1725+
# may be dicts or ``TaxonParent`` objects depending on the query path.
1726+
parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None)
1727+
if parent_id is not None:
1728+
taxon_ids.add(int(parent_id))
1729+
1730+
for taxon_id in taxon_ids:
1731+
verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1
1732+
if row["_agreed_prediction_id"] is not None:
1733+
for taxon_id in taxon_ids:
1734+
agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1
1735+
if (
1736+
include_agreement
1737+
and determination_id is not None
1738+
and determination_id == row["_best_machine_taxon_id"]
1739+
):
1740+
for taxon_id in taxon_ids:
1741+
agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1
1742+
1743+
def count_annotation(counts: dict[int, int]) -> models.expressions.Combinable:
1744+
if not counts:
1745+
return models.Value(0, output_field=models.IntegerField())
1746+
return models.Case(
1747+
*(models.When(id=taxon_id, then=models.Value(count)) for taxon_id, count in counts.items()),
1748+
default=models.Value(0),
1749+
output_field=models.IntegerField(),
17681750
)
1769-
qs = qs.annotate(agreed_exact_count=correlated_count(agreed_exact_occurrences))
17701751

1771-
# verified=true|false filter (list only); the complement uses the same set, so
1772-
# verified=false is the strict complement of verified=true on the filtered taxa.
1752+
qs = qs.annotate(
1753+
verified_count=count_annotation(verified_counts),
1754+
agreed_with_prediction_count=count_annotation(agreed_with_prediction_counts),
1755+
)
1756+
if include_agreement:
1757+
qs = qs.annotate(agreed_exact_count=count_annotation(agreed_exact_counts))
1758+
1759+
# verified=true|false filter (list only); verified=false is the strict complement.
17731760
if self.action == "list" and "verified" in self.request.query_params:
17741761
verified = BooleanField(required=False).clean(self.request.query_params.get("verified"))
1762+
verified_taxon_ids = list(verified_counts.keys())
17751763
if verified:
1776-
qs = qs.filter(models.Exists(verified_occurrences))
1764+
qs = qs.filter(id__in=verified_taxon_ids)
17771765
else:
1778-
qs = qs.filter(~models.Exists(verified_occurrences))
1766+
qs = qs.exclude(id__in=verified_taxon_ids)
17791767

17801768
return qs
17811769

0 commit comments

Comments
 (0)