Improve representation of generated clusters (#849)

mihow · annavik · web-flow · commit a4dd1730ef0a · 2025-05-16T10:38:36.000-07:00
* feat: don't show placeholder scores from human identifications

* feat: prioritize determinations from clustering, regardless of score

* feat: method to calculate score of cluster points

* feat: track more details about cluster members

* fix: score calculation and display of classifications for clusters

* feat: use common ancestor of predicted taxon for name &amp; parent

* feat: use any occurrence's best detection as fallback cover image

* feat: update automatic naming

* feat: fix missing parents in taxa list view

* copy: cleanup auto generated taxa list name

* feat: remove species name from cluster name, expand notes.

* fix: calculation of determination score

---------

Co-authored-by: Anna Viklund &lt;annamariaviklund@gmail.com&gt;
diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py
@@ -32,6 +32,7 @@
     SourceImageUpload,
     TaxaList,
     Taxon,
+    get_media_url,
     validate_filename_timestamp,
 )
 
@@ -520,6 +521,7 @@ class TaxonListSerializer(DefaultSerializer):
     occurrences = serializers.SerializerMethodField()
     parents = TaxonParentSerializer(many=True, read_only=True, source="parents_json")
     parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent")
+    cover_image_url = serializers.SerializerMethodField()
     tags = serializers.SerializerMethodField()
 
     def get_tags(self, obj):
@@ -549,6 +551,8 @@ class Meta:
     def get_occurrences(self, obj):
         """
         Return URL to the occurrences endpoint filtered by this taxon.
+
+        Does not make a database query.
         """
 
         params = {}
@@ -561,6 +565,15 @@ def get_occurrences(self, obj):
             params=params,
         )
 
+    def get_cover_image_url(self, obj):
+        if obj.cover_image_url:
+            return obj.cover_image_url
+        elif hasattr(obj, "best_detection_image_path") and obj.best_detection_image_path:
+            # This attribute is added by an QuerySet annotation
+            return get_media_url(obj.best_detection_image_path)
+        else:
+            return None
+
 
 class TaxaListSerializer(serializers.ModelSerializer):
     taxa = serializers.SerializerMethodField()
@@ -745,6 +758,7 @@ class TaxonSerializer(DefaultSerializer):
     parent = TaxonNoParentNestedSerializer(read_only=True)
     parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent", write_only=True)
     parents = TaxonParentSerializer(many=True, read_only=True, source="parents_json")
+    cover_image_url = serializers.SerializerMethodField()
     tags = serializers.SerializerMethodField()
 
     def get_tags(self, obj):
@@ -774,6 +788,15 @@ class Meta:
             "unknown_species",
         ]
 
+    def get_cover_image_url(self, obj):
+        if obj.cover_image_url:
+            return obj.cover_image_url
+        elif hasattr(obj, "best_detection_image_path") and obj.best_detection_image_path:
+            # This attribute is added by an QuerySet annotation
+            return get_media_url(obj.best_detection_image_path)
+        else:
+            return None
+
 
 class CaptureOccurrenceSerializer(DefaultSerializer):
     determination = TaxonNoParentNestedSerializer(read_only=True)
diff --git a/ami/main/api/views.py b/ami/main/api/views.py
@@ -1371,8 +1371,19 @@ def get_queryset(self) -> QuerySet:
         qs = self.attach_tags_by_project(qs, project)
 
         if project:
-            # Allow showing detail views for unobserved taxa
-            include_unobserved = True
+            include_unobserved = True  # Show detail views for unobserved taxa instead of 404
+            # @TODO move to a QuerySet manager
+            qs = qs.annotate(
+                best_detection_image_path=models.Subquery(
+                    Occurrence.objects.filter(
+                        self.get_occurrence_filters(project),
+                        determination_id=models.OuterRef("id"),
+                    )
+                    .order_by("-determination_score")
+                    .values("best_detection__path")[:1],
+                    output_field=models.TextField(),
+                )
+            )
             if self.action == "list":
                 include_unobserved = self.request.query_params.get("include_unobserved", False)
             qs = self.get_taxa_observed(qs, project, include_unobserved=include_unobserved)
diff --git a/ami/main/management/commands/import_taxa.py b/ami/main/management/commands/import_taxa.py
@@ -359,7 +359,7 @@ def create_taxon(self, taxon_data: dict, root_taxon_parent: Taxon) -> tuple[set[
                         parent = None
                     if taxon.parent != parent:
                         if not created:
-                            logger.warn(f"Changing parent of {taxon} from {taxon.parent} to more specific {parent}")
+                            logger.warning(f"Changing parent of {taxon} from {taxon.parent} to more specific {parent}")
                         taxon.parent = parent
                         taxon.save(update_calculated_fields=False)
                         if not created:
diff --git a/ami/main/models.py b/ami/main/models.py
@@ -2430,31 +2430,22 @@ def get_best_detection(self) -> Detection | None:
 
     def get_best_predictions(self, filters: dict = {}) -> models.QuerySet[Classification]:
         """
-        Retrieve the classification with the max score for each algorithm
-        from any detection belonging to this occurrence.
+        Retrieve all classifications for this occurrence in chronological order.
 
         This data is for the list of predictions in the Identification tab of the Occurrence Detail view
         in the UI. See the OccurrenceSerializer for the serializer method.
 
         If this is need for a list view (multiple occurrenes) it should be overriden
         in the viewset to use the pre-fetched classifications instead of hitting the database
         for each occurrence (n+1 query problem).
+
+        In the past, this was a more complext query that returned a single result
+        for each algorithm, but now it returns all classifications for the occurrence
         """
-        # Get the highest scoring classification for each algorithm
-        # Use a subquery to find the max score for each algorithm
-        subquery = (
-            Classification.objects.filter(detection__occurrence=self, **filters)
-            .values("algorithm")
-            .annotate(max_score=models.Max("score"))
-        )
 
-        # Join the subquery results to get the classifications with those max scores
-        # This ensures we get one classification per algorithm (the one with highest score)
         classifications = Classification.objects.filter(
             detection__occurrence=self,
             **filters,
-            algorithm__in=models.Subquery(subquery.values("algorithm")),
-            score__in=models.Subquery(subquery.values("max_score")),
         ).order_by("-created_at")
 
         return classifications
@@ -2473,6 +2464,22 @@ def get_best_prediction(self, filters: dict = {}) -> Classification | None:
         # Get all classifications for this occurrence to choose from
         all_classifications = Classification.objects.filter(detection__occurrence=self, **filters)
 
+        # Prioritize derived classifications (e.g. clustering) regardless of score
+        derived_classification_task_types = (
+            "clustering",
+            "tracking",
+        )
+        derived_classification = (
+            all_classifications.filter(
+                algorithm__task_type__in=derived_classification_task_types,
+                terminal=True,
+            )
+            .order_by("-created_at")
+            .first()
+        )
+        if derived_classification:
+            return derived_classification
+
         # First try to get a terminal classification
         terminal_classification = all_classifications.filter(terminal=True).order_by("-score", "-created_at").first()
         if terminal_classification:
@@ -2481,6 +2488,9 @@ def get_best_prediction(self, filters: dict = {}) -> Classification | None:
         # If no terminal classification exists, fall back to non-terminal
         return all_classifications.filter(terminal=False).order_by("-score").first()
 
+    def get_best_ood_prediction(self) -> Classification | None:
+        return self.get_best_prediction(filters={"ood_score__isnull": False})
+
     def get_best_identification(self) -> Identification | None:
         """
         The most recent human identification is used as the best identification.
@@ -2489,17 +2499,17 @@ def get_best_identification(self) -> Identification | None:
         """
         return Identification.objects.filter(occurrence=self, withdrawn=False).order_by("-created_at").first()
 
-    def get_determination_score(self) -> float | None:
-        if not self.determination:
+    def get_determination_score(self, prediction: Classification | None = None) -> float | None:
+        """
+        Always return a score from an algorithm, even if a human has identified the occurrence.
+        """
+        best_prediction = prediction or self.get_best_prediction()
+        if not best_prediction:
             return None
-        elif self.best_identification:
-            return self.best_identification.score
-        elif self.best_prediction:
-            return self.best_prediction.score
         else:
-            return None
+            return best_prediction.score
 
-    def get_determination_ood_score(self) -> float | None:
+    def get_determination_ood_score(self, prediction: Classification | None = None) -> float | None:
         """
         Calculate the OOD score for the whole occurrence.
         Uses the average OOD score of all detections belonging to this occurrence.
@@ -2508,16 +2518,16 @@ def get_determination_ood_score(self) -> float | None:
         """
         # Get the best prediction that has an OOD score
         # this should be the last classification before the clustering algorithm
-        # @TODO copy the OOD score from the best classification to the clustering classification during clustering
-        best_prediction = self.get_best_prediction(filters={"ood_score__isnull": False})
+        best_prediction = prediction or self.get_best_ood_prediction()
         if not best_prediction:
             return None
-        mean_ood_score = Classification.objects.filter(
-            detection__occurrence=self,
-            ood_score__isnull=False,
-            algorithm=best_prediction.algorithm,
-        ).aggregate(models.Avg("ood_score"),)["ood_score__avg"]
-        return mean_ood_score
+        else:
+            mean_ood_score = Classification.objects.filter(
+                detection__occurrence=self,
+                ood_score__isnull=False,
+                algorithm=best_prediction.algorithm,
+            ).aggregate(models.Avg("ood_score"),)["ood_score__avg"]
+            return mean_ood_score
 
     def context_url(self):
         detection = self.best_detection
@@ -2540,16 +2550,6 @@ def save(self, update_determination=True, *args, **kwargs):
                 save=True,
             )
 
-        if self.determination and not self.determination_score:
-            # This may happen for legacy occurrences that were created
-            # before the determination_score field was added
-            # @TODO remove
-            self.determination_score = self.get_determination_score()
-            if not self.determination_score:
-                logger.warning(f"Could not determine score for {self}")
-            else:
-                self.save(update_determination=False)
-
     class Meta:
         ordering = ["-determination_score"]
 
@@ -2584,23 +2584,24 @@ def update_occurrence_determination(
 
     # Collect all necessary values first
     best_identification = occurrence.get_best_identification()
-    best_prediction = occurrence.get_best_prediction() if not best_identification else None
+    best_prediction = occurrence.get_best_prediction()
+    best_ood_prediction = occurrence.get_best_ood_prediction()
 
     # Best detection is used as the representative image for the occurrence in either case
     best_detection = occurrence.get_best_detection()
 
-    # Determine values for all attributes
+    # Update the determination (Taxon) first
     new_determination = None
-    new_determination_score = None
-    new_determination_ood_score = occurrence.get_determination_ood_score()
 
     # Identifications take precedence over machine predictions
     if best_identification:
         new_determination = best_identification.taxon
-        new_determination_score = best_identification.score
     elif best_prediction:
         new_determination = best_prediction.taxon
-        new_determination_score = best_prediction.score
+
+    # Update scores, which may or may not come from the same source as the determination
+    new_determination_score = occurrence.get_determination_score(prediction=best_prediction)
+    new_determination_ood_score = occurrence.get_determination_ood_score(prediction=best_ood_prediction)
 
     # Prepare fields that need to be updated (using a dictionary for bulk update)
     update_fields = {}
@@ -2862,6 +2863,7 @@ class Config:
         # so we can sort by rank. The DRF serializer will convert it to a string.
         # just for the API responses.
         use_enum_values = False
+        frozen = True  # Allow hashing for use in a set
 
 
 @final
@@ -3099,6 +3101,60 @@ def save(self, update_calculated_fields=True, *args, **kwargs):
             self.update_calculated_fields(save=True)
 
 
+def find_common_ancestor_taxon(
+    taxa: list["Taxon"],
+    ignore_missing_parents: bool = True,
+) -> typing.Optional["Taxon"]:
+    """
+    Find the common ancestor taxon for a list of taxa.
+    Args:
+        taxa (list[Taxon]): A list of Taxon objects.
+        ignore_rootless (bool): If True, ignore taxa without parents. Defaults to True.
+    Returns:
+        Taxon | None: The common ancestor taxon, or None if no common ancestor exists.
+    """
+    if not taxa:
+        return None
+
+    # Filter taxa based on whether they have parents
+    valid_taxa = taxa
+    if ignore_missing_parents:
+        valid_taxa = [t for t in taxa if t.parents_json]
+        rootless_count = len(taxa) - len(valid_taxa)
+        if rootless_count:
+            logger.warning(f"Ignoring {rootless_count} rootless taxa")
+
+    if not valid_taxa:
+        logger.error("No taxa with parents found")
+        return None
+
+    # Build ancestor sets for each taxon
+    ancestor_sets = []
+    for taxon in valid_taxa:
+        ancestors = set(taxon.parents_json)
+        # Include the taxon itself
+        ancestors.add(TaxonParent(id=taxon.pk, name=taxon.name, rank=TaxonRank(taxon.rank)))
+        ancestor_sets.append(ancestors)
+
+    # Find common ancestors
+    common_ancestors = set.intersection(*ancestor_sets)
+
+    if not common_ancestors:
+        logger.info("No common ancestor found")
+        return None
+
+    # Find the most specific common ancestor (highest rank index)
+    best_ancestor = max(common_ancestors, key=lambda a: list(TaxonRank).index(a.rank))
+
+    logger.info(f"Common ancestor: {best_ancestor.name} ({best_ancestor.rank})")
+
+    # Return the actual Taxon object
+    from .models import Taxon
+
+    result = Taxon.objects.get(id=best_ancestor.id)
+    return result
+
+
 @final
 class TaxaList(BaseModel):
     """A checklist of taxa"""
diff --git a/ami/main/tests/test_occurrence_determination.py b/ami/main/tests/test_occurrence_determination.py
@@ -96,7 +96,7 @@ def test_update_with_identification(self):
 
         # Check that the determination is set to the identification's taxon
         self.assertEqual(self.occurrence.determination, self.taxon2)
-        self.assertEqual(self.occurrence.determination_score, 1.0)  # Human identifications have score 1.0
+        self.assertEqual(self.occurrence.determination_score, None)  # Human identifications have no score
         self.assertEqual(self.occurrence.best_identification, identification)
 
     def test_identification_overrides_classification(self):
diff --git a/ami/ml/clustering_algorithms/agglomerative.py b/ami/ml/clustering_algorithms/agglomerative.py
@@ -4,6 +4,7 @@
 import numpy as np
 from scipy.spatial.distance import pdist, squareform
 from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_samples
 
 from .base_clusterer import BaseClusterer
 from .preprocessing_features import dimension_reduction, standardize
@@ -69,7 +70,7 @@ def setup(self, data_dict):
                     data_dict["val"]["feat_list"], data_dict["val"]["label_list"]
                 )
 
-    def cluster(self, features):
+    def cluster(self, features) -> tuple[np.ndarray, np.ndarray]:
         logger.info(f"distance threshold: {self.distance_threshold}")
         logger.info("features shape: %s", features.shape)
         logger.info(f"self.n_components: {self.n_components}")
@@ -84,8 +85,21 @@ def cluster(self, features):
         linkage = self.config.get("algorithm_kwargs", {}).get("linkage", "ward")
         logger.info(f" features shape after PCA: {features.shape}")
 
-        clusters = AgglomerativeClustering(
+        cluster_ids = AgglomerativeClustering(
             n_clusters=None, distance_threshold=self.distance_threshold, linkage=linkage
         ).fit_predict(features)
 
-        return clusters
+        try:
+            silhouette_scores = silhouette_samples(features, cluster_ids)
+            silhouette_scores = np.asarray(silhouette_scores)
+            # Scale from -1 to 1 to 0 to 1
+            silhouette_scores = (silhouette_scores + 1) / 2
+        except ValueError:
+            # If silhouette scores cannot be computed, return an array of zeros
+            logger.warning(
+                f"Returned {len(cluster_ids)} clusters for {len(features)} features. "
+                "Cannot compute silhouette scores so setting them to zero."
+            )
+            silhouette_scores = np.zeros(features.shape[0], dtype=np.float32)
+
+        return cluster_ids, silhouette_scores
diff --git a/ami/ml/clustering_algorithms/cluster_detections.py b/ami/ml/clustering_algorithms/cluster_detections.py
diff --git a/ami/ml/tests.py b/ami/ml/tests.py
diff --git a/ami/utils/schemas.py b/ami/utils/schemas.py