RolnickLab
diff --git a/‎ami/jobs/management/commands/test_regroup_job_e2e.py‎
Lines changed: 209 additions & 0 deletions b/‎ami/jobs/management/commands/test_regroup_job_e2e.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎ami/jobs/models.py‎
Lines changed: 120 additions & 18 deletions b/‎ami/jobs/models.py‎
Lines changed: 120 additions & 18 deletions
@@ -0,0 +1,209 @@
+"""End-to-end test harness for ``RegroupEventsJob`` and the regroup stage of
+``DataStorageSyncJob``.
+
+Mirrors ``test_ml_job_e2e`` for the session-regrouping path (PR #1292). Used to
+validate:
+
+* Mode ``regroup``: ``RegroupEventsJob`` runs to SUCCESS against a real
+  deployment, stage params are populated, Event count delta is reasonable.
+* Mode ``sync``: ``DataStorageSyncJob`` exposes a two-stage progress
+  (sync_captures + regroup_sessions), both reach SUCCESS, regroup stage params
+  are populated.
+* Mode ``concurrent``: Two ``RegroupEventsJob`` enqueues for the same
+  deployment within the lock TTL — exactly one stage produces non-zero stats,
+  the other short-circuits with a lock warning, and Event count does not
+  diverge from the single-run baseline.
+"""
+
+import time
+
+from django.core.management.base import BaseCommand, CommandError
+
+from ami.jobs.models import DataStorageSyncJob, Job, JobState, RegroupEventsJob
+from ami.main.models import Deployment, Event
+
+
+class Command(BaseCommand):
+    help = (
+        "Run end-to-end tests for the regroup-events Job path.\n\n"
+        "Modes:\n"
+        "  regroup     — RegroupEventsJob on a deployment\n"
+        "  sync        — DataStorageSyncJob (covers sync→regroup chain)\n"
+        "  concurrent  — two RegroupEventsJobs back-to-back, asserts lock semantics\n"
+    )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "mode",
+            choices=["regroup", "sync", "concurrent"],
+            help="Which scenario to exercise",
+        )
+        parser.add_argument("--deployment", type=int, required=True, help="Deployment ID")
+        parser.add_argument(
+            "--poll-interval", type=float, default=2.0, help="Seconds between Job state polls (default 2.0)"
+        )
+        parser.add_argument(
+            "--timeout", type=float, default=600.0, help="Max seconds to wait for each Job (default 600)"
+        )
+
+    def handle(self, *args, **options):
+        deployment = self._resolve_deployment(options["deployment"])
+        mode = options["mode"]
+
+        if mode == "regroup":
+            self._run_regroup(deployment, options)
+        elif mode == "sync":
+            self._run_sync(deployment, options)
+        elif mode == "concurrent":
+            self._run_concurrent(deployment, options)
+
+    def _resolve_deployment(self, deployment_id: int) -> Deployment:
+        try:
+            deployment = Deployment.objects.get(pk=deployment_id)
+        except Deployment.DoesNotExist:
+            raise CommandError(f"Deployment {deployment_id} not found")
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"✓ Deployment {deployment.pk} '{deployment.name}' "
+                f"(project={deployment.project_id}, captures={deployment.captures_count})"
+            )
+        )
+        gap = deployment.project.session_time_gap_seconds if deployment.project_id else None
+        self.stdout.write(f"  project session_time_gap_seconds = {gap!r}")
+        before = Event.objects.filter(deployment=deployment).count()
+        self.stdout.write(f"  Events before run: {before}")
+        return deployment
+
+    def _run_regroup(self, deployment: Deployment, options: dict) -> None:
+        before = Event.objects.filter(deployment=deployment).count()
+        job = self._make_regroup_job(deployment, suffix="e2e-regroup")
+        self.stdout.write(f"\n🚀 RegroupEventsJob {job.pk} enqueueing")
+        job.enqueue()
+        self._monitor(job, options)
+        after = Event.objects.filter(deployment=deployment).count()
+        self.stdout.write(f"\nEvents after: {after} (Δ {after - before:+d})")
+        self._assert_status(job, expected=JobState.SUCCESS)
+        self._dump_stage_params(job)
+
+    def _run_sync(self, deployment: Deployment, options: dict) -> None:
+        if not deployment.data_source_id:
+            raise CommandError(
+                f"Deployment {deployment.pk} has no data_source — DataStorageSyncJob would fail immediately."
+            )
+        before = Event.objects.filter(deployment=deployment).count()
+        job = Job.objects.create(
+            name=f"E2E sync→regroup chain (deployment {deployment.pk})",
+            project=deployment.project,
+            deployment=deployment,
+            job_type_key=DataStorageSyncJob.key,
+        )
+        self.stdout.write(f"\n🚀 DataStorageSyncJob {job.pk} enqueueing")
+        job.enqueue()
+        self._monitor(job, options)
+        after = Event.objects.filter(deployment=deployment).count()
+        self.stdout.write(f"\nEvents after: {after} (Δ {after - before:+d})")
+        self._assert_status(job, expected=JobState.SUCCESS)
+
+        stage_keys = [s.key for s in (job.progress.stages or [])]
+        if DataStorageSyncJob.regroup_stage_key not in stage_keys:
+            raise CommandError(
+                f"❌ DataStorageSyncJob exposed stages {stage_keys!r} — missing "
+                f"'{DataStorageSyncJob.regroup_stage_key}' regroup stage."
+            )
+        self.stdout.write(self.style.SUCCESS(f"✓ Sync Job exposed both stages: {stage_keys!r}"))
+        self._dump_stage_params(job)
+
+    def _run_concurrent(self, deployment: Deployment, options: dict) -> None:
+        before = Event.objects.filter(deployment=deployment).count()
+        job_a = self._make_regroup_job(deployment, suffix="e2e-concurrent-A")
+        job_b = self._make_regroup_job(deployment, suffix="e2e-concurrent-B")
+        self.stdout.write(f"\n🚀 Enqueueing two RegroupEventsJobs back-to-back: {job_a.pk}, {job_b.pk}")
+        job_a.enqueue()
+        # No sleep between — we want both Celery tasks to race for the lock.
+        job_b.enqueue()
+
+        self.stdout.write("\nMonitoring job A:")
+        self._monitor(job_a, options)
+        self.stdout.write("\nMonitoring job B:")
+        self._monitor(job_b, options)
+
+        after = Event.objects.filter(deployment=deployment).count()
+        self.stdout.write(f"\nEvents after both jobs: {after} (Δ {after - before:+d})")
+
+        for job in (job_a, job_b):
+            self._assert_status(job, expected=JobState.SUCCESS)
+
+        params_a = self._stage_param_dict(job_a, RegroupEventsJob.key)
+        params_b = self._stage_param_dict(job_b, RegroupEventsJob.key)
+        self.stdout.write(f"\nJob A stage params: {params_a}")
+        self.stdout.write(f"Job B stage params: {params_b}")
+
+        # Exactly one of A/B should have done real work (captures_grouped > 0);
+        # the other should have short-circuited and reported the initial zeroes.
+        worked_a = (params_a.get("captures_grouped") or 0) > 0
+        worked_b = (params_b.get("captures_grouped") or 0) > 0
+        if worked_a == worked_b:
+            self.stdout.write(
+                self.style.WARNING(
+                    f"⚠ Lock did not separate runs as expected — both jobs reported "
+                    f"captures_grouped={params_a.get('captures_grouped')}/"
+                    f"{params_b.get('captures_grouped')}. "
+                    f"This can happen if the worker ran them serially fast enough that the lock cleared between."
+                )
+            )
+        else:
+            winner = "A" if worked_a else "B"
+            loser = "B" if worked_a else "A"
+            self.stdout.write(
+                self.style.SUCCESS(f"✓ Lock semantics held: job {winner} did the work, job {loser} short-circuited.")
+            )
+
+    def _make_regroup_job(self, deployment: Deployment, suffix: str) -> Job:
+        return Job.objects.create(
+            name=f"E2E {suffix} (deployment {deployment.pk})",
+            project=deployment.project,
+            deployment=deployment,
+            job_type_key=RegroupEventsJob.key,
+        )
+
+    def _monitor(self, job: Job, options: dict) -> None:
+        start = time.time()
+        timeout = options["timeout"]
+        interval = options["poll_interval"]
+        last_status = None
+        while True:
+            job.refresh_from_db()
+            elapsed = time.time() - start
+            if job.status != last_status:
+                self.stdout.write(f"  [{elapsed:6.1f}s] Job {job.pk} status: {job.status}")
+                last_status = job.status
+            if job.status in JobState.final_states():
+                self.stdout.write(f"  [{elapsed:6.1f}s] Job {job.pk} reached final state {job.status}")
+                return
+            if elapsed > timeout:
+                raise CommandError(
+                    f"❌ Job {job.pk} did not reach a final state within {timeout}s (status={job.status})"
+                )
+            time.sleep(interval)
+
+    def _assert_status(self, job: Job, expected: str) -> None:
+        if job.status != expected:
+            raise CommandError(
+                f"❌ Job {job.pk} ended with status {job.status!r} (expected {expected!r}). "
+                f"Stages: {[(s.key, s.status, s.progress) for s in (job.progress.stages or [])]}"
+            )
+        self.stdout.write(self.style.SUCCESS(f"✓ Job {job.pk} ended {expected}"))
+
+    def _stage_param_dict(self, job: Job, stage_key: str) -> dict:
+        for stage in job.progress.stages or []:
+            if stage.key == stage_key:
+                return {param.key: param.value for param in (stage.params or [])}
+        return {}
+
+    def _dump_stage_params(self, job: Job) -> None:
+        for stage in job.progress.stages or []:
+            self.stdout.write(
+                f"\n  Stage '{stage.name}' ({stage.key}): status={stage.status} progress={stage.progress}"
+            )
+            for param in stage.params or []:
+                self.stdout.write(f"    {param.name} [{param.key}]: {param.value}")
@@ -672,9 +672,37 @@ def process_images(cls, job, images):
         job.save()
 
 
+# Human-readable param names for the regroup stage, surfaced both on the
+# standalone RegroupEventsJob and on DataStorageSyncJob's regroup stage. The
+# stable retrieval key is the slugify(name) form (e.g. "captures-grouped"),
+# produced by JobProgress.make_key — see _REGROUP_STAGE_PARAM_KEYS in
+# ami.main.models for the corresponding kwarg map.
+REGROUP_STAGE_PARAM_NAMES = (
+    "Captures grouped",
+    "Events created",
+    "Events touched",
+    "Empty events deleted",
+    "Duplicate timestamps",
+    "Ungrouped captures",
+    "Captures missing timestamp",
+)
+
+
 class DataStorageSyncJob(JobType):
+    """
+    Sync captures from the deployment's data source, then regroup them into
+    sessions as a separate tracked stage.
+
+    The regroup stage runs inside this job (not via ``Deployment.save()``
+    autoregroup) so its logs land on the same Job row and a regroup failure
+    flips the Job to FAILURE. Previously a sync would silently succeed even
+    if the post-sync regroup raised — see #1157.
+    """
+
     name = "Data storage sync"
     key = "data_storage_sync"
+    regroup_stage_key = "regroup_sessions"
+    regroup_stage_name = "Regroup sessions"
 
     @classmethod
     def run(cls, job: "Job"):
@@ -683,38 +711,63 @@ def run(cls, job: "Job"):
 
         This is meant to be called by an async task, not directly.
         """
+        from ami.main.models import group_images_into_events
 
-        job.progress.add_stage(cls.name)
+        job.progress.add_stage(cls.name, key=cls.key)
         job.progress.add_stage_param(cls.key, "Total files", 0)
         job.progress.add_stage_param(cls.key, "Failed", 0)
+
+        job.progress.add_stage(cls.regroup_stage_name, key=cls.regroup_stage_key)
+        for param_name in REGROUP_STAGE_PARAM_NAMES:
+            job.progress.add_stage_param(cls.regroup_stage_key, param_name, 0)
+
         job.update_status(JobState.STARTED)
         job.started_at = datetime.datetime.now()
         job.finished_at = None
         job.save()
 
         if not job.deployment:
             raise ValueError("No deployment provided for data storage sync job")
-        else:
-            job.logger.info(f"Syncing captures for deployment {job.deployment}")
-            job.progress.update_stage(
-                cls.key,
-                status=JobState.STARTED,
-                progress=0,
-                total_files=0,
-            )
-            job.save()
 
-            job.deployment.sync_captures(job=job)
+        job.logger.info(f"Syncing captures for deployment {job.deployment}")
+        job.progress.update_stage(
+            cls.key,
+            status=JobState.STARTED,
+            progress=0,
+            total_files=0,
+        )
+        job.save()
+
+        job.deployment.sync_captures(job=job, regroup_after=False)
 
-            job.logger.info(f"Finished syncing captures for deployment {job.deployment}")
-            job.progress.update_stage(
-                cls.key,
-                status=JobState.SUCCESS,
-                progress=1,
+        job.logger.info(f"Finished syncing captures for deployment {job.deployment}")
+        job.progress.update_stage(cls.key, status=JobState.SUCCESS, progress=1)
+        job.save()
+
+        job.logger.info(f"Regrouping captures into sessions for deployment {job.deployment}")
+        job.progress.update_stage(cls.regroup_stage_key, status=JobState.STARTED, progress=0)
+        job.save()
+
+        events = group_images_into_events(job.deployment, job=job, stage_key=cls.regroup_stage_key)
+        job.logger.info(f"Deployment {job.deployment} now has {len(events)} events after sync regroup.")
+
+        # The lock-miss branch in group_images_into_events returns []. If we
+        # just synced new captures, that means those captures are now sitting
+        # ungrouped because a concurrent regroup held the lock. They will be
+        # picked up by the next Deployment.save autoregroup (e.g. the next
+        # sync) — but flag it loudly on this Job so an admin watching this run
+        # sees what happened rather than a silently-empty regroup stage.
+        sync_total_files = job.progress.get_stage_param(cls.key, "total_files").value or 0
+        if not events and sync_total_files:
+            job.logger.warning(
+                f"Sync added {sync_total_files} files but the regroup stage was skipped because "
+                f"another regroup is in progress for deployment {job.deployment.pk}. The new captures "
+                f"will be grouped by the next sync or save. If this keeps happening, check the Jobs "
+                f"list for a stuck regroup_events task."
             )
-            job.update_status(JobState.SUCCESS)
-            job.save()
 
+        job.progress.update_stage(cls.regroup_stage_key, status=JobState.SUCCESS, progress=1)
+        job.update_status(JobState.SUCCESS)
         job.finished_at = datetime.datetime.now()
         job.save()
 
@@ -834,6 +887,54 @@ def run(cls, job: "Job"):
         job.save()
 
 
+class RegroupEventsJob(JobType):
+    """
+    Regroup a deployment's captures into Events using the project's
+    ``session_time_gap_seconds`` setting.
+
+    Single-stage job: ``group_images_into_events`` is one mostly-atomic SQL
+    pass with no per-image Python loop, so we cannot report incremental %
+    progress meaningfully. Stage transitions are CREATED → STARTED (0%) →
+    SUCCESS/FAILURE (100%). Summary stats (events created/touched/deleted,
+    duplicates, ungrouped captures) are written to the stage params by
+    ``group_images_into_events`` itself before it returns. Closes #1157, #1158.
+
+    Scope: grouping only. Propagating ``project_id`` to children lives on
+    ``Deployment.save()`` via ``update_children()`` — save the deployment
+    after moving it to push the new ``project_id`` down. Bare
+    ``ami.tasks.regroup_events`` has the same scope.
+    """
+
+    name = "Regroup sessions"
+    key = "regroup_events"
+
+    @classmethod
+    def run(cls, job: "Job"):
+        from ami.main.models import group_images_into_events
+
+        if not job.deployment:
+            raise ValueError("No deployment provided for regroup events job")
+
+        job.progress.add_stage(cls.name, key=cls.key)
+        for param_name in REGROUP_STAGE_PARAM_NAMES:
+            job.progress.add_stage_param(cls.key, param_name, 0)
+
+        job.update_status(JobState.STARTED)
+        job.started_at = datetime.datetime.now()
+        job.finished_at = None
+        job.progress.update_stage(cls.key, status=JobState.STARTED, progress=0)
+        job.save()
+
+        job.logger.info(f"Regrouping captures for deployment {job.deployment}")
+        events = group_images_into_events(job.deployment, job=job, stage_key=cls.key)
+        job.logger.info(f"Deployment {job.deployment} now has {len(events)} events after regrouping.")
+
+        job.progress.update_stage(cls.key, status=JobState.SUCCESS, progress=1)
+        job.update_status(JobState.SUCCESS, save=False)
+        job.finished_at = datetime.datetime.now()
+        job.save()
+
+
 class UnknownJobType(JobType):
     name = "Unknown"
     key = "unknown"
@@ -847,6 +948,7 @@ def run(cls, job: "Job"):
     MLJob,
     SourceImageCollectionPopulateJob,
     DataStorageSyncJob,
+    RegroupEventsJob,
     UnknownJobType,
     DataExportJob,
     PostProcessingJob,