RolnickLab · mihow · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/ami/main/api/views.py b/ami/main/api/views.py
@@ -37,7 +37,6 @@
 from ami.utils.storages import ConnectionTestResult
 
 from ..models import (
-    NULL_DETECTIONS_FILTER,
     Classification,
     Deployment,
     Detection,
@@ -581,7 +580,7 @@ def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet:
         if has_detections is not None:
             has_detections = BooleanField(required=False).clean(has_detections)
             queryset = queryset.annotate(
-                has_detections=models.Exists(Detection.objects.filter(source_image=models.OuterRef("pk"))),
+                has_detections=models.Exists(Detection.objects.valid().filter(source_image=models.OuterRef("pk"))),
             ).filter(has_detections=has_detections)
         return queryset
 
@@ -611,7 +610,7 @@ def prefetch_detections(self, queryset: QuerySet, project: Project | None = None
         score = get_default_classification_threshold(project, self.request)
 
         prefetch_queryset = (
-            Detection.objects.exclude(NULL_DETECTIONS_FILTER)
+            Detection.objects.valid()
             .annotate(
                 determination_score=models.Max("occurrence__detections__classifications__score"),
                 # Store whether this occurrence should be included based on default filters
@@ -910,7 +909,7 @@ class DetectionViewSet(DefaultViewSet, ProjectMixin):
     """
 
     require_project_for_list = True  # Unfiltered list scans are too expensive on this table
-    queryset = Detection.objects.exclude(NULL_DETECTIONS_FILTER).select_related("source_image", "detection_algorithm")
+    queryset = Detection.objects.valid().select_related("source_image", "detection_algorithm")
     serializer_class = DetectionSerializer
     filterset_fields = ["source_image", "detection_algorithm", "source_image__project"]
     ordering_fields = ["created_at", "updated_at", "detection_score", "timestamp"]

diff --git a/ami/main/management/commands/cleanup_null_only_occurrences.py b/ami/main/management/commands/cleanup_null_only_occurrences.py
@@ -0,0 +1,86 @@
+"""
+Delete phantom Occurrences and orphan null-marker Detections left by the Issue #1310
+field bug, on a per-project basis.
+
+The bug created two categories of rows that should never have been persisted:
+- Occurrence rows with no real detections (or with determination=NULL), surfaced as
+  ghost rows in the API.
+- Detection rows that mark a SourceImage as "processed" while no real detections
+  exist for it — these prevent filter_processed_images from re-yielding the image
+  on the next ML run.
+
+After cleanup, the source images become eligible for re-processing.
+
+Dry-run by default. Pass --commit to delete.
+"""
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db import transaction
+from django.db.models import Exists, OuterRef
+
+from ami.main.models import Detection, Occurrence, Project
+
+
+class Command(BaseCommand):
+    help = "Delete phantom Occurrences and orphan null-marker Detections (Issue #1310)."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--project",
+            type=int,
+            required=True,
+            help="Project ID to clean up.",
+        )
+        parser.add_argument(
+            "--commit",
+            action="store_true",
+            help="Actually delete. Defaults to dry-run.",
+        )
+
+    def handle(self, *args, **options):
+        project_id: int = options["project"]
+        commit: bool = options["commit"]
+
+        try:
+            project = Project.objects.get(pk=project_id)
+        except Project.DoesNotExist as err:
+            raise CommandError(f"Project {project_id} does not exist") from err
+
+        all_occs = Occurrence.objects.filter(project=project)
+        valid_occs = all_occs.valid()
+        phantom_occs = all_occs.exclude(pk__in=valid_occs.values("pk"))
+
+        has_valid_detection = Detection.objects.valid().filter(source_image_id=OuterRef("source_image_id"))
+        orphan_null_markers = (
+            Detection.objects.filter(source_image__project=project)
+            .null_markers()
+            .annotate(_has_valid=Exists(has_valid_detection))
+            .filter(_has_valid=False)
+        )
+
+        phantom_count = phantom_occs.count()
+        null_count = orphan_null_markers.count()
+
+        self.stdout.write(f"Project #{project.pk} ({project.name}):")
+        self.stdout.write(f"  Phantom occurrences (no valid detection or null determination): {phantom_count}")
+        self.stdout.write(f"  Orphan null-marker detections on images with no real detections: {null_count}")
+
+        if phantom_count == 0 and null_count == 0:
+            self.stdout.write(self.style.SUCCESS("Nothing to clean up."))
+            return
+
+        if not commit:
+            self.stdout.write(self.style.WARNING("Dry run — pass --commit to delete."))
+            return
+
+        with transaction.atomic():
+            orphan_null_markers.delete()
+            phantom_occs.delete()
+
+        # Report the pre-calculated counts of the rows we targeted directly. The tuple from
+        # .delete() also counts cascade-deleted related rows (e.g. classifications under a
+        # phantom occurrence's detections), which would inflate the numbers and confuse the
+        # operator about what the command actually targeted.
+        self.stdout.write(
+            self.style.SUCCESS(f"Deleted {phantom_count} phantom occurrences and {null_count} orphan null markers.")
+        )
diff --git a/ami/main/models.py b/ami/main/models.py
@@ -98,6 +98,16 @@ class TaxonRank(OrderedEnum):
 NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[])
 
 
+def null_detections_q(prefix: str = "") -> Q:
+    """
+    Return a Q expression matching null-marker Detection rows, optionally prefixed
+    for use across relations (e.g. null_detections_q("images__detections__") for an
+    aggregate filter on a parent table). For Detection queries directly, prefer
+    Detection.objects.null_markers() / .valid() instead.
+    """
+    return Q(**{f"{prefix}bbox__isnull": True}) | Q(**{f"{prefix}bbox": []})
+
+
 def get_media_url(path: str) -> str:
     """
     If path is a full URL, return it as-is.
@@ -814,7 +824,7 @@ def get_detections_count(self) -> int | None:
         was processed and no detections were found) to stay consistent with
         ``SourceImage.get_detections_count`` and ``Event.get_detections_count``.
         """
-        qs = Detection.objects.filter(source_image__deployment=self).exclude(NULL_DETECTIONS_FILTER)
+        qs = Detection.objects.filter(source_image__deployment=self).valid()
         filter_q = build_occurrence_default_filters_q(
             project=self.project,
             request=None,
@@ -1226,7 +1236,7 @@ def get_detections_count(self) -> int | None:
         Excludes null-bbox placeholder detections to stay consistent with
         ``SourceImage.get_detections_count`` and ``Deployment.get_detections_count``.
         """
-        qs = Detection.objects.filter(source_image__event=self).exclude(NULL_DETECTIONS_FILTER)
+        qs = Detection.objects.filter(source_image__event=self).valid()
         filter_q = build_occurrence_default_filters_q(
             project=self.project,
             request=None,
@@ -2034,7 +2044,7 @@ def get_detections_count(self) -> int:
         Excludes detections without bounding boxes — those are placeholder records
         indicating the image was successfully processed and no detections were found.
         """
-        qs = self.detections.exclude(NULL_DETECTIONS_FILTER)
+        qs = self.detections.all().valid()
         project = self.project
         if not project:
             return qs.distinct().count()
@@ -2240,7 +2250,7 @@ def update_detection_counts(
     if null_only:
         qs = qs.filter(detections_count__isnull=True)
 
-    detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
+    detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).valid()
     if project is not None:
         filter_q = build_occurrence_default_filters_q(
             project=project,
@@ -2718,7 +2728,23 @@ def save(self, *args, **kwargs):
 
 
 class DetectionQuerySet(BaseQuerySet):
-    def null_detections(self):
+    def valid(self):
+        """
+        Detections suitable for consumer queries — excludes null-marker sentinels.
+
+        Null markers are rows that record "an algorithm ran against this image and
+        found nothing." Consumers asking "give me detections" should always go
+        through .valid(). Future predicates to fold in here: soft-delete tombstones,
+        detections missing an algorithm reference, detections missing classifications.
+        """
+        return self.exclude(NULL_DETECTIONS_FILTER)
+
+    def null_markers(self):
+        """
+        Sentinel rows that record "this algorithm ran against this image and found
+        nothing." Only relevant for SourceImage-level "has this been processed?"
+        questions. Detection consumers should use .valid() instead.
+        """
         return self.filter(NULL_DETECTIONS_FILTER)
 
 
@@ -2796,6 +2822,26 @@ class Detection(BaseModel):
 
     objects = DetectionManager()
 
+    NULL_BBOX = None
+    """Canonical bbox value for null markers (rows that record 'an algorithm ran but
+    found nothing'). Use Detection.build_null_marker() to construct them. The legacy
+    bbox=[] form is still recognised by .null_markers() / .is_null_marker for
+    backwards compatibility with historical rows."""
+
+    @property
+    def is_null_marker(self) -> bool:
+        """True for sentinel rows representing 'no detections found by this algorithm.'"""
+        return self.bbox is None or self.bbox == []
+
+    @classmethod
+    def build_null_marker(cls, source_image, detection_algorithm) -> "Detection":
+        """Construct (without saving) a null-marker Detection for the given image+algorithm."""
+        return cls(
+            source_image=source_image,
+            bbox=cls.NULL_BBOX,
+            detection_algorithm=detection_algorithm,
+        )
+
     def get_bbox(self):
         if self.bbox:
             return BoundingBox(
@@ -2911,7 +2957,20 @@ def __str__(self) -> str:
 
 class OccurrenceQuerySet(BaseQuerySet):
     def valid(self):
-        return self.exclude(detections__isnull=True)
+        """
+        Occurrences fit to surface in API responses: at least one real detection AND
+        a determination set.
+
+        Excludes:
+          - Occurrences with no detections at all (orphans)
+          - Occurrences whose only detections are null-marker sentinels (Issue #1310:
+            field bug created phantom occurrences with no real bounding box backing
+            them)
+          - Occurrences with determination__isnull=True (no taxonomic identification,
+            same field bug shape)
+        """
+        has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
+        return self.filter(has_valid_detection).exclude(determination__isnull=True)
 
     def with_detections_count(self):
         return self.annotate(detections_count=models.Count("detections", distinct=True))
@@ -4105,7 +4164,7 @@ def with_source_images_with_detections_count(self):
         return self.annotate(
             source_images_with_detections_count=models.Count(
                 "images",
-                filter=(~models.Q(images__detections__bbox__isnull=True) & ~models.Q(images__detections__bbox=[])),
+                filter=~null_detections_q("images__detections__"),
                 distinct=True,
             )
         )
@@ -4497,10 +4556,11 @@ def sample_greatest_file_size_from_each_event(self, num_each: int = 1):
         return captures
 
     def sample_detections_only(self):
-        """Sample all source images with detections"""
+        """Sample all source images with at least one real (non-null-marker) detection."""
 
         qs = self.get_queryset()
-        return qs.filter(detections__isnull=False).distinct()
+        valid_detection_image_ids = Detection.objects.valid().values("source_image_id")
+        return qs.filter(pk__in=valid_detection_image_ids).distinct()
 
     def sample_full(
         self,