Skip to content
Open
7 changes: 3 additions & 4 deletions ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from ami.utils.storages import ConnectionTestResult

from ..models import (
NULL_DETECTIONS_FILTER,
Classification,
Deployment,
Detection,
Expand Down Expand Up @@ -581,7 +580,7 @@ def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet:
if has_detections is not None:
has_detections = BooleanField(required=False).clean(has_detections)
queryset = queryset.annotate(
has_detections=models.Exists(Detection.objects.filter(source_image=models.OuterRef("pk"))),
has_detections=models.Exists(Detection.objects.valid().filter(source_image=models.OuterRef("pk"))),
).filter(has_detections=has_detections)
return queryset

Expand Down Expand Up @@ -611,7 +610,7 @@ def prefetch_detections(self, queryset: QuerySet, project: Project | None = None
score = get_default_classification_threshold(project, self.request)

prefetch_queryset = (
Detection.objects.exclude(NULL_DETECTIONS_FILTER)
Detection.objects.valid()
.annotate(
determination_score=models.Max("occurrence__detections__classifications__score"),
# Store whether this occurrence should be included based on default filters
Expand Down Expand Up @@ -910,7 +909,7 @@ class DetectionViewSet(DefaultViewSet, ProjectMixin):
"""

require_project_for_list = True # Unfiltered list scans are too expensive on this table
queryset = Detection.objects.exclude(NULL_DETECTIONS_FILTER).select_related("source_image", "detection_algorithm")
queryset = Detection.objects.valid().select_related("source_image", "detection_algorithm")
serializer_class = DetectionSerializer
filterset_fields = ["source_image", "detection_algorithm", "source_image__project"]
ordering_fields = ["created_at", "updated_at", "detection_score", "timestamp"]
Expand Down
86 changes: 86 additions & 0 deletions ami/main/management/commands/cleanup_null_only_occurrences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Delete phantom Occurrences and orphan null-marker Detections left by the Issue #1310
field bug, on a per-project basis.

The bug created two categories of rows that should never have been persisted:
- Occurrence rows with no real detections (or with determination=NULL), surfaced as
ghost rows in the API.
- Detection rows that mark a SourceImage as "processed" while no real detections
exist for it — these prevent filter_processed_images from re-yielding the image
on the next ML run.

After cleanup, the source images become eligible for re-processing.

Dry-run by default. Pass --commit to delete.
"""

from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.db.models import Exists, OuterRef

from ami.main.models import Detection, Occurrence, Project


class Command(BaseCommand):
help = "Delete phantom Occurrences and orphan null-marker Detections (Issue #1310)."

def add_arguments(self, parser):
parser.add_argument(
"--project",
type=int,
required=True,
help="Project ID to clean up.",
)
parser.add_argument(
"--commit",
action="store_true",
help="Actually delete. Defaults to dry-run.",
)

def handle(self, *args, **options):
project_id: int = options["project"]
commit: bool = options["commit"]

try:
project = Project.objects.get(pk=project_id)
except Project.DoesNotExist as err:
raise CommandError(f"Project {project_id} does not exist") from err

all_occs = Occurrence.objects.filter(project=project)
valid_occs = all_occs.valid()
phantom_occs = all_occs.exclude(pk__in=valid_occs.values("pk"))

has_valid_detection = Detection.objects.valid().filter(source_image_id=OuterRef("source_image_id"))
orphan_null_markers = (
Detection.objects.filter(source_image__project=project)
.null_markers()
.annotate(_has_valid=Exists(has_valid_detection))
.filter(_has_valid=False)
)

phantom_count = phantom_occs.count()
null_count = orphan_null_markers.count()

self.stdout.write(f"Project #{project.pk} ({project.name}):")
self.stdout.write(f" Phantom occurrences (no valid detection or null determination): {phantom_count}")
self.stdout.write(f" Orphan null-marker detections on images with no real detections: {null_count}")

if phantom_count == 0 and null_count == 0:
self.stdout.write(self.style.SUCCESS("Nothing to clean up."))
return

if not commit:
self.stdout.write(self.style.WARNING("Dry run — pass --commit to delete."))
return

with transaction.atomic():
orphan_null_markers.delete()
phantom_occs.delete()

# Report the pre-calculated counts of the rows we targeted directly. The tuple from
# .delete() also counts cascade-deleted related rows (e.g. classifications under a
# phantom occurrence's detections), which would inflate the numbers and confuse the
# operator about what the command actually targeted.
self.stdout.write(
self.style.SUCCESS(f"Deleted {phantom_count} phantom occurrences and {null_count} orphan null markers.")
)
78 changes: 69 additions & 9 deletions ami/main/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,16 @@ class TaxonRank(OrderedEnum):
NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[])


def null_detections_q(prefix: str = "") -> Q:
"""
Return a Q expression matching null-marker Detection rows, optionally prefixed
for use across relations (e.g. null_detections_q("images__detections__") for an
aggregate filter on a parent table). For Detection queries directly, prefer
Detection.objects.null_markers() / .valid() instead.
"""
return Q(**{f"{prefix}bbox__isnull": True}) | Q(**{f"{prefix}bbox": []})


def get_media_url(path: str) -> str:
"""
If path is a full URL, return it as-is.
Expand Down Expand Up @@ -814,7 +824,7 @@ def get_detections_count(self) -> int | None:
was processed and no detections were found) to stay consistent with
``SourceImage.get_detections_count`` and ``Event.get_detections_count``.
"""
qs = Detection.objects.filter(source_image__deployment=self).exclude(NULL_DETECTIONS_FILTER)
qs = Detection.objects.filter(source_image__deployment=self).valid()
filter_q = build_occurrence_default_filters_q(
project=self.project,
request=None,
Expand Down Expand Up @@ -1226,7 +1236,7 @@ def get_detections_count(self) -> int | None:
Excludes null-bbox placeholder detections to stay consistent with
``SourceImage.get_detections_count`` and ``Deployment.get_detections_count``.
"""
qs = Detection.objects.filter(source_image__event=self).exclude(NULL_DETECTIONS_FILTER)
qs = Detection.objects.filter(source_image__event=self).valid()
filter_q = build_occurrence_default_filters_q(
project=self.project,
request=None,
Expand Down Expand Up @@ -2034,7 +2044,7 @@ def get_detections_count(self) -> int:
Excludes detections without bounding boxes — those are placeholder records
indicating the image was successfully processed and no detections were found.
"""
qs = self.detections.exclude(NULL_DETECTIONS_FILTER)
qs = self.detections.all().valid()
project = self.project
if not project:
return qs.distinct().count()
Expand Down Expand Up @@ -2240,7 +2250,7 @@ def update_detection_counts(
if null_only:
qs = qs.filter(detections_count__isnull=True)

detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER)
detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).valid()
if project is not None:
filter_q = build_occurrence_default_filters_q(
project=project,
Expand Down Expand Up @@ -2718,7 +2728,23 @@ def save(self, *args, **kwargs):


class DetectionQuerySet(BaseQuerySet):
def null_detections(self):
def valid(self):
"""
Detections suitable for consumer queries — excludes null-marker sentinels.

Null markers are rows that record "an algorithm ran against this image and
found nothing." Consumers asking "give me detections" should always go
through .valid(). Future predicates to fold in here: soft-delete tombstones,
detections missing an algorithm reference, detections missing classifications.
"""
return self.exclude(NULL_DETECTIONS_FILTER)

def null_markers(self):
"""
Sentinel rows that record "this algorithm ran against this image and found
nothing." Only relevant for SourceImage-level "has this been processed?"
questions. Detection consumers should use .valid() instead.
"""
return self.filter(NULL_DETECTIONS_FILTER)


Expand Down Expand Up @@ -2796,6 +2822,26 @@ class Detection(BaseModel):

objects = DetectionManager()

NULL_BBOX = None
"""Canonical bbox value for null markers (rows that record 'an algorithm ran but
found nothing'). Use Detection.build_null_marker() to construct them. The legacy
bbox=[] form is still recognised by .null_markers() / .is_null_marker for
backwards compatibility with historical rows."""

@property
def is_null_marker(self) -> bool:
"""True for sentinel rows representing 'no detections found by this algorithm.'"""
return self.bbox is None or self.bbox == []

@classmethod
def build_null_marker(cls, source_image, detection_algorithm) -> "Detection":
"""Construct (without saving) a null-marker Detection for the given image+algorithm."""
return cls(
source_image=source_image,
bbox=cls.NULL_BBOX,
detection_algorithm=detection_algorithm,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

def get_bbox(self):
if self.bbox:
return BoundingBox(
Expand Down Expand Up @@ -2911,7 +2957,20 @@ def __str__(self) -> str:

class OccurrenceQuerySet(BaseQuerySet):
def valid(self):
return self.exclude(detections__isnull=True)
"""
Occurrences fit to surface in API responses: at least one real detection AND
a determination set.

Excludes:
- Occurrences with no detections at all (orphans)
- Occurrences whose only detections are null-marker sentinels (Issue #1310:
field bug created phantom occurrences with no real bounding box backing
them)
- Occurrences with determination__isnull=True (no taxonomic identification,
same field bug shape)
"""
has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk")))
return self.filter(has_valid_detection).exclude(determination__isnull=True)

def with_detections_count(self):
return self.annotate(detections_count=models.Count("detections", distinct=True))
Expand Down Expand Up @@ -4105,7 +4164,7 @@ def with_source_images_with_detections_count(self):
return self.annotate(
source_images_with_detections_count=models.Count(
"images",
filter=(~models.Q(images__detections__bbox__isnull=True) & ~models.Q(images__detections__bbox=[])),
filter=~null_detections_q("images__detections__"),
distinct=True,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)
)
Expand Down Expand Up @@ -4497,10 +4556,11 @@ def sample_greatest_file_size_from_each_event(self, num_each: int = 1):
return captures

def sample_detections_only(self):
"""Sample all source images with detections"""
"""Sample all source images with at least one real (non-null-marker) detection."""

qs = self.get_queryset()
return qs.filter(detections__isnull=False).distinct()
valid_detection_image_ids = Detection.objects.valid().values("source_image_id")
return qs.filter(pk__in=valid_detection_image_ids).distinct()

def sample_full(
self,
Expand Down
Loading
Loading