From 44dd942631957bbe6fce67c8aa2bf4010d77ab8f Mon Sep 17 00:00:00 2001
From: Michael Bunsen <notbot@gmail.com>
Date: Tue, 14 Apr 2026 10:35:52 -0700
Subject: [PATCH 1/2] fix(celery): bump worker concurrency default to 16

The default celery worker concurrency (os.cpu_count()) underutilises the
worker pool for process_nats_pipeline_result and create_detection_images,
which are DB/Redis-bound rather than CPU-bound. On a prefork pool sized
to CPU count, the pool is idle most of the time while the antenna queue
backlogs during high-throughput NATS async_api jobs.

Override via CELERY_WORKER_CONCURRENCY env var per deployment; 16 is the
new default.
---
 config/settings/base.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/config/settings/base.py b/config/settings/base.py
index c3a8750dc..d0b456708 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -393,6 +393,13 @@ def _celery_result_backend_url(redis_url):
 CELERY_WORKER_PREFETCH_MULTIPLIER = 1
 CELERY_WORKER_ENABLE_PREFETCH_COUNT_REDUCTION = True
 
+# Worker concurrency (prefork pool size)
+# https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-concurrency
+# Default when unset is os.cpu_count(), which underutilises this workload because
+# process_nats_pipeline_result is DB/Redis-bound and spends most of its time on I/O.
+# Override via CELERY_WORKER_CONCURRENCY env var per deployment.
+CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=16)
+
 # Cancel & return to queue if connection is lost
 # https://docs.celeryq.dev/en/latest/userguide/configuration.html#worker-cancel-long-running-tasks-on-connection-loss
 CELERY_WORKER_CANCEL_LONG_RUNNING_TASKS_ON_CONNECTION_LOSS = True

From 07a1c2b258fefd41ebe4963423c30692a78725e3 Mon Sep 17 00:00:00 2001
From: Michael Bunsen <notbot@gmail.com>
Date: Tue, 14 Apr 2026 17:19:48 -0700
Subject: [PATCH 2/2] chore(celery): lower default concurrency to 8, document
 prod override

Drop the default from 16 to 8 so local dev, staging, and demo stacks don't
spawn 16 prefork workers by default. Production keeps 16 via the existing
CELERY_WORKER_CONCURRENCY env var (hotfix already in place).

Also add guidance + example values to .envs/.local/.django and
.envs/.production/.django-example so per-environment tuning is discoverable.
---
 .envs/.local/.django              |  5 +++++
 .envs/.production/.django-example |  7 +++++++
 config/settings/base.py           | 11 +++++++----
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 8eb5610f7..36568b3e1 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -19,6 +19,11 @@ NATS_URL=nats://nats:4222
 CELERY_FLOWER_USER=QSocnxapfMvzLqJXSsXtnEZqRkBtsmKT
 CELERY_FLOWER_PASSWORD=BEQgmCtgyrFieKNoGTsux9YIye0I7P5Q7vEgfJD2C4jxmtHDetFaE2jhS7K7rxaf
 
+# Celery worker concurrency (prefork pool size).
+# Default in settings is 8. Lower to 2-4 on memory-constrained dev laptops —
+# each prefork worker is a separate Python process with its own DB connection.
+# CELERY_WORKER_CONCURRENCY=4
+
 # RabbitMQ
 CELERY_BROKER_URL=amqp://rabbituser:rabbitpass@rabbitmq:5672/
 CELERY_RESULT_BACKEND=rpc:// # Use RabbitMQ for results backend
diff --git a/.envs/.production/.django-example b/.envs/.production/.django-example
index 93737d527..c30d88dc0 100644
--- a/.envs/.production/.django-example
+++ b/.envs/.production/.django-example
@@ -20,6 +20,13 @@ REDIS_URL=redis://redis:6379/0
 # Celery
 # ------------------------------------------------------------------------------
 
+# Worker concurrency (prefork pool size). Settings default is 8.
+# Recommended for production: 16 on an 8-core host — process_nats_pipeline_result
+# and create_detection_images are DB/Redis-bound, so ~2x cpu_count roughly matches
+# observed drain vs ingress on the antenna queue without saturating pgbouncer.
+# Tune higher only after confirming postgres connection-pool headroom.
+CELERY_WORKER_CONCURRENCY=16
+
 # Flower
 CELERY_FLOWER_USER=
 CELERY_FLOWER_PASSWORD=
diff --git a/config/settings/base.py b/config/settings/base.py
index d0b456708..5877d3406 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -395,10 +395,13 @@ def _celery_result_backend_url(redis_url):
 
 # Worker concurrency (prefork pool size)
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-concurrency
-# Default when unset is os.cpu_count(), which underutilises this workload because
-# process_nats_pipeline_result is DB/Redis-bound and spends most of its time on I/O.
-# Override via CELERY_WORKER_CONCURRENCY env var per deployment.
-CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=16)
+# Celery's own default when unset is os.cpu_count(), which on the production
+# 8-core host produced an 8-process pool that could not keep up with the antenna
+# queue's DB/Redis-bound tasks (process_nats_pipeline_result, create_detection_images).
+# 8 is a conservative default that keeps local/staging/demo memory footprints
+# reasonable (each prefork worker is a separate Python process with imports +
+# DB connection). Production should override to 16 (see .envs/.production/.django-example).
+CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=8)
 
 # Cancel & return to queue if connection is lost
 # https://docs.celeryq.dev/en/latest/userguide/configuration.html#worker-cancel-long-running-tasks-on-connection-loss