From 44dd942631957bbe6fce67c8aa2bf4010d77ab8f Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 14 Apr 2026 10:35:52 -0700 Subject: [PATCH 1/2] fix(celery): bump worker concurrency default to 16 The default celery worker concurrency (os.cpu_count()) underutilises the worker pool for process_nats_pipeline_result and create_detection_images, which are DB/Redis-bound rather than CPU-bound. On a prefork pool sized to CPU count, the pool is idle most of the time while the antenna queue backlogs during high-throughput NATS async_api jobs. Override via CELERY_WORKER_CONCURRENCY env var per deployment; 16 is the new default. --- config/settings/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config/settings/base.py b/config/settings/base.py index c3a8750dc..d0b456708 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -393,6 +393,13 @@ def _celery_result_backend_url(redis_url): CELERY_WORKER_PREFETCH_MULTIPLIER = 1 CELERY_WORKER_ENABLE_PREFETCH_COUNT_REDUCTION = True +# Worker concurrency (prefork pool size) +# https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-concurrency +# Default when unset is os.cpu_count(), which underutilises this workload because +# process_nats_pipeline_result is DB/Redis-bound and spends most of its time on I/O. +# Override via CELERY_WORKER_CONCURRENCY env var per deployment. +CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=16) + # Cancel & return to queue if connection is lost # https://docs.celeryq.dev/en/latest/userguide/configuration.html#worker-cancel-long-running-tasks-on-connection-loss CELERY_WORKER_CANCEL_LONG_RUNNING_TASKS_ON_CONNECTION_LOSS = True From 07a1c2b258fefd41ebe4963423c30692a78725e3 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 14 Apr 2026 17:19:48 -0700 Subject: [PATCH 2/2] chore(celery): lower default concurrency to 8, document prod override Drop the default from 16 to 8 so local dev, staging, and demo stacks don't spawn 16 prefork workers by default. Production keeps 16 via the existing CELERY_WORKER_CONCURRENCY env var (hotfix already in place). Also add guidance + example values to .envs/.local/.django and .envs/.production/.django-example so per-environment tuning is discoverable. --- .envs/.local/.django | 5 +++++ .envs/.production/.django-example | 7 +++++++ config/settings/base.py | 11 +++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.envs/.local/.django b/.envs/.local/.django index 8eb5610f7..36568b3e1 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -19,6 +19,11 @@ NATS_URL=nats://nats:4222 CELERY_FLOWER_USER=QSocnxapfMvzLqJXSsXtnEZqRkBtsmKT CELERY_FLOWER_PASSWORD=BEQgmCtgyrFieKNoGTsux9YIye0I7P5Q7vEgfJD2C4jxmtHDetFaE2jhS7K7rxaf +# Celery worker concurrency (prefork pool size). +# Default in settings is 8. Lower to 2-4 on memory-constrained dev laptops — +# each prefork worker is a separate Python process with its own DB connection. +# CELERY_WORKER_CONCURRENCY=4 + # RabbitMQ CELERY_BROKER_URL=amqp://rabbituser:rabbitpass@rabbitmq:5672/ CELERY_RESULT_BACKEND=rpc:// # Use RabbitMQ for results backend diff --git a/.envs/.production/.django-example b/.envs/.production/.django-example index 93737d527..c30d88dc0 100644 --- a/.envs/.production/.django-example +++ b/.envs/.production/.django-example @@ -20,6 +20,13 @@ REDIS_URL=redis://redis:6379/0 # Celery # ------------------------------------------------------------------------------ +# Worker concurrency (prefork pool size). Settings default is 8. +# Recommended for production: 16 on an 8-core host — process_nats_pipeline_result +# and create_detection_images are DB/Redis-bound, so ~2x cpu_count roughly matches +# observed drain vs ingress on the antenna queue without saturating pgbouncer. +# Tune higher only after confirming postgres connection-pool headroom. +CELERY_WORKER_CONCURRENCY=16 + # Flower CELERY_FLOWER_USER= CELERY_FLOWER_PASSWORD= diff --git a/config/settings/base.py b/config/settings/base.py index d0b456708..5877d3406 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -395,10 +395,13 @@ def _celery_result_backend_url(redis_url): # Worker concurrency (prefork pool size) # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-concurrency -# Default when unset is os.cpu_count(), which underutilises this workload because -# process_nats_pipeline_result is DB/Redis-bound and spends most of its time on I/O. -# Override via CELERY_WORKER_CONCURRENCY env var per deployment. -CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=16) +# Celery's own default when unset is os.cpu_count(), which on the production +# 8-core host produced an 8-process pool that could not keep up with the antenna +# queue's DB/Redis-bound tasks (process_nats_pipeline_result, create_detection_images). +# 8 is a conservative default that keeps local/staging/demo memory footprints +# reasonable (each prefork worker is a separate Python process with imports + +# DB connection). Production should override to 16 (see .envs/.production/.django-example). +CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=8) # Cancel & return to queue if connection is lost # https://docs.celeryq.dev/en/latest/userguide/configuration.html#worker-cancel-long-running-tasks-on-connection-loss