-
Notifications
You must be signed in to change notification settings - Fork 459
health checks implementation
Health checks are critical for dynamic route announcement. ExaBGP announces or withdraws routes based on service availability, enabling automatic failover and high availability.
ExaBGP includes a built-in healthcheck module (since version 4.x):
python3 -m exabgp healthcheckneighbor 192.0.2.1 {
router-id 10.0.0.1;
local-address 192.0.2.2;
local-as 65000;
peer-as 65000;
api {
processes [service-nginx];
}
}
process service-nginx {
run python3 -m exabgp healthcheck \
-s \
--name nginx \
--cmd "curl --fail --verbose --max-time 2 http://localhost" \
--start-ip 0;
encoder text;
}--name <service> # Service name
--cmd <command> # Health check command
--start-ip <number> # Starting IP offset
-s, --syslog # Log to syslog
--ip <ip> # Service IP address
--interval <seconds> # Check interval (default: 5)
--timeout <seconds> # Check timeout
--rise <count> # Consecutive successes before UP
--fall <count> # Consecutive failures before DOWN#!/usr/bin/env python3
import sys
import socket
from time import sleep
def check_tcp(address, port, timeout=2):
"""Check TCP connectivity"""
s = socket.socket()
s.settimeout(timeout)
try:
s.connect((address, port))
return True
except (socket.timeout, socket.error):
return False
finally:
s.close()
# Configuration
SERVICE_IP = '203.0.113.10'
CHECK_HOST = 'localhost'
CHECK_PORT = 80
CHECK_INTERVAL = 5
# Announce route initially
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
sys.stdout.flush()
while True:
if check_tcp(CHECK_HOST, CHECK_PORT):
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(CHECK_INTERVAL)#!/usr/bin/env python3
import sys
import urllib.request
import urllib.error
from time import sleep
def check_http(url, timeout=2):
"""Check HTTP endpoint"""
try:
req = urllib.request.Request(url)
response = urllib.request.urlopen(req, timeout=timeout)
return response.getcode() == 200
except (urllib.error.URLError, urllib.error.HTTPError):
return False
# Configuration
SERVICE_IP = '203.0.113.10'
HEALTH_URL = 'http://localhost:80/health'
CHECK_INTERVAL = 5
while True:
if check_http(HEALTH_URL):
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(CHECK_INTERVAL)#!/usr/bin/env python3
import sys
import socket
import subprocess
from time import sleep
from typing import NamedTuple
class ServiceCheck(NamedTuple):
name: str
ip: str
check_func: callable
metric: int = 100
def check_http_service():
try:
result = subprocess.run(
['curl', '-sf', '--max-time', '2', 'http://localhost'],
capture_output=True,
timeout=3
)
return result.returncode == 0
except:
return False
def check_database():
try:
import psycopg2
conn = psycopg2.connect("dbname=test user=postgres", connect_timeout=2)
conn.close()
return True
except:
return False
def check_redis():
try:
s = socket.socket()
s.settimeout(2)
s.connect(('localhost', 6379))
s.close()
return True
except:
return False
# Define services to monitor
services = [
ServiceCheck('web', '203.0.113.10', check_http_service, 100),
ServiceCheck('db', '203.0.113.11', check_database, 100),
ServiceCheck('cache', '203.0.113.12', check_redis, 100),
]
while True:
for service in services:
healthy = service.check_func()
if healthy:
sys.stdout.write(
f'announce route {service.ip}/32 next-hop self med {service.metric}\n'
)
else:
sys.stdout.write(f'withdraw route {service.ip}/32\n')
sys.stderr.write(f'Service {service.name} failed health check\n')
sys.stdout.flush()
sys.stderr.flush()
sleep(10)#!/usr/bin/env python3
import sys
from time import sleep
from collections import defaultdict
class HealthChecker:
def __init__(self, rise=3, fall=3):
self.rise = rise # Consecutive passes to mark UP
self.fall = fall # Consecutive fails to mark DOWN
self.state = {} # Current state: True=UP, False=DOWN
self.counts = defaultdict(int) # Consecutive success/failure counts
def check(self, service_id, check_func):
"""
Perform health check with dampening
Returns (should_announce, current_state)
"""
check_passed = check_func()
current_state = self.state.get(service_id, False)
if check_passed:
if current_state:
# Already UP, stays UP
self.counts[service_id] = 0
return (True, True)
else:
# Currently DOWN, count passes
self.counts[service_id] += 1
if self.counts[service_id] >= self.rise:
# Transition to UP
self.state[service_id] = True
self.counts[service_id] = 0
return (True, True)
else:
# Still DOWN, not enough passes
return (False, False)
else:
if not current_state:
# Already DOWN, stays DOWN
self.counts[service_id] = 0
return (False, False)
else:
# Currently UP, count failures
self.counts[service_id] += 1
if self.counts[service_id] >= self.fall:
# Transition to DOWN
self.state[service_id] = False
self.counts[service_id] = 0
return (False, False)
else:
# Still UP, not enough failures
return (True, True)
# Usage
checker = HealthChecker(rise=3, fall=2)
SERVICE_IP = '203.0.113.10'
while True:
should_announce, state = checker.check('web', check_http_service)
if should_announce:
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(5)#!/usr/bin/env python3
import sys
import psutil
from time import sleep
def get_system_load():
"""Get system load metrics"""
cpu = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory().percent
return (cpu, memory)
def calculate_metric(cpu_percent, memory_percent):
"""Calculate BGP MED based on system load"""
# Base metric: 100
# Add penalty for high utilization
cpu_penalty = int((cpu_percent / 100) * 100)
mem_penalty = int((memory_percent / 100) * 50)
return 100 + cpu_penalty + mem_penalty
SERVICE_IP = '203.0.113.10'
while True:
cpu, memory = get_system_load()
# Only announce if system is not overloaded
if cpu < 90 and memory < 90:
metric = calculate_metric(cpu, memory)
sys.stdout.write(
f'announce route {SERVICE_IP}/32 next-hop self med {metric}\n'
)
else:
# System overloaded, withdraw
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(10)def check_web_app():
"""Comprehensive web application check"""
checks = {
'http': check_http('http://localhost:80'),
'health_endpoint': check_http('http://localhost:80/health'),
'database': check_database_connection(),
'cache': check_redis_connection(),
'disk_space': check_disk_space(),
}
# All checks must pass
return all(checks.values()), checks
def check_disk_space(threshold_percent=90):
"""Check if disk space is available"""
usage = psutil.disk_usage('/')
return usage.percent < threshold_percentdef check_database_replication():
"""Check PostgreSQL replication lag"""
try:
import psycopg2
conn = psycopg2.connect("dbname=postgres user=replicator")
cur = conn.cursor()
cur.execute("""
SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))
""")
lag_seconds = cur.fetchone()[0]
conn.close()
# Acceptable lag: < 5 seconds
return lag_seconds < 5.0, lag_seconds
except:
return False, None
# Usage with metric adjustment
SERVICE_IP = '203.0.113.11'
while True:
healthy, lag = check_database_replication()
if healthy:
# Metric based on lag (lower lag = better metric)
metric = int(100 + (lag * 10))
sys.stdout.write(
f'announce route {SERVICE_IP}/32 next-hop self med {metric}\n'
)
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(5)ExaCheck is a dedicated BGP health checker that works with ExaBGP.
Installation:
pip3 install exacheckConfiguration: /etc/exacheck/exacheck.conf
services:
web:
ip: 203.0.113.10
checks:
- type: http
url: http://localhost
interval: 5
timeout: 2
rise: 3
fall: 2
metric: 100
dns:
ip: 203.0.113.11
checks:
- type: dns
query: example.com
server: 127.0.0.1
interval: 5
metric: 100Run ExaCheck:
exacheck --config /etc/exacheck/exacheck.confsysadminblog/exabgp-healthcheck (Perl-based)
Features:
- Multiple health check types
- Configuration file support
- Metric adjustment
- Maintenance mode via file flag
Configuration: healthcheck.conf
{
'service1' => {
'ip' => '203.0.113.10',
'check' => '/usr/local/bin/check_service.sh',
'interval' => 5,
'timeout' => 2,
'metric' => 100,
'disable' => '/var/run/service1.maintenance',
},
}#!/usr/bin/env python3
import sys
import os
from time import sleep
MAINTENANCE_FILE = '/var/run/exabgp-maintenance'
SERVICE_IP = '203.0.113.10'
while True:
# Check if maintenance file exists
if os.path.exists(MAINTENANCE_FILE):
# Maintenance mode: withdraw routes
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stderr.write('Maintenance mode active\n')
else:
# Normal operation: check service
if check_service():
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sys.stderr.flush()
sleep(5)Enter maintenance:
touch /var/run/exabgp-maintenanceExit maintenance:
rm /var/run/exabgp-maintenance#!/usr/bin/env python3
import sys
import signal
from time import sleep
maintenance_mode = False
def toggle_maintenance(signum, frame):
global maintenance_mode
maintenance_mode = not maintenance_mode
msg = "ENABLED" if maintenance_mode else "DISABLED"
sys.stderr.write(f'Maintenance mode {msg}\n')
sys.stderr.flush()
# Register signal handler
signal.signal(signal.SIGUSR1, toggle_maintenance)
SERVICE_IP = '203.0.113.10'
while True:
if maintenance_mode:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
else:
if check_service():
sys.stdout.write(f'announce route {SERVICE_IP}/32 next-hop self\n')
else:
sys.stdout.write(f'withdraw route {SERVICE_IP}/32\n')
sys.stdout.flush()
sleep(5)Toggle maintenance:
kill -USR1 <pid>import sys
import json
from datetime import datetime
def log(level, message, **kwargs):
"""Structured logging to stderr"""
log_entry = {
'timestamp': datetime.utcnow().isoformat(),
'level': level,
'message': message,
**kwargs
}
sys.stderr.write(json.dumps(log_entry) + '\n')
sys.stderr.flush()
# Usage
log('INFO', 'Health check passed', service='web', ip='203.0.113.10')
log('ERROR', 'Health check failed', service='web', ip='203.0.113.10', error='timeout')from prometheus_client import Counter, Gauge, Histogram, start_http_server
# Prometheus metrics
health_checks_total = Counter('health_checks_total', 'Total health checks', ['service', 'result'])
service_up = Gauge('service_up', 'Service health status', ['service'])
check_duration = Histogram('health_check_duration_seconds', 'Health check duration', ['service'])
# Start metrics server
start_http_server(9100)
# In health check loop
with check_duration.labels(service='web').time():
healthy = check_service()
health_checks_total.labels(service='web', result='success' if healthy else 'failure').inc()
service_up.labels(service='web').set(1 if healthy else 0)- Retry Logic: Implement rise/fall counters to prevent flapping
- Timeout Management: Set reasonable timeouts (2-5 seconds)
- Check Interval: Balance between responsiveness and overhead (5-10 seconds typical)
- Comprehensive Checks: Check all critical dependencies
- Maintenance Mode: Support graceful service withdrawal
- Logging: Log all state changes for troubleshooting
- Monitoring: Export metrics for observability
- Error Handling: Catch and handle all exceptions
- Graceful Degradation: Use metrics for partial failures
- Testing: Test health check logic independently
- Keep checks lightweight (< 2 seconds)
- Avoid blocking operations
- Use connection pooling for database checks
- Cache check results when appropriate
- Run checks in parallel for multiple services
- Monitor health check script resource usage
Routes Not Withdrawing:
- Check health check script is running
- Verify check logic returns correct boolean
- Ensure stdout flush is called
- Check ExaBGP process logs
Flapping Routes:
- Add retry logic (rise/fall counters)
- Increase check interval
- Investigate root cause of intermittent failures
- Add hysteresis to metrics
False Negatives:
- Increase timeout values
- Add retry before declaring failure
- Check network connectivity from host
- Verify service listening on correct interface
# Test health check script directly
python3 /usr/local/bin/healthcheck.py
# Monitor ExaBGP output
tail -f /var/log/exabgp/exabgp.log
# Check process is running
ps aux | grep healthcheck
# Test TCP connectivity
nc -zv localhost 80
# Test HTTP endpoint
curl -v --max-time 2 http://localhost/health- ExaBGP built-in healthcheck:
python3 -m exabgp healthcheck --help - ExaCheck: https://exacheck.net/
- sysadminblog healthcheck: https://github.com/sysadminblog/exabgp-healthcheck
- Vincent Bernat's HA guide: https://vincent.bernat.ch/en/blog/2013-exabgp-highavailability
π Home
π Getting Started
π§ API
π‘οΈ Use Cases
π Address Families
βοΈ Configuration
π Operations
π Reference
- Architecture
- BGP State Machine
- Communities (RFC)
- Extended Communities
- BGP Ecosystem
- Capabilities (AFI/SAFI)
- RFC Support
π Migration
π Community
π External
- GitHub Repo β
- Slack β
- Issues β
π» Ghost written by Claude (Anthropic AI)