diff --git a/watchdog/main_pi/watchdog.py b/watchdog/main_pi/watchdog.py new file mode 100644 index 00000000..34b96b39 --- /dev/null +++ b/watchdog/main_pi/watchdog.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Watchdog for the main Pi that pings the Pi Zero and power-cycles its relay +after repeated failures, with cooldown and daily limits. + +Cron setup (every 10 minutes at :05): + 1) Edit crontab: crontab -e + 2) Add the line: + 5,15,25,35,45,55 * * * * /usr/bin/python3 /home/pi/pyro-engine/watchdog/main_pi/watchdog.py >> /home/pi/watchdog_main.log 2>&1 + +Adjust the paths to match where this repo lives on the Pi. +""" + +import datetime as dt +import logging +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path + +import RPi.GPIO as GPIO + +# ================= CONFIG ================= + +RELAY_PIZERO = 16 +PIZERO_IP = "192.168.1.98" + +PING_COUNT = 2 +TIMEOUT = 2 # seconds, used for ping timeout + +MAX_FAILS = 3 +POWER_OFF_TIME = 15 + +COOLDOWN_SECONDS = 30 * 60 +MAX_REBOOTS_PER_DAY = 3 + +STATE_DIR = Path("/tmp") +LOG_FILE = Path("/home/pi/watchdog_main.log") + +FAIL_PIZERO_FILE = STATE_DIR / "fail_pizero" +LAST_REBOOT_FILE = STATE_DIR / "last_reboot_pizero" +DAILY_REBOOT_FILE = STATE_DIR / "daily_reboots_pizero" + +# ================ LOGGING ================= + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +# ================ GPIO ==================== + +GPIO.setmode(GPIO.BCM) +GPIO.setwarnings(False) + +GPIO.setup(RELAY_PIZERO, GPIO.OUT, initial=GPIO.HIGH) + +# ================ IO HELPERS ============== + +def read_int(path: Path, default: int = 0) -> int: + try: + return int(path.read_text().strip()) + except Exception: + return default + + +def write_int(path: Path, value: int) -> None: + path.write_text(str(value)) + + +def read_text(path: Path, default: str = "") -> str: + try: + return path.read_text().strip() + except Exception: + return default + + +def write_text(path: Path, value: str) -> None: + path.write_text(value) + +# ================ CHECKS ================== + +def ping_host(ip: str) -> bool: + try: + subprocess.check_output( + ["ping", "-c", str(PING_COUNT), "-W", str(TIMEOUT), ip], + stderr=subprocess.DEVNULL, + ) + return True + except subprocess.CalledProcessError: + return False + +# ================ FAIL COUNTERS =========== + +def update_fail_counter(ok: bool, fail_file: Path, label: str) -> int: + if ok: + logging.info("%s check OK", label) + write_int(fail_file, 0) + return 0 + + fails = read_int(fail_file, 0) + 1 + logging.warning("%s check FAILED (%s/%s)", label, fails, MAX_FAILS) + write_int(fail_file, fails) + return fails + +# ================ REBOOT GUARD ============ + +@dataclass(frozen=True) +class RebootGuard: + cooldown_seconds: int + max_reboots_per_day: int + + def _read_daily(self, daily_file: Path) -> tuple[str, int]: + today = dt.date.today().isoformat() + raw = read_text(daily_file, "") + + if not raw: + return today, 0 + + parts = raw.split() + if len(parts) != 2: + return today, 0 + + day, count_s = parts[0], parts[1] + try: + count = int(count_s) + except Exception: + count = 0 + + if day != today: + return today, 0 + + return day, count + + def can_reboot(self, now_ts: int, last_reboot_file: Path, daily_file: Path, label: str) -> bool: + last_ts = read_int(last_reboot_file, 0) + if last_ts and (now_ts - last_ts) < self.cooldown_seconds: + remaining = self.cooldown_seconds - (now_ts - last_ts) + logging.warning("%s: reboot blocked by cooldown, remaining_seconds=%s", label, remaining) + return False + + day, count = self._read_daily(daily_file) + if count >= self.max_reboots_per_day: + logging.warning( + "%s: reboot blocked by daily limit, count=%s, limit=%s", + label, + count, + self.max_reboots_per_day, + ) + write_text(daily_file, f"{day} {count}") + return False + + return True + + def record_reboot(self, now_ts: int, last_reboot_file: Path, daily_file: Path) -> None: + write_int(last_reboot_file, now_ts) + + today = dt.date.today().isoformat() + day, count = self._read_daily(daily_file) + if day != today: + day, count = today, 0 + + count += 1 + write_text(daily_file, f"{day} {count}") + +def power_cycle(relay_gpio: int, label: str, last_file: Path, daily_file: Path, guard: RebootGuard) -> None: + now_ts = int(time.time()) + + if not guard.can_reboot(now_ts, last_file, daily_file, label): + return + + logging.warning("%s: power cycle triggered", label) + GPIO.output(relay_gpio, GPIO.LOW) + time.sleep(POWER_OFF_TIME) + GPIO.output(relay_gpio, GPIO.HIGH) + logging.info("%s: power restored", label) + + guard.record_reboot(now_ts, last_file, daily_file) + +# ================= MAIN =================== + +def main() -> None: + guard = RebootGuard( + cooldown_seconds=COOLDOWN_SECONDS, + max_reboots_per_day=MAX_REBOOTS_PER_DAY, + ) + + fails = update_fail_counter(ping_host(PIZERO_IP), FAIL_PIZERO_FILE, "Pi Zero") + if fails >= MAX_FAILS: + power_cycle(RELAY_PIZERO, "Pi Zero", LAST_REBOOT_FILE, DAILY_REBOOT_FILE, guard) + write_int(FAIL_PIZERO_FILE, 0) + +if __name__ == "__main__": + try: + main() + finally: + GPIO.cleanup() diff --git a/watchdog/pi_zero/watchdog.py b/watchdog/pi_zero/watchdog.py new file mode 100644 index 00000000..2c85ac25 --- /dev/null +++ b/watchdog/pi_zero/watchdog.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Watchdog script for Pyro Engine hardware. + +It checks the main Pi health endpoint and pings camera IPs, tracking failures +and power-cycling relays after repeated failures with cooldown/daily limits. + +Cron setup (every 10 minutes): + 1) Edit crontab: crontab -e + 2) Add the line: + */10 * * * * /usr/bin/python3 /home/pi/pyro-engine/watchdog/pi_zero/watchdog.py >> /home/pi/watchdog.log 2>&1 + +Adjust the path to match where this repo lives on the Pi. +""" + +import datetime as dt +import json +import logging +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path +from urllib.request import Request, urlopen + +import RPi.GPIO as GPIO + +# ================= CONFIG ================= + +RELAY_MAIN = 16 +RELAY_CAMS = 26 + +MAIN_PI_IP = "192.168.1.99" +MAIN_HEALTH_URL = f"http://{MAIN_PI_IP}:8081/health" + +CAM_IPS = ["192.168.1.11", "192.168.1.12"] +INTERNET_IP = "1.1.1.1" + +PING_COUNT = 2 +TIMEOUT = 2 # seconds, used for ping and HTTP timeout + +MAX_FAILS = 3 +POWER_OFF_TIME = 15 + +COOLDOWN_SECONDS = 30 * 60 +MAX_REBOOTS_PER_DAY = 3 + +STATE_DIR = Path("/tmp") +LOG_FILE = Path("/home/pi/watchdog.log") + +FAIL_MAIN_FILE = STATE_DIR / "fail_main" +FAIL_INTERNET_FILE = STATE_DIR / "fail_internet" +FAIL_CAM_FILES = {ip: STATE_DIR / f"fail_cam_{ip.split('.')[-1]}" for ip in CAM_IPS} + +MAIN_LAST_REBOOT_FILE = STATE_DIR / "last_reboot_main" +CAMS_LAST_REBOOT_FILE = STATE_DIR / "last_reboot_cams" +MAIN_DAILY_FILE = STATE_DIR / "daily_reboots_main" +CAMS_DAILY_FILE = STATE_DIR / "daily_reboots_cams" + +# ================ LOGGING ================= + +logging.basicConfig( + filename=str(LOG_FILE), + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +# ================ GPIO ==================== + +GPIO.setmode(GPIO.BCM) +GPIO.setwarnings(False) + +GPIO.setup(RELAY_MAIN, GPIO.OUT, initial=GPIO.HIGH) +GPIO.setup(RELAY_CAMS, GPIO.OUT, initial=GPIO.HIGH) + +# ================ IO HELPERS ============== + +def read_int(path: Path, default: int = 0) -> int: + try: + return int(path.read_text().strip()) + except Exception: + return default + + +def write_int(path: Path, value: int) -> None: + path.write_text(str(value)) + + +def read_text(path: Path, default: str = "") -> str: + try: + return path.read_text().strip() + except Exception: + return default + + +def write_text(path: Path, value: str) -> None: + path.write_text(value) + +# ================ CHECKS ================== + +def ping_host(ip: str, count: int = PING_COUNT, timeout: int = TIMEOUT) -> bool: + try: + subprocess.check_output( + ["ping", "-c", str(count), "-W", str(timeout), ip], + stderr=subprocess.DEVNULL, + ) + return True + except subprocess.CalledProcessError: + return False + + +def http_health_ok(url: str) -> bool: + req = Request(url, method="GET", headers={"accept": "application/json"}) + try: + with urlopen(req, timeout=TIMEOUT) as resp: + if resp.status != 200: + return False + body = resp.read().decode("utf-8", errors="replace") + data = json.loads(body) + return data.get("status") == "ok" + except Exception: + return False + + +# ================ FAIL COUNTERS =========== + +def update_fail_counter(ok: bool, fail_file: Path, label: str, log_result: bool = True) -> int: + if ok: + if log_result: + logging.info("%s check OK", label) + write_int(fail_file, 0) + return 0 + + fails = read_int(fail_file, 0) + 1 + if log_result: + logging.warning("%s check FAILED (%s/%s)", label, fails, MAX_FAILS) + write_int(fail_file, fails) + return fails + +# ================ REBOOT GUARD ============ + +@dataclass(frozen=True) +class RebootGuard: + cooldown_seconds: int + max_reboots_per_day: int + + def _read_daily(self, daily_file: Path) -> tuple[str, int]: + today = dt.date.today().isoformat() + raw = read_text(daily_file, "") + + if not raw: + return today, 0 + + parts = raw.split() + if len(parts) != 2: + return today, 0 + + day, count_s = parts[0], parts[1] + try: + count = int(count_s) + except Exception: + count = 0 + + if day != today: + return today, 0 + + return day, count + + def can_reboot(self, now_ts: int, last_reboot_file: Path, daily_file: Path, label: str) -> bool: + last_ts = read_int(last_reboot_file, 0) + if last_ts and (now_ts - last_ts) < self.cooldown_seconds: + remaining = self.cooldown_seconds - (now_ts - last_ts) + logging.warning("%s: reboot blocked by cooldown, remaining_seconds=%s", label, remaining) + return False + + day, count = self._read_daily(daily_file) + if count >= self.max_reboots_per_day: + logging.warning( + "%s: reboot blocked by daily limit, count=%s, limit=%s", + label, + count, + self.max_reboots_per_day, + ) + write_text(daily_file, f"{day} {count}") + return False + + return True + + def record_reboot(self, now_ts: int, last_reboot_file: Path, daily_file: Path) -> None: + write_int(last_reboot_file, now_ts) + + today = dt.date.today().isoformat() + day, count = self._read_daily(daily_file) + if day != today: + day, count = today, 0 + + count += 1 + write_text(daily_file, f"{day} {count}") + +def power_cycle(relay_gpio: int, label: str, last_file: Path, daily_file: Path, guard: RebootGuard) -> None: + now_ts = int(time.time()) + + if not guard.can_reboot(now_ts, last_file, daily_file, label): + return + + logging.warning("%s: power cycle triggered", label) + GPIO.output(relay_gpio, GPIO.LOW) + time.sleep(POWER_OFF_TIME) + GPIO.output(relay_gpio, GPIO.HIGH) + logging.info("%s: power restored", label) + + guard.record_reboot(now_ts, last_file, daily_file) + +# ================= MAIN =================== + +def main() -> None: + guard = RebootGuard( + cooldown_seconds=COOLDOWN_SECONDS, + max_reboots_per_day=MAX_REBOOTS_PER_DAY, + ) + + reboot_12v = False + + internet_ok = ping_host(INTERNET_IP, count=1, timeout=TIMEOUT) + internet_fails = update_fail_counter(internet_ok, FAIL_INTERNET_FILE, f"Internet {INTERNET_IP}") + if internet_fails >= MAX_FAILS: + reboot_12v = True + + if internet_ok: + main_ok = http_health_ok(MAIN_HEALTH_URL) + main_fails = update_fail_counter(main_ok, FAIL_MAIN_FILE, "Main Pi health") + else: + logging.warning("Skipping Main Pi health check (internet_ok=%s)", internet_ok) + main_fails = 0 + + if main_fails >= MAX_FAILS: + power_cycle(RELAY_MAIN, "Main Pi", MAIN_LAST_REBOOT_FILE, MAIN_DAILY_FILE, guard) + write_int(FAIL_MAIN_FILE, 0) + + cam_results: list[tuple[str, bool, int]] = [] + + for ip in CAM_IPS: + ok = ping_host(ip) + + fails = update_fail_counter(ok, FAIL_CAM_FILES[ip], f"Camera {ip}", log_result=False) + cam_results.append((ip, ok, fails)) + if fails >= MAX_FAILS: + reboot_12v = True + + if cam_results: + parts = [] + any_failed = False + for ip, ok, fails in cam_results: + if ok: + parts.append(f"{ip} OK") + else: + parts.append(f"{ip} FAILED ({fails}/{MAX_FAILS})") + any_failed = True + + summary = "Cameras check: " + ", ".join(parts) + if any_failed: + logging.warning(summary) + else: + logging.info(summary) + + if reboot_12v: + power_cycle(RELAY_CAMS, "Cameras / Router 12V", CAMS_LAST_REBOOT_FILE, CAMS_DAILY_FILE, guard) + for ip in CAM_IPS: + write_int(FAIL_CAM_FILES[ip], 0) + write_int(FAIL_INTERNET_FILE, 0) + +if __name__ == "__main__": + try: + main() + finally: + GPIO.cleanup()