Development
This commit is contained in:
parent
a4652866c3
commit
27eaea3d73
19 changed files with 602 additions and 427 deletions
802
routlin/health.py
Normal file
802
routlin/health.py
Normal file
|
|
@ -0,0 +1,802 @@
|
|||
"""
|
||||
health.py -- System health checks for Routlin.
|
||||
|
||||
Reads core.json, checks services, configuration files, and logs, then writes
|
||||
.health JSON. Imported by core.py; also runnable standalone.
|
||||
|
||||
Public API:
|
||||
run_and_write(data) -> dict run all checks, write .health, return dict
|
||||
print_table(status: dict) render the CLI service table from status dict
|
||||
"""
|
||||
import hashlib
|
||||
import ipaddress
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from validation import derive_interface, derive_vlan_id, is_wg
|
||||
|
||||
# ===================================================================
|
||||
# Constants (mirror core.py - no import to avoid circular dependency)
|
||||
# ===================================================================
|
||||
|
||||
PRODUCT_NAME = "routlin"
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
HEALTH_FILE = SCRIPT_DIR / ".health"
|
||||
CONFIG_FILE = SCRIPT_DIR / "core.json"
|
||||
BLOCKLIST_DIR = SCRIPT_DIR / "blocklists"
|
||||
DNSMASQ_CONF_DIR = Path(f"/etc/dnsmasq-{PRODUCT_NAME}")
|
||||
LEASES_DIR = Path("/var/lib/misc")
|
||||
NETWORKD_DIR = Path("/etc/systemd/network")
|
||||
SYSTEMD_DIR = Path("/etc/systemd/system")
|
||||
WG_DIR = Path("/etc/wireguard")
|
||||
RESOLV_CONF = Path("/etc/resolv.conf")
|
||||
AVAHI_CONF_FILE = Path("/etc/avahi/avahi-daemon.conf")
|
||||
CHRONY_CONF_FILE = Path("/etc/chrony/chrony.conf")
|
||||
RADIUS_SECRET_FILE = SCRIPT_DIR / ".radius-secret"
|
||||
RADIUS_CLIENTS_CONF = Path("/etc/freeradius/3.0/clients.conf")
|
||||
RADIUS_USERS_FILE = Path("/etc/freeradius/3.0/users")
|
||||
BLIST_TIMER_NAME = f"{PRODUCT_NAME}-dns-blocklist-update"
|
||||
DASHB_TIMER_NAME = f"{PRODUCT_NAME}-dashboard-queue"
|
||||
HEALTH_TIMER_NAME = f"{PRODUCT_NAME}-health-check"
|
||||
DDNS_TIMER_NAME = f"{PRODUCT_NAME}-ddns-update"
|
||||
DASHB_QUEUE_FILE = SCRIPT_DIR / ".dashboard-queue"
|
||||
NAT_SERVICE_NAME = f"{PRODUCT_NAME}-nat"
|
||||
BLOCKLIST_STALE_SECS = 36 * 3600
|
||||
DISK_WARN_PCT = 90
|
||||
DHCP_WARN_PCT = 90
|
||||
DNS_TIMEOUT_SECS = 2
|
||||
|
||||
# ===================================================================
|
||||
# Small helpers replicated from core.py (no import)
|
||||
# ===================================================================
|
||||
|
||||
def _vlan_service_name(vlan, iface):
|
||||
if is_wg(vlan):
|
||||
return f"dnsmasq-{PRODUCT_NAME}-{vlan['name']}-{iface}"
|
||||
return f"dnsmasq-{PRODUCT_NAME}-{vlan['name']}"
|
||||
|
||||
def _radius_enabled(data):
|
||||
return any(
|
||||
r.get("radius_client") is True
|
||||
for v in data.get("vlans", [])
|
||||
for r in v.get("reservations", [])
|
||||
)
|
||||
|
||||
def _avahi_enabled(data):
|
||||
return any(
|
||||
v.get("mdns_reflection") is True
|
||||
for v in data.get("vlans", [])
|
||||
if not is_wg(v)
|
||||
)
|
||||
|
||||
def _avahi_interfaces(data):
|
||||
return [
|
||||
derive_interface(v, data)
|
||||
for v in data.get("vlans", [])
|
||||
if v.get("mdns_reflection") is True and not is_wg(v)
|
||||
]
|
||||
|
||||
def _combo_hash(names):
|
||||
key = ",".join(sorted(names))
|
||||
return hashlib.sha256(key.encode()).hexdigest()[:8]
|
||||
|
||||
def _merged_path(h):
|
||||
return BLOCKLIST_DIR / f"merged-{h}.conf"
|
||||
|
||||
def _lowest_quartet_ip(vlan):
|
||||
ips = []
|
||||
for s in vlan.get("server_identities", []):
|
||||
try:
|
||||
ips.append(ipaddress.IPv4Address(s["ip"]))
|
||||
except (KeyError, ValueError):
|
||||
pass
|
||||
return str(min(ips, key=lambda ip: ip.packed[-1])) if ips else None
|
||||
|
||||
def _gateway_ips(data):
|
||||
"""Return set of all gateway IPs across all VLANs."""
|
||||
gws = set()
|
||||
for vlan in data.get("vlans", []):
|
||||
ip = _lowest_quartet_ip(vlan)
|
||||
if ip:
|
||||
gws.add(ip)
|
||||
return gws
|
||||
|
||||
def _iface_operstate(iface):
|
||||
"""Read operstate from sysfs. Returns 'up', 'down', 'unknown', or None."""
|
||||
try:
|
||||
return Path(f"/sys/class/net/{iface}/operstate").read_text().strip()
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
def _sysctl_query(unit):
|
||||
"""Return (active, enabled) strings from systemctl."""
|
||||
r_a = subprocess.run(["systemctl", "is-active", unit], capture_output=True, text=True)
|
||||
r_e = subprocess.run(["systemctl", "is-enabled", unit], capture_output=True, text=True)
|
||||
enabled = r_e.stdout.strip() or "not-found"
|
||||
return r_a.stdout.strip(), enabled
|
||||
|
||||
# ===================================================================
|
||||
# Result builders
|
||||
# ===================================================================
|
||||
|
||||
def _ok(id_, name, detail=""):
|
||||
r = {"id": id_, "name": name, "status": "ok"}
|
||||
if detail:
|
||||
r["detail"] = detail
|
||||
return r
|
||||
|
||||
def _problem(id_, name, severity, detail, suggestion=""):
|
||||
r = {"id": id_, "name": name, "status": "problem",
|
||||
"severity": severity, "detail": detail}
|
||||
if suggestion:
|
||||
r["suggestion"] = suggestion
|
||||
return r
|
||||
|
||||
# ===================================================================
|
||||
# Services checks
|
||||
# ===================================================================
|
||||
|
||||
def check_services(data):
|
||||
results = []
|
||||
vlans = data.get("vlans", [])
|
||||
|
||||
units = []
|
||||
for vlan in vlans:
|
||||
iface = derive_interface(vlan, data)
|
||||
name = _vlan_service_name(vlan, iface)
|
||||
units.append({"id": name, "name": name,
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "error"})
|
||||
|
||||
units.append({"id": f"{BLIST_TIMER_NAME}.timer",
|
||||
"name": f"{BLIST_TIMER_NAME}.timer",
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "warning"})
|
||||
|
||||
units.append({"id": NAT_SERVICE_NAME,
|
||||
"name": NAT_SERVICE_NAME,
|
||||
"expected_active": "inactive",
|
||||
"expected_enabled": "enabled",
|
||||
"severity": "error"})
|
||||
|
||||
units.append({"id": f"{HEALTH_TIMER_NAME}.timer",
|
||||
"name": f"{HEALTH_TIMER_NAME}.timer",
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "warning"})
|
||||
|
||||
if DASHB_QUEUE_FILE.exists():
|
||||
units.append({"id": f"{DASHB_TIMER_NAME}.timer",
|
||||
"name": f"{DASHB_TIMER_NAME}.timer",
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "error"})
|
||||
|
||||
has_ddns = any(p.get("enabled") for p in data.get("ddns", {}).get("providers", []))
|
||||
exp_ddns_active = "active" if has_ddns else "inactive"
|
||||
exp_ddns_enabled = "enabled" if has_ddns else "not-found"
|
||||
units.append({"id": f"{DDNS_TIMER_NAME}.timer",
|
||||
"name": f"{DDNS_TIMER_NAME}.timer",
|
||||
"expected_active": exp_ddns_active, "expected_enabled": exp_ddns_enabled,
|
||||
"severity": "warning"})
|
||||
|
||||
exp_fr_active = "active" if _radius_enabled(data) else "inactive"
|
||||
exp_fr_enabled = "enabled" if _radius_enabled(data) else "disabled"
|
||||
units.append({"id": "freeradius", "name": "freeradius",
|
||||
"expected_active": exp_fr_active,
|
||||
"expected_enabled": exp_fr_enabled,
|
||||
"severity": "error"})
|
||||
|
||||
exp_av_active = "active" if _avahi_enabled(data) else "inactive"
|
||||
exp_av_enabled = "enabled" if _avahi_enabled(data) else "disabled"
|
||||
units.append({"id": "avahi-daemon", "name": "avahi-daemon",
|
||||
"expected_active": exp_av_active,
|
||||
"expected_enabled": exp_av_enabled,
|
||||
"severity": "warning"})
|
||||
|
||||
units.append({"id": "chrony", "name": "chrony",
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "warning"})
|
||||
units.append({"id": "systemd-networkd", "name": "systemd-networkd",
|
||||
"expected_active": "active", "expected_enabled": "enabled",
|
||||
"severity": "error"})
|
||||
|
||||
for u in units:
|
||||
active, enabled = _sysctl_query(u["id"])
|
||||
exp_active = u["expected_active"]
|
||||
exp_enabled = u["expected_enabled"]
|
||||
active_ok = active == exp_active
|
||||
enabled_ok = enabled == exp_enabled
|
||||
status = "ok" if (active_ok and enabled_ok) else "problem"
|
||||
results.append({
|
||||
"id": u["id"],
|
||||
"name": u["name"],
|
||||
"active": active,
|
||||
"enabled": enabled,
|
||||
"expected_active": exp_active,
|
||||
"expected_enabled": exp_enabled,
|
||||
"active_ok": active_ok,
|
||||
"enabled_ok": enabled_ok,
|
||||
"severity": u.get("severity", "error"),
|
||||
"status": status,
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
# ===================================================================
|
||||
# Configuration checks
|
||||
# ===================================================================
|
||||
|
||||
def check_configurations(data):
|
||||
results = []
|
||||
vlans = data.get("vlans", [])
|
||||
non_wg = [v for v in vlans if not is_wg(v)]
|
||||
wg_vlans = [v for v in vlans if is_wg(v)]
|
||||
def file_ok(id_, name, path, severity="error", suggestion=""):
|
||||
try:
|
||||
exists = path.exists()
|
||||
except PermissionError:
|
||||
return _problem(id_, name, "warning",
|
||||
f"{path}: permission denied — run with sudo for accurate status.")
|
||||
if not exists:
|
||||
return _problem(id_, name, severity,
|
||||
f"{path} does not exist.",
|
||||
suggestion or f"Run `sudo python3 core.py --apply` to create it.")
|
||||
return _ok(id_, name)
|
||||
|
||||
# --- nftables tables ---
|
||||
try:
|
||||
tables_out = subprocess.run(
|
||||
["nft", "list", "tables"], capture_output=True, text=True
|
||||
).stdout
|
||||
for tbl in ("ip routlin-nat", "ip routlin-filter"):
|
||||
if tbl in tables_out:
|
||||
results.append(_ok(f"nft_{tbl.replace(' ', '_')}",
|
||||
f"nftables table {tbl}"))
|
||||
else:
|
||||
results.append(_problem(
|
||||
f"nft_{tbl.replace(' ', '_')}",
|
||||
f"nftables table {tbl}",
|
||||
"error",
|
||||
f"nftables table '{tbl}' is missing.",
|
||||
"Run `sudo python3 core.py --apply` to rebuild firewall rules."))
|
||||
except Exception:
|
||||
results.append(_problem("nft_tables", "nftables tables", "error",
|
||||
"Could not query nftables (nft not available or failed)."))
|
||||
|
||||
# --- Docker bridge rules ---
|
||||
try:
|
||||
bridges = [
|
||||
p.parent.name
|
||||
for p in Path("/sys/class/net").glob("*/bridge")
|
||||
if _iface_operstate(p.parent.name) == "up"
|
||||
]
|
||||
if bridges:
|
||||
fwd_out = subprocess.run(
|
||||
["nft", "list", "chain", "ip", "routlin-filter", "forward"],
|
||||
capture_output=True, text=True
|
||||
).stdout
|
||||
missing = [b for b in bridges if b not in fwd_out]
|
||||
if missing:
|
||||
results.append(_problem(
|
||||
"nft_docker_bridges", "nftables Docker bridge rules", "warning",
|
||||
f"Container bridge(s) {', '.join(missing)} have no nftables forward rules.",
|
||||
"Run `sudo python3 core.py --apply` to add the missing rules."))
|
||||
else:
|
||||
results.append(_ok("nft_docker_bridges", "nftables Docker bridge rules"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- VLAN sub-interfaces ---
|
||||
for vlan in non_wg:
|
||||
iface = derive_interface(vlan, data)
|
||||
vid = derive_vlan_id(vlan.get("subnet", ""), vlan.get("subnet_mask", 24))
|
||||
state = _iface_operstate(iface)
|
||||
id_ = f"iface_{vlan['name']}"
|
||||
name = f"interface {iface}"
|
||||
if state is None:
|
||||
results.append(_problem(id_, name, "error",
|
||||
f"Interface {iface} does not exist in /sys/class/net/.",
|
||||
"Run `sudo python3 core.py --apply` to configure network interfaces."))
|
||||
elif state != "up":
|
||||
results.append(_problem(id_, name, "error",
|
||||
f"Interface {iface} operstate is '{state}' (expected 'up').",
|
||||
"Check systemd-networkd: `sudo systemctl status systemd-networkd`"))
|
||||
else:
|
||||
results.append(_ok(id_, name))
|
||||
|
||||
# --- WireGuard interfaces ---
|
||||
for vlan in wg_vlans:
|
||||
iface = derive_interface(vlan, data)
|
||||
state = _iface_operstate(iface)
|
||||
id_ = f"iface_wg_{vlan['name']}"
|
||||
name = f"WireGuard interface {iface}"
|
||||
if state is None:
|
||||
results.append(_problem(id_, name, "error",
|
||||
f"WireGuard interface {iface} does not exist.",
|
||||
"Run `sudo python3 core.py --apply` to bring up WireGuard."))
|
||||
elif state in ("up", "unknown"): # WireGuard interfaces normally report 'unknown'
|
||||
results.append(_ok(id_, name))
|
||||
else:
|
||||
results.append(_problem(id_, name, "error",
|
||||
f"WireGuard interface {iface} operstate is '{state}'.",
|
||||
f"Try: sudo wg-quick up {iface}"))
|
||||
|
||||
# --- Stale WG interfaces when no WG VLANs configured ---
|
||||
if not wg_vlans:
|
||||
stale_wg = [
|
||||
p.name for p in Path("/sys/class/net").iterdir()
|
||||
if p.name.startswith("wg") and re.match(r"^wg\d+$", p.name)
|
||||
]
|
||||
if stale_wg:
|
||||
results.append(_problem(
|
||||
"stale_wg_ifaces", "Stale WireGuard interfaces", "warning",
|
||||
f"WireGuard interface(s) {', '.join(stale_wg)} exist but no VPN VLANs are configured.",
|
||||
f"Bring them down manually: sudo wg-quick down {stale_wg[0]}"))
|
||||
|
||||
# --- dnsmasq config files ---
|
||||
for vlan in vlans:
|
||||
path = DNSMASQ_CONF_DIR / f"{vlan['name']}.conf"
|
||||
results.append(file_ok(f"dnsmasq_conf_{vlan['name']}",
|
||||
f"dnsmasq config {path.name}", path))
|
||||
|
||||
# --- systemd-networkd files ---
|
||||
for vlan in non_wg:
|
||||
iface = derive_interface(vlan, data)
|
||||
vid = derive_vlan_id(vlan.get("subnet", ""), vlan.get("subnet_mask", 24))
|
||||
net = NETWORKD_DIR / f"10-{PRODUCT_NAME}-{vlan['name']}.network"
|
||||
results.append(file_ok(f"networkd_net_{vlan['name']}",
|
||||
f"networkd {net.name}", net))
|
||||
if vid != 1: # non-physical VLANs have a .netdev too
|
||||
netdev = NETWORKD_DIR / f"10-{PRODUCT_NAME}-{vlan['name']}.netdev"
|
||||
results.append(file_ok(f"networkd_netdev_{vlan['name']}",
|
||||
f"networkd {netdev.name}", netdev))
|
||||
|
||||
# --- systemd unit files ---
|
||||
for path in (SYSTEMD_DIR / f"{NAT_SERVICE_NAME}.service",
|
||||
SYSTEMD_DIR / f"{BLIST_TIMER_NAME}.timer",
|
||||
SYSTEMD_DIR / f"{BLIST_TIMER_NAME}.service"):
|
||||
results.append(file_ok(f"unit_{path.stem}", f"systemd unit {path.name}", path))
|
||||
|
||||
# --- WireGuard config and key files ---
|
||||
for vlan in wg_vlans:
|
||||
iface = derive_interface(vlan, data)
|
||||
conf = WG_DIR / f"{iface}.conf"
|
||||
key = WG_DIR / f"{iface}.key"
|
||||
pub = SCRIPT_DIR / f".{iface}.pub"
|
||||
results.append(file_ok(f"wg_conf_{iface}", f"WireGuard {conf.name}", conf))
|
||||
results.append(file_ok(f"wg_key_{iface}", f"WireGuard {key.name}", key))
|
||||
results.append(file_ok(f"wg_pubkey_{iface}", f"WireGuard {pub.name}", pub))
|
||||
|
||||
# --- Stale WG conf files when no WG VLANs ---
|
||||
if not wg_vlans and WG_DIR.exists():
|
||||
stale = [
|
||||
p for p in WG_DIR.glob("wg*.conf")
|
||||
if p.read_text().startswith("# Generated by")
|
||||
]
|
||||
if stale:
|
||||
results.append(_problem(
|
||||
"stale_wg_conf", "Stale WireGuard config files", "warning",
|
||||
f"{', '.join(p.name for p in stale)} exist but no VPN VLANs are configured.",
|
||||
"Remove with: sudo rm " + " ".join(str(p) for p in stale)))
|
||||
|
||||
# --- RADIUS files and secret check ---
|
||||
if _radius_enabled(data):
|
||||
results.append(file_ok("radius_secret_file", ".radius-secret file",
|
||||
RADIUS_SECRET_FILE, "error"))
|
||||
results.append(file_ok("radius_clients_conf", "FreeRADIUS clients.conf",
|
||||
RADIUS_CLIENTS_CONF, "error"))
|
||||
results.append(file_ok("radius_users_file", "FreeRADIUS users",
|
||||
RADIUS_USERS_FILE, "error"))
|
||||
|
||||
# Secret content match
|
||||
try:
|
||||
secret = RADIUS_SECRET_FILE.read_text().strip()
|
||||
conf_text = RADIUS_CLIENTS_CONF.read_text()
|
||||
secret_ok = any(
|
||||
line.strip().split("=", 1)[-1].strip() == secret
|
||||
for line in conf_text.splitlines()
|
||||
if "secret" in line and not line.strip().startswith("#")
|
||||
)
|
||||
if secret_ok:
|
||||
results.append(_ok("radius_secret_match", "FreeRADIUS shared secret"))
|
||||
else:
|
||||
results.append(_problem(
|
||||
"radius_secret_match", "FreeRADIUS shared secret", "error",
|
||||
"clients.conf secret does not match .radius-secret. "
|
||||
"Access points will reject all authentication requests.",
|
||||
"Restore .radius-secret from backup, or run `sudo python3 core.py --apply` "
|
||||
"then update the shared secret in your AP controller."))
|
||||
except OSError:
|
||||
pass # already caught above by file_ok
|
||||
else:
|
||||
# RADIUS not enabled - warn if generated config files still exist
|
||||
if RADIUS_CLIENTS_CONF.exists():
|
||||
try:
|
||||
if "# Generated by" in RADIUS_CLIENTS_CONF.read_text():
|
||||
results.append(_problem(
|
||||
"radius_conf_orphan", "FreeRADIUS config", "warning",
|
||||
"FreeRADIUS clients.conf contains routlin-generated content "
|
||||
"but RADIUS is not enabled.",
|
||||
"This is harmless if freeradius is stopped. "
|
||||
"Remove with: sudo rm " + str(RADIUS_CLIENTS_CONF)))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# --- Avahi config ---
|
||||
if _avahi_enabled(data):
|
||||
results.append(file_ok("avahi_conf", "avahi-daemon.conf",
|
||||
AVAHI_CONF_FILE, "warning"))
|
||||
if AVAHI_CONF_FILE.exists():
|
||||
expected_ifaces = set(_avahi_interfaces(data))
|
||||
try:
|
||||
text = AVAHI_CONF_FILE.read_text()
|
||||
m = re.search(r"allow-interfaces\s*=\s*(.+)", text)
|
||||
if m:
|
||||
actual_ifaces = {i.strip() for i in m.group(1).split(",")}
|
||||
missing = expected_ifaces - actual_ifaces
|
||||
extra = actual_ifaces - expected_ifaces
|
||||
if missing or extra:
|
||||
results.append(_problem(
|
||||
"avahi_ifaces", "avahi-daemon interface list", "warning",
|
||||
f"avahi-daemon.conf interface list does not match config "
|
||||
f"(missing: {missing or 'none'}, extra: {extra or 'none'}).",
|
||||
"Run `sudo python3 core.py --apply` to update."))
|
||||
else:
|
||||
results.append(_ok("avahi_ifaces",
|
||||
"avahi-daemon interface list"))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# --- resolv.conf ---
|
||||
gateway_ips = _gateway_ips(data)
|
||||
try:
|
||||
resolv = RESOLV_CONF.read_text()
|
||||
ns_ips = {
|
||||
line.split()[1]
|
||||
for line in resolv.splitlines()
|
||||
if line.startswith("nameserver") and len(line.split()) >= 2
|
||||
}
|
||||
if ns_ips & gateway_ips:
|
||||
results.append(_ok("resolv_conf", "/etc/resolv.conf"))
|
||||
else:
|
||||
results.append(_problem(
|
||||
"resolv_conf", "/etc/resolv.conf", "warning",
|
||||
f"/etc/resolv.conf nameserver(s) {ns_ips} do not include any VLAN gateway. "
|
||||
f"Expected one of: {gateway_ips}.",
|
||||
"Run `sudo python3 core.py --apply` to update /etc/resolv.conf."))
|
||||
except OSError:
|
||||
results.append(_problem("resolv_conf", "/etc/resolv.conf", "warning",
|
||||
"/etc/resolv.conf is not readable.",
|
||||
"Run `sudo python3 core.py --apply`."))
|
||||
|
||||
# --- chrony.conf ---
|
||||
if CHRONY_CONF_FILE.exists():
|
||||
try:
|
||||
content = CHRONY_CONF_FILE.read_text()
|
||||
missing_subnets = []
|
||||
for vlan in non_wg:
|
||||
try:
|
||||
network = ipaddress.IPv4Network(
|
||||
f"{vlan['subnet']}/{vlan['subnet_mask']}", strict=False)
|
||||
cidr = str(network)
|
||||
if f"allow {cidr}" not in content and f"allow {vlan['subnet']}" not in content:
|
||||
missing_subnets.append(cidr)
|
||||
except Exception:
|
||||
pass
|
||||
if missing_subnets:
|
||||
results.append(_problem(
|
||||
"chrony_conf", "/etc/chrony/chrony.conf", "warning",
|
||||
f"chrony.conf is missing allow directives for: {', '.join(missing_subnets)}.",
|
||||
"Run `sudo python3 core.py --apply` to update chrony.conf."))
|
||||
else:
|
||||
results.append(_ok("chrony_conf", "/etc/chrony/chrony.conf"))
|
||||
except OSError:
|
||||
results.append(_problem("chrony_conf", "/etc/chrony/chrony.conf", "warning",
|
||||
"/etc/chrony/chrony.conf is not readable."))
|
||||
else:
|
||||
results.append(_problem("chrony_conf", "/etc/chrony/chrony.conf", "warning",
|
||||
"/etc/chrony/chrony.conf does not exist.",
|
||||
"Install chrony: sudo apt-get install chrony"))
|
||||
|
||||
# --- Stale WG conf when no WG VLANs (already handled above) ---
|
||||
|
||||
# --- DHCP pool utilization ---
|
||||
for vlan in non_wg:
|
||||
try:
|
||||
dhcp = vlan.get("dhcp_information", {})
|
||||
start = dhcp.get("pool_start", "")
|
||||
end = dhcp.get("pool_end", "")
|
||||
if not start or not end:
|
||||
continue
|
||||
pool_size = (int(ipaddress.IPv4Address(end))
|
||||
- int(ipaddress.IPv4Address(start)) + 1)
|
||||
if pool_size <= 0:
|
||||
continue
|
||||
lease_file = LEASES_DIR / f"dnsmasq-{PRODUCT_NAME}-{vlan['name']}.leases"
|
||||
if not lease_file.exists():
|
||||
continue
|
||||
leases = [
|
||||
l for l in lease_file.read_text().splitlines()
|
||||
if l.strip() and not l.startswith("#")
|
||||
]
|
||||
pct = len(leases) * 100 // pool_size
|
||||
if pct >= DHCP_WARN_PCT:
|
||||
results.append(_problem(
|
||||
f"dhcp_pool_{vlan['name']}",
|
||||
f"DHCP pool ({vlan['name']})", "warning",
|
||||
f"DHCP pool for VLAN '{vlan['name']}' is {pct}% full "
|
||||
f"({len(leases)}/{pool_size} leases).",
|
||||
"Expand the pool range in core.json or clean up stale leases "
|
||||
f"with: `sudo python3 core.py --reset-leases {vlan['name']}`"))
|
||||
else:
|
||||
results.append(_ok(f"dhcp_pool_{vlan['name']}",
|
||||
f"DHCP pool ({vlan['name']})",
|
||||
f"{pct}% used ({len(leases)}/{pool_size})"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Blocklist file freshness ---
|
||||
blocklists = data.get("dns_blocking", {}).get("blocklists", [])
|
||||
if blocklists:
|
||||
combos = {}
|
||||
for vlan in vlans:
|
||||
names = vlan.get("use_blocklists", [])
|
||||
if names:
|
||||
combos[_combo_hash(names)] = names
|
||||
now = datetime.now(timezone.utc).timestamp()
|
||||
for h, names in combos.items():
|
||||
path = _merged_path(h)
|
||||
label = ", ".join(names)
|
||||
if not path.exists():
|
||||
results.append(_problem(
|
||||
f"blocklist_{h}", f"blocklist ({label})", "warning",
|
||||
f"Merged blocklist file for '{label}' does not exist.",
|
||||
"Run `sudo python3 dns-blocklists.py` to download blocklists."))
|
||||
elif now - path.stat().st_mtime > BLOCKLIST_STALE_SECS:
|
||||
age_h = int((now - path.stat().st_mtime) / 3600)
|
||||
results.append(_problem(
|
||||
f"blocklist_{h}", f"blocklist ({label})", "warning",
|
||||
f"Merged blocklist for '{label}' is {age_h}h old (threshold 36h).",
|
||||
"Run `sudo python3 dns-blocklists.py` to refresh."))
|
||||
else:
|
||||
results.append(_ok(f"blocklist_{h}", f"blocklist ({label})"))
|
||||
|
||||
# --- Disk space ---
|
||||
try:
|
||||
usage = shutil.disk_usage("/")
|
||||
pct = usage.used * 100 // usage.total
|
||||
if pct >= DISK_WARN_PCT:
|
||||
results.append(_problem(
|
||||
"disk_space", "Disk space", "warning",
|
||||
f"Root filesystem is {pct}% full "
|
||||
f"({usage.used // 1_073_741_824}G of {usage.total // 1_073_741_824}G used).",
|
||||
"Free up disk space to avoid service disruption."))
|
||||
else:
|
||||
results.append(_ok("disk_space", "Disk space",
|
||||
f"{pct}% used"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Upstream DNS reachability ---
|
||||
servers = data.get("upstream_dns", {}).get("upstream_servers", [])
|
||||
unreachable = []
|
||||
for srv in servers:
|
||||
try:
|
||||
with socket.create_connection((srv, 53), timeout=DNS_TIMEOUT_SECS):
|
||||
pass
|
||||
except OSError:
|
||||
unreachable.append(srv)
|
||||
if unreachable:
|
||||
results.append(_problem(
|
||||
"upstream_dns", "Upstream DNS reachability", "warning",
|
||||
f"Upstream DNS server(s) unreachable on port 53: {', '.join(unreachable)}.",
|
||||
"Check WAN connectivity and upstream DNS server addresses in core.json."))
|
||||
elif servers:
|
||||
results.append(_ok("upstream_dns", "Upstream DNS reachability"))
|
||||
|
||||
return results
|
||||
|
||||
# ===================================================================
|
||||
# Log checks
|
||||
# ===================================================================
|
||||
|
||||
def check_logs(data):
|
||||
results = []
|
||||
|
||||
# --- FreeRADIUS auth failures ---
|
||||
radius_log = Path("/var/log/freeradius/radius.log")
|
||||
if radius_log.exists():
|
||||
try:
|
||||
now = datetime.now(timezone.utc).timestamp()
|
||||
cutoff = now - 3600
|
||||
lines = radius_log.read_text(errors="replace").splitlines()
|
||||
# Parse lines with timestamps like "Thu May 21 11:53:47 2026 : Info: ..."
|
||||
recent = []
|
||||
failure_re = re.compile(r"Shared secret is incorrect")
|
||||
ts_re = re.compile(
|
||||
r"(\w+ \w+ +\d+ \d+:\d+:\d+ \d+) : ")
|
||||
for line in lines[-2000:]: # scan last 2000 lines
|
||||
m = ts_re.match(line)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
ts = datetime.strptime(m.group(1), "%a %b %d %H:%M:%S %Y")
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
if ts.timestamp() >= cutoff:
|
||||
recent.append(line)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
failures = [l for l in recent if failure_re.search(l)]
|
||||
if failures:
|
||||
# Extract distinct AP names from "(from client ...)" pattern
|
||||
ap_re = re.compile(r"\(from client ([^)]+)\)")
|
||||
aps = sorted({m.group(1) for l in failures
|
||||
for m in ap_re.finditer(l)})
|
||||
ap_str = ", ".join(aps) if aps else f"{len(failures)} request(s)"
|
||||
results.append(_problem(
|
||||
"freeradius_auth_failures",
|
||||
"FreeRADIUS auth failures", "error",
|
||||
f"FreeRADIUS is rejecting requests from {ap_str} with "
|
||||
f"'Shared secret is incorrect' ({len(failures)} failures in the last hour).",
|
||||
"Restore .radius-secret from backup and run `sudo python3 core.py --apply`, "
|
||||
"or update the shared secret in your AP controller to match .radius-secret."))
|
||||
else:
|
||||
results.append(_ok("freeradius_auth_failures",
|
||||
"FreeRADIUS auth failures"))
|
||||
|
||||
# High rejection rate (>50% of recent activity is failures)
|
||||
if recent and len(failures) > len(recent) * 0.5 and not failures:
|
||||
results.append(_problem(
|
||||
"freeradius_high_reject_rate",
|
||||
"FreeRADIUS rejection rate", "warning",
|
||||
f"Over half of recent FreeRADIUS activity ({len(failures)}/{len(recent)}) "
|
||||
f"are auth failures.",
|
||||
"Investigate FreeRADIUS config and shared secrets."))
|
||||
elif recent:
|
||||
results.append(_ok("freeradius_high_reject_rate",
|
||||
"FreeRADIUS rejection rate"))
|
||||
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# --- dnsmasq errors ---
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["journalctl", f"-u", f"dnsmasq-{PRODUCT_NAME}-*",
|
||||
"--since", "-1h", "--priority=err", "--no-pager", "-q"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
err_lines = [l for l in r.stdout.splitlines() if l.strip()]
|
||||
if err_lines:
|
||||
results.append(_problem(
|
||||
"dnsmasq_errors", "dnsmasq errors", "error",
|
||||
f"{len(err_lines)} dnsmasq error(s) in the last hour: "
|
||||
f"{err_lines[0][:120]}{'...' if len(err_lines) > 1 else ''}",
|
||||
"Check dnsmasq logs: `sudo journalctl -u 'dnsmasq-routlin-*' --since -1h`"))
|
||||
else:
|
||||
results.append(_ok("dnsmasq_errors", "dnsmasq errors"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
# ===================================================================
|
||||
# Next blocklist update
|
||||
# ===================================================================
|
||||
|
||||
def _next_blocklist_update():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["systemctl", "status", f"{BLIST_TIMER_NAME}.timer", "--no-pager"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
for line in r.stdout.splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("Trigger:"):
|
||||
trigger = line.split("Trigger:", 1)[1].strip()
|
||||
if trigger and trigger != "n/a":
|
||||
return trigger
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
# ===================================================================
|
||||
# Public API
|
||||
# ===================================================================
|
||||
|
||||
def run_and_write(data):
|
||||
"""Run all checks, write .health atomically, return the status dict."""
|
||||
status = {
|
||||
"checked_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"services": check_services(data),
|
||||
"configurations": check_configurations(data),
|
||||
"logs": check_logs(data),
|
||||
"next_blocklist_update": _next_blocklist_update(),
|
||||
}
|
||||
tmp = HEALTH_FILE.with_suffix(".tmp")
|
||||
tmp.write_text(json.dumps(status, indent=2))
|
||||
tmp.replace(HEALTH_FILE)
|
||||
return status
|
||||
|
||||
|
||||
def print_table(status):
|
||||
"""Print the service status table and any problems to stdout."""
|
||||
col = shutil.get_terminal_size((80, 24)).columns
|
||||
|
||||
services = status.get("services", [])
|
||||
print(f"\n {'UNIT':<45} {'ACTIVE':<18} {'ENABLED'}")
|
||||
print(f" {'-'*45} {'-'*18} {'-'*15}")
|
||||
for svc in services:
|
||||
active = svc.get("active", "unknown")
|
||||
enabled = svc.get("enabled", "unknown")
|
||||
a_ok = svc.get("active_ok", True)
|
||||
e_ok = svc.get("enabled_ok", True)
|
||||
a_sym = "+" if active == "active" else "x"
|
||||
e_sym = "+" if enabled == "enabled" else "x"
|
||||
a_status = "(OK) " if a_ok else "(BAD)"
|
||||
e_status = "(OK) " if e_ok else "(BAD)"
|
||||
print(f" {svc['name']:<45} "
|
||||
f"{a_sym} {active:<10} {a_status} "
|
||||
f"{e_sym} {enabled:<10} {e_status}")
|
||||
|
||||
trigger = status.get("next_blocklist_update")
|
||||
if trigger:
|
||||
print(f"\n Next blocklist update: {trigger}")
|
||||
|
||||
svc_problems = []
|
||||
for svc in status.get("services", []):
|
||||
if svc.get("status") == "problem":
|
||||
name = svc["name"]
|
||||
utype = "timer" if name.endswith(".timer") else "service" if name.endswith(".service") else "unit"
|
||||
exp_parts, act_parts, fix_parts = [], [], []
|
||||
if not svc.get("active_ok"):
|
||||
exp_parts.append(svc.get("expected_active", "active"))
|
||||
act_parts.append(svc.get("active", "unknown"))
|
||||
fix_parts.append("activate")
|
||||
if not svc.get("enabled_ok"):
|
||||
exp_parts.append(svc.get("expected_enabled", "enabled"))
|
||||
act_parts.append(svc.get("enabled", "unknown"))
|
||||
fix_parts.append("enable")
|
||||
detail = (f"The {utype} `{name}` is expected to be "
|
||||
f"{' and '.join(exp_parts)} but is {' and '.join(act_parts)}.")
|
||||
suggestion = f"Run `sudo python3 core.py --apply` to {' and '.join(reversed(fix_parts))} it."
|
||||
svc_problems.append({"severity": svc.get("severity", "error"), "detail": detail, "suggestion": suggestion})
|
||||
problems = svc_problems + [
|
||||
item
|
||||
for section in ("configurations", "logs")
|
||||
for item in status.get(section, [])
|
||||
if item.get("status") == "problem"
|
||||
]
|
||||
if problems:
|
||||
print(f"\n Problems {'=' * (col - 12)}")
|
||||
for p in problems:
|
||||
sev = p.get("severity", "error")
|
||||
tag = f"[{sev}]"
|
||||
detail = p.get("detail", p.get("name", ""))
|
||||
print(f" {tag:<10} {detail}")
|
||||
tip = p.get("suggestion", "")
|
||||
if tip:
|
||||
print(f" {'':10} -> {tip}")
|
||||
print()
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Standalone entry point
|
||||
# ===================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
with open(CONFIG_FILE) as f:
|
||||
data = json.load(f)
|
||||
except Exception as ex:
|
||||
print(f"Error loading {CONFIG_FILE}: {ex}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
status = run_and_write(data)
|
||||
print_table(status)
|
||||
Loading…
Add table
Add a link
Reference in a new issue