""" mod_metrics.py -- DNS metrics collection and display. Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl, and accumulates lifetime totals in a JSON file. """ import json import re import subprocess import time from datetime import datetime import mod_shared as shared import mod_validation as validation METRICS_FILE = shared.SCRIPT_DIR / ".dns-metrics" # =================================================================== # Collect and store # =================================================================== def collect_metrics(data): """ Send SIGUSR1 to each running dnsmasq instance and parse stats from journalctl. Returns a combined metrics dict, or None if unavailable. """ metrics = { "queries_forwarded": 0, "queries_answered_locally": 0, "queries_authoritative": 0, "cache_reused": 0, "tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0, "dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0, "dnssec_sig_fails_hwm": 0, "servers": [] } any_running = False for vlan in data["vlans"]: svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data)) result = subprocess.run( ["systemctl", "kill", "--signal=SIGUSR1", svc], capture_output=True, text=True ) if result.returncode != 0: continue any_running = True if not any_running: print("No dnsmasq instances are running.") return None time.sleep(1) server_map = {} for vlan in data["vlans"]: svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data)) result = subprocess.run( ["journalctl", "-u", svc, "--since", "5 seconds ago", "--no-pager", "-o", "cat"], capture_output=True, text=True ) for line in result.stdout.splitlines(): m = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line) if m: metrics["cache_reused"] += int(m.group(1)) m = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line) if m: metrics["queries_forwarded"] += int(m.group(1)) metrics["queries_answered_locally"] += int(m.group(2)) m = re.search(r"queries for authoritative zones (\d+)", line) if m: metrics["queries_authoritative"] += int(m.group(1)) m = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line) if m: metrics["tcp_hwm"] = max(metrics["tcp_hwm"], int(m.group(1))) metrics["tcp_max_allowed"] = max(metrics["tcp_max_allowed"], int(m.group(2))) m = re.search(r"pool memory in use \d+, max (\d+)", line) if m: metrics["pool_memory_max"] = max(metrics["pool_memory_max"], int(m.group(1))) m = re.search( r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), " r"nxdomain replies (\d+), avg\. latency (\d+)ms", line ) if m: addr = m.group(1) if addr not in server_map: server_map[addr] = { "address": addr, "queries_sent": 0, "retried": 0, "failed": 0, "nxdomain": 0, "avg_latency_ms": 0 } server_map[addr]["queries_sent"] += int(m.group(2)) server_map[addr]["retried"] += int(m.group(3)) server_map[addr]["failed"] += int(m.group(4)) server_map[addr]["nxdomain"] += int(m.group(5)) server_map[addr]["avg_latency_ms"] = int(m.group(6)) metrics["servers"] = list(server_map.values()) return metrics def update_metrics_file(new_metrics): now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if METRICS_FILE.exists(): with open(METRICS_FILE) as f: stored = json.load(f) else: stored = { "metadata": {"first_recorded": now_str, "last_recorded": now_str, "total_updates": 0}, "totals": { "queries_forwarded": 0, "queries_answered_locally": 0, "queries_authoritative": 0, "cache_reused": 0, "tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0, "dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0, "dnssec_sig_fails_hwm": 0, "servers": [] } } t = stored["totals"] t["queries_forwarded"] += new_metrics["queries_forwarded"] t["queries_answered_locally"] += new_metrics["queries_answered_locally"] t["queries_authoritative"] += new_metrics["queries_authoritative"] t["cache_reused"] += new_metrics["cache_reused"] t["tcp_hwm"] = max(t["tcp_hwm"], new_metrics["tcp_hwm"]) t["pool_memory_max"] = max(t["pool_memory_max"], new_metrics["pool_memory_max"]) t["dnssec_subqueries_hwm"] = max(t["dnssec_subqueries_hwm"], new_metrics["dnssec_subqueries_hwm"]) t["dnssec_crypto_hwm"] = max(t["dnssec_crypto_hwm"], new_metrics["dnssec_crypto_hwm"]) t["dnssec_sig_fails_hwm"] = max(t["dnssec_sig_fails_hwm"], new_metrics["dnssec_sig_fails_hwm"]) if new_metrics["tcp_max_allowed"]: t["tcp_max_allowed"] = new_metrics["tcp_max_allowed"] existing = {s["address"]: s for s in t["servers"]} for srv in new_metrics["servers"]: addr = srv["address"] if addr in existing: existing[addr]["queries_sent"] += srv["queries_sent"] existing[addr]["retried"] += srv["retried"] existing[addr]["failed"] += srv["failed"] existing[addr]["nxdomain"] += srv["nxdomain"] existing[addr]["avg_latency_ms"] = srv["avg_latency_ms"] else: existing[addr] = srv.copy() t["servers"] = list(existing.values()) stored["metadata"]["last_recorded"] = now_str stored["metadata"]["total_updates"] += 1 with open(METRICS_FILE, "w") as f: json.dump(stored, f, indent=2) shared.chown_to_script_dir_owner(METRICS_FILE) # =================================================================== # Display # =================================================================== def show_metrics(data): new = collect_metrics(data) if new is None: return update_metrics_file(new) with open(METRICS_FILE) as f: data_m = json.load(f) m = data_m["metadata"] t = data_m["totals"] print("DNS Metrics (lifetime totals across all VLAN instances)") print(f" First recorded : {m['first_recorded']}") print(f" Last recorded : {m['last_recorded']}") print(f" Total updates : {m['total_updates']}") print() print("Queries") print(f" Forwarded to upstream : {t['queries_forwarded']:,}") print(f" Answered from cache : {t['queries_answered_locally']:,}") print(f" Authoritative : {t['queries_authoritative']:,}") print(f" Cache reused : {t['cache_reused']:,}") print() print("TCP") print(f" Peak concurrent (HWM) : {t['tcp_hwm']}") print(f" Max allowed : {t['tcp_max_allowed']}") print() print(f"Pool memory peak : {t['pool_memory_max']} bytes") if t["servers"]: print() print("Upstream servers") for s in t["servers"]: print(f" {s['address']}") print(f" Sent : {s['queries_sent']:,}") print(f" Retried : {s['retried']:,}") print(f" Failed : {s['failed']:,}") print(f" NXDOMAIN : {s['nxdomain']:,}") print(f" Latency : {s['avg_latency_ms']}ms (last recorded)")