""" mod_metrics.py -- DNS metrics collection and display. Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl, and stores daily-aggregated totals in a SQLite database (.dns-metrics2). Each maintenance tick upserts into today's row, accumulating additive counters and taking MAX for high-water marks. All-time totals are derived with SUM/MAX across rows at read time. """ import os import re import signal import sqlite3 import subprocess import time from datetime import date import mod_shared as shared import mod_validation as validation DB_FILE = shared.SCRIPT_DIR / ".dns-metrics" # =================================================================== # Database # =================================================================== def open_db(): con = sqlite3.connect(DB_FILE, timeout=10) con.execute('PRAGMA journal_mode=WAL') con.executescript(''' CREATE TABLE IF NOT EXISTS daily_totals ( date TEXT PRIMARY KEY, last_updated INTEGER, queries_forwarded INTEGER NOT NULL DEFAULT 0, queries_answered_locally INTEGER NOT NULL DEFAULT 0, queries_authoritative INTEGER NOT NULL DEFAULT 0, cache_reused INTEGER NOT NULL DEFAULT 0, tcp_hwm INTEGER NOT NULL DEFAULT 0, tcp_max_allowed INTEGER NOT NULL DEFAULT 0, pool_memory_max INTEGER NOT NULL DEFAULT 0, dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0, dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0, dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0 ); CREATE TABLE IF NOT EXISTS daily_servers ( date TEXT NOT NULL, address TEXT NOT NULL, queries_sent INTEGER NOT NULL DEFAULT 0, retried INTEGER NOT NULL DEFAULT 0, failed INTEGER NOT NULL DEFAULT 0, nxdomain INTEGER NOT NULL DEFAULT 0, avg_latency_ms INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (date, address) ); ''') con.commit() return con # =================================================================== # Collect # =================================================================== def collect_metrics(data): """ Send SIGUSR1 to each running dnsmasq instance and parse stats from journalctl. Returns a combined metrics dict, or None if unavailable. """ metrics = { "queries_forwarded": 0, "queries_answered_locally": 0, "queries_authoritative": 0, "cache_reused": 0, "tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0, "dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0, "dnssec_sig_fails_hwm": 0, "servers": [] } t_signal = int(time.time()) any_running = False for vlan in data["vlans"]: pid_file = shared.vlan_pid_file(vlan) try: pid = int(pid_file.read_text().strip()) os.kill(pid, signal.SIGUSR1) any_running = True except Exception: continue if not any_running: print("No dnsmasq instances are running.") return None time.sleep(2) server_map = {} for vlan in data["vlans"]: svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data)) result = subprocess.run( ["journalctl", "-u", svc, f"--since=@{t_signal}", "--no-pager", "-o", "cat"], capture_output=True, text=True ) for line in result.stdout.splitlines(): m = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line) if m: metrics["cache_reused"] += int(m.group(1)) m = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line) if m: metrics["queries_forwarded"] += int(m.group(1)) metrics["queries_answered_locally"] += int(m.group(2)) m = re.search(r"queries for authoritative zones (\d+)", line) if m: metrics["queries_authoritative"] += int(m.group(1)) m = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line) if m: metrics["tcp_hwm"] = max(metrics["tcp_hwm"], int(m.group(1))) metrics["tcp_max_allowed"] = max(metrics["tcp_max_allowed"], int(m.group(2))) m = re.search(r"pool memory in use \d+, max (\d+)", line) if m: metrics["pool_memory_max"] = max(metrics["pool_memory_max"], int(m.group(1))) m = re.search( r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), " r"nxdomain replies (\d+), avg\. latency (\d+)ms", line ) if m: addr = m.group(1) if addr not in server_map: server_map[addr] = { "address": addr, "queries_sent": 0, "retried": 0, "failed": 0, "nxdomain": 0, "avg_latency_ms": 0 } server_map[addr]["queries_sent"] += int(m.group(2)) server_map[addr]["retried"] += int(m.group(3)) server_map[addr]["failed"] += int(m.group(4)) server_map[addr]["nxdomain"] += int(m.group(5)) if int(m.group(6)) > 0: server_map[addr]["avg_latency_ms"] = int(m.group(6)) metrics["servers"] = list(server_map.values()) return metrics # =================================================================== # Store # =================================================================== def update_metrics_db(new_metrics): today = date.today().isoformat() con = open_db() con.execute(''' INSERT INTO daily_totals( date, last_updated, queries_forwarded, queries_answered_locally, queries_authoritative, cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max, dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm ) VALUES (?,strftime('%s','now'),?,?,?,?,?,?,?,?,?,?) ON CONFLICT(date) DO UPDATE SET last_updated = strftime('%s','now'), queries_forwarded = queries_forwarded + excluded.queries_forwarded, queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally, queries_authoritative = queries_authoritative + excluded.queries_authoritative, cache_reused = cache_reused + excluded.cache_reused, tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm), tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0 THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END, pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max), dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm), dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm), dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm) ''', ( today, new_metrics["queries_forwarded"], new_metrics["queries_answered_locally"], new_metrics["queries_authoritative"], new_metrics["cache_reused"], new_metrics["tcp_hwm"], new_metrics["tcp_max_allowed"], new_metrics["pool_memory_max"], new_metrics["dnssec_subqueries_hwm"], new_metrics["dnssec_crypto_hwm"], new_metrics["dnssec_sig_fails_hwm"], )) for srv in new_metrics["servers"]: con.execute(''' INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms) VALUES (?,?,?,?,?,?,?) ON CONFLICT(date, address) DO UPDATE SET queries_sent = queries_sent + excluded.queries_sent, retried = retried + excluded.retried, failed = failed + excluded.failed, nxdomain = nxdomain + excluded.nxdomain, avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0 THEN excluded.avg_latency_ms ELSE avg_latency_ms END ''', ( today, srv["address"], srv["queries_sent"], srv["retried"], srv["failed"], srv["nxdomain"], srv["avg_latency_ms"], )) con.commit() shared.chown_to_script_dir_owner(DB_FILE) con.close() # =================================================================== # Display # =================================================================== def show_metrics(data): new = collect_metrics(data) if new is None: return update_metrics_db(new) con = open_db() row = con.execute(''' SELECT MIN(date), MAX(date), COUNT(*), SUM(queries_forwarded), SUM(queries_answered_locally), SUM(queries_authoritative), SUM(cache_reused), MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max) FROM daily_totals ''').fetchone() servers = con.execute(''' SELECT ds.address, SUM(ds.queries_sent), SUM(ds.retried), SUM(ds.failed), SUM(ds.nxdomain), (SELECT avg_latency_ms FROM daily_servers d2 WHERE d2.address = ds.address AND d2.avg_latency_ms > 0 ORDER BY d2.date DESC LIMIT 1) FROM daily_servers ds GROUP BY ds.address ORDER BY SUM(ds.queries_sent) DESC ''').fetchall() con.close() first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row print("DNS Metrics (all-time totals across all VLAN instances)") print(f" First recorded : {first or '-'}") print(f" Last recorded : {last or '-'}") print(f" Days tracked : {days or 0}") print() print("Queries") print(f" Forwarded to upstream : {(fwd or 0):,}") print(f" Answered from cache : {(local or 0):,}") print(f" Authoritative : {(auth or 0):,}") print(f" Cache reused : {(reused or 0):,}") print() print("TCP") print(f" Peak concurrent (HWM) : {tcp_hwm or 0}") print(f" Max allowed : {tcp_max or 0}") print() print(f"Pool memory peak : {pool or 0} bytes") if servers: print() print("Upstream servers (all-time)") for addr, sent, retried, failed, nxdomain, latency in servers: print(f" {addr}") print(f" Sent : {(sent or 0):,}") print(f" Retried : {(retried or 0):,}") print(f" Failed : {(failed or 0):,}") print(f" NXDOMAIN : {(nxdomain or 0):,}") print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -")