linuxrouter/routlin/mod_metrics.py
2026-06-09 22:42:37 -04:00

284 lines
11 KiB
Python

"""
mod_metrics.py -- DNS metrics collection and display.
Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl,
and stores daily-aggregated totals in a SQLite database (.dns-metrics2).
Each maintenance tick upserts into today's row, accumulating additive
counters and taking MAX for high-water marks. All-time totals are
derived with SUM/MAX across rows at read time.
"""
import os
import re
import signal
import sqlite3
import subprocess
import time
from datetime import date
import mod_shared as shared
import mod_validation as validation
DB_FILE = shared.SCRIPT_DIR / ".dns-metrics"
# ===================================================================
# Database
# ===================================================================
def open_db():
con = sqlite3.connect(DB_FILE, timeout=10)
con.execute('PRAGMA journal_mode=WAL')
con.executescript('''
CREATE TABLE IF NOT EXISTS daily_totals (
date TEXT PRIMARY KEY,
last_updated INTEGER,
queries_forwarded INTEGER NOT NULL DEFAULT 0,
queries_answered_locally INTEGER NOT NULL DEFAULT 0,
queries_authoritative INTEGER NOT NULL DEFAULT 0,
cache_reused INTEGER NOT NULL DEFAULT 0,
tcp_hwm INTEGER NOT NULL DEFAULT 0,
tcp_max_allowed INTEGER NOT NULL DEFAULT 0,
pool_memory_max INTEGER NOT NULL DEFAULT 0,
dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0,
dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0,
dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS daily_servers (
date TEXT NOT NULL,
address TEXT NOT NULL,
queries_sent INTEGER NOT NULL DEFAULT 0,
retried INTEGER NOT NULL DEFAULT 0,
failed INTEGER NOT NULL DEFAULT 0,
nxdomain INTEGER NOT NULL DEFAULT 0,
avg_latency_ms INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (date, address)
);
''')
con.commit()
return con
# ===================================================================
# Collect
# ===================================================================
def collect_metrics(data):
"""
Send SIGUSR1 to each running dnsmasq instance and parse stats from
journalctl. Returns a combined metrics dict, or None if unavailable.
"""
metrics = {
"queries_forwarded": 0,
"queries_answered_locally": 0,
"queries_authoritative": 0,
"cache_reused": 0,
"tcp_hwm": 0,
"tcp_max_allowed": 0,
"pool_memory_max": 0,
"dnssec_subqueries_hwm": 0,
"dnssec_crypto_hwm": 0,
"dnssec_sig_fails_hwm": 0,
"servers": []
}
t_signal = int(time.time())
any_running = False
for vlan in data["vlans"]:
pid_file = shared.vlan_pid_file(vlan)
try:
pid = int(pid_file.read_text().strip())
os.kill(pid, signal.SIGUSR1)
any_running = True
except Exception:
continue
if not any_running:
print("No dnsmasq instances are running.")
return None
time.sleep(2)
server_map = {}
for vlan in data["vlans"]:
svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data))
result = subprocess.run(
["journalctl", "-u", svc, f"--since=@{t_signal}",
"--no-pager", "-o", "cat"],
capture_output=True, text=True
)
for line in result.stdout.splitlines():
m = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line)
if m:
metrics["cache_reused"] += int(m.group(1))
m = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line)
if m:
metrics["queries_forwarded"] += int(m.group(1))
metrics["queries_answered_locally"] += int(m.group(2))
m = re.search(r"queries for authoritative zones (\d+)", line)
if m:
metrics["queries_authoritative"] += int(m.group(1))
m = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line)
if m:
metrics["tcp_hwm"] = max(metrics["tcp_hwm"], int(m.group(1)))
metrics["tcp_max_allowed"] = max(metrics["tcp_max_allowed"], int(m.group(2)))
m = re.search(r"pool memory in use \d+, max (\d+)", line)
if m:
metrics["pool_memory_max"] = max(metrics["pool_memory_max"], int(m.group(1)))
m = re.search(
r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), "
r"nxdomain replies (\d+), avg\. latency (\d+)ms",
line
)
if m:
addr = m.group(1)
if addr not in server_map:
server_map[addr] = {
"address": addr, "queries_sent": 0, "retried": 0,
"failed": 0, "nxdomain": 0, "avg_latency_ms": 0
}
server_map[addr]["queries_sent"] += int(m.group(2))
server_map[addr]["retried"] += int(m.group(3))
server_map[addr]["failed"] += int(m.group(4))
server_map[addr]["nxdomain"] += int(m.group(5))
if int(m.group(6)) > 0:
server_map[addr]["avg_latency_ms"] = int(m.group(6))
metrics["servers"] = list(server_map.values())
return metrics
# ===================================================================
# Store
# ===================================================================
def update_metrics_db(new_metrics):
today = date.today().isoformat()
con = open_db()
con.execute('''
INSERT INTO daily_totals(
date, last_updated,
queries_forwarded, queries_answered_locally, queries_authoritative,
cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max,
dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm
) VALUES (?,strftime('%s','now'),?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(date) DO UPDATE SET
last_updated = strftime('%s','now'),
queries_forwarded = queries_forwarded + excluded.queries_forwarded,
queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally,
queries_authoritative = queries_authoritative + excluded.queries_authoritative,
cache_reused = cache_reused + excluded.cache_reused,
tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm),
tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0
THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END,
pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max),
dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm),
dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm),
dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm)
''', (
today,
new_metrics["queries_forwarded"],
new_metrics["queries_answered_locally"],
new_metrics["queries_authoritative"],
new_metrics["cache_reused"],
new_metrics["tcp_hwm"],
new_metrics["tcp_max_allowed"],
new_metrics["pool_memory_max"],
new_metrics["dnssec_subqueries_hwm"],
new_metrics["dnssec_crypto_hwm"],
new_metrics["dnssec_sig_fails_hwm"],
))
for srv in new_metrics["servers"]:
con.execute('''
INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms)
VALUES (?,?,?,?,?,?,?)
ON CONFLICT(date, address) DO UPDATE SET
queries_sent = queries_sent + excluded.queries_sent,
retried = retried + excluded.retried,
failed = failed + excluded.failed,
nxdomain = nxdomain + excluded.nxdomain,
avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0
THEN excluded.avg_latency_ms
ELSE avg_latency_ms END
''', (
today, srv["address"],
srv["queries_sent"], srv["retried"], srv["failed"],
srv["nxdomain"], srv["avg_latency_ms"],
))
con.commit()
shared.chown_to_script_dir_owner(DB_FILE)
con.close()
# ===================================================================
# Display
# ===================================================================
def show_metrics(data):
new = collect_metrics(data)
if new is None:
return
update_metrics_db(new)
con = open_db()
row = con.execute('''
SELECT
MIN(date), MAX(date), COUNT(*),
SUM(queries_forwarded), SUM(queries_answered_locally),
SUM(queries_authoritative), SUM(cache_reused),
MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max)
FROM daily_totals
''').fetchone()
servers = con.execute('''
SELECT
ds.address,
SUM(ds.queries_sent),
SUM(ds.retried),
SUM(ds.failed),
SUM(ds.nxdomain),
(SELECT avg_latency_ms FROM daily_servers d2
WHERE d2.address = ds.address AND d2.avg_latency_ms > 0
ORDER BY d2.date DESC LIMIT 1)
FROM daily_servers ds
GROUP BY ds.address
ORDER BY SUM(ds.queries_sent) DESC
''').fetchall()
con.close()
first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row
print("DNS Metrics (all-time totals across all VLAN instances)")
print(f" First recorded : {first or '-'}")
print(f" Last recorded : {last or '-'}")
print(f" Days tracked : {days or 0}")
print()
print("Queries")
print(f" Forwarded to upstream : {(fwd or 0):,}")
print(f" Answered from cache : {(local or 0):,}")
print(f" Authoritative : {(auth or 0):,}")
print(f" Cache reused : {(reused or 0):,}")
print()
print("TCP")
print(f" Peak concurrent (HWM) : {tcp_hwm or 0}")
print(f" Max allowed : {tcp_max or 0}")
print()
print(f"Pool memory peak : {pool or 0} bytes")
if servers:
print()
print("Upstream servers (all-time)")
for addr, sent, retried, failed, nxdomain, latency in servers:
print(f" {addr}")
print(f" Sent : {(sent or 0):,}")
print(f" Retried : {(retried or 0):,}")
print(f" Failed : {(failed or 0):,}")
print(f" NXDOMAIN : {(nxdomain or 0):,}")
print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -")