Development

This commit is contained in:
Matthew Grotke 2026-06-09 21:28:38 -04:00
parent e9166d8a6a
commit 0983e14de4
7 changed files with 494 additions and 160 deletions

View file

@ -2,31 +2,71 @@
mod_metrics.py -- DNS metrics collection and display.
Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl,
and accumulates lifetime totals in a JSON file.
and stores daily-aggregated totals in a SQLite database (.dns-metrics2).
Each maintenance tick upserts into today's row, accumulating additive
counters and taking MAX for high-water marks. All-time totals are
derived with SUM/MAX across rows at read time.
"""
import json
import os
import re
import signal
import sqlite3
import subprocess
import time
from datetime import datetime
from datetime import date
import mod_shared as shared
import mod_validation as validation
METRICS_FILE = shared.SCRIPT_DIR / ".dns-metrics"
DB_FILE = shared.SCRIPT_DIR / ".dns-metrics2"
# ===================================================================
# Collect and store
# Database
# ===================================================================
def open_db():
con = sqlite3.connect(DB_FILE, timeout=10)
con.execute('PRAGMA journal_mode=WAL')
con.executescript('''
CREATE TABLE IF NOT EXISTS daily_totals (
date TEXT PRIMARY KEY,
queries_forwarded INTEGER NOT NULL DEFAULT 0,
queries_answered_locally INTEGER NOT NULL DEFAULT 0,
queries_authoritative INTEGER NOT NULL DEFAULT 0,
cache_reused INTEGER NOT NULL DEFAULT 0,
tcp_hwm INTEGER NOT NULL DEFAULT 0,
tcp_max_allowed INTEGER NOT NULL DEFAULT 0,
pool_memory_max INTEGER NOT NULL DEFAULT 0,
dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0,
dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0,
dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS daily_servers (
date TEXT NOT NULL,
address TEXT NOT NULL,
queries_sent INTEGER NOT NULL DEFAULT 0,
retried INTEGER NOT NULL DEFAULT 0,
failed INTEGER NOT NULL DEFAULT 0,
nxdomain INTEGER NOT NULL DEFAULT 0,
avg_latency_ms INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (date, address)
);
''')
con.commit()
return con
# ===================================================================
# Collect
# ===================================================================
def collect_metrics(data):
"""
Send SIGUSR1 to each running dnsmasq instance and parse stats from
journalctl. Returns a combined metrics dict, or None if unavailable.
journalctl. Returns a combined metrics dict, or None if unavailable.
"""
metrics = {
"queries_forwarded": 0,
@ -101,66 +141,79 @@ def collect_metrics(data):
"address": addr, "queries_sent": 0, "retried": 0,
"failed": 0, "nxdomain": 0, "avg_latency_ms": 0
}
server_map[addr]["queries_sent"] += int(m.group(2))
server_map[addr]["retried"] += int(m.group(3))
server_map[addr]["failed"] += int(m.group(4))
server_map[addr]["nxdomain"] += int(m.group(5))
server_map[addr]["avg_latency_ms"] = int(m.group(6))
server_map[addr]["queries_sent"] += int(m.group(2))
server_map[addr]["retried"] += int(m.group(3))
server_map[addr]["failed"] += int(m.group(4))
server_map[addr]["nxdomain"] += int(m.group(5))
if int(m.group(6)) > 0:
server_map[addr]["avg_latency_ms"] = int(m.group(6))
metrics["servers"] = list(server_map.values())
return metrics
def update_metrics_file(new_metrics):
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ===================================================================
# Store
# ===================================================================
if METRICS_FILE.exists():
with open(METRICS_FILE) as f:
stored = json.load(f)
else:
stored = {
"metadata": {"first_recorded": now_str, "last_recorded": now_str, "total_updates": 0},
"totals": {
"queries_forwarded": 0, "queries_answered_locally": 0,
"queries_authoritative": 0, "cache_reused": 0,
"tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0,
"dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0,
"dnssec_sig_fails_hwm": 0, "servers": []
}
}
def update_metrics_db(new_metrics):
today = date.today().isoformat()
con = open_db()
t = stored["totals"]
t["queries_forwarded"] += new_metrics["queries_forwarded"]
t["queries_answered_locally"] += new_metrics["queries_answered_locally"]
t["queries_authoritative"] += new_metrics["queries_authoritative"]
t["cache_reused"] += new_metrics["cache_reused"]
t["tcp_hwm"] = max(t["tcp_hwm"], new_metrics["tcp_hwm"])
t["pool_memory_max"] = max(t["pool_memory_max"], new_metrics["pool_memory_max"])
t["dnssec_subqueries_hwm"] = max(t["dnssec_subqueries_hwm"], new_metrics["dnssec_subqueries_hwm"])
t["dnssec_crypto_hwm"] = max(t["dnssec_crypto_hwm"], new_metrics["dnssec_crypto_hwm"])
t["dnssec_sig_fails_hwm"] = max(t["dnssec_sig_fails_hwm"], new_metrics["dnssec_sig_fails_hwm"])
if new_metrics["tcp_max_allowed"]:
t["tcp_max_allowed"] = new_metrics["tcp_max_allowed"]
con.execute('''
INSERT INTO daily_totals(
date,
queries_forwarded, queries_answered_locally, queries_authoritative,
cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max,
dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm
) VALUES (?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(date) DO UPDATE SET
queries_forwarded = queries_forwarded + excluded.queries_forwarded,
queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally,
queries_authoritative = queries_authoritative + excluded.queries_authoritative,
cache_reused = cache_reused + excluded.cache_reused,
tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm),
tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0
THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END,
pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max),
dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm),
dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm),
dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm)
''', (
today,
new_metrics["queries_forwarded"],
new_metrics["queries_answered_locally"],
new_metrics["queries_authoritative"],
new_metrics["cache_reused"],
new_metrics["tcp_hwm"],
new_metrics["tcp_max_allowed"],
new_metrics["pool_memory_max"],
new_metrics["dnssec_subqueries_hwm"],
new_metrics["dnssec_crypto_hwm"],
new_metrics["dnssec_sig_fails_hwm"],
))
existing = {s["address"]: s for s in t["servers"]}
for srv in new_metrics["servers"]:
addr = srv["address"]
if addr in existing:
existing[addr]["queries_sent"] += srv["queries_sent"]
existing[addr]["retried"] += srv["retried"]
existing[addr]["failed"] += srv["failed"]
existing[addr]["nxdomain"] += srv["nxdomain"]
existing[addr]["avg_latency_ms"] = srv["avg_latency_ms"]
else:
existing[addr] = srv.copy()
t["servers"] = list(existing.values())
con.execute('''
INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms)
VALUES (?,?,?,?,?,?,?)
ON CONFLICT(date, address) DO UPDATE SET
queries_sent = queries_sent + excluded.queries_sent,
retried = retried + excluded.retried,
failed = failed + excluded.failed,
nxdomain = nxdomain + excluded.nxdomain,
avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0
THEN excluded.avg_latency_ms
ELSE avg_latency_ms END
''', (
today, srv["address"],
srv["queries_sent"], srv["retried"], srv["failed"],
srv["nxdomain"], srv["avg_latency_ms"],
))
stored["metadata"]["last_recorded"] = now_str
stored["metadata"]["total_updates"] += 1
with open(METRICS_FILE, "w") as f:
json.dump(stored, f, indent=2)
shared.chown_to_script_dir_owner(METRICS_FILE)
con.commit()
shared.chown_to_script_dir_owner(DB_FILE)
con.close()
# ===================================================================
@ -171,37 +224,58 @@ def show_metrics(data):
new = collect_metrics(data)
if new is None:
return
update_metrics_file(new)
update_metrics_db(new)
with open(METRICS_FILE) as f:
data_m = json.load(f)
con = open_db()
row = con.execute('''
SELECT
MIN(date), MAX(date), COUNT(*),
SUM(queries_forwarded), SUM(queries_answered_locally),
SUM(queries_authoritative), SUM(cache_reused),
MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max)
FROM daily_totals
''').fetchone()
servers = con.execute('''
SELECT
ds.address,
SUM(ds.queries_sent),
SUM(ds.retried),
SUM(ds.failed),
SUM(ds.nxdomain),
(SELECT avg_latency_ms FROM daily_servers d2
WHERE d2.address = ds.address AND d2.avg_latency_ms > 0
ORDER BY d2.date DESC LIMIT 1)
FROM daily_servers ds
GROUP BY ds.address
ORDER BY SUM(ds.queries_sent) DESC
''').fetchall()
con.close()
m = data_m["metadata"]
t = data_m["totals"]
first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row
print("DNS Metrics (lifetime totals across all VLAN instances)")
print(f" First recorded : {m['first_recorded']}")
print(f" Last recorded : {m['last_recorded']}")
print(f" Total updates : {m['total_updates']}")
print("DNS Metrics (all-time totals across all VLAN instances)")
print(f" First recorded : {first or '-'}")
print(f" Last recorded : {last or '-'}")
print(f" Days tracked : {days or 0}")
print()
print("Queries")
print(f" Forwarded to upstream : {t['queries_forwarded']:,}")
print(f" Answered from cache : {t['queries_answered_locally']:,}")
print(f" Authoritative : {t['queries_authoritative']:,}")
print(f" Cache reused : {t['cache_reused']:,}")
print(f" Forwarded to upstream : {(fwd or 0):,}")
print(f" Answered from cache : {(local or 0):,}")
print(f" Authoritative : {(auth or 0):,}")
print(f" Cache reused : {(reused or 0):,}")
print()
print("TCP")
print(f" Peak concurrent (HWM) : {t['tcp_hwm']}")
print(f" Max allowed : {t['tcp_max_allowed']}")
print(f" Peak concurrent (HWM) : {tcp_hwm or 0}")
print(f" Max allowed : {tcp_max or 0}")
print()
print(f"Pool memory peak : {t['pool_memory_max']} bytes")
if t["servers"]:
print(f"Pool memory peak : {pool or 0} bytes")
if servers:
print()
print("Upstream servers")
for s in t["servers"]:
print(f" {s['address']}")
print(f" Sent : {s['queries_sent']:,}")
print(f" Retried : {s['retried']:,}")
print(f" Failed : {s['failed']:,}")
print(f" NXDOMAIN : {s['nxdomain']:,}")
print(f" Latency : {s['avg_latency_ms']}ms (last recorded)")
print("Upstream servers (all-time)")
for addr, sent, retried, failed, nxdomain, latency in servers:
print(f" {addr}")
print(f" Sent : {(sent or 0):,}")
print(f" Retried : {(retried or 0):,}")
print(f" Failed : {(failed or 0):,}")
print(f" NXDOMAIN : {(nxdomain or 0):,}")
print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -")