Development
This commit is contained in:
parent
e9166d8a6a
commit
0983e14de4
7 changed files with 494 additions and 160 deletions
|
|
@ -2,31 +2,71 @@
|
|||
mod_metrics.py -- DNS metrics collection and display.
|
||||
|
||||
Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl,
|
||||
and accumulates lifetime totals in a JSON file.
|
||||
and stores daily-aggregated totals in a SQLite database (.dns-metrics2).
|
||||
|
||||
Each maintenance tick upserts into today's row, accumulating additive
|
||||
counters and taking MAX for high-water marks. All-time totals are
|
||||
derived with SUM/MAX across rows at read time.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import date
|
||||
|
||||
import mod_shared as shared
|
||||
import mod_validation as validation
|
||||
|
||||
METRICS_FILE = shared.SCRIPT_DIR / ".dns-metrics"
|
||||
DB_FILE = shared.SCRIPT_DIR / ".dns-metrics2"
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Collect and store
|
||||
# Database
|
||||
# ===================================================================
|
||||
|
||||
def open_db():
|
||||
con = sqlite3.connect(DB_FILE, timeout=10)
|
||||
con.execute('PRAGMA journal_mode=WAL')
|
||||
con.executescript('''
|
||||
CREATE TABLE IF NOT EXISTS daily_totals (
|
||||
date TEXT PRIMARY KEY,
|
||||
queries_forwarded INTEGER NOT NULL DEFAULT 0,
|
||||
queries_answered_locally INTEGER NOT NULL DEFAULT 0,
|
||||
queries_authoritative INTEGER NOT NULL DEFAULT 0,
|
||||
cache_reused INTEGER NOT NULL DEFAULT 0,
|
||||
tcp_hwm INTEGER NOT NULL DEFAULT 0,
|
||||
tcp_max_allowed INTEGER NOT NULL DEFAULT 0,
|
||||
pool_memory_max INTEGER NOT NULL DEFAULT 0,
|
||||
dnssec_subqueries_hwm INTEGER NOT NULL DEFAULT 0,
|
||||
dnssec_crypto_hwm INTEGER NOT NULL DEFAULT 0,
|
||||
dnssec_sig_fails_hwm INTEGER NOT NULL DEFAULT 0
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS daily_servers (
|
||||
date TEXT NOT NULL,
|
||||
address TEXT NOT NULL,
|
||||
queries_sent INTEGER NOT NULL DEFAULT 0,
|
||||
retried INTEGER NOT NULL DEFAULT 0,
|
||||
failed INTEGER NOT NULL DEFAULT 0,
|
||||
nxdomain INTEGER NOT NULL DEFAULT 0,
|
||||
avg_latency_ms INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (date, address)
|
||||
);
|
||||
''')
|
||||
con.commit()
|
||||
return con
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Collect
|
||||
# ===================================================================
|
||||
|
||||
def collect_metrics(data):
|
||||
"""
|
||||
Send SIGUSR1 to each running dnsmasq instance and parse stats from
|
||||
journalctl. Returns a combined metrics dict, or None if unavailable.
|
||||
journalctl. Returns a combined metrics dict, or None if unavailable.
|
||||
"""
|
||||
metrics = {
|
||||
"queries_forwarded": 0,
|
||||
|
|
@ -101,66 +141,79 @@ def collect_metrics(data):
|
|||
"address": addr, "queries_sent": 0, "retried": 0,
|
||||
"failed": 0, "nxdomain": 0, "avg_latency_ms": 0
|
||||
}
|
||||
server_map[addr]["queries_sent"] += int(m.group(2))
|
||||
server_map[addr]["retried"] += int(m.group(3))
|
||||
server_map[addr]["failed"] += int(m.group(4))
|
||||
server_map[addr]["nxdomain"] += int(m.group(5))
|
||||
server_map[addr]["avg_latency_ms"] = int(m.group(6))
|
||||
server_map[addr]["queries_sent"] += int(m.group(2))
|
||||
server_map[addr]["retried"] += int(m.group(3))
|
||||
server_map[addr]["failed"] += int(m.group(4))
|
||||
server_map[addr]["nxdomain"] += int(m.group(5))
|
||||
if int(m.group(6)) > 0:
|
||||
server_map[addr]["avg_latency_ms"] = int(m.group(6))
|
||||
|
||||
metrics["servers"] = list(server_map.values())
|
||||
return metrics
|
||||
|
||||
|
||||
def update_metrics_file(new_metrics):
|
||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
# ===================================================================
|
||||
# Store
|
||||
# ===================================================================
|
||||
|
||||
if METRICS_FILE.exists():
|
||||
with open(METRICS_FILE) as f:
|
||||
stored = json.load(f)
|
||||
else:
|
||||
stored = {
|
||||
"metadata": {"first_recorded": now_str, "last_recorded": now_str, "total_updates": 0},
|
||||
"totals": {
|
||||
"queries_forwarded": 0, "queries_answered_locally": 0,
|
||||
"queries_authoritative": 0, "cache_reused": 0,
|
||||
"tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0,
|
||||
"dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0,
|
||||
"dnssec_sig_fails_hwm": 0, "servers": []
|
||||
}
|
||||
}
|
||||
def update_metrics_db(new_metrics):
|
||||
today = date.today().isoformat()
|
||||
con = open_db()
|
||||
|
||||
t = stored["totals"]
|
||||
t["queries_forwarded"] += new_metrics["queries_forwarded"]
|
||||
t["queries_answered_locally"] += new_metrics["queries_answered_locally"]
|
||||
t["queries_authoritative"] += new_metrics["queries_authoritative"]
|
||||
t["cache_reused"] += new_metrics["cache_reused"]
|
||||
t["tcp_hwm"] = max(t["tcp_hwm"], new_metrics["tcp_hwm"])
|
||||
t["pool_memory_max"] = max(t["pool_memory_max"], new_metrics["pool_memory_max"])
|
||||
t["dnssec_subqueries_hwm"] = max(t["dnssec_subqueries_hwm"], new_metrics["dnssec_subqueries_hwm"])
|
||||
t["dnssec_crypto_hwm"] = max(t["dnssec_crypto_hwm"], new_metrics["dnssec_crypto_hwm"])
|
||||
t["dnssec_sig_fails_hwm"] = max(t["dnssec_sig_fails_hwm"], new_metrics["dnssec_sig_fails_hwm"])
|
||||
if new_metrics["tcp_max_allowed"]:
|
||||
t["tcp_max_allowed"] = new_metrics["tcp_max_allowed"]
|
||||
con.execute('''
|
||||
INSERT INTO daily_totals(
|
||||
date,
|
||||
queries_forwarded, queries_answered_locally, queries_authoritative,
|
||||
cache_reused, tcp_hwm, tcp_max_allowed, pool_memory_max,
|
||||
dnssec_subqueries_hwm, dnssec_crypto_hwm, dnssec_sig_fails_hwm
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(date) DO UPDATE SET
|
||||
queries_forwarded = queries_forwarded + excluded.queries_forwarded,
|
||||
queries_answered_locally = queries_answered_locally + excluded.queries_answered_locally,
|
||||
queries_authoritative = queries_authoritative + excluded.queries_authoritative,
|
||||
cache_reused = cache_reused + excluded.cache_reused,
|
||||
tcp_hwm = MAX(tcp_hwm, excluded.tcp_hwm),
|
||||
tcp_max_allowed = CASE WHEN excluded.tcp_max_allowed > 0
|
||||
THEN excluded.tcp_max_allowed ELSE tcp_max_allowed END,
|
||||
pool_memory_max = MAX(pool_memory_max, excluded.pool_memory_max),
|
||||
dnssec_subqueries_hwm = MAX(dnssec_subqueries_hwm, excluded.dnssec_subqueries_hwm),
|
||||
dnssec_crypto_hwm = MAX(dnssec_crypto_hwm, excluded.dnssec_crypto_hwm),
|
||||
dnssec_sig_fails_hwm = MAX(dnssec_sig_fails_hwm, excluded.dnssec_sig_fails_hwm)
|
||||
''', (
|
||||
today,
|
||||
new_metrics["queries_forwarded"],
|
||||
new_metrics["queries_answered_locally"],
|
||||
new_metrics["queries_authoritative"],
|
||||
new_metrics["cache_reused"],
|
||||
new_metrics["tcp_hwm"],
|
||||
new_metrics["tcp_max_allowed"],
|
||||
new_metrics["pool_memory_max"],
|
||||
new_metrics["dnssec_subqueries_hwm"],
|
||||
new_metrics["dnssec_crypto_hwm"],
|
||||
new_metrics["dnssec_sig_fails_hwm"],
|
||||
))
|
||||
|
||||
existing = {s["address"]: s for s in t["servers"]}
|
||||
for srv in new_metrics["servers"]:
|
||||
addr = srv["address"]
|
||||
if addr in existing:
|
||||
existing[addr]["queries_sent"] += srv["queries_sent"]
|
||||
existing[addr]["retried"] += srv["retried"]
|
||||
existing[addr]["failed"] += srv["failed"]
|
||||
existing[addr]["nxdomain"] += srv["nxdomain"]
|
||||
existing[addr]["avg_latency_ms"] = srv["avg_latency_ms"]
|
||||
else:
|
||||
existing[addr] = srv.copy()
|
||||
t["servers"] = list(existing.values())
|
||||
con.execute('''
|
||||
INSERT INTO daily_servers(date, address, queries_sent, retried, failed, nxdomain, avg_latency_ms)
|
||||
VALUES (?,?,?,?,?,?,?)
|
||||
ON CONFLICT(date, address) DO UPDATE SET
|
||||
queries_sent = queries_sent + excluded.queries_sent,
|
||||
retried = retried + excluded.retried,
|
||||
failed = failed + excluded.failed,
|
||||
nxdomain = nxdomain + excluded.nxdomain,
|
||||
avg_latency_ms = CASE WHEN excluded.avg_latency_ms > 0
|
||||
THEN excluded.avg_latency_ms
|
||||
ELSE avg_latency_ms END
|
||||
''', (
|
||||
today, srv["address"],
|
||||
srv["queries_sent"], srv["retried"], srv["failed"],
|
||||
srv["nxdomain"], srv["avg_latency_ms"],
|
||||
))
|
||||
|
||||
stored["metadata"]["last_recorded"] = now_str
|
||||
stored["metadata"]["total_updates"] += 1
|
||||
|
||||
with open(METRICS_FILE, "w") as f:
|
||||
json.dump(stored, f, indent=2)
|
||||
shared.chown_to_script_dir_owner(METRICS_FILE)
|
||||
con.commit()
|
||||
shared.chown_to_script_dir_owner(DB_FILE)
|
||||
con.close()
|
||||
|
||||
|
||||
# ===================================================================
|
||||
|
|
@ -171,37 +224,58 @@ def show_metrics(data):
|
|||
new = collect_metrics(data)
|
||||
if new is None:
|
||||
return
|
||||
update_metrics_file(new)
|
||||
update_metrics_db(new)
|
||||
|
||||
with open(METRICS_FILE) as f:
|
||||
data_m = json.load(f)
|
||||
con = open_db()
|
||||
row = con.execute('''
|
||||
SELECT
|
||||
MIN(date), MAX(date), COUNT(*),
|
||||
SUM(queries_forwarded), SUM(queries_answered_locally),
|
||||
SUM(queries_authoritative), SUM(cache_reused),
|
||||
MAX(tcp_hwm), MAX(tcp_max_allowed), MAX(pool_memory_max)
|
||||
FROM daily_totals
|
||||
''').fetchone()
|
||||
servers = con.execute('''
|
||||
SELECT
|
||||
ds.address,
|
||||
SUM(ds.queries_sent),
|
||||
SUM(ds.retried),
|
||||
SUM(ds.failed),
|
||||
SUM(ds.nxdomain),
|
||||
(SELECT avg_latency_ms FROM daily_servers d2
|
||||
WHERE d2.address = ds.address AND d2.avg_latency_ms > 0
|
||||
ORDER BY d2.date DESC LIMIT 1)
|
||||
FROM daily_servers ds
|
||||
GROUP BY ds.address
|
||||
ORDER BY SUM(ds.queries_sent) DESC
|
||||
''').fetchall()
|
||||
con.close()
|
||||
|
||||
m = data_m["metadata"]
|
||||
t = data_m["totals"]
|
||||
first, last, days, fwd, local, auth, reused, tcp_hwm, tcp_max, pool = row
|
||||
|
||||
print("DNS Metrics (lifetime totals across all VLAN instances)")
|
||||
print(f" First recorded : {m['first_recorded']}")
|
||||
print(f" Last recorded : {m['last_recorded']}")
|
||||
print(f" Total updates : {m['total_updates']}")
|
||||
print("DNS Metrics (all-time totals across all VLAN instances)")
|
||||
print(f" First recorded : {first or '-'}")
|
||||
print(f" Last recorded : {last or '-'}")
|
||||
print(f" Days tracked : {days or 0}")
|
||||
print()
|
||||
print("Queries")
|
||||
print(f" Forwarded to upstream : {t['queries_forwarded']:,}")
|
||||
print(f" Answered from cache : {t['queries_answered_locally']:,}")
|
||||
print(f" Authoritative : {t['queries_authoritative']:,}")
|
||||
print(f" Cache reused : {t['cache_reused']:,}")
|
||||
print(f" Forwarded to upstream : {(fwd or 0):,}")
|
||||
print(f" Answered from cache : {(local or 0):,}")
|
||||
print(f" Authoritative : {(auth or 0):,}")
|
||||
print(f" Cache reused : {(reused or 0):,}")
|
||||
print()
|
||||
print("TCP")
|
||||
print(f" Peak concurrent (HWM) : {t['tcp_hwm']}")
|
||||
print(f" Max allowed : {t['tcp_max_allowed']}")
|
||||
print(f" Peak concurrent (HWM) : {tcp_hwm or 0}")
|
||||
print(f" Max allowed : {tcp_max or 0}")
|
||||
print()
|
||||
print(f"Pool memory peak : {t['pool_memory_max']} bytes")
|
||||
if t["servers"]:
|
||||
print(f"Pool memory peak : {pool or 0} bytes")
|
||||
if servers:
|
||||
print()
|
||||
print("Upstream servers")
|
||||
for s in t["servers"]:
|
||||
print(f" {s['address']}")
|
||||
print(f" Sent : {s['queries_sent']:,}")
|
||||
print(f" Retried : {s['retried']:,}")
|
||||
print(f" Failed : {s['failed']:,}")
|
||||
print(f" NXDOMAIN : {s['nxdomain']:,}")
|
||||
print(f" Latency : {s['avg_latency_ms']}ms (last recorded)")
|
||||
print("Upstream servers (all-time)")
|
||||
for addr, sent, retried, failed, nxdomain, latency in servers:
|
||||
print(f" {addr}")
|
||||
print(f" Sent : {(sent or 0):,}")
|
||||
print(f" Retried : {(retried or 0):,}")
|
||||
print(f" Failed : {(failed or 0):,}")
|
||||
print(f" NXDOMAIN : {(nxdomain or 0):,}")
|
||||
print(f" Latency : {latency}ms (last recorded)" if latency else " Latency : -")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue