207 lines
7.8 KiB
Python
207 lines
7.8 KiB
Python
"""
|
|
mod_metrics.py -- DNS metrics collection and display.
|
|
|
|
Sends SIGUSR1 to running dnsmasq instances, parses stats from journalctl,
|
|
and accumulates lifetime totals in a JSON file.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import signal
|
|
import subprocess
|
|
import time
|
|
from datetime import datetime
|
|
|
|
import mod_shared as shared
|
|
import mod_validation as validation
|
|
|
|
METRICS_FILE = shared.SCRIPT_DIR / ".dns-metrics"
|
|
|
|
|
|
# ===================================================================
|
|
# Collect and store
|
|
# ===================================================================
|
|
|
|
def collect_metrics(data):
|
|
"""
|
|
Send SIGUSR1 to each running dnsmasq instance and parse stats from
|
|
journalctl. Returns a combined metrics dict, or None if unavailable.
|
|
"""
|
|
metrics = {
|
|
"queries_forwarded": 0,
|
|
"queries_answered_locally": 0,
|
|
"queries_authoritative": 0,
|
|
"cache_reused": 0,
|
|
"tcp_hwm": 0,
|
|
"tcp_max_allowed": 0,
|
|
"pool_memory_max": 0,
|
|
"dnssec_subqueries_hwm": 0,
|
|
"dnssec_crypto_hwm": 0,
|
|
"dnssec_sig_fails_hwm": 0,
|
|
"servers": []
|
|
}
|
|
|
|
any_running = False
|
|
for vlan in data["vlans"]:
|
|
pid_file = shared.vlan_pid_file(vlan)
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
os.kill(pid, signal.SIGUSR1)
|
|
any_running = True
|
|
except Exception:
|
|
continue
|
|
|
|
if not any_running:
|
|
print("No dnsmasq instances are running.")
|
|
return None
|
|
|
|
time.sleep(1)
|
|
|
|
server_map = {}
|
|
for vlan in data["vlans"]:
|
|
svc = shared.vlan_service_name(vlan, validation.derive_interface(vlan, data))
|
|
result = subprocess.run(
|
|
["journalctl", "-u", svc, "--since", "5 seconds ago",
|
|
"--no-pager", "-o", "cat"],
|
|
capture_output=True, text=True
|
|
)
|
|
for line in result.stdout.splitlines():
|
|
m = re.search(r"cache size \d+, (\d+)/\d+ cache insertions re-used", line)
|
|
if m:
|
|
metrics["cache_reused"] += int(m.group(1))
|
|
|
|
m = re.search(r"queries forwarded (\d+), queries answered locally (\d+)", line)
|
|
if m:
|
|
metrics["queries_forwarded"] += int(m.group(1))
|
|
metrics["queries_answered_locally"] += int(m.group(2))
|
|
|
|
m = re.search(r"queries for authoritative zones (\d+)", line)
|
|
if m:
|
|
metrics["queries_authoritative"] += int(m.group(1))
|
|
|
|
m = re.search(r"highest since last SIGUSR1 (\d+), max allowed (\d+)", line)
|
|
if m:
|
|
metrics["tcp_hwm"] = max(metrics["tcp_hwm"], int(m.group(1)))
|
|
metrics["tcp_max_allowed"] = max(metrics["tcp_max_allowed"], int(m.group(2)))
|
|
|
|
m = re.search(r"pool memory in use \d+, max (\d+)", line)
|
|
if m:
|
|
metrics["pool_memory_max"] = max(metrics["pool_memory_max"], int(m.group(1)))
|
|
|
|
m = re.search(
|
|
r"server (\S+): queries sent (\d+), retried (\d+), failed (\d+), "
|
|
r"nxdomain replies (\d+), avg\. latency (\d+)ms",
|
|
line
|
|
)
|
|
if m:
|
|
addr = m.group(1)
|
|
if addr not in server_map:
|
|
server_map[addr] = {
|
|
"address": addr, "queries_sent": 0, "retried": 0,
|
|
"failed": 0, "nxdomain": 0, "avg_latency_ms": 0
|
|
}
|
|
server_map[addr]["queries_sent"] += int(m.group(2))
|
|
server_map[addr]["retried"] += int(m.group(3))
|
|
server_map[addr]["failed"] += int(m.group(4))
|
|
server_map[addr]["nxdomain"] += int(m.group(5))
|
|
server_map[addr]["avg_latency_ms"] = int(m.group(6))
|
|
|
|
metrics["servers"] = list(server_map.values())
|
|
return metrics
|
|
|
|
|
|
def update_metrics_file(new_metrics):
|
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
if METRICS_FILE.exists():
|
|
with open(METRICS_FILE) as f:
|
|
stored = json.load(f)
|
|
else:
|
|
stored = {
|
|
"metadata": {"first_recorded": now_str, "last_recorded": now_str, "total_updates": 0},
|
|
"totals": {
|
|
"queries_forwarded": 0, "queries_answered_locally": 0,
|
|
"queries_authoritative": 0, "cache_reused": 0,
|
|
"tcp_hwm": 0, "tcp_max_allowed": 0, "pool_memory_max": 0,
|
|
"dnssec_subqueries_hwm": 0, "dnssec_crypto_hwm": 0,
|
|
"dnssec_sig_fails_hwm": 0, "servers": []
|
|
}
|
|
}
|
|
|
|
t = stored["totals"]
|
|
t["queries_forwarded"] += new_metrics["queries_forwarded"]
|
|
t["queries_answered_locally"] += new_metrics["queries_answered_locally"]
|
|
t["queries_authoritative"] += new_metrics["queries_authoritative"]
|
|
t["cache_reused"] += new_metrics["cache_reused"]
|
|
t["tcp_hwm"] = max(t["tcp_hwm"], new_metrics["tcp_hwm"])
|
|
t["pool_memory_max"] = max(t["pool_memory_max"], new_metrics["pool_memory_max"])
|
|
t["dnssec_subqueries_hwm"] = max(t["dnssec_subqueries_hwm"], new_metrics["dnssec_subqueries_hwm"])
|
|
t["dnssec_crypto_hwm"] = max(t["dnssec_crypto_hwm"], new_metrics["dnssec_crypto_hwm"])
|
|
t["dnssec_sig_fails_hwm"] = max(t["dnssec_sig_fails_hwm"], new_metrics["dnssec_sig_fails_hwm"])
|
|
if new_metrics["tcp_max_allowed"]:
|
|
t["tcp_max_allowed"] = new_metrics["tcp_max_allowed"]
|
|
|
|
existing = {s["address"]: s for s in t["servers"]}
|
|
for srv in new_metrics["servers"]:
|
|
addr = srv["address"]
|
|
if addr in existing:
|
|
existing[addr]["queries_sent"] += srv["queries_sent"]
|
|
existing[addr]["retried"] += srv["retried"]
|
|
existing[addr]["failed"] += srv["failed"]
|
|
existing[addr]["nxdomain"] += srv["nxdomain"]
|
|
existing[addr]["avg_latency_ms"] = srv["avg_latency_ms"]
|
|
else:
|
|
existing[addr] = srv.copy()
|
|
t["servers"] = list(existing.values())
|
|
|
|
stored["metadata"]["last_recorded"] = now_str
|
|
stored["metadata"]["total_updates"] += 1
|
|
|
|
with open(METRICS_FILE, "w") as f:
|
|
json.dump(stored, f, indent=2)
|
|
shared.chown_to_script_dir_owner(METRICS_FILE)
|
|
|
|
|
|
# ===================================================================
|
|
# Display
|
|
# ===================================================================
|
|
|
|
def show_metrics(data):
|
|
new = collect_metrics(data)
|
|
if new is None:
|
|
return
|
|
update_metrics_file(new)
|
|
|
|
with open(METRICS_FILE) as f:
|
|
data_m = json.load(f)
|
|
|
|
m = data_m["metadata"]
|
|
t = data_m["totals"]
|
|
|
|
print("DNS Metrics (lifetime totals across all VLAN instances)")
|
|
print(f" First recorded : {m['first_recorded']}")
|
|
print(f" Last recorded : {m['last_recorded']}")
|
|
print(f" Total updates : {m['total_updates']}")
|
|
print()
|
|
print("Queries")
|
|
print(f" Forwarded to upstream : {t['queries_forwarded']:,}")
|
|
print(f" Answered from cache : {t['queries_answered_locally']:,}")
|
|
print(f" Authoritative : {t['queries_authoritative']:,}")
|
|
print(f" Cache reused : {t['cache_reused']:,}")
|
|
print()
|
|
print("TCP")
|
|
print(f" Peak concurrent (HWM) : {t['tcp_hwm']}")
|
|
print(f" Max allowed : {t['tcp_max_allowed']}")
|
|
print()
|
|
print(f"Pool memory peak : {t['pool_memory_max']} bytes")
|
|
if t["servers"]:
|
|
print()
|
|
print("Upstream servers")
|
|
for s in t["servers"]:
|
|
print(f" {s['address']}")
|
|
print(f" Sent : {s['queries_sent']:,}")
|
|
print(f" Retried : {s['retried']:,}")
|
|
print(f" Failed : {s['failed']:,}")
|
|
print(f" NXDOMAIN : {s['nxdomain']:,}")
|
|
print(f" Latency : {s['avg_latency_ms']}ms (last recorded)")
|