Build Your Own
← Back to Overview
Monitoring & Alerting — see everything, miss nothing.
If you can't see it, you can't fix it. Prometheus scrapes metrics from every node. Grafana visualizes them. Alertmanager tells you when something's wrong. All on ZFS — so your monitoring data is checksummed, compressed, and snapshotable.
The recipe
Step 1: Node exporter on every machine
# Already installed on kldload systems, just enable:
systemctl enable --now prometheus-node-exporter
# Verify — metrics available on port 9100
curl -s http://localhost:9100/metrics | head -5
Step 2: ZFS-specific metrics
# Create a script that exports ZFS metrics for Prometheus
cat > /usr/local/bin/zfs-metrics.sh <<'METRICS'
#!/bin/bash
# ZFS metrics for node_exporter textfile collector
OUT="/var/lib/prometheus/node-exporter/zfs.prom"
mkdir -p "$(dirname $OUT)"
{
# Pool health (1=ONLINE, 0=DEGRADED/FAULTED)
echo "# HELP zfs_pool_healthy Pool health status"
echo "# TYPE zfs_pool_healthy gauge"
zpool list -H -o name,health | while read pool health; do
[[ "$health" == "ONLINE" ]] && echo "zfs_pool_healthy{pool=\"$pool\"} 1" || echo "zfs_pool_healthy{pool=\"$pool\"} 0"
done
# ARC hit rate
echo "# HELP zfs_arc_hit_ratio ARC cache hit ratio"
echo "# TYPE zfs_arc_hit_ratio gauge"
HITS=$(awk '/^hits/ {print $3}' /proc/spl/kstat/zfs/arcstats)
MISSES=$(awk '/^misses/ {print $3}' /proc/spl/kstat/zfs/arcstats)
if [[ -n "$HITS" && -n "$MISSES" && $((HITS+MISSES)) -gt 0 ]]; then
echo "zfs_arc_hit_ratio $(echo "scale=4; $HITS / ($HITS + $MISSES)" | bc)"
fi
# Snapshot count per dataset
echo "# HELP zfs_snapshot_count Number of snapshots"
echo "# TYPE zfs_snapshot_count gauge"
zfs list -H -t snapshot -o name 2>/dev/null | awk -F@ '{print $1}' | sort | uniq -c | while read count ds; do
echo "zfs_snapshot_count{dataset=\"$ds\"} $count"
done
# Compression ratio
echo "# HELP zfs_compressratio Dataset compression ratio"
echo "# TYPE zfs_compressratio gauge"
zfs get -H -o name,value compressratio -r rpool 2>/dev/null | while read ds ratio; do
echo "zfs_compressratio{dataset=\"$ds\"} ${ratio%x}"
done
} > "$OUT"
METRICS
chmod +x /usr/local/bin/zfs-metrics.sh
# Run every minute via cron
echo "* * * * * root /usr/local/bin/zfs-metrics.sh" > /etc/cron.d/zfs-metrics
# Configure node_exporter to read textfile metrics
# Add --collector.textfile.directory=/var/lib/prometheus/node-exporter
Step 3: Prometheus server
# Create ZFS dataset for metrics (compressed, retained)
kdir -o compression=zstd -o recordsize=128K /srv/prometheus
# Install Prometheus
kpkg install prometheus
# Configure scrape targets
cat > /etc/prometheus/prometheus.yml <<'PROM'
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'nodes'
static_configs:
- targets:
- 'localhost:9100'
- '10.78.0.2:9100' # node-2 via WireGuard
- '10.78.0.3:9100' # node-3 via WireGuard
PROM
systemctl enable --now prometheus
Step 4: Grafana dashboards
# Install Grafana
kpkg install grafana
systemctl enable --now grafana-server
# Browse to http://your-ip:3000
# Default login: admin / admin
# Add Prometheus as data source: http://localhost:9090
# Import ZFS dashboard — or build your own with these queries:
# Pool health: zfs_pool_healthy
# ARC hit rate: zfs_arc_hit_ratio
# Snapshot count: zfs_snapshot_count
# Compression: zfs_compressratio
# Disk I/O: node_disk_io_time_seconds_total
# Memory: node_memory_MemAvailable_bytes
Step 5: Alerts that matter
# /etc/prometheus/alerts.yml
groups:
- name: zfs
rules:
- alert: ZFSPoolDegraded
expr: zfs_pool_healthy == 0
for: 1m
labels:
severity: critical
annotations:
summary: "ZFS pool {{ $labels.pool }} is DEGRADED"
- alert: ZFSArcHitRateLow
expr: zfs_arc_hit_ratio < 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "ARC hit rate below 80% — consider increasing zfs_arc_max"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Less than 10% disk space remaining"
Pool goes DEGRADED at 3 AM? You get a text. ARC hit rate drops? You get a warning before performance tanks. No more discovering problems when users complain.