| your Linux construction kit
Source
← Back to Overview

Monitoring & Alerting — see everything, miss nothing.

If you can't see it, you can't fix it. Prometheus scrapes metrics from every node. Grafana visualizes them. Alertmanager tells you when something's wrong. All on ZFS — so your monitoring data is checksummed, compressed, and snapshotable.

The recipe

Step 1: Node exporter on every machine

# Already installed on kldload systems, just enable:
systemctl enable --now prometheus-node-exporter

# Verify — metrics available on port 9100
curl -s http://localhost:9100/metrics | head -5

Step 2: ZFS-specific metrics

# Create a script that exports ZFS metrics for Prometheus
cat > /usr/local/bin/zfs-metrics.sh <<'METRICS'
#!/bin/bash
# ZFS metrics for node_exporter textfile collector

OUT="/var/lib/prometheus/node-exporter/zfs.prom"
mkdir -p "$(dirname $OUT)"

{
  # Pool health (1=ONLINE, 0=DEGRADED/FAULTED)
  echo "# HELP zfs_pool_healthy Pool health status"
  echo "# TYPE zfs_pool_healthy gauge"
  zpool list -H -o name,health | while read pool health; do
    [[ "$health" == "ONLINE" ]] && echo "zfs_pool_healthy{pool=\"$pool\"} 1" || echo "zfs_pool_healthy{pool=\"$pool\"} 0"
  done

  # ARC hit rate
  echo "# HELP zfs_arc_hit_ratio ARC cache hit ratio"
  echo "# TYPE zfs_arc_hit_ratio gauge"
  HITS=$(awk '/^hits/ {print $3}' /proc/spl/kstat/zfs/arcstats)
  MISSES=$(awk '/^misses/ {print $3}' /proc/spl/kstat/zfs/arcstats)
  if [[ -n "$HITS" && -n "$MISSES" && $((HITS+MISSES)) -gt 0 ]]; then
    echo "zfs_arc_hit_ratio $(echo "scale=4; $HITS / ($HITS + $MISSES)" | bc)"
  fi

  # Snapshot count per dataset
  echo "# HELP zfs_snapshot_count Number of snapshots"
  echo "# TYPE zfs_snapshot_count gauge"
  zfs list -H -t snapshot -o name 2>/dev/null | awk -F@ '{print $1}' | sort | uniq -c | while read count ds; do
    echo "zfs_snapshot_count{dataset=\"$ds\"} $count"
  done

  # Compression ratio
  echo "# HELP zfs_compressratio Dataset compression ratio"
  echo "# TYPE zfs_compressratio gauge"
  zfs get -H -o name,value compressratio -r rpool 2>/dev/null | while read ds ratio; do
    echo "zfs_compressratio{dataset=\"$ds\"} ${ratio%x}"
  done
} > "$OUT"
METRICS
chmod +x /usr/local/bin/zfs-metrics.sh

# Run every minute via cron
echo "* * * * * root /usr/local/bin/zfs-metrics.sh" > /etc/cron.d/zfs-metrics

# Configure node_exporter to read textfile metrics
# Add --collector.textfile.directory=/var/lib/prometheus/node-exporter

Step 3: Prometheus server

# Create ZFS dataset for metrics (compressed, retained)
kdir -o compression=zstd -o recordsize=128K /srv/prometheus

# Install Prometheus
kpkg install prometheus

# Configure scrape targets
cat > /etc/prometheus/prometheus.yml <<'PROM'
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'nodes'
    static_configs:
      - targets:
        - 'localhost:9100'
        - '10.78.0.2:9100'    # node-2 via WireGuard
        - '10.78.0.3:9100'    # node-3 via WireGuard
PROM

systemctl enable --now prometheus

Step 4: Grafana dashboards

# Install Grafana
kpkg install grafana

systemctl enable --now grafana-server

# Browse to http://your-ip:3000
# Default login: admin / admin
# Add Prometheus as data source: http://localhost:9090

# Import ZFS dashboard — or build your own with these queries:
# Pool health:    zfs_pool_healthy
# ARC hit rate:   zfs_arc_hit_ratio
# Snapshot count: zfs_snapshot_count
# Compression:    zfs_compressratio
# Disk I/O:       node_disk_io_time_seconds_total
# Memory:         node_memory_MemAvailable_bytes

Step 5: Alerts that matter

# /etc/prometheus/alerts.yml
groups:
  - name: zfs
    rules:
      - alert: ZFSPoolDegraded
        expr: zfs_pool_healthy == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "ZFS pool {{ $labels.pool }} is DEGRADED"

      - alert: ZFSArcHitRateLow
        expr: zfs_arc_hit_ratio < 0.8
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "ARC hit rate below 80% — consider increasing zfs_arc_max"

      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Less than 10% disk space remaining"
Pool goes DEGRADED at 3 AM? You get a text. ARC hit rate drops? You get a warning before performance tanks. No more discovering problems when users complain.