Prometheus监控主机,Grafana成图
全部使用官方 GitHub 源的部署方案,下载链接来自官方,无需镜像。
官方下载地址汇总
组件 官方下载地址
Node Exporterhttps://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
Prometheushttps://github.com/prometheus/prometheus/releases/download/v3.5.0/prometheus-3.5.0.linux-amd64.tar.gz
AlertManagerhttps://github.com/prometheus/alertmanager/releases/download/v0.28.0/alertmanager-0.28.0.linux-amd64.tar.gz
Grafana
https://dl.grafana.com/oss/release/grafana-11.6.0.linux-amd64.tar.gz
一、部署 Node Exporter(官方源)
bash
创建工作目录
sudo mkdir -p /data/node_exporter/logs
下载官方版本(GitHub Release)
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
tar xvf node_exporter-1.8.2.linux-amd64.tar.gz
sudo cp node_exporter-1.8.2.linux-amd64/node_exporter /data/node_exporter/
验证
/data/node_exporter/node_exporter --version
启动脚本
bash
sudo tee /data/node_exporter/start.sh <<‘EOF’
#!/bin/bash
cd /data/node_exporter
nohup ./node_exporter
–web.listen-address=“0.0.0.0:9100”
–collector.disable-defaults
–collector.cpu
–collector.meminfo
–collector.filesystem
–collector.diskstats
–collector.netdev
–collector.loadavg
–collector.time
–collector.uname \
logs/node_exporter.log 2>&1 &
echo $! > node_exporter.pid
echo “Node Exporter started with PID: $(cat node_exporter.pid)”
EOF
sudo tee /data/node_exporter/stop.sh <<‘EOF’
#!/bin/bash
if [ -f /data/node_exporter/node_exporter.pid ]; then
kill $(cat /data/node_exporter/node_exporter.pid)
rm -f /data/node_exporter/node_exporter.pid
echo “Node Exporter stopped”
else
pkill -f “node_exporter.*9100”
fi
EOF
sudo chmod +x /data/node_exporter/*.sh
sudo /data/node_exporter/start.sh
验证
sleep 2
curl http://localhost:9100/metrics | head -5
二、部署 Prometheus(官方源)
bash
创建工作目录
sudo mkdir -p /data/prometheus/{data,rules,logs}
下载官方版本
cd /tmp
wget https://github.com/prometheus/prometheus/releases/download/v3.5.0/prometheus-3.5.0.linux-amd64.tar.gz
tar xvf prometheus-3.5.0.linux-amd64.tar.gz
sudo cp prometheus-3.5.0.linux-amd64/prometheus /data/prometheus/
sudo cp prometheus-3.5.0.linux-amd64/promtool /data/prometheus/
sudo cp -r prometheus-3.5.0.linux-amd64/consoles /data/prometheus/
sudo cp -r prometheus-3.5.0.linux-amd64/console_libraries /data/prometheus/
设置权限
sudo chmod +x /data/prometheus/prometheus
配置文件
bash
sudo tee /data/prometheus/prometheus.yml <<‘EOF’
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: ‘vm-monitor’
alerting:
alertmanagers:
- static_configs:
- targets: []
rule_files: []
scrape_configs:
job_name: ‘prometheus’
static_configs:- targets: [‘localhost:9090’]
labels:
service: ‘prometheus’
- targets: [‘localhost:9090’]
job_name: ‘node_exporter’
scrape_interval: 30s
static_configs:- targets:
- ‘localhost:9100’
labels:
environment: ‘production’
EOF
启动脚本
bash
sudo tee /data/prometheus/start.sh <<‘EOF’
#!/bin/bash
cd /data/prometheus
nohup ./prometheus
–config.file=/data/prometheus/prometheus.yml
–storage.tsdb.path=/data/prometheus/data
–storage.tsdb.retention.time=30d
–web.enable-lifecycle \
- ‘localhost:9100’
- targets:
logs/prometheus.log 2>&1 &
echo $! > prometheus.pid
echo “Prometheus started with PID:(catprometheus.pid)"echo"WebUI:http://(cat prometheus.pid)" echo "Web UI: http://(catprometheus.pid)"echo"WebUI:http://(hostname -I | awk ‘{print $1}’):9090”
EOF
sudo tee /data/prometheus/stop.sh <<‘EOF’
#!/bin/bash
if [ -f /data/prometheus/prometheus.pid ]; then
kill $(cat /data/prometheus/prometheus.pid)
rm -f /data/prometheus/prometheus.pid
echo “Prometheus stopped”
else
pkill -f “prometheus.*config.file”
fi
EOF
sudo chmod +x /data/prometheus/*.sh
sudo /data/prometheus/start.sh
验证
sleep 3
curl http://localhost:9090/-/healthy
三、部署 Grafana(官方源)
bash
创建工作目录
sudo mkdir -p /data/grafana/{data,logs,plugins}
下载官方版本
cd /tmp
wget https://dl.grafana.com/oss/release/grafana-11.6.0.linux-amd64.tar.gz
tar xvf grafana-11.6.0.linux-amd64.tar.gz
sudo cp -r grafana-11.6.0/* /data/grafana/
配置文件
sudo tee /data/grafana/conf/custom.ini <<‘EOF’
[paths]
data = /data/grafana/data
logs = /data/grafana/logs
plugins = /data/grafana/plugins
[server]
http_port = 3000
[auth]
disable_login_form = false
[users]
allow_sign_up = false
EOF
启动脚本
bash
sudo tee /data/grafana/start.sh <<‘EOF’
#!/bin/bash
cd /data/grafana
nohup ./bin/grafana-server
–config=/data/grafana/conf/custom.ini
–homepath=/data/grafana \
logs/grafana.log 2>&1 &
echo $! > grafana.pid
echo “Grafana started with PID:(catgrafana.pid)"echo"GrafanaUI:http://(cat grafana.pid)" echo "Grafana UI: http://(catgrafana.pid)"echo"GrafanaUI:http://(hostname -I | awk ‘{print $1}’):3000 (admin/admin)”
EOF
sudo tee /data/grafana/stop.sh <<‘EOF’
#!/bin/bash
if [ -f /data/grafana/grafana.pid ]; then
kill $(cat /data/grafana/grafana.pid)
rm -f /data/grafana/grafana.pid
echo “Grafana stopped”
else
pkill -f “grafana-server”
fi
EOF
sudo chmod +x /data/grafana/*.sh
sudo /data/grafana/start.sh
四、统一管理脚本
bash
sudo tee /usr/local/bin/monitor-control <<‘EOF’
#!/bin/bash
SERVICES=“node_exporter prometheus grafana”
case “$1” in
start)
echo “Starting all monitoring services…”
/data/node_exporter/start.sh 2>/dev/null || echo “Node Exporter already running”
sleep 2
/data/prometheus/start.sh 2>/dev/null || echo “Prometheus already running”
sleep 2
/data/grafana/start.sh 2>/dev/null || echo “Grafana already running”
echo “All services started”
;;
stop)
echo “Stopping all monitoring services…”
/data/node_exporter/stop.sh 2>/dev/null
/data/prometheus/stop.sh 2>/dev/null
/data/grafana/stop.sh 2>/dev/null
echo “All services stopped”
;;
status)
echo “=== Node Exporter ="
ps aux | grep -v grep | grep “node_exporter.*9100” && echo “Running” || echo “Stopped”
echo "= Prometheus ="
ps aux | grep -v grep | grep “prometheus.*config” && echo “Running” || echo “Stopped”
echo "= Grafana ===”
ps aux | grep -v grep | grep “grafana-server” && echo “Running” || echo “Stopped”
;;
restart)
$0 stop
sleep 3
$0 start
;;
*)
echo “Usage: $0 {start|stop|restart|status}”
exit 1
;;
esac
EOF
sudo chmod +x /usr/local/bin/monitor-control
五、测试验证
bash
启动所有服务
sudo monitor-control start
查看状态
sudo monitor-control status
测试各服务
curl http://localhost:9090/-/healthy # Prometheus
curl http://localhost:9100/metrics | head # Node Exporter
curl http://localhost:3000/api/health # Grafana
六、添加其他虚拟机
在其他虚拟机上部署 Node Exporter:
bash
在每一台被监控虚拟机上执行
mkdir -p /data/node_exporter/logs
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
tar xvf node_exporter-1.8.2.linux-amd64.tar.gz
cp node_exporter-1.8.2.linux-amd64/node_exporter /data/node_exporter/
cd /data/node_exporter
nohup ./node_exporter --web.listen-address=“0.0.0.0:9100” > logs/node_exporter.log 2>&1 &
然后在 Prometheus 配置中添加新目标并热加载:
bash
编辑 /data/prometheus/prometheus.yml,在 targets 中添加新 IP
curl -X POST http://localhost:9090/-/reload
六、Grafana添加Prometheus数据源
这里最好是填写http://127.0.0.1:9090,这样不会识别到ipv6.
七、给数据源添加面板
八、配置告警(AlertManager)
3.1 下载并安装 AlertManager
bash
创建工作目录
mkdir -p /data/alertmanager/{data,logs}
下载官方版本
cd /tmp
wget https://github.com/prometheus/alertmanager/releases/download/v0.28.0/alertmanager-0.28.0.linux-amd64.tar.gz
tar xvf alertmanager-0.28.0.linux-amd64.tar.gz
cp alertmanager-0.28.0.linux-amd64/alertmanager /data/alertmanager/
cp alertmanager-0.28.0.linux-amd64/amtool /data/alertmanager/
chmod +x /data/alertmanager/alertmanager
3.2 配置 AlertManager
bash
sudo tee /data/alertmanager/alertmanager.yml <<‘EOF’
global:
resolve_timeout: 5m
smtp_smarthost: ‘smtp.example.com:587’ # 配置邮件服务器
smtp_from: ‘alertmanager@example.com’
smtp_auth_username: ‘your-username’
smtp_auth_password: ‘your-password’
route:
group_by: [‘alertname’, ‘severity’]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: ‘default-receiver’
routes:
- match:
severity: critical
receiver: critical-receiver
receivers:
name: ‘default-receiver’
email_configs:- to: ‘ops-team@example.com’
name: ‘critical-receiver’
email_configs:- to: ‘oncall@example.com’
webhook_configs: - url: ‘https://your-webhook-url’ # 可对接钉钉/企微/飞书
- to: ‘oncall@example.com’
inhibit_rules:
- source_match:
severity: ‘critical’
target_match:
severity: ‘warning’
equal: [‘instance’, ‘alertname’]
EOF
3.3 启动 AlertManager
bash
sudo tee /data/alertmanager/start.sh <<‘EOF’
#!/bin/bash
cd /data/alertmanager
nohup ./alertmanager
–config.file=/data/alertmanager/alertmanager.yml
–web.listen-address=“0.0.0.0:9093”
–storage.path=/data/alertmanager/data \
logs/alertmanager.log 2>&1 &
echo $! > alertmanager.pid
echo “AlertManager started with PID:(catalertmanager.pid)"echo"AlertManagerUI:http://(cat alertmanager.pid)" echo "AlertManager UI: http://(catalertmanager.pid)"echo"AlertManagerUI:http://(hostname -I | awk ‘{print $1}’):9093”
EOF
sudo tee /data/alertmanager/stop.sh <<‘EOF’
#!/bin/bash
if [ -f /data/alertmanager/alertmanager.pid ]; then
kill $(cat /data/alertmanager/alertmanager.pid)
rm -f /data/alertmanager/alertmanager.pid
else
pkill -f “alertmanager.*config.file”
fi
EOF
sudo chmod +x /data/alertmanager/*.sh
sudo /data/alertmanager/start.sh
3.4 配置 Prometheus 对接 AlertManager
编辑 /data/prometheus/prometheus.yml,添加以下配置:
yaml
告警配置
alerting:
alertmanagers:
- static_configs:
- targets: [‘localhost:9093’] # AlertManager 地址
告警规则文件
rule_files:
“rules/*.yml”
3.5 创建告警规则
bash
mkdir -p /data/prometheus/rules
sudo tee /data/prometheus/rules/node_alerts.yml <<‘EOF’
groups:name: node_alerts
interval: 30s
rules:1. 节点宕机告警
- alert: InstanceDown
expr: up{job=“node_exporter”} == 0
for: 1m
labels:
severity: critical
annotations:
summary: “实例 {{ $labels.instance }} 已宕机”
description: “{{ $labels.instance }} 已宕机超过 1 分钟”
2. CPU 使用率过高告警
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=“idle”}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: “{{ $labels.instance }} CPU 使用率过高”
description: “CPU 使用率已超过 80%,当前值: {{ $value }}%”
3. 内存使用率过高告警
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: “{{ $labels.instance }} 内存使用率过高”
description: “内存使用率已超过 90%,当前值: {{ $value }}%”
4. 磁盘空间不足告警
- alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{mountpoint=“/”} / node_filesystem_size_bytes{mountpoint=“/”}) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: “{{ $labels.instance }} 磁盘空间不足”
description: “磁盘使用率已超过 85%,当前值: {{ $value }}%”
5. 系统负载过高告警
- alert: HighSystemLoad
expr: node_load5 / count by (instance) (node_cpu_seconds_total{mode=“idle”}) > 2
for: 5m
labels:
severity: warning
annotations:
summary: “{{ $labels.instance }} 系统负载过高”
description: “5分钟平均负载超过 CPU 核心数的 2 倍”
EOF
3.6 加载配置并验证
bash
- alert: InstanceDown
热加载 Prometheus 配置
curl -X POST http://localhost:9090/-/reload
验证告警规则是否加载成功
curl http://localhost:9090/api/v1/rules | jq ‘.data.groups[].name’
