AI查看文档001
#!/usr/bin/env bash set -euo pipefail # ============================================================================= # Ceph CRUSH Root Usage Monitor # 功能: 自动发现集群所有不同的 take_root,每个 root 只处理一次(去重) # - 指标写入 textfile collector 目录(供 node_exporter 采集) # - 回显报告写入 reports 目录(供管理员定期查阅) # 版本: 5.0 (改为按 take_root 维度去重,消除同 root 多规则的冗余输出) # ============================================================================= # --------------------- 配置区 --------------------- TEXTFILE_DIR="/home/monitor/node_exporter_9101/textfile_collector" REPORT_DIR="/home/monitor/node_exporter_9101/reports" REPORT_FILE="${REPORT_DIR}/crush_usage_$(date '+%Y%m%d').log" TEXTFILE_OUT="${TEXTFILE_DIR}/ceph_crush_root_usage.prom" TEXTFILE_TMP="${TEXTFILE_DIR}/.ceph_crush_root_usage.prom.tmp" # --------------------- 初始化目录 --------------------- mkdir -p "$TEXTFILE_DIR" "$REPORT_DIR" # --------------------- 日志函数 --------------------- log() { local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" echo "$msg" echo "$msg" >> "$REPORT_FILE" } log_error() { local msg="[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" echo "$msg" >&2 echo "$msg" >> "$REPORT_FILE" } tee_report() { tee -a "$REPORT_FILE" } # --------------------- 环境检查 --------------------- CEPH_BIN="$(command -v ceph || true)" PYTHON_BIN="$(command -v python3 || true)" if [[ -z "${CEPH_BIN:-}" ]]; then log_error "未找到 ceph 命令,请确保 Ceph 已正确安装" exit 2 fi if [[ -z "${PYTHON_BIN:-}" ]]; then log_error "未找到 python3,请安装 Python 3" exit 3 fi # --------------------- 临时文件管理 --------------------- RULE_JSON_FILE="$(mktemp /tmp/ceph_rule_json.XXXXXX)" TREE_JSON_FILE="$(mktemp /tmp/ceph_tree_json.XXXXXX)" DF_JSON_FILE="$(mktemp /tmp/ceph_df_json.XXXXXX)" cleanup() { rm -f "$RULE_JSON_FILE" "$TREE_JSON_FILE" "$DF_JSON_FILE" 2>/dev/null || true } trap cleanup EXIT # --------------------- 报告头部 --------------------- { echo "" echo "======================================================================" echo " Ceph CRUSH Root OSD 使用率报告" echo " 生成时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "======================================================================" echo "" } | tee_report # --------------------- 获取 Ceph 数据 --------------------- log "开始获取 Ceph 集群数据..." if ! "$CEPH_BIN" osd crush rule dump --format json 2>/dev/null > "$RULE_JSON_FILE"; then log_error "获取 CRUSH rule 配置失败"; exit 4 fi if ! "$CEPH_BIN" osd crush tree --format json 2>/dev/null > "$TREE_JSON_FILE"; then log_error "获取 CRUSH tree 失败"; exit 5 fi if ! "$CEPH_BIN" osd df --format json 2>/dev/null > "$DF_JSON_FILE"; then log_error "获取 OSD df 信息失败"; exit 6 fi log "Ceph 数据获取完成" # --------------------- 自动发现所有唯一 take_root(去重,保序)--------------------- # 从 rule dump 中提取每条规则的 take item_name,去重后输出,跳过空值 mapfile -t UNIQUE_ROOTS < <( "$PYTHON_BIN" - "$RULE_JSON_FILE" <<'EOF' import json, sys with open(sys.argv[1], 'rb') as f: content = f.read() if content.startswith(b'\xef\xbb\xbf'): content = content[3:] rules = json.loads(content.decode('utf-8', errors='replace')) seen = set() for r in rules: for step in r.get("steps", []): if step.get("op") == "take": root = step.get("item_name", "").strip() if root and root not in seen: seen.add(root) print(root) break EOF ) if [[ ${#UNIQUE_ROOTS[@]} -eq 0 ]]; then log_error "未能从集群中发现任何 take_root,请检查 ceph 权限或集群状态" exit 7 fi log "自动发现 ${#UNIQUE_ROOTS[@]} 个唯一 take_root: ${UNIQUE_ROOTS[*]}" # --------------------- 初始化 textfile 临时文件 --------------------- cat > "$TEXTFILE_TMP" <<'PROMEOF' # HELP ceph_crush_root_osd_max_usage_percent Maximum OSD usage percentage under CRUSH take_root # TYPE ceph_crush_root_osd_max_usage_percent gauge PROMEOF # --------------------- 处理每个唯一 take_root --------------------- for ROOT_NAME in "${UNIQUE_ROOTS[@]}"; do log "处理 take_root: $ROOT_NAME" TEXTFILE_TMP="$TEXTFILE_TMP" \ "$PYTHON_BIN" - "$ROOT_NAME" "$TREE_JSON_FILE" "$DF_JSON_FILE" <<'PYTHON_SCRIPT' | tee_report import json import sys import os from typing import Dict, List def die(msg: str, code: int = 1) -> None: print(msg, file=sys.stderr) sys.exit(code) def load_json(path: str): try: with open(path, 'rb') as f: content = f.read() if content.startswith(b'\xef\xbb\xbf'): content = content[3:] return json.loads(content.decode('utf-8', errors='replace')) except Exception as e: die(f"无法解析 JSON 文件 ({path}): {e}", 4) if len(sys.argv) != 4: die(f"参数错误: 期望 3 个参数,实际收到 {len(sys.argv)-1}", 1) root_name, tree_path, df_path = sys.argv[1], sys.argv[2], sys.argv[3] tree = load_json(tree_path) df = load_json(df_path) # --------------------- 在 CRUSH tree 中定位 root --------------------- nodes = tree.get("nodes", []) name_to_id = {n.get("name"): n.get("id") for n in nodes if "name" in n and "id" in n} root_id = name_to_id.get(root_name) if root_id is None: # 该 root 在 tree 中不存在(理论上不会,但防御性处理) print(f"[跳过] root '{root_name}' 在 CRUSH tree 中不存在,已跳过。") sys.exit(0) id_to_node = {n.get("id"): n for n in nodes if "id" in n} sys.setrecursionlimit(10000) def get_osd_descendants(node_id: int) -> List[int]: node = id_to_node.get(node_id) if not node: return [] if node.get("type") == "osd": return [node_id] result = [] for child_id in node.get("children", []): result.extend(get_osd_descendants(child_id)) return result osd_ids = get_osd_descendants(root_id) if not osd_ids: print(f"[跳过] root '{root_name}' 下没有 OSD(空桶),已跳过。") sys.exit(0) # --------------------- 获取 OSD 使用率信息 --------------------- df_nodes = df.get("nodes", []) df_by_id = {n.get("id"): n for n in df_nodes if isinstance(n, dict) and "id" in n} osd_info: List[Dict] = [] missing_osds = [] for osd_id in osd_ids: osd_node = df_by_id.get(osd_id) if not osd_node: missing_osds.append(osd_id) continue kb = osd_node.get("kb", 0) or 0 kb_used = osd_node.get("kb_used", 0) or 0 name = osd_node.get("name", f"osd.{osd_id}") usage_pct = (float(kb_used) / float(kb) * 100.0) if kb > 0 else 0.0 osd_info.append({ "id": osd_id, "name": name, "kb": int(kb), "kb_used": int(kb_used), "usage_pct": usage_pct }) if missing_osds: print(f"警告: 以下 OSD 在 'ceph osd df' 中未找到: {missing_osds}", file=sys.stderr) if not osd_info: print(f"[跳过] root '{root_name}' 下无可用 OSD 数据,已跳过。") sys.exit(0) # --------------------- 计算统计信息 --------------------- max_osd = max(osd_info, key=lambda x: x["usage_pct"]) min_osd = min(osd_info, key=lambda x: x["usage_pct"]) avg_usage = sum(x["usage_pct"] for x in osd_info) / len(osd_info) total_kb = sum(x["kb"] for x in osd_info) total_kb_used = sum(x["kb_used"] for x in osd_info) total_usage_pct = (float(total_kb_used) / float(total_kb) * 100.0) if total_kb > 0 else 0.0 # --------------------- 输出终端/报告内容 --------------------- print(f"\n{'='*70}") print(f"Take Root : {root_name}") print(f"OSD Count : {len(osd_info)}") print(f"{'='*70}\n") print(f"{'OSD_ID':<8} {'OSD_NAME':<15} {'CAPACITY_KB':<15} {'USED_KB':<20} {'%USED':>10}") print(f"{'-'*70}") for osd in sorted(osd_info, key=lambda x: x["id"]): print(f"{osd['id']:<8} {osd['name']:<15} {osd['kb']:<15,} {osd['kb_used']:<20,} {osd['usage_pct']:>9.2f}%") print(f"\n{'='*70}") print(f"统计摘要:") print(f" 总容量 : {total_kb:,} KB ({total_kb/1024/1024:.2f} GB)") print(f" 已使用 : {total_kb_used:,} KB ({total_kb_used/1024/1024:.2f} GB)") print(f" 总使用率 : {total_usage_pct:.2f}%") print(f" 平均使用率 : {avg_usage:.2f}%") print(f" 最大使用率 : {max_osd['usage_pct']:.2f}% (OSD {max_osd['id']} / {max_osd['name']})") print(f" 最小使用率 : {min_osd['usage_pct']:.2f}% (OSD {min_osd['id']} / {min_osd['name']})") print(f"{'='*70}\n") # --------------------- 写入 textfile collector(仅最大使用率)--------------------- textfile_tmp = os.environ.get("TEXTFILE_TMP") if textfile_tmp: line = ( f'ceph_crush_root_osd_max_usage_percent{{' f'take_root="{root_name}",' f'osd_name="{max_osd["name"]}"' f'}} {max_osd["usage_pct"]:.2f}\n' ) with open(textfile_tmp, 'a') as f: f.write(line) print(f"✓ 指标已写入 textfile collector: {textfile_tmp}") else: print("✗ 环境变量 TEXTFILE_TMP 未设置,跳过 textfile 写入", file=sys.stderr) PYTHON_SCRIPT if [ $? -eq 0 ]; then log "take_root '$ROOT_NAME' 处理完成" else log_error "take_root '$ROOT_NAME' 处理失败" fi echo "" | tee_report done # --------------------- 原子替换 textfile --------------------- mv -f "$TEXTFILE_TMP" "$TEXTFILE_OUT" log "textfile collector 已更新: $TEXTFILE_OUT" # --------------------- 清理旧报告(保留最近 30 天)--------------------- find "$REPORT_DIR" -name "crush_usage_*.log" -mtime +30 -delete 2>/dev/null || true log "所有 take_root 处理完成" log "报告已保存至: $REPORT_FILE"ceph环境脚本运行结果:
root@gm1-pub-ceph-172-16-3-107:/tmp# bash /home/monitor/node_exporter_9101/collectors/crushrule_usage_report_nojq_v5_metrics.sh
======================================================================
Ceph CRUSH Root OSD 使用率报告
生成时间: 2026-04-24 15:15:09
======================================================================
[2026-04-24 15:15:09] 开始获取 Ceph 集群数据...
[2026-04-24 15:15:10] Ceph 数据获取完成
[2026-04-24 15:15:10] 自动发现 2 个唯一 take_root: default ssd
[2026-04-24 15:15:10] 处理 take_root: default
======================================================================
Take Root : default
OSD Count : 12
======================================================================
OSD_ID OSD_NAME CAPACITY_KB USED_KB %USED
----------------------------------------------------------------------
0 osd.0 4,140,752,888 1,702,653,156 41.12%
1 osd.1 4,140,752,888 1,614,773,804 39.00%
2 osd.2 4,140,752,888 1,654,648,876 39.96%
3 osd.3 4,140,752,888 1,837,897,260 44.39%
4 osd.4 3,907,014,656 1,340,719,904 34.32%
5 osd.5 4,141,436,920 1,749,448,584 42.24%
6 osd.6 4,141,436,920 1,657,205,256 40.02%
7 osd.7 4,141,436,920 1,836,856,668 44.35%
8 osd.8 4,141,436,920 1,792,965,288 43.29%
9 osd.9 4,141,436,920 1,522,847,576 36.77%
10 osd.10 4,141,436,920 1,793,701,216 43.31%
11 osd.11 4,141,436,920 1,701,000,068 41.07%
======================================================================
统计摘要:
总容量 : 49,460,084,648 KB (47168.81 GB)
已使用 : 20,204,717,656 KB (19268.72 GB)
总使用率 : 40.85%
平均使用率 : 40.82%
最大使用率 : 44.39% (OSD 3 / osd.3)
最小使用率 : 34.32% (OSD 4 / osd.4)
======================================================================
✓ 指标已写入 textfile collector: /home/monitor/node_exporter_9101/textfile_collector/.ceph_crush_root_usage.prom.tmp
[2026-04-24 15:15:10] take_root 'default' 处理完成
[2026-04-24 15:15:10] 处理 take_root: ssd
======================================================================
Take Root : ssd
OSD Count : 3
======================================================================
OSD_ID OSD_NAME CAPACITY_KB USED_KB %USED
----------------------------------------------------------------------
12 osd.12 937,160,704 3,288,832 0.35%
13 osd.13 937,689,088 3,874,516 0.41%
14 osd.14 937,689,088 4,192,704 0.45%
======================================================================
统计摘要:
总容量 : 2,812,538,880 KB (2682.25 GB)
已使用 : 11,356,052 KB (10.83 GB)
总使用率 : 0.40%
平均使用率 : 0.40%
最大使用率 : 0.45% (OSD 14 / osd.14)
最小使用率 : 0.35% (OSD 12 / osd.12)
======================================================================
✓ 指标已写入 textfile collector: /home/monitor/node_exporter_9101/textfile_collector/.ceph_crush_root_usage.prom.tmp
[2026-04-24 15:15:10] take_root 'ssd' 处理完成
[2026-04-24 15:15:10] textfile collector 已更新: /home/monitor/node_exporter_9101/textfile_collector/ceph_crush_root_usage.prom
[2026-04-24 15:15:10] 所有 take_root 处理完成
[2026-04-24 15:15:10] 报告已保存至: /home/monitor/node_exporter_9101/reports/crush_usage_20260424.log
root@gm1-pub-ceph-172-16-3-107:/tmp#
