检查 AMD ROCm / RCCL 安装情况并可自动修复的脚本。
#!/usr/bin/env bash
#
# @Time : ${DATE} ${TIME}
# @Author : Hank Fu (hankf@amd.com)
# @File : ${NAME}.sh
# set -Eeuo pipefail# rocm_rccl_doctor.sh
# 检查 AMD ROCm / RCCL 安装与配置,并可自动修复
#
# 用法:
# bash rocm_rccl_doctor.sh # 仅检查,不修改
# bash rocm_rccl_doctor.sh --fix # 检查并自动修复
# bash rocm_rccl_doctor.sh --fix --yes # 自动确认可执行的修复动作
# bash rocm_rccl_doctor.sh --fix --rocm-version 6.2
#
# 环境变量:
# ROCM_VERSION=6.2 bash rocm_rccl_doctor.sh --fixMODE_FIX=0
ASSUME_YES=0
RUN_SMOKE_TESTS=1
ROCM_VERSION="${ROCM_VERSION:-}"
ROCM_VERSION_LOCKED=0
if [[ -n "${ROCM_VERSION}" ]]; thenROCM_VERSION_LOCKED=1
fiISSUES=()
WARNINGS=()
ACTIONS=()COLOR_RED="\033[31m"
COLOR_GREEN="\033[32m"
COLOR_YELLOW="\033[33m"
COLOR_BLUE="\033[34m"
COLOR_RESET="\033[0m"log_info() { echo -e "${COLOR_BLUE}[INFO]${COLOR_RESET} $*"; }
log_ok() { echo -e "${COLOR_GREEN}[ OK ]${COLOR_RESET} $*"; }
log_warn() { echo -e "${COLOR_YELLOW}[WARN]${COLOR_RESET} $*"; }
log_error() { echo -e "${COLOR_RED}[ERR ]${COLOR_RESET} $*" >&2; }add_issue() { ISSUES+=("$*"); }
add_warning() { WARNINGS+=("$*"); }
add_action() { ACTIONS+=("$*"); }usage() {cat <<EOF
Usage: $0 [options]Options:--fix 自动尝试修复发现的问题--yes 自动确认所有修复动作(与 --fix 一起使用)--rocm-version <ver> 指定 ROCm 仓库版本(默认: 自动检测系统版本)--no-smoke-test 跳过 HIP/RCCL 编译运行烟雾测试-h, --help 显示帮助Examples:$0$0 --fix$0 --fix --yes --rocm-version 6.2
EOF
}confirm() {local prompt="$1"if [[ "${ASSUME_YES}" -eq 1 ]]; thenreturn 0firead -r -p "${prompt} [y/N]: " ans[[ "${ans}" =~ ^[Yy]$ ]]
}need_cmd() {local cmd="$1"if ! command -v "${cmd}" >/dev/null 2>&1; thenadd_issue "缺少命令: ${cmd}"return 1fireturn 0
}run_root() {# 以 root 权限执行命令(root 或 sudo)local cmd="$*"if [[ "${EUID}" -eq 0 ]]; thenbash -c "${cmd}"elsesudo bash -c "${cmd}"fi
}parse_args() {while [[ $# -gt 0 ]]; docase "$1" in--fix)MODE_FIX=1shift;;--yes)ASSUME_YES=1shift;;--rocm-version)if [[ $# -lt 2 ]]; thenlog_error "--rocm-version 需要一个值"exit 2fiROCM_VERSION="$2"ROCM_VERSION_LOCKED=1shift 2;;--no-smoke-test)RUN_SMOKE_TESTS=0shift;;-h|--help)usageexit 0;;*)log_error "未知参数: $1"usageexit 2;;esacdone
}detect_rocm_version() {if [[ "${ROCM_VERSION_LOCKED}" -eq 1 ]]; thenlog_info "使用指定 ROCm 版本: ${ROCM_VERSION}"returnfilocal raw_version=""local repo_version=""local ffor f in /opt/rocm/.info/version /opt/rocm/.info/version-dev /opt/rocm/.info/version-utils; doif [[ -f "${f}" ]]; thenraw_version="$(grep -Eo '[0-9]+(\.[0-9]+)+' "${f}" | head -n1 || true)"if [[ -n "${raw_version}" ]]; thenbreakfifidoneif [[ -z "${raw_version}" ]] && command -v hipcc >/dev/null 2>&1; thenraw_version="$(hipcc --version 2>/dev/null | grep -Eo 'HIP version: *[0-9]+(\.[0-9]+)+' | grep -Eo '[0-9]+(\.[0-9]+)+' | head -n1 || true)"fiif [[ -z "${raw_version}" ]]; thenraw_version="6.2"add_warning "无法自动检测 ROCm 版本,回退使用默认仓库版本 ${raw_version}"firepo_version="$(echo "${raw_version}" | grep -Eo '^[0-9]+\.[0-9]+' | head -n1 || true)"if [[ -z "${repo_version}" ]]; thenrepo_version="${raw_version}"fiROCM_VERSION="${repo_version}"log_info "自动检测 ROCm 版本: ${ROCM_VERSION} (raw=${raw_version})"
}detect_platform() {if [[ -f /etc/os-release ]]; then# shellcheck disable=SC1091source /etc/os-releaseOS_ID="${ID:-unknown}"OS_VERSION_ID="${VERSION_ID:-unknown}"OS_CODENAME="${VERSION_CODENAME:-${UBUNTU_CODENAME:-}}"elseOS_ID="unknown"OS_VERSION_ID="unknown"OS_CODENAME=""fiif command -v apt-get >/dev/null 2>&1; thenPKG_MGR="apt"elif command -v dnf >/dev/null 2>&1; thenPKG_MGR="dnf"elif command -v yum >/dev/null 2>&1; thenPKG_MGR="yum"elif command -v zypper >/dev/null 2>&1; thenPKG_MGR="zypper"elsePKG_MGR="unknown"filog_info "系统: ${OS_ID} ${OS_VERSION_ID} (codename=${OS_CODENAME:-N/A}), 包管理器: ${PKG_MGR}"
}check_sudo_if_needed() {if [[ "${MODE_FIX}" -eq 1 && "${EUID}" -ne 0 ]]; thenif ! command -v sudo >/dev/null 2>&1; thenadd_issue "需要 sudo 执行修复,但系统不存在 sudo"returnfiif ! sudo -n true 2>/dev/null; thenadd_warning "后续修复可能会提示输入 sudo 密码"fifi
}check_amd_gpu() {if ! command -v lspci >/dev/null 2>&1; thenadd_warning "未找到 lspci,无法检查 AMD GPU 设备(可安装 pciutils)"returnfilocal gpu_linesgpu_lines="$(lspci | grep -Ei 'VGA|3D|Display' || true)"if [[ -z "${gpu_lines}" ]]; thenadd_issue "未检测到显示控制器设备(VGA/3D/Display)"returnfiif echo "${gpu_lines}" | grep -Eiq 'AMD|Advanced Micro Devices|Radeon'; thenlog_ok "检测到 AMD 显卡设备"elseadd_warning "检测到显示设备,但似乎不是 AMD GPU: ${gpu_lines//$'\n'/; }"fi
}check_kernel_and_devices() {if command -v lsmod >/dev/null 2>&1; thenif lsmod | grep -qE '^amdgpu[[:space:]]'; thenlog_ok "内核模块 amdgpu 已加载"elif [[ -d /sys/module/amdgpu ]]; then# 某些环境下 lsmod 输出可能不完整,补充 /sys/module 检测log_ok "内核模块 amdgpu 已加载(通过 /sys/module/amdgpu 检测)"elseadd_issue "内核模块 amdgpu 未加载"fielif [[ -d /sys/module/amdgpu ]]; then# 无 lsmod 命令时回退到 /sys/module 检测log_ok "内核模块 amdgpu 已加载(通过 /sys/module/amdgpu 检测)"elseadd_warning "未找到 lsmod,无法通过 lsmod 检查 amdgpu 模块"add_issue "内核模块 amdgpu 未加载"fiif [[ -e /dev/kfd ]]; thenlog_ok "/dev/kfd 存在"elseadd_issue "/dev/kfd 不存在(ROCm 驱动接口不可用)"fiif [[ -d /dev/dri ]]; thenif ls /dev/dri/renderD* >/dev/null 2>&1; thenlog_ok "/dev/dri/renderD* 存在"elseadd_issue "/dev/dri 下无 render 设备节点"fielseadd_issue "/dev/dri 不存在"fi
}check_user_groups() {local user_nameuser_name="${SUDO_USER:-$USER}"if id -nG "${user_name}" | tr ' ' '\n' | grep -qx 'video'; thenlog_ok "用户 ${user_name} 属于 video 组"elseadd_issue "用户 ${user_name} 不在 video 组"fiif id -nG "${user_name}" | tr ' ' '\n' | grep -qx 'render'; thenlog_ok "用户 ${user_name} 属于 render 组"elseadd_issue "用户 ${user_name} 不在 render 组"fi
}find_hipcc() {local pif command -v hipcc >/dev/null 2>&1; thencommand -v hipccreturn 0fifor p in /usr/bin/hipcc /opt/rocm/bin/hipcc; doif [[ -x "${p}" ]]; thenecho "${p}"return 0fidonereturn 1
}check_rocm_install() {local hipcc_path=""if [[ -d /opt/rocm ]]; thenlog_ok "/opt/rocm 存在"elseadd_issue "/opt/rocm 不存在(ROCm 可能未安装)"fiif command -v rocminfo >/dev/null 2>&1; thenlog_ok "rocminfo 已安装"elseadd_issue "rocminfo 不存在"fiif hipcc_path="$(find_hipcc)"; thenlog_ok "hipcc 已安装 (${hipcc_path})"elseadd_issue "hipcc 不存在或不可执行(已检查 PATH, /usr/bin/hipcc, /opt/rocm/bin/hipcc)"fi
}find_rccl_header() {local candidates=("/opt/rocm/include/rccl/rccl.h""/opt/rocm/include/rccl.h""/usr/include/rccl/rccl.h""/usr/include/rccl.h""/usr/local/include/rccl/rccl.h""/usr/local/include/rccl.h"# RCCL 通常兼容 NCCL API,部分发行包只提供 nccl.h"/opt/rocm/include/nccl.h""/opt/rocm/include/rccl/nccl.h""/usr/include/nccl.h""/usr/local/include/nccl.h")local pfor p in "${candidates[@]}"; doif [[ -f "${p}" ]]; thenecho "${p}"return 0fidonereturn 1
}check_rccl_install() {local has_lib=0local has_header=0local rccl_header_path=""if ldconfig -p 2>/dev/null | grep -q 'librccl\.so'; thenhas_lib=1elif [[ -e /opt/rocm/lib/librccl.so || -e /opt/rocm/lib64/librccl.so ]]; thenhas_lib=1fiif rccl_header_path="$(find_rccl_header)"; thenhas_header=1fiif [[ "${has_lib}" -eq 1 ]]; thenlog_ok "RCCL 动态库已找到 (librccl.so)"elseadd_issue "未找到 RCCL 动态库 librccl.so"fiif [[ "${has_header}" -eq 1 ]]; thenlog_ok "RCCL 头文件已找到 (${rccl_header_path})"elseadd_issue "未找到 RCCL 头文件(rccl.h/nccl.h),已检查 /opt/rocm/include, /usr/include, /usr/local/include"fi
}check_ldconfig_rocm() {local conf="/etc/ld.so.conf.d/rocm.conf"if [[ -f "${conf}" ]] && grep -q '^/opt/rocm/lib' "${conf}" && grep -q '^/opt/rocm/lib64' "${conf}"; thenlog_ok "ldconfig 已配置 rocm.conf"elseadd_issue "ldconfig 未完整配置 /opt/rocm/lib 与 /opt/rocm/lib64(${conf})"fi
}apt_has_rocm_repo() {grep -R "repo\.radeon\.com/rocm/apt" /etc/apt/sources.list /etc/apt/sources.list.d/*.list 2>/dev/null | grep -q .
}configure_rocm_repo_apt() {local codename="${OS_CODENAME}"if [[ -z "${codename}" ]]; thenif command -v lsb_release >/dev/null 2>&1; thencodename="$(lsb_release -cs || true)"fifiif [[ -z "${codename}" ]]; thenlog_error "无法获取 Ubuntu codename,不能自动配置 ROCm APT 仓库"return 1fiadd_action "配置 ROCm APT 仓库 (version=${ROCM_VERSION}, codename=${codename})"run_root "mkdir -p /etc/apt/keyrings"run_root "apt-get update"run_root "apt-get install -y wget gpg"run_root "wget -qO- https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg"run_root "cat > /etc/apt/sources.list.d/rocm.list <<'EOF'
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} ${codename} main
EOF"run_root "apt-get update"
}install_rocm_rccl_apt() {add_action "安装 ROCm/RCCL 相关软件包 (APT)"run_root "apt-get install -y rocm-hip-runtime rocm-dev rocminfo rccl rccl-dev"
}install_rocm_rccl_dnf() {add_action "安装 ROCm/RCCL 相关软件包 (DNF)"run_root "dnf install -y rocm-hip-runtime rocm-devel rocminfo rccl rccl-devel"
}install_rocm_rccl_yum() {add_action "安装 ROCm/RCCL 相关软件包 (YUM)"run_root "yum install -y rocm-hip-runtime rocm-devel rocminfo rccl rccl-devel"
}install_rocm_rccl_zypper() {add_action "安装 ROCm/RCCL 相关软件包 (ZYPPER)"run_root "zypper --non-interactive install rocm-hip-runtime rocm-devel rocminfo rccl rccl-devel"
}fix_ldconfig_rocm() {add_action "修复 ldconfig 的 ROCm 库路径配置"run_root "cat > /etc/ld.so.conf.d/rocm.conf <<'EOF'
/opt/rocm/lib
/opt/rocm/lib64
EOF"run_root "ldconfig"
}fix_user_groups() {local user_nameuser_name="${SUDO_USER:-$USER}"if ! id -nG "${user_name}" | tr ' ' '\n' | grep -qx 'video'; thenadd_action "将用户 ${user_name} 加入 video 组"run_root "usermod -aG video ${user_name}"fiif ! id -nG "${user_name}" | tr ' ' '\n' | grep -qx 'render'; thenadd_action "将用户 ${user_name} 加入 render 组"run_root "usermod -aG render ${user_name}"fi
}install_or_fix_by_pkg_manager() {case "${PKG_MGR}" inapt)if ! apt_has_rocm_repo; thenlog_warn "未检测到 ROCm APT 仓库"if confirm "是否自动配置 ROCm APT 仓库并继续安装?"; thenconfigure_rocm_repo_aptelseadd_warning "用户跳过了 ROCm 仓库配置"fifiif confirm "是否安装/修复 ROCm + RCCL 软件包 (APT)?"; theninstall_rocm_rccl_apt || add_issue "APT 安装 ROCm/RCCL 失败"elseadd_warning "用户跳过了 APT 安装步骤"fi;;dnf)if confirm "是否安装/修复 ROCm + RCCL 软件包 (DNF)?"; theninstall_rocm_rccl_dnf || add_issue "DNF 安装 ROCm/RCCL 失败"elseadd_warning "用户跳过了 DNF 安装步骤"fi;;yum)if confirm "是否安装/修复 ROCm + RCCL 软件包 (YUM)?"; theninstall_rocm_rccl_yum || add_issue "YUM 安装 ROCm/RCCL 失败"elseadd_warning "用户跳过了 YUM 安装步骤"fi;;zypper)if confirm "是否安装/修复 ROCm + RCCL 软件包 (ZYPPER)?"; theninstall_rocm_rccl_zypper || add_issue "ZYPPER 安装 ROCm/RCCL 失败"elseadd_warning "用户跳过了 ZYPPER 安装步骤"fi;;*)add_issue "不支持自动安装:未知包管理器 ${PKG_MGR}";;esac
}run_smoke_tests() {if [[ "${RUN_SMOKE_TESTS}" -ne 1 ]]; thenlog_warn "已跳过烟雾测试 (--no-smoke-test)"returnfilog_info "开始执行烟雾测试"if command -v rocminfo >/dev/null 2>&1; thenif rocminfo >/tmp/rocminfo.out 2>/tmp/rocminfo.err; thenlog_ok "rocminfo 运行成功"elseadd_issue "rocminfo 运行失败,详见 /tmp/rocminfo.err"fielseadd_issue "烟雾测试失败:rocminfo 不存在"filocal hipcc_path=""local rccl_header_path=""local rccl_include_dir=""if hipcc_path="$(find_hipcc)"; thenlog_info "使用 hipcc: ${hipcc_path}"cat >/tmp/hip_smoke.cpp <<'EOF'
#include <iostream>// 仅验证 hipcc 工具链可正常编译与运行,避免驱动/运行时 ABI 细节导致误报
int main() {std::cout << "hipcc toolchain smoke test passed" << std::endl;return 0;
}
EOFif "${hipcc_path}" /tmp/hip_smoke.cpp -o /tmp/hip_smoke >/tmp/hip_smoke_build.out 2>/tmp/hip_smoke_build.err && /tmp/hip_smoke >/tmp/hip_smoke_run.out 2>/tmp/hip_smoke_run.err; thenlog_ok "HIP 编译与运行测试成功(toolchain)"elseif grep -q "__AMDGCN_WAVEFRONT_SIZE" /tmp/hip_smoke_build.err 2>/dev/null; thenadd_issue "HIP 烟雾测试失败:检测到 __AMDGCN_WAVEFRONT_SIZE 宏兼容问题(通常是头文件/编译器版本不匹配),见 /tmp/hip_smoke_build.err"elif grep -q "free(): invalid pointer" /tmp/hip_smoke_run.err 2>/dev/null; thenadd_issue "HIP 烟雾测试运行异常:free(): invalid pointer。已将测试降级为纯 toolchain 检查;请检查系统 ROCm 运行时库版本一致性(见 /tmp/hip_smoke_run.err)"elseadd_issue "HIP 烟雾测试失败(编译或运行),见 /tmp/hip_smoke_build.err /tmp/hip_smoke_run.err"fifielseadd_issue "烟雾测试失败:hipcc 不存在或不可执行(已检查 PATH, /usr/bin/hipcc, /opt/rocm/bin/hipcc)"fiif [[ -n "${hipcc_path}" ]] && rccl_header_path="$(find_rccl_header)"; thenrccl_include_dir="$(dirname "${rccl_header_path}")"cat >/tmp/rccl_smoke.cpp <<'EOF'
#include <iostream>// 仅做 API 连通性验证,避免不同版本头文件中的设备端宏兼容问题
extern "C" int ncclGetVersion(int* version);int main() {int version = 0;int r = ncclGetVersion(&version);if (r != 0) {std::cerr << "rccl ncclGetVersion failed, rc=" << r << std::endl;return 2;}std::cout << "RCCL version(API): " << version << std::endl;return 0;
}
EOFif "${hipcc_path}" /tmp/rccl_smoke.cpp -I"${rccl_include_dir}" -L/opt/rocm/lib -L/opt/rocm/lib64 -lrccl -o /tmp/rccl_smoke \>/tmp/rccl_smoke_build.out 2>/tmp/rccl_smoke_build.err && /tmp/rccl_smoke >/tmp/rccl_smoke_run.out 2>/tmp/rccl_smoke_run.err; thenlog_ok "RCCL 编译与运行测试成功"elseif grep -q "__AMDGCN_WAVEFRONT_SIZE" /tmp/rccl_smoke_build.err 2>/dev/null; thenadd_issue "RCCL 烟雾测试失败:检测到 __AMDGCN_WAVEFRONT_SIZE 宏兼容问题(通常是头文件/编译器版本不匹配)。已改为最小 API 测试,请检查 /tmp/rccl_smoke_build.err"elseadd_issue "RCCL 烟雾测试失败(编译或运行),见 /tmp/rccl_smoke_build.err /tmp/rccl_smoke_run.err"fifielseadd_issue "烟雾测试失败:缺少可用 hipcc 或未找到 RCCL 头文件(rccl.h/nccl.h)"fi
}print_summary() {echolog_info "============== 检查/修复结果汇总 =============="echo "执行模式: $([[ "${MODE_FIX}" -eq 1 ]] && echo '检查 + 修复' || echo '仅检查')"echo "ROCm 目标仓库版本: ${ROCM_VERSION}"echoif [[ ${#ACTIONS[@]} -gt 0 ]]; thenecho "已执行动作:"for a in "${ACTIONS[@]}"; doecho " - ${a}"doneechofiif [[ ${#WARNINGS[@]} -gt 0 ]]; thenecho "警告:"for w in "${WARNINGS[@]}"; doecho " - ${w}"doneechofiif [[ ${#ISSUES[@]} -gt 0 ]]; thenecho "问题:"for i in "${ISSUES[@]}"; doecho " - ${i}"doneecholog_error "仍存在 ${#ISSUES[@]} 个问题,请根据上方提示处理。"if [[ "${MODE_FIX}" -eq 1 ]]; thenecho "提示: 部分问题可能需要重启、重新登录(组权限生效)或安装内核/驱动后再重试。"fireturn 1filog_ok "未发现阻塞问题,ROCm / RCCL 处于可用状态。"if [[ "${MODE_FIX}" -eq 1 ]]; thenecho "提示: 如果本次修改了用户组,请重新登录后再使用 ROCm。"fireturn 0
}do_checks() {ISSUES=()WARNINGS=()detect_platformcheck_sudo_if_neededcheck_amd_gpucheck_kernel_and_devicescheck_user_groupscheck_rocm_installcheck_rccl_installcheck_ldconfig_rocm
}do_fix() {if [[ "${MODE_FIX}" -ne 1 ]]; thenreturnfilog_info "进入修复流程"install_or_fix_by_pkg_managerif confirm "是否修复 ldconfig 的 ROCm 库路径配置?"; thenfix_ldconfig_rocm || add_issue "修复 ldconfig 配置失败"elseadd_warning "用户跳过 ldconfig 修复"fiif confirm "是否将当前用户加入 video/render 组?"; thenfix_user_groups || add_issue "修复用户组失败"elseadd_warning "用户跳过用户组修复"fi
}main() {parse_args "$@"detect_rocm_versionlog_info "第一轮检查开始"do_checksif [[ "${MODE_FIX}" -eq 1 ]]; thendo_fixlog_info "第二轮检查开始(修复后复检)"do_checksfirun_smoke_testsprint_summary
}main "$@"