# -*- coding: utf-8 -*-
import paramiko
import socket
import time
import subprocess
import os
from config import Config
# utils/ssh_utils.py
import paramiko
import time
# utils/ssh_utils.py - 恢复原来能工作的版本
def ssh_exec(ip, port, user, password, command, timeout=30):
"""执行 SSH 命令"""
import paramiko
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
ssh.connect(ip, port=port, username=user, password=password, timeout=timeout)
stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout)
stdout_str = stdout.read().decode('utf-8', errors='ignore')
stderr_str = stderr.read().decode('utf-8', errors='ignore')
exit_code = stdout.channel.recv_exit_status()
ssh.close()
return stdout_str, stderr_str, exit_code
except Exception as e:
ssh.close()
raise e
def wait_for_ssh(ip, port, timeout=1200, first_wait=180, log_callback=None):
"""等待SSH服务就绪"""
RETRY_INTERVAL = 30
total_timeout = timeout if timeout else 1200
start = time.time()
attempt = 0
def output(msg):
if log_callback:
log_callback(msg)
output(f" ⏳ 等待 {first_wait} 秒让服务器启动...")
time.sleep(first_wait)
elapsed = int(time.time() - start)
output(f" 🔄 开始检测SSH服务(已等待{elapsed}秒)")
while time.time() - start < total_timeout:
attempt += 1
elapsed = int(time.time() - start)
remaining = total_timeout - elapsed
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((ip, port))
sock.close()
if result == 0:
output(f" ✅ SSH端口已通,等待 {Config.WAIT_EXTRA_SECONDS} 秒系统稳定...")
time.sleep(Config.WAIT_EXTRA_SECONDS)
return True
except:
pass
if attempt % 2 == 0:
output(f" ⏳ 检测SSH中... (第{attempt}次, 已等待{elapsed}秒, 剩余{remaining}秒)")
time.sleep(RETRY_INTERVAL)
output(f" ❌ SSH恢复超时 (已等待{total_timeout}秒,共检测{attempt}次)")
return False
def wait_for_power_off(ip, log_callback=None, timeout=30):
"""等待服务器断电(ping 不通)- Python 3.6 兼容"""
if log_callback:
log_callback(f"等待服务器 {ip} 断电...")
start_time = time.time()
while time.time() - start_time < timeout:
if os.name == 'nt':
cmd = ['ping', '-n', '1', '-w', '1000', ip]
else:
cmd = ['ping', '-c', '1', '-W', '1', ip]
try:
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=5)
is_alive = (result.returncode == 0)
except subprocess.TimeoutExpired:
is_alive = False
except Exception:
is_alive = True
if not is_alive:
elapsed = int(time.time() - start_time)
if log_callback:
log_callback(f"✅ 服务器 {ip} 已断电 (等待 {elapsed} 秒)")
return True
time.sleep(2)
if log_callback:
log_callback(f"❌ 服务器 {ip} 断电超时 ({timeout} 秒)")
return False
def check_nvme_health(ip, port, user, password, nvme_disk):
"""检查单个NVMe盘是否健康"""
cmd = f"ls {nvme_disk} 2>/dev/null && echo 'OK' || echo 'FAIL'"
stdout, stderr, code = ssh_exec(ip, port, user, password, cmd)
if 'OK' in stdout:
return True, ""
cmd2 = f"nvme list 2>/dev/null | grep {nvme_disk}"
stdout2, _, _ = ssh_exec(ip, port, user, password, cmd2)
if stdout2.strip():
return True, ""
return False, f"NVMe盘 {nvme_disk} 不可见"
