当前位置: 首页 > news >正文

2025.11.8上机实验二:逻辑回归算法实现与测试

logistic_regression_iris.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

设置中文字体

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

print("步骤1:加载iris数据集并进行基本数据分析")
print("="*50)

加载iris数据集

iris = load_iris()
X = iris.data
y = iris.target

将数据集转换为DataFrame便于分析

df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['target_names'] = [iris.target_names[i] for i in y]

print("数据集基本信息:")
print(f"数据集形状:{X.shape}")
print(f"特征数量:{X.shape[1]}")
print(f"样本数量:{X.shape[0]}")
print(f"类别:{list(iris.target_names)}")
print("\n特征名称:")
for i, name in enumerate(iris.feature_names):
print(f"{i+1}. {name}")

print("\n数据统计描述:")
print(df.describe())

print("\n各类别样本数量:")
print(df['target_names'].value_counts())

print("\n数据前5行:")
print(df.head())

保存基本信息到文件,用于后续的实验报告

with open('data_analysis_results.txt', 'w', encoding='utf-8') as f:
f.write("IRIS数据集分析结果\n")
f.write("="*50 + "\n\n")
f.write(f"数据集形状:{X.shape}\n")
f.write(f"特征数量:{X.shape[1]}\n")
f.write(f"样本数量:{X.shape[0]}\n")
f.write(f"类别:{list(iris.target_names)}\n\n")
f.write("特征名称:\n")
for i, name in enumerate(iris.feature_names):
f.write(f"{i+1}. {name}\n")
f.write("\n数据统计描述:\n")
f.write(str(df.describe()) + "\n\n")
f.write("各类别样本数量:\n")
f.write(str(df['target_names'].value_counts()) + "\n")

print("\n数据分析结果已保存到 data_analysis_results.txt")
print("\n数据集加载和基本分析完成!")

======================================

print("\n步骤2:实现五折交叉验证的逻辑回归模型训练")
print("="*50)

创建标准化器

scaler = StandardScaler()

创建逻辑回归模型

logistic_model = LogisticRegression(
C=1.0, # 正则化强度的倒数,值越小正则化越强
penalty='l2', # 正则化类型,'l2'是默认值
solver='lbfgs', # 优化算法
multi_class='multinomial', # 多分类策略,'multinomial'表示使用softmax
max_iter=1000, # 最大迭代次数
random_state=42 # 随机种子,保证结果可复现
)

创建五折交叉验证分割器

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

存储每折的评估指标

accuracies = []
precisions = []
recalls = []
f1_scores = []
all_y_test = []
all_y_pred = []

print("\n开始五折交叉验证...")

进行五折交叉验证

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
print(f"\n===== 折 {fold} =====")

# 分割训练集和测试集
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]print(f"训练集大小: {X_train.shape[0]}, 测试集大小: {X_test.shape[0]}")# 标准化特征
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)# 训练模型
logistic_model.fit(X_train_scaled, y_train)# 在测试集上预测
y_pred = logistic_model.predict(X_test_scaled)# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')# 存储指标
accuracies.append(accuracy)
precisions.append(precision)
recalls.append(recall)
f1_scores.append(f1)# 存储预测结果用于混淆矩阵
all_y_test.extend(y_test)
all_y_pred.extend(y_pred)print(f"折 {fold} 评估结果:")
print(f"  准确率: {accuracy:.4f}")
print(f"  精确率: {precision:.4f}")
print(f"  召回率: {recall:.4f}")
print(f"  F1值: {f1:.4f}")# 获取详细的分类报告
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print(f"\n分类报告:\n{report}")

print("\n五折交叉验证训练完成!")

保存交叉验证结果到文件

with open('cross_validation_results.txt', 'w', encoding='utf-8') as f:
f.write("五折交叉验证结果\n")
f.write("="*50 + "\n\n")

for i in range(5):f.write(f"折 {i+1} 结果:\n")f.write(f"  准确率: {accuracies[i]:.4f}\n")f.write(f"  精确率: {precisions[i]:.4f}\n")f.write(f"  召回率: {recalls[i]:.4f}\n")f.write(f"  F1值: {f1_scores[i]:.4f}\n\n")

print("\n交叉验证结果已保存到 cross_validation_results.txt")

======================================

print("\n步骤3:使用五折交叉验证评估模型性能指标")
print("="*50)

计算平均性能指标

avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)

计算标准差

dev_accuracy = np.std(accuracies)
dev_precision = np.std(precisions)
dev_recall = np.std(recalls)
dev_f1 = np.std(f1_scores)

print("\n五折交叉验证平均性能指标:")
print(f"平均准确率: {avg_accuracy:.4f} ± {dev_accuracy:.4f}")
print(f"平均精确率: {avg_precision:.4f} ± {dev_precision:.4f}")
print(f"平均召回率: {avg_recall:.4f} ± {dev_recall:.4f}")
print(f"平均F1值: {avg_f1:.4f} ± {dev_f1:.4f}")

计算最佳和最差折数

best_fold = np.argmax(accuracies) + 1
worst_fold = np.argmin(accuracies) + 1

print(f"\n最佳折数: 折 {best_fold} (准确率: {accuracies[best_fold-1]:.4f})")
print(f"最差折数: 折 {worst_fold} (准确率: {accuracies[worst_fold-1]:.4f})")

保存性能评估汇总到文件

with open('performance_evaluation_summary.txt', 'w', encoding='utf-8') as f:
f.write("模型性能评估汇总\n")
f.write("="*50 + "\n\n")

f.write("五折交叉验证平均性能指标:\n")
f.write(f"平均准确率: {avg_accuracy:.4f} ± {dev_accuracy:.4f}\n")
f.write(f"平均精确率: {avg_precision:.4f} ± {dev_precision:.4f}\n")
f.write(f"平均召回率: {avg_recall:.4f} ± {dev_recall:.4f}\n")
f.write(f"平均F1值: {avg_f1:.4f} ± {dev_f1:.4f}\n\n")f.write(f"最佳折数: 折 {best_fold} (准确率: {accuracies[best_fold-1]:.4f})\n")
f.write(f"最差折数: 折 {worst_fold} (准确率: {accuracies[worst_fold-1]:.4f})\n\n")# 模型参数信息
f.write("逻辑回归模型参数:\n")
f.write(f"  正则化强度(C): {logistic_model.C}\n")
f.write(f"  正则化类型(penalty): {logistic_model.penalty}\n")
f.write(f"  优化算法(solver): {logistic_model.solver}\n")
f.write(f"  多分类策略(multi_class): {'multinomial'}\n")
f.write(f"  最大迭代次数(max_iter): {logistic_model.max_iter}\n")

print("\n模型性能评估完成!")
print("性能评估汇总已保存到 performance_evaluation_summary.txt")

获取最终模型的系数和截距

print("\n模型参数分析:")
print("特征权重(系数)😊
for i, (feature, coefs) in enumerate(zip(iris.feature_names, logistic_model.coef_.T)):
print(f" {feature}😊
for j, coef in enumerate(coefs):
print(f" - 对 {iris.target_names[j]} 的权重: {coef:.4f}")

print("\n模型截距:")
for i, intercept in enumerate(logistic_model.intercept_):
print(f" {iris.target_names[i]}: {intercept:.4f}")

======================================

print("\n步骤4:分析实验结果并生成可视化")
print("="*50)

创建一个大图,包含多个子图

fig = plt.figure(figsize=(20, 15))

1. 各折性能指标比较图

ax1 = plt.subplot(2, 2, 1)
folds = list(range(1, 6))
x = np.arange(len(folds))
width = 0.2

ax1.bar(x - width1.5, accuracies, width, label='准确率')
ax1.bar(x - width/2, precisions, width, label='精确率')
ax1.bar(x + width/2, recalls, width, label='召回率')
ax1.bar(x + width
1.5, f1_scores, width, label='F1值')

ax1.set_xlabel('折数')
ax1.set_ylabel('指标值')
ax1.set_title('五折交叉验证各性能指标比较')
ax1.set_xticks(x)
ax1.set_xticklabels(folds)
ax1.legend()
ax1.set_ylim(0.8, 1.05)

添加数值标签

for i, v in enumerate(accuracies):
ax1.text(i - width1.5, v + 0.01, f'{v:.3f}', ha='center')
for i, v in enumerate(precisions):
ax1.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center')
for i, v in enumerate(recalls):
ax1.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center')
for i, v in enumerate(f1_scores):
ax1.text(i + width
1.5, v + 0.01, f'{v:.3f}', ha='center')

2. 特征重要性可视化

ax2 = plt.subplot(2, 2, 2)

计算平均特征重要性(绝对值)

feature_importance = np.mean(np.abs(logistic_model.coef_), axis=0)
sorted_idx = np.argsort(feature_importance)

ax2.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
ax2.set_yticks(range(len(sorted_idx)))
ax2.set_yticklabels([iris.feature_names[i] for i in sorted_idx])
ax2.set_xlabel('平均特征重要性(系数绝对值)')
ax2.set_title('逻辑回归模型特征重要性分析')

3. 混淆矩阵可视化

from sklearn.metrics import confusion_matrix
ax3 = plt.subplot(2, 2, 3)

cm = confusion_matrix(all_y_test, all_y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names,
yticklabels=iris.target_names,
ax=ax3)
ax3.set_xlabel('预测类别')
ax3.set_ylabel('真实类别')
ax3.set_title('总混淆矩阵')

4. 平均性能指标雷达图

ax4 = plt.subplot(2, 2, 4, polar=True)

metrics = ['准确率', '精确率', '召回率', 'F1值']
values = [avg_accuracy, avg_precision, avg_recall, avg_f1]

angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
values = values + values[:1] # 闭合雷达图
angles = angles + angles[:1]

ax4.plot(angles, values, 'o-', linewidth=2)
ax4.fill(angles, values, alpha=0.25)
ax4.set_thetagrids(np.degrees(angles[:-1]), metrics)
ax4.set_ylim(0.8, 1.0)
ax4.set_title('模型平均性能雷达图')
ax4.grid(True)

plt.tight_layout()
plt.savefig('experiment_visualizations.png', dpi=300, bbox_inches='tight')
print("\n可视化图表已保存到 experiment_visualizations.png")

5. 单独的特征分布图

plt.figure(figsize=(15, 10))
for i, feature in enumerate(iris.feature_names):
plt.subplot(2, 2, i+1)
for target in range(3):
plt.hist(df[df['target'] == target][feature], alpha=0.5, label=iris.target_names[target])
plt.xlabel(feature)
plt.ylabel('频次')
plt.title(f'{feature} 分布图')
plt.legend()

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
print("特征分布图已保存到 feature_distributions.png")

print("\n实验结果可视化完成!")
print("生成的可视化图表:")
print("1. experiment_visualizations.png - 包含性能指标比较、特征重要性、混淆矩阵和雷达图")
print("2. feature_distributions.png - 各特征在不同类别中的分布情况")

http://www.jsqmd.com/news/156256/

相关文章:

  • YOLOv11训练日志解读:loss下降趋势正常吗?
  • 编译原理中**语法制导翻译**(Syntax-Directed Translation, SDT)在中间代码生成阶段的核心机制
  • PyTorch模型量化压缩指南,降低推理所需Token数
  • openEuler集群 Chrony 时间同步实战:从零构建高精度分布式时钟体系
  • Jupyter Notebook单元格执行顺序陷阱及避免方法
  • 基于Python的新能源汽车美容洗车预约系统vue
  • 027.归并排序
  • 2025.11.10上机实验三:C4.5(带有预剪枝和后剪枝)算法实现与测试
  • 中信银行信用卡中心Android高级研发工程师岗位深度解析与技术面试指南
  • 上位机是什么意思:工业4.0中OPC UA协议的应用
  • 2025.10.30非遗声景漫游馆(项目架构文档)
  • 清华大学开源镜像站配置PyTorch源的方法详解
  • 2025最新!8款AI论文软件测评:本科生毕业论文写作全攻略
  • SHEIN高级/资深iOS研发工程师:技术深度解析与面试指南
  • SSH免密登录PyTorch服务器,提高开发效率
  • AI原生应用领域下的AI工作流最佳实践
  • 2025.11.3社区智慧共享资源管理系统(项目概述文档)
  • 2025.10.31非遗声景漫游馆(技术实现文档)
  • 文法定义了一个典型的表达式文法,支持加法和乘法,具有左递归以实现左结合
  • 从文法的开始符号出发,尝试通过一系列最左推导,构造出与输入串完全匹配的语法树
  • 2025.11.5社区智慧共享资源管理系统(部署和运行文档)
  • 2025.10.28校园绿色能源监测与管理MIS系统(功能模块)
  • PyTorch-CUDA-v2.6镜像更新日志:新增支持哪些功能?
  • Springmvc的底层原理流程描述
  • (旧文)聊聊在Android跑RPG Maker游戏那点事
  • 布尔表达式的文法与代码结构在编译原理中属于**中间代码生成**阶段的重要内容
  • 2025.11.1非遗声景漫游馆(用户使用文档)
  • 2025.10.29校园绿色能源监测与管理MIS系统(部署和运行指南)
  • 2025.11.2非遗声景漫游馆(项目完成报告)
  • 2025.10.25故事生成系统介绍