当前位置：首页 > news >正文

2025.11.18上机实验七：K 均值聚类算法实现与测试

news 2026/7/3 0:36:20

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

1. 加载数据集并进行数据分析

print("Step 1: Loading and analyzing the dataset")
iris = load_iris()
X = iris.data # 特征数据
y_true = iris.target # 真实标签

数据分析

print(f"Dataset shape: {X.shape}")
print(f"Feature names: {iris.feature_names}")
print(f"Target names: {iris.target_names}")
print("First 5 samples:")
print(pd.DataFrame(X, columns=iris.feature_names).head())

2. 使用五折交叉验证进行模型训练和评估

print("\nStep 2: K-means clustering with 5-fold cross-validation")
k = 3 # 聚类数量，与iris数据集的类别数一致
kf = KFold(n_splits=5, shuffle=True, random_state=42) # 五折交叉验证

存储各折的性能指标

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

fold = 1
for train_index, test_index in kf.split(X):
print(f"\nFold {fold}😊

# 划分训练集和测试集
X_train, X_test = X[train_index], X[test_index]
y_train_true, y_test_true = y_true[train_index], y_true[test_index]# 训练K-means模型
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train)# 预测测试集
y_test_pred = kmeans.predict(X_test)# 由于K-means是无监督学习，聚类中心的标签与真实标签可能不一致，需要进行标签匹配
# 使用多数投票法匹配聚类标签和真实标签
from scipy.stats import mode
y_pred_voted = np.zeros_like(y_test_pred)
for i in range(k):mask = (y_test_pred == i)y_pred_voted[mask] = mode(y_test_true[mask])[0]# 计算性能指标
accuracy = accuracy_score(y_test_true, y_pred_voted)
precision = precision_score(y_test_true, y_pred_voted, average='weighted')
recall = recall_score(y_test_true, y_pred_voted, average='weighted')
f1 = f1_score(y_test_true, y_pred_voted, average='weighted')# 存储性能指标
accuracy_scores.append(accuracy)
precision_scores.append(precision)
recall_scores.append(recall)
f1_scores.append(f1)print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")fold += 1

3. 计算平均性能指标

print("\nStep 3: Performance evaluation")
print(f"\nAverage Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"Average F1-score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

4. 可视化聚类结果（使用前两个特征）

print("\nStep 4: Visualization")
kmeans_full = KMeans(n_clusters=k, random_state=42)
kmeans_full.fit(X)
y_pred_full = kmeans_full.predict(X)

匹配聚类标签和真实标签

from scipy.stats import mode
y_pred_voted_full = np.zeros_like(y_pred_full)
for i in range(k):
mask = (y_pred_full == i)
y_pred_voted_full[mask] = mode(y_true[mask])[0]

plt.figure(figsize=(12, 5))

真实标签可视化

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('True Labels')
plt.colorbar(ticks=[0, 1, 2], label='Species')

K-means聚类结果可视化

plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=y_pred_voted_full, cmap='viridis')
plt.scatter(kmeans_full.cluster_centers_[:, 0], kmeans_full.cluster_centers_[:, 1],
s=200, c='red', marker='X', label='Centroids')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('K-means Clustering Results')
plt.colorbar(ticks=[0, 1, 2], label='Cluster')
plt.legend()

plt.tight_layout()
plt.savefig('kmeans_iris_visualization.png')
plt.show()

5. K-means算法参数说明

print("\nStep 5: K-means parameters explanation")
print("KMeans(n_clusters=3, random_state=42)")
print("- n_clusters: 聚类的数量，这里设置为3，与iris数据集的类别数一致")
print("- random_state: 随机种子，确保结果可复现")
print("其他重要参数:")
print("- init: 初始化方法，默认'k-means++'，选择距离较远的初始聚类中心")
print("- n_init: 运行算法的次数，默认10，选择最好的结果")
print("- max_iter: 单次运行的最大迭代次数，默认300")
print("- tol: 收敛阈值，默认1e-4")
print("- algorithm: 算法选择，默认'elkan'，适用于密集数据")

查看全文

http://www.jsqmd.com/news/156268/