当前位置：首页 > news >正文

用python实现简单的机器学习

news 2026/7/7 10:09:32

接下来我用Python来训练一个能识别各种花的AI示例，整个过程就像教小朋友认水果一样直观，最后让你理解什么是机器学习。

准备工作：安装必要的工具

安装以下Python库：

pip install numpy pandas matplotlib scikit-learn

完整代码示例

# 导入需要的工具包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report# 设置中文字体和图表样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = Falseprint("🚀 开始我们的AI机器学习之旅（bbdaxia.com）！")# 第一步：准备"学习资料" - 加载数据
# 加载花数据集（数据自己网上找哦，我不提供）
iris = datasets.load_iris()# 看看我们有什么数据
print("📊 数据集包含的信息：")
print(f"- 特征名: {iris.feature_names}")  # 花的测量数据
print(f"- 目标名: {iris.target_names}")  # 花的种类
print(f"- 数据形状: {iris.data.shape}")  # 150个样本，每个样本4个特征# 把数据转换成更易读的DataFrame格式
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target  # 添加种类列print("\n🔍 前5行数据预览：")
print(df.head())print("\n📈 数据统计信息：")
print(df.describe())# 第二步：数据探索 - 用眼睛先看看规律
def explore_data():# 创建一个大图表fig, axes = plt.subplots(2, 2, figsize=(12, 10))# 不同种类的颜色colors = ['red', 'blue', 'green']species_names = iris.target_names# 选择要绘制的特征组合feature_combinations = [(0, 1),  # 花萼长度 vs 花萼宽度(0, 2),  # 花萼长度 vs 花瓣长度  (0, 3),  # 花萼长度 vs 花瓣宽度(2, 3)   # 花瓣长度 vs 花瓣宽度]# 绘制四个特征组合的散点图for idx, (i, j) in enumerate(feature_combinations):row = idx // 2col = idx % 2for species in range(3):mask = (iris.target == species)axes[row, col].scatter(iris.data[mask, i], iris.data[mask, j],c=colors[species], label=species_names[species],alpha=0.7)axes[row, col].set_xlabel(iris.feature_names[i])axes[row, col].set_ylabel(iris.feature_names[j])axes[row, col].legend()axes[row, col].set_title(f'{iris.feature_names[i]} vs {iris.feature_names[j]}')plt.tight_layout()plt.show()# 运行数据探索
print("👀 正在绘制数据分布图...")
explore_data()
print("从图表中我们可以看到，不同种类的花在测量数据上确实有区别！")# 第三步：准备训练和测试 - 分堆学习
# 分离特征（X）和目标（y）
X = iris.data  # 所有测量数据
y = iris.target  # 对应的种类标签# 把数据分成训练集和测试集（80%训练，20%测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)print("📚 数据分割完成：")
print(f"- 训练集样本数: {X_train.shape[0]}")
print(f"- 测试集样本数: {X_test.shape[0]}")
print(f"- 特征数量: {X_train.shape[1]}")print("\n🎯 训练集中各类别分布：")
unique, counts = np.unique(y_train, return_counts=True)
for species, count in zip(unique, counts):print(f"  {iris.target_names[species]}: {count}个样本")# 第四步：创建和训练AI模型
# 创建逻辑回归分类器（这是一个很常用的分类算法）
model = LogisticRegression(random_state=42, max_iter=200)print("🏋️ 开始训练模型...")
# 训练模型！
model.fit(X_train, y_train)print("✅ 模型训练完成！")# 第五步：测试模型性能 - 看看学得怎么样
# 用测试集进行预测
y_pred = model.predict(X_test)# 计算准确率
accuracy = accuracy_score(y_test, y_pred)print("\n🎯 模型测试结果：")
print(f"- 准确率: {accuracy:.2f} ({accuracy*100:.1f}%)")
print(f"- 测试样本数: {len(y_test)}")
print(f"- 预测正确的数量: {np.sum(y_pred == y_test)}")print("\n📊 详细分类报告：")
print(classification_report(y_test, y_pred, target_names=iris.target_names))# 创建一个对比表格显示预测结果
results_df = pd.DataFrame({'实际种类': [iris.target_names[i] for i in y_test],'预测种类': [iris.target_names[i] for i in y_pred],'是否正确': ['✅' if pred == true else '❌' for pred, true in zip(y_pred, y_test)]
})print("\n🔍 预测结果详情：")
print(results_df)# 第六步：可视化预测结果
def visualize_predictions():plt.figure(figsize=(12, 5))# 实际种类plt.subplot(1, 2, 1)scatter = plt.scatter(X_test[:, 2], X_test[:, 3], c=y_test, cmap='viridis')plt.xlabel('花瓣长度 (cm)')plt.ylabel('花瓣宽度 (cm)')plt.title('实际种类')plt.colorbar(scatter, ticks=[0, 1, 2]).set_ticklabels(iris.target_names)# 预测种类plt.subplot(1, 2, 2)scatter = plt.scatter(X_test[:, 2], X_test[:, 3], c=y_pred, cmap='viridis')plt.xlabel('花瓣长度 (cm)')plt.ylabel('花瓣宽度 (cm)')plt.title('预测种类')plt.colorbar(scatter, ticks=[0, 1, 2]).set_ticklabels(iris.target_names)# 标记错误预测errors = y_pred != y_testif np.any(errors):plt.scatter(X_test[errors, 2], X_test[errors, 3], facecolors='none', edgecolors='red', s=200, linewidth=2,label='预测错误')plt.legend()plt.tight_layout()plt.show()# 显示预测结果可视化
print("\n📈 正在绘制预测结果对比图...")
visualize_predictions()# 第七步：让模型真正实用起来
# 创建一个函数来预测新样本
def predict_new_flower(sepal_length, sepal_width, petal_length, petal_width):"""输入新花的测量数据，预测其种类"""# 创建输入数组new_sample = np.array([[sepal_length, sepal_width, petal_length, petal_width]])# 进行预测prediction = model.predict(new_sample)probability = model.predict_proba(new_sample)species = iris.target_names[prediction[0]]confidence = probability[0][prediction[0]] * 100print(f"\n🌺 预测结果：")print(f"- 种类: {species}")print(f"- 置信度: {confidence:.1f}%")print(f"\n所有可能性的概率：")for i, prob in enumerate(probability[0]):print(f"  {iris.target_names[i]}: {prob*100:.1f}%")return species, confidenceprint("🔮 现在让我们试试预测新数据！")# 测试几个例子
print("\n" + "="*50)
print("示例1：")
predict_new_flower(5.1, 3.5, 1.4, 0.2)print("\n" + "="*50)
print("示例2：")
predict_new_flower(6.7, 3.0, 5.2, 2.3)print("\n" + "="*50)
print("示例3：")
predict_new_flower(5.9, 2.8, 4.3, 1.3)# 项目总结
def project_summary():print("\n" + "🌟" * 50)print("                   项目总结")print("🌟" * 50)print("\n📋 我们完成了什么：")achievements = ["✅ 加载和探索了花数据集","✅ 可视化数据并发现了模式","✅ 将数据分为训练集和测试集","✅ 训练了一个逻辑回归分类器","✅ 评估了模型性能（准确率）","✅ 创建了预测新样本的函数"]for achievement in achievements:print(achievement)print(f"\n🎯 最终模型准确率: {accuracy_score(y_test, y_pred)*100:.1f}%")print("\n🚀 下一步可以尝试：")next_steps = ["• 尝试其他分类算法（如决策树、随机森林）","• 调整模型参数来提升性能","• 处理更复杂的数据集","• 尝试解决回归问题（预测数值）"]for step in next_steps:print(step)# 显示项目总结
project_summary()

第一步：准备"学习资料" - 加载数据

# 导入需要的工具包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report# 设置中文字体和图表样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = Falseprint("🚀 开始我们的AI机器学习之旅！")# 第一步：准备"学习资料" - 加载数据
# 加载鸢尾花数据集（机器学习界的"Hello World"）
iris = datasets.load_iris()# 看看我们有什么数据
print("📊 数据集包含的信息：")
print(f"- 特征名: {iris.feature_names}")  # 花的测量数据
print(f"- 目标名: {iris.target_names}")  # 花的种类
print(f"- 数据形状: {iris.data.shape}")  # 150个样本，每个样本4个特征# 把数据转换成更易读的DataFrame格式
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target  # 添加种类列print("\n🔍 前5行数据预览：")
print(df.head())print("\n📈 数据统计信息：")
print(df.describe())

准备150张花的照片，每张照片都记录了花的4个测量数据（花萼长度、花萼宽度、花瓣长度、花瓣宽度），并且标注了花的种类。

第二步：数据探索 - 用眼睛先看看规律

# 第二步：数据探索 - 用眼睛先看看规律
def explore_data():# 创建一个大图表fig, axes = plt.subplots(2, 2, figsize=(12, 10))# 不同种类的颜色colors = ['red', 'blue', 'green']species_names = iris.target_names# 选择要绘制的特征组合feature_combinations = [(0, 1),  # 花萼长度 vs 花萼宽度(0, 2),  # 花萼长度 vs 花瓣长度  (0, 3),  # 花萼长度 vs 花瓣宽度(2, 3)   # 花瓣长度 vs 花瓣宽度]# 绘制四个特征组合的散点图for idx, (i, j) in enumerate(feature_combinations):row = idx // 2col = idx % 2for species in range(3):mask = (iris.target == species)axes[row, col].scatter(iris.data[mask, i], iris.data[mask, j],c=colors[species], label=species_names[species],alpha=0.7)axes[row, col].set_xlabel(iris.feature_names[i])axes[row, col].set_ylabel(iris.feature_names[j])axes[row, col].legend()axes[row, col].set_title(f'{iris.feature_names[i]} vs {iris.feature_names[j]}')plt.tight_layout()plt.show()# 运行数据探索
print("👀 正在绘制数据分布图...")
explore_data()
print("从图表中我们可以看到，不同种类的花在测量数据上确实有区别！")

就像看照片时会发现"苹果大多是红色的，橙子是橙色的"一样，从图表中我们能看出不同种类的花在尺寸上确实有规律可循。

第三步：准备训练和测试 - 分堆学习

# 第三步：准备训练和测试 - 分堆学习
# 分离特征（X）和目标（y）
X = iris.data  # 所有测量数据
y = iris.target  # 对应的种类标签# 把数据分成训练集和测试集（80%训练，20%测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)print("📚 数据分割完成：")
print(f"- 训练集样本数: {X_train.shape[0]}")
print(f"- 测试集样本数: {X_test.shape[0]}")
print(f"- 特征数量: {X_train.shape[1]}")print("\n🎯 训练集中各类别分布：")
unique, counts = np.unique(y_train, return_counts=True)
for species, count in zip(unique, counts):print(f"  {iris.target_names[species]}: {count}个样本")

我们把150张花的照片分成两堆：120张用来"教"AI（训练集），30张留着考考它学得怎么样（测试集）。这样就能检验AI是不是真的学会了，而不是死记硬背。

第四步：创建和训练AI模型

# 第四步：创建和训练AI模型
# 创建逻辑回归分类器（这是一个很常用的分类算法）
model = LogisticRegression(random_state=42, max_iter=200)print("🏋️ 开始训练模型...")
# 训练模型！
model.fit(X_train, y_train)print("✅ 模型训练完成！")

现在AI"学生"开始学习了！它通过分析120张训练照片，找出花的大小和种类之间的关系，建立自己的"判断规则"。

第五步：测试模型性能 - 看看学得怎么样

# 第五步：测试模型性能 - 看看学得怎么样
# 用测试集进行预测
y_pred = model.predict(X_test)# 计算准确率
accuracy = accuracy_score(y_test, y_pred)print("\n🎯 模型测试结果：")
print(f"- 准确率: {accuracy:.2f} ({accuracy*100:.1f}%)")
print(f"- 测试样本数: {len(y_test)}")
print(f"- 预测正确的数量: {np.sum(y_pred == y_test)}")print("\n📊 详细分类报告：")
print(classification_report(y_test, y_pred, target_names=iris.target_names))# 创建一个对比表格显示预测结果
results_df = pd.DataFrame({'实际种类': [iris.target_names[i] for i in y_test],'预测种类': [iris.target_names[i] for i in y_pred],'是否正确': ['✅' if pred == true else '❌' for pred, true in zip(y_pred, y_test)]
})print("\n🔍 预测结果详情：")
print(results_df)

现在用那30张没学过的照片考考AI，看看它能认对多少。准确率95%就意味着30张照片中它认对了28-29张！

第六步：可视化预测结果

# 第六步：可视化预测结果
def visualize_predictions():plt.figure(figsize=(12, 5))# 实际种类plt.subplot(1, 2, 1)scatter = plt.scatter(X_test[:, 2], X_test[:, 3], c=y_test, cmap='viridis')plt.xlabel('花瓣长度 (cm)')plt.ylabel('花瓣宽度 (cm)')plt.title('实际种类')plt.colorbar(scatter, ticks=[0, 1, 2]).set_ticklabels(iris.target_names)# 预测种类plt.subplot(1, 2, 2)scatter = plt.scatter(X_test[:, 2], X_test[:, 3], c=y_pred, cmap='viridis')plt.xlabel('花瓣长度 (cm)')plt.ylabel('花瓣宽度 (cm)')plt.title('预测种类')plt.colorbar(scatter, ticks=[0, 1, 2]).set_ticklabels(iris.target_names)# 标记错误预测errors = y_pred != y_testif np.any(errors):plt.scatter(X_test[errors, 2], X_test[errors, 3], facecolors='none', edgecolors='red', s=200, linewidth=2,label='预测错误')plt.legend()plt.tight_layout()plt.show()# 显示预测结果可视化
print("\n📈 正在绘制预测结果对比图...")
visualize_predictions()

第七步：让模型真正实用起来

# 第七步：让模型真正实用起来
# 创建一个函数来预测新样本
def predict_new_flower(sepal_length, sepal_width, petal_length, petal_width):"""输入新花的测量数据，预测其种类"""# 创建输入数组new_sample = np.array([[sepal_length, sepal_width, petal_length, petal_width]])# 进行预测prediction = model.predict(new_sample)probability = model.predict_proba(new_sample)species = iris.target_names[prediction[0]]confidence = probability[0][prediction[0]] * 100print(f"\n🌺 预测结果：")print(f"- 种类: {species}")print(f"- 置信度: {confidence:.1f}%")print(f"\n所有可能性的概率：")for i, prob in enumerate(probability[0]):print(f"  {iris.target_names[i]}: {prob*100:.1f}%")return species, confidenceprint("🔮 现在让我们试试预测新数据！")# 测试几个例子
print("\n" + "="*50)
print("示例1：")
predict_new_flower(5.1, 3.5, 1.4, 0.2)print("\n" + "="*50)
print("示例2：")
predict_new_flower(6.7, 3.0, 5.2, 2.3)print("\n" + "="*50)
print("示例3：")
predict_new_flower(5.9, 2.8, 4.3, 1.3)

完整项目总结

# 项目总结
def project_summary():print("\n" + "🌟" * 50)print("                   项目总结")print("🌟" * 50)print("\n📋 我们完成了什么：")achievements = ["✅ 加载和探索了鸢尾花数据集","✅ 可视化数据并发现了模式","✅ 将数据分为训练集和测试集","✅ 训练了一个逻辑回归分类器","✅ 评估了模型性能（准确率）","✅ 创建了预测新样本的函数"]for achievement in achievements:print(achievement)print(f"\n🎯 最终模型准确率: {accuracy_score(y_test, y_pred)*100:.1f}%")print("\n🚀 下一步可以尝试：")next_steps = ["• 尝试其他分类算法（如决策树、随机森林）","• 调整模型参数来提升性能","• 处理更复杂的数据集","• 尝试解决回归问题（预测数值）"]for step in next_steps:print(step)# 显示项目总结
project_summary()