当前位置: 首页 > news >正文

Go语言机器学习实战:聚类算法与无监督学习

Go语言机器学习实战:聚类算法与无监督学习

无监督学习是机器学习的重要分支,它从无标签数据中发现模式和结构。聚类算法是无监督学习的核心,本文将深入探讨如何使用Go语言实现常见的聚类算法。

一、聚类算法概述

聚类是将数据点分组的过程,使得同一组内的数据点相似度较高,而不同组之间的数据点相似度较低。常见的聚类算法包括:

  • K-Means:基于距离的划分方法,简单高效
  • 层次聚类:构建层次化的聚类树
  • DBSCAN:基于密度的聚类,能发现任意形状的簇
  • 高斯混合模型:概率模型,考虑数据的分布

二、K-Means聚类实现

2.1 算法原理

K-Means算法的核心思想是:

  1. 随机选择K个初始质心
  2. 将每个数据点分配到最近的质心
  3. 重新计算每个簇的质心
  4. 重复步骤2-3直到收敛

2.2 Go语言实现

package main import ( "fmt" "math" "math/rand" "time" ) type Point struct { Features []float64 } type KMeans struct { K int Centroids []Point MaxIter int } func NewKMeans(k, maxIter int) *KMeans { return &KMeans{ K: k, MaxIter: maxIter, } } func (km *KMeans) distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) } func (km *KMeans) fit(points []Point) { rand.Seed(time.Now().UnixNano()) // 随机初始化质心 km.Centroids = make([]Point, km.K) for i := 0; i < km.K; i++ { idx := rand.Intn(len(points)) km.Centroids[i] = points[idx] } for iter := 0; iter < km.MaxIter; iter++ { // 分配数据点到簇 clusters := make([][]Point, km.K) for _, point := range points { minDist := math.MaxFloat64 clusterIdx := 0 for i, centroid := range km.Centroids { dist := km.distance(point, centroid) if dist < minDist { minDist = dist clusterIdx = i } } clusters[clusterIdx] = append(clusters[clusterIdx], point) } // 更新质心 prevCentroids := make([]Point, km.K) copy(prevCentroids, km.Centroids) for i, cluster := range clusters { if len(cluster) == 0 { continue } newCentroid := Point{Features: make([]float64, len(cluster[0].Features))} for _, point := range cluster { for j := 0; j < len(point.Features); j++ { newCentroid.Features[j] += point.Features[j] } } for j := 0; j < len(newCentroid.Features); j++ { newCentroid.Features[j] /= float64(len(cluster)) } km.Centroids[i] = newCentroid } // 检查收敛 converged := true for i := 0; i < km.K; i++ { if km.distance(km.Centroids[i], prevCentroids[i]) > 1e-6 { converged = false break } } if converged { fmt.Printf("算法在第%d次迭代收敛\n", iter+1) break } } } func (km *KMeans) predict(point Point) int { minDist := math.MaxFloat64 clusterIdx := 0 for i, centroid := range km.Centroids { dist := km.distance(point, centroid) if dist < minDist { minDist = dist clusterIdx = i } } return clusterIdx }

2.3 使用示例

func main() { // 生成模拟数据 points := []Point{ {Features: []float64{1, 2}}, {Features: []float64{2, 1}}, {Features: []float64{2, 3}}, {Features: []float64{8, 7}}, {Features: []float64{9, 8}}, {Features: []float64{7, 9}}, {Features: []float64{15, 16}}, {Features: []float64{16, 15}}, {Features: []float64{17, 17}}, } kmeans := NewKMeans(3, 100) kmeans.fit(points) fmt.Println("质心坐标:") for i, centroid := range kmeans.Centroids { fmt.Printf("簇%d: %v\n", i, centroid.Features) } // 预测新数据点 testPoint := Point{Features: []float64{10, 10}} cluster := kmeans.predict(testPoint) fmt.Printf("数据点(10, 10)属于簇%d\n", cluster) }

三、DBSCAN聚类实现

3.1 算法原理

DBSCAN(Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法:

  • 核心点:在指定半径ε内有至少MinPts个邻居
  • 边界点:在ε内邻居数少于MinPts,但属于某个核心点的邻域
  • 噪声点:既不是核心点也不是边界点

3.2 Go语言实现

type DBSCAN struct { Epsilon float64 MinPts int labels []int } func NewDBSCAN(epsilon float64, minPts int) *DBSCAN { return &DBSCAN{ Epsilon: epsilon, MinPts: minPts, } } func (d *DBSCAN) fit(points []Point) { n := len(points) d.labels = make([]int, n) for i := range d.labels { d.labels[i] = -1 // -1表示未访问 } clusterID := 0 for i := 0; i < n; i++ { if d.labels[i] != -1 { continue } neighbors := d.rangeQuery(points, i) if len(neighbors) < d.MinPts { d.labels[i] = 0 // 标记为噪声 continue } // 扩展簇 d.expandCluster(points, i, neighbors, clusterID) clusterID++ } } func (d *DBSCAN) rangeQuery(points []Point, idx int) []int { neighbors := []int{} for i := 0; i < len(points); i++ { if i == idx { continue } dist := d.distance(points[idx], points[i]) if dist <= d.Epsilon { neighbors = append(neighbors, i) } } return neighbors } func (d *DBSCAN) expandCluster(points []Point, idx int, neighbors []int, clusterID int) { d.labels[idx] = clusterID queue := neighbors for len(queue) > 0 { current := queue[0] queue = queue[1:] if d.labels[current] == 0 { d.labels[current] = clusterID } if d.labels[current] != -1 { continue } d.labels[current] = clusterID currentNeighbors := d.rangeQuery(points, current) if len(currentNeighbors) >= d.MinPts { queue = append(queue, currentNeighbors...) } } } func (d *DBSCAN) distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

四、层次聚类

4.1 算法原理

层次聚类有两种策略:

  • 凝聚式:从单个点开始,逐步合并相似的簇
  • 分裂式:从整个数据集开始,逐步分裂

4.2 Go语言实现

type HierarchicalClustering struct { linkage string // "single", "complete", "average" } func NewHierarchicalClustering(linkage string) *HierarchicalClustering { return &HierarchicalClustering{linkage: linkage} } func (hc *HierarchicalClustering) fit(points []Point) [][]Point { // 初始化每个点为一个簇 clusters := make([][]Point, len(points)) for i, point := range points { clusters[i] = []Point{point} } for len(clusters) > 1 { minDist := math.MaxFloat64 mergeI, mergeJ := 0, 1 // 找到距离最近的两个簇 for i := 0; i < len(clusters); i++ { for j := i + 1; j < len(clusters); j++ { dist := hc.clusterDistance(clusters[i], clusters[j]) if dist < minDist { minDist = dist mergeI, mergeJ = i, j } } } // 合并两个簇 merged := append(clusters[mergeI], clusters[mergeJ]...) clusters = append(clusters[:mergeJ], clusters[mergeJ+1:]...) clusters[mergeI] = merged } return clusters } func (hc *HierarchicalClustering) clusterDistance(c1, c2 []Point) float64 { switch hc.linkage { case "single": return hc.singleLinkage(c1, c2) case "complete": return hc.completeLinkage(c1, c2) case "average": return hc.averageLinkage(c1, c2) default: return hc.singleLinkage(c1, c2) } } func (hc *HierarchicalClustering) singleLinkage(c1, c2 []Point) float64 { minDist := math.MaxFloat64 for _, p1 := range c1 { for _, p2 := range c2 { dist := hc.pointDistance(p1, p2) if dist < minDist { minDist = dist } } } return minDist } func (hc *HierarchicalClustering) completeLinkage(c1, c2 []Point) float64 { maxDist := 0.0 for _, p1 := range c1 { for _, p2 := range c2 { dist := hc.pointDistance(p1, p2) if dist > maxDist { maxDist = dist } } } return maxDist } func (hc *HierarchicalClustering) averageLinkage(c1, c2 []Point) float64 { var sumDist float64 count := 0 for _, p1 := range c1 { for _, p2 := range c2 { sumDist += hc.pointDistance(p1, p2) count++ } } return sumDist / float64(count) } func (hc *HierarchicalClustering) pointDistance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

五、高斯混合模型(GMM)

5.1 算法原理

GMM假设数据来自多个高斯分布的混合,使用EM算法进行参数估计。

5.2 Go语言实现

type Gaussian struct { Mean float64 StdDev float64 } func (g *Gaussian) pdf(x float64) float64 { return math.Exp(-math.Pow(x-g.Mean, 2)/(2*math.Pow(g.StdDev, 2))) / (g.StdDev * math.Sqrt(2*math.Pi)) } type GMM struct { Gaussians []Gaussian Weights []float64 } func NewGMM(k int) *GMM { return &GMM{ Gaussians: make([]Gaussian, k), Weights: make([]float64, k), } } func (gmm *GMM) fit(data []float64, maxIter int) { n := len(data) k := len(gmm.Gaussians) // 初始化 for i := 0; i < k; i++ { gmm.Gaussians[i] = Gaussian{ Mean: data[i*n/k], StdDev: 1.0, } gmm.Weights[i] = 1.0 / float64(k) } for iter := 0; iter < maxIter; iter++ { // E步:计算后验概率 responsibilities := make([][]float64, k) for i := range responsibilities { responsibilities[i] = make([]float64, n) } for j := 0; j < n; j++ { var sum float64 for i := 0; i < k; i++ { responsibilities[i][j] = gmm.Weights[i] * gmm.Gaussians[i].pdf(data[j]) sum += responsibilities[i][j] } for i := 0; i < k; i++ { responsibilities[i][j] /= sum } } // M步:更新参数 for i := 0; i < k; i++ { var weightSum, meanSum, varSum float64 for j := 0; j < n; j++ { weightSum += responsibilities[i][j] meanSum += responsibilities[i][j] * data[j] } gmm.Weights[i] = weightSum / float64(n) gmm.Gaussians[i].Mean = meanSum / weightSum for j := 0; j < n; j++ { varSum += responsibilities[i][j] * math.Pow(data[j]-gmm.Gaussians[i].Mean, 2) } gmm.Gaussians[i].StdDev = math.Sqrt(varSum / weightSum) } } }

六、聚类评估指标

func SilhouetteScore(points []Point, labels []int) float64 { n := len(points) silhouette := make([]float64, n) for i := 0; i < n; i++ { // 计算a(i):同一簇内其他点的平均距离 var a float64 sameCluster := []Point{} for j := 0; j < n; j++ { if i != j && labels[j] == labels[i] { sameCluster = append(sameCluster, points[j]) } } if len(sameCluster) > 0 { for _, p := range sameCluster { a += distance(points[i], p) } a /= float64(len(sameCluster)) } // 计算b(i):最近簇的平均距离 b := math.MaxFloat64 clusters := make(map[int][]Point) for j := 0; j < n; j++ { if j != i { clusters[labels[j]] = append(clusters[labels[j]], points[j]) } } for _, cluster := range clusters { if len(cluster) > 0 { var distSum float64 for _, p := range cluster { distSum += distance(points[i], p) } avgDist := distSum / float64(len(cluster)) if avgDist < b { b = avgDist } } } silhouette[i] = (b - a) / math.Max(a, b) } var score float64 for _, s := range silhouette { score += s } return score / float64(n) } func distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

七、总结

本文介绍了四种经典的聚类算法及其Go语言实现:

  1. K-Means:简单高效,适合大规模数据
  2. DBSCAN:基于密度,能发现任意形状的簇
  3. 层次聚类:构建层次化结构,无需指定K值
  4. 高斯混合模型:概率模型,考虑数据分布

每种算法都有其适用场景,选择合适的聚类算法需要考虑数据特性和业务需求。Go语言的高性能特性使其成为处理大规模数据聚类的理想选择。

http://www.jsqmd.com/news/912866/

相关文章:

  • E图提取技术与e-boost框架在EDA中的高效应用
  • 豆包优化怎么选才稳妥?细数企业高频踩坑问题,三家服务商实测参考 - 玖叁鹿
  • 2026年节日限定盲盒毛绒玩具怎么挑:五家优选品牌解析 - 科技焦点
  • 打工人实用参考!优质项目汇报PPT制作工具汇总
  • 告别CentOS思维:在银河麒麟V10上用源码编译PHP的正确姿势
  • Kubernetes分布式追踪与链路分析:实现全链路可观测性
  • 并网逆变器开发实战:从PR控制器到GaN功率级的设计与爆炸复盘
  • 如何快速下载百度文库等30+平台文档:终极免费文档获取指南
  • FEMTO-ST轴承数据集深度使用指南:避开新手处理振动信号的5个常见坑
  • 名家字画回收,丰宝斋上门服务,让艺术瑰宝重焕光彩 - 深鉴新闻
  • GaiaNet Chat从零上手:去中心化AI聊天应用实战指南
  • 如何选择家用SUV车型?2026年5月推荐TOP5对比家庭出行案例评测价格 - 品牌推荐
  • Windows内存管理优化方案:Mem Reduct深度解析与实践指南
  • 十分钟掌握暗黑2存档修改:d2s-editor终极指南让游戏体验焕然一新
  • 基于树莓派Zero与Fusion 360的复古掌机DIY全流程指南
  • 2026年盲盒毛绒玩具收藏保值指南:五家优选品牌解析 - 科技焦点
  • 【Claude创新方案生成黄金法则】:基于237个真实项目验证的4维质量评估模型(含可复用评分表)
  • 从天气预报到股票预测:MA模型在真实业务场景中到底怎么用?(以销售预测为例)
  • 从Simulink仿真到SVM分类:电力故障数据生成与模型部署避坑指南
  • 2026年济南宣传片拍摄/山东宣传片制作榜单:企业影视制作与创意视觉深度推荐 - 品牌企业推荐师(官方)
  • 基于树莓派Zero W打造GTA风格车载FM发射器:硬件改造与Python控制
  • AI Agent如何考虑港口物流调度中的复杂变量?2026企业级智能体技术路径深度测评
  • 2026年薪酬设计公司推荐:这几家靠谱又专业
  • Forlinx OKMX93xx平台Linux 6.1.36下GPIO操作全解析:从设备树到libgpiod
  • 成都钣金折弯焊接技术解析与权威厂家实测指南:成都非标钣金定制加工、成都二分类垃圾箱、成都仿古垃圾箱、成都分类垃圾箱选择指南 - 优质品牌商家
  • Win11/Win10深度学习环境搭建:实测PyCharm远程连接WSL2下的CUDA,性能比虚拟机强多少?
  • 别再只盯着KL散度了!用Python手把手教你实现MMD,搞定迁移学习中的分布差异度量
  • Claude调用OR-Tools求解器的隐藏API文档(内部泄露版):5个未公开参数让求解速度提升3.2倍
  • 2026年物联网GEO优化公司哪家好?“全意图”占领AI心智 - GEO优化
  • 2026年工业控制GEO优化公司排行榜:谁在AI搜索时代真正掌握“工业品选型”的话语权? - GEO优化