当前位置: 首页 > news >正文

Go语言自然语言处理:文本处理与分析

Go语言自然语言处理:文本处理与分析

引言

自然语言处理(NLP)是人工智能的重要分支,它使计算机能够理解、处理和生成人类语言。Go语言以其高性能和并发能力,成为构建NLP应用的理想选择。本文将介绍如何使用Go语言进行自然语言处理。

一、文本处理基础

1.1 字符串操作

package main import ( "fmt" "strings" "unicode" ) func main() { text := "Hello, World! 你好,世界!" // 字符串长度 fmt.Printf("长度: %d\n", len(text)) // 转换为小写 fmt.Printf("小写: %s\n", strings.ToLower(text)) // 转换为大写 fmt.Printf("大写: %s\n", strings.ToUpper(text)) // 拆分字符串 words := strings.Fields(text) fmt.Printf("分词: %v\n", words) // 替换 replaced := strings.ReplaceAll(text, "World", "Go") fmt.Printf("替换后: %s\n", replaced) }

1.2 Unicode处理

package main import ( "fmt" "unicode" ) func main() { text := "Hello 世界 123 !" // 遍历字符 for _, r := range text { fmt.Printf("%c - 类型: ", r) switch { case unicode.IsLetter(r): fmt.Println("字母") case unicode.IsDigit(r): fmt.Println("数字") case unicode.IsSpace(r): fmt.Println("空格") default: fmt.Println("其他") } } }

二、分词处理

2.1 英文分词

package main import ( "fmt" "regexp" "strings" ) func tokenize(text string) []string { // 移除标点符号 re := regexp.MustCompile(`[^\w\s]`) cleaned := re.ReplaceAllString(text, "") // 转换为小写并分词 words := strings.Fields(strings.ToLower(cleaned)) return words } func main() { text := "Hello, World! This is a test sentence." tokens := tokenize(text) fmt.Printf("分词结果: %v\n", tokens) }

2.2 中文分词

go get github.com/go-ego/gse
package main import ( "fmt" "github.com/go-ego/gse" ) func main() { seg := gse.New() // 加载词典 err := seg.LoadDict("zh") if err != nil { panic(err) } text := "我爱北京天安门" words := seg.Cut(text, true) fmt.Printf("中文分词结果: %v\n", words) }

三、词频统计

3.1 基础词频统计

package main import ( "fmt" "sort" "strings" ) func wordFrequency(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func sortByFrequency(freq map[string]int) []string { words := make([]string, 0, len(freq)) for word := range freq { words = append(words, word) } sort.Slice(words, func(i, j int) bool { return freq[words[i]] > freq[words[j]] }) return words } func main() { text := "Hello world! Hello Go! Go is great. Go is fun." freq := wordFrequency(text) sortedWords := sortByFrequency(freq) fmt.Println("词频统计:") for _, word := range sortedWords { fmt.Printf("%s: %d\n", word, freq[word]) } }

3.2 TF-IDF计算

package main import ( "fmt" "math" ) func computeTF(term string, doc []string) float64 { count := 0 for _, word := range doc { if word == term { count++ } } return float64(count) / float64(len(doc)) } func computeIDF(term string, docs [][]string) float64 { docCount := 0 for _, doc := range docs { for _, word := range doc { if word == term { docCount++ break } } } return math.Log(float64(len(docs)) / float64(docCount+1)) } func computeTFIDF(term string, doc []string, docs [][]string) float64 { tf := computeTF(term, doc) idf := computeIDF(term, docs) return tf * idf } func main() { docs := [][]string{ {"hello", "world", "go"}, {"hello", "go", "lang"}, {"world", "programming"}, } term := "go" doc := docs[0] tfidf := computeTFIDF(term, doc, docs) fmt.Printf("TF-IDF for '%s': %.4f\n", term, tfidf) }

四、文本分类

4.1 朴素贝叶斯分类器

package main import ( "fmt" "math" ) type NaiveBayesClassifier struct { classCounts map[string]int wordCounts map[string]map[string]int totalDocuments int } func NewNaiveBayesClassifier() *NaiveBayesClassifier { return &NaiveBayesClassifier{ classCounts: make(map[string]int), wordCounts: make(map[string]map[string]int), } } func (nb *NaiveBayesClassifier) Train(docs []string, labels []string) { for i, doc := range docs { label := labels[i] nb.classCounts[label]++ nb.totalDocuments++ if _, ok := nb.wordCounts[label]; !ok { nb.wordCounts[label] = make(map[string]int) } words := strings.Fields(strings.ToLower(doc)) for _, word := range words { nb.wordCounts[label][word]++ } } } func (nb *NaiveBayesClassifier) Predict(doc string) string { words := strings.Fields(strings.ToLower(doc)) bestClass := "" bestScore := math.Inf(-1) for class := range nb.classCounts { score := math.Log(float64(nb.classCounts[class]) / float64(nb.totalDocuments)) for _, word := range words { wordCount := nb.wordCounts[class][word] score += math.Log(float64(wordCount+1) / float64(nb.classCounts[class]+len(nb.wordCounts[class]))) } if score > bestScore { bestScore = score bestClass = class } } return bestClass } func main() { nb := NewNaiveBayesClassifier() docs := []string{ "I love this movie", "Great film, highly recommend", "Terrible movie, waste of time", "Hated every minute of it", "Excellent performance", "Poor acting, bad script", } labels := []string{"positive", "positive", "negative", "negative", "positive", "negative"} nb.Train(docs, labels) testDoc := "This movie was amazing" prediction := nb.Predict(testDoc) fmt.Printf("预测结果: %s\n", prediction) }

五、文本生成

5.1 马尔可夫链文本生成

package main import ( "fmt" "math/rand" "strings" "time" ) type MarkovChain struct { transitions map[string][]string order int } func NewMarkovChain(order int) *MarkovChain { rand.Seed(time.Now().UnixNano()) return &MarkovChain{ transitions: make(map[string][]string), order: order, } } func (mc *MarkovChain) Train(text string) { words := strings.Fields(strings.ToLower(text)) for i := 0; i <= len(words)-mc.order-1; i++ { key := strings.Join(words[i:i+mc.order], " ") nextWord := words[i+mc.order] mc.transitions[key] = append(mc.transitions[key], nextWord) } } func (mc *MarkovChain) Generate(length int) string { // 随机选择起始状态 keys := make([]string, 0, len(mc.transitions)) for key := range mc.transitions { keys = append(keys, key) } if len(keys) == 0 { return "" } current := keys[rand.Intn(len(keys))] result := strings.Split(current, " ") for i := 0; i < length-mc.order; i++ { nextWords := mc.transitions[current] if len(nextWords) == 0 { break } nextWord := nextWords[rand.Intn(len(nextWords))] result = append(result, nextWord) // 更新当前状态 current = strings.Join(result[len(result)-mc.order:], " ") } return strings.Join(result, " ") } func main() { text := `I love Go programming. Go is a great language. Go is fast and efficient. I love programming in Go.` mc := NewMarkovChain(2) mc.Train(text) generated := mc.Generate(10) fmt.Printf("生成文本: %s\n", generated) }

六、情感分析

6.1 简单情感分析

package main import ( "fmt" "strings" ) var positiveWords = map[string]bool{ "love": true, "great": true, "excellent": true, "amazing": true, "good": true, "best": true, "wonderful": true, "fantastic": true, } var negativeWords = map[string]bool{ "hate": true, "terrible": true, "bad": true, "awful": true, "worst": true, "poor": true, "horrible": true, } func analyzeSentiment(text string) float64 { words := strings.Fields(strings.ToLower(text)) positiveCount := 0 negativeCount := 0 for _, word := range words { if positiveWords[word] { positiveCount++ } if negativeWords[word] { negativeCount++ } } total := positiveCount + negativeCount if total == 0 { return 0 } return float64(positiveCount-negativeCount) / float64(total) } func main() { texts := []string{ "I love this movie, it's amazing!", "Terrible experience, hated it.", "It was okay, not great but not bad.", } for _, text := range texts { score := analyzeSentiment(text) sentiment := "中性" if score > 0.3 { sentiment = "正面" } else if score < -0.3 { sentiment = "负面" } fmt.Printf("文本: %s\n情感得分: %.2f (%s)\n\n", text, score, sentiment) } }

七、命名实体识别

7.1 基于规则的NER

package main import ( "fmt" "regexp" "strings" ) type Entity struct { Text string Type string Start int End int } func extractEntities(text string) []Entity { var entities []Entity // 匹配邮箱 emailRegex := regexp.MustCompile(`[\w.-]+@[\w.-]+\.\w+`) for _, match := range emailRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "EMAIL", Start: match[0], End: match[1], }) } // 匹配电话号码 phoneRegex := regexp.MustCompile(`\d{3,4}[-.]?\d{4}[-.]?\d{4}`) for _, match := range phoneRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "PHONE", Start: match[0], End: match[1], }) } // 匹配网址 urlRegex := regexp.MustCompile(`https?://[\w.-]+(?:/[\w./-]*)?`) for _, match := range urlRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "URL", Start: match[0], End: match[1], }) } return entities } func main() { text := `联系我们: support@example.com 或拨打 123-4567-8900 更多信息请访问 https://www.example.com/products` entities := extractEntities(text) fmt.Println("提取的实体:") for _, entity := range entities { fmt.Printf("类型: %s, 文本: %s, 位置: [%d-%d]\n", entity.Type, entity.Text, entity.Start, entity.End) } }

八、文本相似度

8.1 余弦相似度

package main import ( "fmt" "math" "strings" ) func tokenize(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func dotProduct(v1, v2 map[string]int) int { sum := 0 for word, count := range v1 { sum += count * v2[word] } return sum } func magnitude(v map[string]int) float64 { sum := 0 for _, count := range v { sum += count * count } return math.Sqrt(float64(sum)) } func cosineSimilarity(text1, text2 string) float64 { v1 := tokenize(text1) v2 := tokenize(text2) dot := dotProduct(v1, v2) mag1 := magnitude(v1) mag2 := magnitude(v2) if mag1 == 0 || mag2 == 0 { return 0 } return float64(dot) / (mag1 * mag2) } func main() { text1 := "I love programming in Go" text2 := "Go is a great programming language" text3 := "Cats are cute animals" similarity12 := cosineSimilarity(text1, text2) similarity13 := cosineSimilarity(text1, text3) fmt.Printf("文本1与文本2相似度: %.4f\n", similarity12) fmt.Printf("文本1与文本3相似度: %.4f\n", similarity13) }

九、实战:文本搜索引擎

package main import ( "fmt" "sort" "strings" ) type Document struct { ID int Title string Body string } type SearchEngine struct { documents []Document index map[string][]int } func NewSearchEngine() *SearchEngine { return &SearchEngine{ documents: make([]Document, 0), index: make(map[string][]int), } } func (se *SearchEngine) AddDocument(doc Document) { se.documents = append(se.documents, doc) docID := len(se.documents) - 1 words := strings.Fields(strings.ToLower(doc.Title + " " + doc.Body)) seen := make(map[string]bool) for _, word := range words { if !seen[word] { se.index[word] = append(se.index[word], docID) seen[word] = true } } } func (se *SearchEngine) Search(query string) []Document { queryWords := strings.Fields(strings.ToLower(query)) // 找到包含所有查询词的文档 var resultIDs []int for i, word := range queryWords { if docIDs, ok := se.index[word]; ok { if i == 0 { resultIDs = docIDs } else { // 求交集 resultIDs = intersect(resultIDs, docIDs) } } else { return []Document{} } } // 获取文档 results := make([]Document, 0, len(resultIDs)) for _, id := range resultIDs { results = append(results, se.documents[id]) } return results } func intersect(a, b []int) []int { result := make([]int, 0) i, j := 0, 0 for i < len(a) && j < len(b) { if a[i] == b[j] { result = append(result, a[i]) i++ j++ } else if a[i] < b[j] { i++ } else { j++ } } return result } func main() { se := NewSearchEngine() se.AddDocument(Document{Title: "Go Programming", Body: "Go is a programming language created by Google"}) se.AddDocument(Document{Title: "Machine Learning", Body: "Machine learning is a subset of AI"}) se.AddDocument(Document{Title: "Go and AI", Body: "Go can be used for AI and machine learning"}) results := se.Search("Go programming") fmt.Println("搜索结果:") for _, doc := range results { fmt.Printf("标题: %s\n内容: %s\n\n", doc.Title, doc.Body) } }

十、总结

本文介绍了如何使用Go语言进行自然语言处理,包括:

  1. 文本处理基础:字符串操作、Unicode处理
  2. 分词处理:英文分词和中文分词
  3. 词频统计:基础词频和TF-IDF计算
  4. 文本分类:朴素贝叶斯分类器
  5. 文本生成:马尔可夫链文本生成
  6. 情感分析:基于词典的情感分析
  7. 命名实体识别:基于规则的NER
  8. 文本相似度:余弦相似度计算
  9. 实战项目:简单文本搜索引擎

通过这些实现,你可以使用Go语言构建各种NLP应用,充分利用Go的性能优势处理大规模文本数据。

http://www.jsqmd.com/news/913210/

相关文章:

  • 光伏螺栓技术全解析:材质选型防腐与售后保障推荐 - 优质品牌商家
  • 基于树莓派5打造硬核便携电脑:从硬件选型到系统配置全攻略
  • 2026光伏螺栓选型推荐及靠谱厂家技术维度解析:河北10.9s钢结构螺栓/河北光伏螺栓/河北六角螺栓/排行一览 - 优质品牌商家
  • STM32F407标准库实战:串口+DMA收发数据,如何设计一个高效的环形缓冲区管理模块?
  • OpenCL GPU内存检测架构设计与实践指南
  • 云克隆多因子检测技术|标准曲线拟合实操教程
  • 惠普EliteDesk SFF主机硬盘位改造:安全扩展第三块3.5寸硬盘
  • 你想何出怎样的SRAM CIM
  • 2026贵阳初升高民办校评测:5校核心指标横向对比 - 优质品牌商家
  • 2026年Q2线上控价服务机构排行及联系方式汇总 - 优质品牌商家
  • 从SBM到超效率SBM:一篇讲清DEA模型家族的区别与Python选型指南
  • 量子视觉场技术:量子计算与计算机视觉的融合创新
  • 破局全厂数据孤岛:移动机器人统一调度与数字孪生演进指南
  • 2026年4g远传水表实测评测:四川超声波水表/四川铜阀门/四川闸阀/四川阀门/四川预付费水表/七大维度选型参考 - 优质品牌商家
  • 20年经验供应商揭秘:小型轧机如何做到高性价比
  • Python 函数完全指南:定义与调用
  • 探秘2026年当下漳州可靠的水果店运营源头公司:全链路赋能新零售 - 2026年企业资讯
  • 光OFDM系统中非线性效应及缓解方法解析【附数据】
  • AI 学习——多 Agent 协作入门
  • 网页切图工具,网格切图,非常方便
  • 基于Arduino与Visuino的线性执行器时序控制系统设计与实现
  • 2026年q2第三方控价选型推荐:线上控价/专业控价/京东控价/化妆品控价/品牌控价/技术与服务双维度解析 - 优质品牌商家
  • 无标识视觉感知下核电厂区外来人员轨迹建模与推演技术解析
  • 别再只懂LSH了:手把手拆解跨模态哈希中的矩阵分解与离散优化(附Python示例)
  • Hotkey Detective:3分钟精准定位Windows热键冲突的终极方案
  • D41: 多租户架构的 AI 服务设计
  • 2026年5月,专业儿童帽企业的硬核实力与深度服务解析 - 2026年企业资讯
  • 两个独立事件的联合概率
  • 下载 | Win10 2021官方精简版,预装应用极少!(5月更新、Win10 IoT LTSC 2021版、适合老电脑)
  • 收藏!AI时代,被淘汰的不是程序员,而是那些不懂“借力”的人!