当前位置：首页 > news >正文

NLP情感分析：从传统方法到深度学习

news 2026/5/8 0:39:58

NLP情感分析：从传统方法到深度学习

1. 技术分析

1.1 情感分析任务

类型	描述	典型应用
二分类	积极/消极	评论分析
三分类	积极/中性/消极	舆情监测
多标签	多种情感混合	复杂文本

1.2 方法对比

方法	特点	性能
词典方法	基于情感词典	中等
传统ML	TF-IDF+SVM	良好
深度学习	Word2Vec+CNN/RNN	优秀
预训练模型	BERT等	最佳

2. 核心功能实现

2.1 词典方法

from nltk.sentiment import SentimentIntensityAnalyzer class LexiconSentimentAnalyzer: def __init__(self): import nltk nltk.download('vader_lexicon', quiet=True) self.analyzer = SentimentIntensityAnalyzer() def analyze(self, text): scores = self.analyzer.polarity_scores(text) if scores['compound'] >= 0.05: return "positive", scores['compound'] elif scores['compound'] <= -0.05: return "negative", scores['compound'] else: return "neutral", scores['compound'] analyzer = LexiconSentimentAnalyzer() result, score = analyzer.analyze("I love this product! It's amazing.") print(f"Sentiment: {result}, Score: {score}")

2.2 传统机器学习方法

from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline class TraditionalSentimentAnalyzer: def __init__(self): self.pipeline = Pipeline([ ('tfidf', TfidfVectorizer( max_features=10000, ngram_range=(1, 2), stop_words='english' )), ('clf', LinearSVC(C=1.0)) ]) def train(self, texts, labels): self.pipeline.fit(texts, labels) def predict(self, texts): return self.pipeline.predict(texts) def predict_proba(self, texts): decision = self.pipeline.decision_function(texts) # 转换为概率 import numpy as np proba = 1 / (1 + np.exp(-decision)) return np.column_stack([1-proba, proba]) analyzer = TraditionalSentimentAnalyzer() analyzer.train(train_texts, train_labels) predictions = analyzer.predict(test_texts)

2.3 深度学习方法

import torch import torch.nn as nn class TextCNN(nn.Module): def __init__(self, vocab_size, embed_dim=128, num_classes=2, num_filters=100, filter_sizes=[3, 4, 5]): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.convs = nn.ModuleList([ nn.Conv2d(1, num_filters, (k, embed_dim)) for k in filter_sizes ]) self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes) self.dropout = nn.Dropout(0.5) def forward(self, x): x = self.embedding(x) x = x.unsqueeze(1) conv_outputs = [] for conv in self.convs: conv_out = torch.relu(conv(x)) pooled = torch.max_pool2d(conv_out, (conv_out.size(2), 1)) pooled = pooled.squeeze(3).squeeze(2) conv_outputs.append(pooled) concat = torch.cat(conv_outputs, dim=1) output = self.dropout(concat) return self.fc(output) class DeepSentimentAnalyzer: def __init__(self, vocab_size): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model = TextCNN(vocab_size).to(self.device) self.criterion = nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3) def train_epoch(self, dataloader): self.model.train() total_loss = 0 correct = 0 total = 0 for texts, labels in dataloader: texts = texts.to(self.device) labels = labels.to(self.device) self.optimizer.zero_grad() outputs = self.model(texts) loss = self.criterion(outputs, labels) loss.backward() self.optimizer.step() total_loss += loss.item() _, predicted = outputs.max(1) total += labels.size(0) correct += predicted.eq(labels).sum().item() return total_loss / len(dataloader), 100. * correct / total def predict(self, texts): self.model.eval() with torch.no_grad(): outputs = self.model(texts) _, predicted = outputs.max(1) return predicted.cpu().numpy()

3. 预训练模型方法

from transformers import BertTokenizer, BertForSequenceClassification from transformers import Trainer, TrainingArguments class BertSentimentAnalyzer: def __init__(self, model_name='bert-base-uncased'): self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = BertForSequenceClassification.from_pretrained( model_name, num_labels=3 ) self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model.to(self.device) def encode_texts(self, texts, max_length=128): return self.tokenizer( texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt' ) def train(self, train_texts, train_labels, eval_texts=None, eval_labels=None): train_encodings = self.encode_texts(train_texts) class TextDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: val[idx] for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = TextDataset(train_encodings, train_labels) training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', ) trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, ) trainer.train() def predict(self, texts): self.model.eval() encodings = self.encode_texts(texts) with torch.no_grad(): inputs = {k: v.to(self.device) for k, v in encodings.items()} outputs = self.model(**inputs) predictions = torch.argmax(outputs.logits, dim=1) return predictions.cpu().numpy()

4. 性能对比

4.1 方法对比

方法	准确率	训练时间	推理速度
VADER	0.78	无	极快
TF-IDF+SVM	0.87	分钟级	快
TextCNN	0.91	小时级	快
BERT	0.94	小时级	毫秒级

4.2 评估指标

from sklearn.metrics import classification_report, confusion_matrix def evaluate(y_true, y_pred): print("分类报告:") print(classification_report(y_true, y_pred)) print("\n混淆矩阵:") print(confusion_matrix(y_true, y_pred))