NLP情感分析:从传统方法到深度学习
1. 技术分析
1.1 情感分析任务
| 类型 | 描述 | 典型应用 |
|---|
| 二分类 | 积极/消极 | 评论分析 |
| 三分类 | 积极/中性/消极 | 舆情监测 |
| 多标签 | 多种情感混合 | 复杂文本 |
1.2 方法对比
| 方法 | 特点 | 性能 |
|---|
| 词典方法 | 基于情感词典 | 中等 |
| 传统ML | TF-IDF+SVM | 良好 |
| 深度学习 | Word2Vec+CNN/RNN | 优秀 |
| 预训练模型 | BERT等 | 最佳 |
2. 核心功能实现
2.1 词典方法
from nltk.sentiment import SentimentIntensityAnalyzer class LexiconSentimentAnalyzer: def __init__(self): import nltk nltk.download('vader_lexicon', quiet=True) self.analyzer = SentimentIntensityAnalyzer() def analyze(self, text): scores = self.analyzer.polarity_scores(text) if scores['compound'] >= 0.05: return "positive", scores['compound'] elif scores['compound'] <= -0.05: return "negative", scores['compound'] else: return "neutral", scores['compound'] analyzer = LexiconSentimentAnalyzer() result, score = analyzer.analyze("I love this product! It's amazing.") print(f"Sentiment: {result}, Score: {score}")
2.2 传统机器学习方法
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline class TraditionalSentimentAnalyzer: def __init__(self): self.pipeline = Pipeline([ ('tfidf', TfidfVectorizer( max_features=10000, ngram_range=(1, 2), stop_words='english' )), ('clf', LinearSVC(C=1.0)) ]) def train(self, texts, labels): self.pipeline.fit(texts, labels) def predict(self, texts): return self.pipeline.predict(texts) def predict_proba(self, texts): decision = self.pipeline.decision_function(texts) # 转换为概率 import numpy as np proba = 1 / (1 + np.exp(-decision)) return np.column_stack([1-proba, proba]) analyzer = TraditionalSentimentAnalyzer() analyzer.train(train_texts, train_labels) predictions = analyzer.predict(test_texts)
2.3 深度学习方法
import torch import torch.nn as nn class TextCNN(nn.Module): def __init__(self, vocab_size, embed_dim=128, num_classes=2, num_filters=100, filter_sizes=[3, 4, 5]): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.convs = nn.ModuleList([ nn.Conv2d(1, num_filters, (k, embed_dim)) for k in filter_sizes ]) self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes) self.dropout = nn.Dropout(0.5) def forward(self, x): x = self.embedding(x) x = x.unsqueeze(1) conv_outputs = [] for conv in self.convs: conv_out = torch.relu(conv(x)) pooled = torch.max_pool2d(conv_out, (conv_out.size(2), 1)) pooled = pooled.squeeze(3).squeeze(2) conv_outputs.append(pooled) concat = torch.cat(conv_outputs, dim=1) output = self.dropout(concat) return self.fc(output) class DeepSentimentAnalyzer: def __init__(self, vocab_size): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model = TextCNN(vocab_size).to(self.device) self.criterion = nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3) def train_epoch(self, dataloader): self.model.train() total_loss = 0 correct = 0 total = 0 for texts, labels in dataloader: texts = texts.to(self.device) labels = labels.to(self.device) self.optimizer.zero_grad() outputs = self.model(texts) loss = self.criterion(outputs, labels) loss.backward() self.optimizer.step() total_loss += loss.item() _, predicted = outputs.max(1) total += labels.size(0) correct += predicted.eq(labels).sum().item() return total_loss / len(dataloader), 100. * correct / total def predict(self, texts): self.model.eval() with torch.no_grad(): outputs = self.model(texts) _, predicted = outputs.max(1) return predicted.cpu().numpy()
3. 预训练模型方法
from transformers import BertTokenizer, BertForSequenceClassification from transformers import Trainer, TrainingArguments class BertSentimentAnalyzer: def __init__(self, model_name='bert-base-uncased'): self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = BertForSequenceClassification.from_pretrained( model_name, num_labels=3 ) self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model.to(self.device) def encode_texts(self, texts, max_length=128): return self.tokenizer( texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt' ) def train(self, train_texts, train_labels, eval_texts=None, eval_labels=None): train_encodings = self.encode_texts(train_texts) class TextDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: val[idx] for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = TextDataset(train_encodings, train_labels) training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', ) trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, ) trainer.train() def predict(self, texts): self.model.eval() encodings = self.encode_texts(texts) with torch.no_grad(): inputs = {k: v.to(self.device) for k, v in encodings.items()} outputs = self.model(**inputs) predictions = torch.argmax(outputs.logits, dim=1) return predictions.cpu().numpy()
4. 性能对比
4.1 方法对比
| 方法 | 准确率 | 训练时间 | 推理速度 |
|---|
| VADER | 0.78 | 无 | 极快 |
| TF-IDF+SVM | 0.87 | 分钟级 | 快 |
| TextCNN | 0.91 | 小时级 | 快 |
| BERT | 0.94 | 小时级 | 毫秒级 |
4.2 评估指标
from sklearn.metrics import classification_report, confusion_matrix def evaluate(y_true, y_pred): print("分类报告:") print(classification_report(y_true, y_pred)) print("\n混淆矩阵:") print(confusion_matrix(y_true, y_pred))
5. 总结
情感分析方法选择:
- 快速分析:使用VADER词典方法
- 中等规模数据:TF-IDF+SVM
- 高精度需求:BERT等预训练模型