04-进阶方向:自然语言处理(NLP)——spaCy入门
spaCy入门(工业级NLP管道、实体识别、依存分析)
一、spaCy概述
1.1 为什么选择spaCy?
importspacyimportnumpyasnpimportmatplotlib.pyplotaspltfrommatplotlib.patchesimportRectangle,FancyBboxPatchimportwarnings warnings.filterwarnings('ignore')print("="*60)print("spaCy:工业级NLP工具")print("="*60)# spaCy vs NLTK对比fig,axes=plt.subplots(1,2,figsize=(12,5))# spaCyax1=axes[0]ax1.axis('off')ax1.set_title('spaCy - 工业级',fontsize=11)spacy_features=['✓ 快速(Cython实现)','✓ 生产就绪','✓ 深度学习模型','✓ 完整管道','✓ 易于部署',]y_pos=0.7forfeatinspacy_features:ax1.text(0.1,y_pos,feat,fontsize=9,color='green')y_pos-=0.1# NLTKax2=axes[1]ax2.axis('off')ax2.set_title('NLTK - 学术/教育',fontsize=11)nltk_features=['○ 教学友好','○ 算法丰富','○ 语料库多','○ 速度较慢','○ 适合学习',]y_pos=0.7forfeatinnltk_features:ax2.text(0.1,y_pos,feat,fontsize=9,color='blue')y_pos-=0.1plt.suptitle('spaCy vs NLTK',fontsize=12)plt.tight_layout()plt.show()print("\n💡 spaCy特点:")print(" - 工业级速度(Cython优化)")print(" - 预训练模型支持多语言")print(" - 端到端NLP管道")print(" - 易于集成到生产环境")二、spaCy基础
2.1 安装与加载模型
defspacy_basics():"""spaCy基础操作"""print("\n"+"="*60)print("spaCy基础操作")print("="*60)print(""" # 安装 pip install spacy # 下载模型 python -m spacy download en_core_web_sm python -m spacy download zh_core_web_sm """)code=""" import spacy # 加载模型 nlp = spacy.load("en_core_web_sm") # 处理文本 doc = nlp("Apple is looking at buying U.K. startup for $1 billion") # 查看模型信息 print(f"模型名称: {nlp.meta['name']}") print(f"模型版本: {nlp.meta['version']}") print(f"支持语言: {nlp.meta['lang']}") print(f"管道组件: {nlp.pipe_names}") # 输出: # 模型名称: core_web_sm # 模型版本: 3.7.1 # 支持语言: en # 管道组件: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'] """print(code)spacy_basics()2.2 Doc对象
defdoc_object():"""Doc对象详解"""print("\n"+"="*60)print("Doc对象:NLP结果容器")print("="*60)code=""" import spacy nlp = spacy.load("en_core_web_sm") text = "Steve Jobs founded Apple in Cupertino in 1976." doc = nlp(text) # 1. 句子分割 print(f"句子数量: {len(list(doc.sents))}") for sent in doc.sents: print(f" 句子: {sent}") # 2. 分词 print(f"\\nToken数量: {len(doc)}") for token in doc[:10]: print(f" {token.text}") # 3. 词性标注 print("\\n词性标注:") for token in doc: print(f" {token.text:12} → {token.pos_:8} ({token.tag_})") # 4. 依存句法分析 print("\\n依存关系:") for token in doc: print(f" {token.text:12} → {token.dep_:10} ← {token.head.text}") # 5. 命名实体识别 print("\\n命名实体:") for ent in doc.ents: print(f" {ent.text:20} → {ent.label_}") # 6. 词形还原 print("\\n词形还原:") for token in doc: print(f" {token.text:12} → {token.lemma_:12}") # 7. 向量表示 print(f"\\n向量维度: {doc.vector.shape}") print(f"第一个词向量维度: {doc[0].vector.shape}") """print(code)doc_object()三、实体识别(NER)
3.1 内置实体类型
defner_demo():"""命名实体识别"""print("\n"+"="*60)print("命名实体识别(NER)")print("="*60)code=""" import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") # 1. 基本NER text = """Apple Inc.was founded by Steve Jobs,Steve Wozniak,andRonald WayneinApril1976.The companyisheadquarteredinCupertino,California.""" doc = nlp(text) print("识别到的实体:") for ent in doc.ents: print(f" {ent.text:25} → {ent.label_:10} ({spacy.explain(ent.label_)})") # 2. 实体类型说明 entity_types = { 'PERSON': '人名', 'ORG': '组织/公司', 'GPE': '地缘政治实体(国家、城市)', 'LOC': '地理位置', 'DATE': '日期', 'TIME': '时间', 'MONEY': '金额', 'PERCENT': '百分比', 'PRODUCT': '产品', 'EVENT': '事件', } print("\\n常用实体类型:") for code, name in entity_types.items(): print(f" {code}: {name}") # 3. 可视化NER # displacy.render(doc, style="ent", jupyter=True) # 4. 提取特定类型实体 persons = [ent.text for ent in doc.ents if ent.label_ == "PERSON"] orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"] dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"] print(f"\\n人物: {persons}") print(f"组织: {orgs}") print(f"日期: {dates}") """print(code)ner_demo()3.2 可视化NER
defner_visualization():"""NER可视化"""print("\n"+"="*60)print("NER可视化")print("="*60)code=""" import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") # 1. 基本可视化 text = "Elon Musk founded SpaceX in Hawthorne, California." doc = nlp(text) # HTML输出 html = displacy.render(doc, style="ent", page=True) # 2. 自定义颜色 colors = { "PERSON": "#FF6B6B", "ORG": "#4ECDC4", "GPE": "#45B7D1", } options = { "ents": ["PERSON", "ORG", "GPE"], "colors": colors, } html = displacy.render(doc, style="ent", options=options, page=True) # 3. 批量处理 texts = [ "Google is headquartered in Mountain View.", "Microsoft was founded by Bill Gates in Albuquerque.", "Amazon was founded by Jeff Bezos in Seattle." ] docs = list(nlp.pipe(texts)) html = displacy.render(docs, style="ent", page=True) # 4. 保存为文件 with open("ner_visualization.html", "w") as f: f.write(html) """print(code)ner_visualization()四、依存句法分析
4.1 依存关系
defdependency_parsing():"""依存句法分析"""print("\n"+"="*60)print("依存句法分析")print("="*60)code=""" import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") text = "The quick brown fox jumps over the lazy dog." doc = nlp(text) # 1. 依存关系 print("依存关系分析:") print(f"{'Token':<12} {'POS':<8} {'Dep':<12} {'Head':<12} {'Children'}") print("-" * 60) for token in doc: children = [child.text for child in token.children] print(f"{token.text:<12} {token.pos_:<8} {token.dep_:<12} " f"{token.head.text:<12} {children}") # 2. 常用依存关系 dependency_types = { 'nsubj': '名词性主语', 'dobj': '直接宾语', 'amod': '形容词修饰语', 'det': '限定词', 'prep': '介词修饰语', 'pobj': '介词宾语', 'aux': '助动词', 'conj': '连词连接', } print("\\n常用依存关系:") for dep, meaning in dependency_types.items(): print(f" {dep}: {meaning}") # 3. 提取主语-谓语-宾语 def extract_svo(doc): """提取主谓宾结构""" triples = [] for token in doc: if token.dep_ == "nsubj": subject = token.text verb = token.head.text # 找宾语 obj = None for child in token.head.children: if child.dep_ == "dobj": obj = child.text break triples.append((subject, verb, obj)) return triples svo = extract_svo(doc) print(f"\\n主谓宾: {svo}") # 4. 可视化依存关系 # displacy.render(doc, style="dep", jupyter=True) """print(code)dependency_parsing()4.2 依存关系可视化
defdependency_viz():"""依存关系可视化"""print("\n"+"="*60)print("依存关系可视化")print("="*60)code=""" import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") # 1. 基本依存可视化 text = "The cat sat on the mat." doc = nlp(text) # 2. 自定义选项 options = { "compact": True, "color": "#4ECDC4", "bg": "#2C3E50", "font": "Arial", } html = displacy.render(doc, style="dep", options=options, page=True) # 3. 设置距离 options = {"distance": 120} html = displacy.render(doc, style="dep", options=options, page=True) # 4. 批量处理 texts = [ "I love natural language processing.", "The quick brown fox jumps over the lazy dog.", ] docs = list(nlp.pipe(texts)) html = displacy.render(docs, style="dep", page=True) # 5. 保存为SVG from spacy import displacy svg = displacy.render(doc, style="dep", options={"fine_grained": True}) with open("dependency.svg", "w") as f: f.write(svg) """print(code)dependency_viz()五、spaCy管道
5.1 管道组件
defspacy_pipeline():"""spaCy管道"""print("\n"+"="*60)print("spaCy管道组件")print("="*60)code=""" import spacy nlp = spacy.load("en_core_web_sm") # 1. 查看管道组件 print(f"管道组件: {nlp.pipe_names}") print(f"管道顺序: {nlp.pipeline}") # 2. 禁用组件(加速) with nlp.disable_pipes("parser", "ner"): doc = nlp("This is a fast processing without parser and NER.") print(f"禁用后可用组件: {nlp.pipe_names}") # 3. 自定义组件 from spacy.language import Language @Language.component("custom_component") def custom_component(doc): # 添加自定义处理逻辑 print(f"处理文档: {doc.text[:50]}...") return doc # 添加组件 nlp.add_pipe("custom_component", before="ner") # 4. 移除组件 nlp.remove_pipe("custom_component") # 5. 替换组件 nlp.replace_pipe("ner", "custom_component") # 6. 组件顺序调整 nlp.move_pipe("ner", last=True) """print(code)spacy_pipeline()5.2 自定义管道组件
defcustom_pipeline_component():"""自定义管道组件"""print("\n"+"="*60)print("自定义管道组件")print("="*60)code=""" import spacy from spacy.language import Language # 1. 简单自定义组件 @Language.component("entity_extractor") def entity_extractor(doc): # 提取特定模式 entities = [] for token in doc: if token.like_email: entities.append((token.text, "EMAIL")) elif token.like_url: entities.append((token.text, "URL")) # 添加到doc扩展 doc._.custom_entities = entities return doc # 添加扩展属性 from spacy.tokens import Doc Doc.set_extension("custom_entities", default=[]) # 注册组件 nlp = spacy.load("en_core_web_sm") nlp.add_pipe("entity_extractor", after="ner") # 2. 带参数的组件 @Language.component("keyword_extractor") def keyword_extractor(doc, min_length=3, top_k=5): # 统计词频 freq = {} for token in doc: if (not token.is_stop and not token.is_punct and len(token.text) >= min_length): freq[token.lemma_] = freq.get(token.lemma_, 0) + 1 # 排序取top_k keywords = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:top_k] doc._.keywords = keywords return doc Doc.set_extension("keywords", default=[]) # 3. 工厂组件(可配置) @Language.factory("sentiment_analyzer") class SentimentAnalyzer: def __init__(self, nlp, name, threshold=0.5): self.nlp = nlp self.threshold = threshold def __call__(self, doc): # 简单的情感分析 positive_words = set(["good", "great", "excellent", "amazing"]) negative_words = set(["bad", "terrible", "awful", "poor"]) pos_count = sum(1 for token in doc if token.lemma_ in positive_words) neg_count = sum(1 for token in doc if token.lemma_ in negative_words) score = (pos_count - neg_count) / (len(doc) + 1) doc._.sentiment = score doc._.sentiment_label = "positive" if score > self.threshold else "negative" return doc # 注册工厂组件 nlp.add_pipe("sentiment_analyzer", after="ner") # 使用 doc = nlp("This movie is absolutely great and amazing!") print(f"情感分数: {doc._.sentiment}") print(f"情感标签: {doc._.sentiment_label}") """print(code)custom_pipeline_component()六、相似度计算
6.1 词向量相似度
defsimilarity_demo():"""相似度计算"""print("\n"+"="*60)print("词向量相似度")print("="*60)code=""" import spacy # 加载带词向量的模型(需要较大模型) nlp = spacy.load("en_core_web_md") # 或 en_core_web_lg # 1. 词语相似度 word1 = nlp("apple") word2 = nlp("orange") word3 = nlp("car") print(f"apple vs orange: {word1.similarity(word2):.3f}") print(f"apple vs car: {word1.similarity(word3):.3f}") # 2. 句子相似度 text1 = nlp("I love programming") text2 = nlp("I enjoy coding") text3 = nlp("The weather is nice") print(f"\\n句子相似度:") print(f" 'I love programming' vs 'I enjoy coding': {text1.similarity(text2):.3f}") print(f" 'I love programming' vs 'The weather is nice': {text1.similarity(text3):.3f}") # 3. 文档相似度 doc1 = nlp("Machine learning is fascinating.") doc2 = nlp("Deep learning is a subset of machine learning.") doc3 = nlp("I like to eat pizza.") print(f"\\n文档相似度:") print(f" ML vs DL: {doc1.similarity(doc2):.3f}") print(f" ML vs Pizza: {doc1.similarity(doc3):.3f}") # 4. 查找最相似的词 def most_similar(word, top_n=5): """查找最相似的词""" target = nlp(word) similarities = [] # 需要词表(简化示例) vocab = ["apple", "orange", "banana", "car", "truck", "bike", "dog", "cat", "bird", "happy", "sad", "angry"] for w in vocab: if w != word: sim = target.similarity(nlp(w)) similarities.append((w, sim)) similarities.sort(key=lambda x: x[1], reverse=True) return similarities[:top_n] print(f"\\n与'apple'最相似的词:") for word, sim in most_similar("apple", 5): print(f" {word}: {sim:.3f}") """print(code)similarity_demo()七、实战:信息抽取系统
7.1 完整信息抽取
definformation_extraction():"""信息抽取系统"""print("\n"+"="*60)print("信息抽取系统")print("="*60)code=""" import spacy from typing import List, Dict, Any class InformationExtractor: def __init__(self, model_name="en_core_web_sm"): self.nlp = spacy.load(model_name) def extract_entities(self, text: str) -> Dict[str, List[str]]: """提取命名实体""" doc = self.nlp(text) entities = { "PERSON": [], "ORG": [], "GPE": [], "DATE": [], "MONEY": [], } for ent in doc.ents: if ent.label_ in entities: entities[ent.label_].append(ent.text) return entities def extract_relations(self, text: str) -> List[Dict]: """提取实体关系""" doc = self.nlp(text) relations = [] for token in doc: # 提取主谓宾关系 if token.dep_ == "nsubj" and token.head.pos_ == "VERB": subject = token.text predicate = token.head.text # 找宾语 obj = None for child in token.head.children: if child.dep_ == "dobj": obj = child.text break if obj: relations.append({ "subject": subject, "predicate": predicate, "object": obj }) return relations def extract_keywords(self, text: str, top_k: int = 10) -> List[str]: """提取关键词""" doc = self.nlp(text) # 统计词频(排除停用词和标点) freq = {} for token in doc: if not token.is_stop and not token.is_punct and token.is_alpha: lemma = token.lemma_.lower() freq[lemma] = freq.get(lemma, 0) + 1 # 排序取top_k keywords = sorted(freq.items(), key=lambda x: x[1], reverse=True) return [word for word, _ in keywords[:top_k]] def process(self, text: str) -> Dict[str, Any]: """完整处理""" return { "entities": self.extract_entities(text), "relations": self.extract_relations(text), "keywords": self.extract_keywords(text), } # 使用示例 extractor = InformationExtractor() text = """Apple Inc.was founded by Steve JobsinCupertino,CaliforniainApril1976.The companyisvalued at over $2trillion.""" result = extractor.process(text) print("实体识别结果:") for entity_type, entities in result["entities"].items(): if entities: print(f" {entity_type}: {entities}") print("\\n关系抽取结果:") for relation in result["relations"]: print(f" {relation['subject']} → {relation['predicate']} → {relation['object']}") print(f"\\n关键词: {result['keywords']}") """print(code)information_extraction()八、总结
| 功能 | 方法 | 应用 |
|---|---|---|
| 分词 | doc | 基础处理 |
| 词性标注 | token.pos_ | 语法分析 |
| 依存分析 | token.dep_ | 句法结构 |
| NER | doc.ents | 信息抽取 |
| 相似度 | similarity() | 语义匹配 |
| 管道 | nlp.pipe() | 批量处理 |
spaCy最佳实践:
- 使用
nlp.pipe()批量处理提高效率 - 禁用不需要的组件加速
- 使用小模型(sm)开发,大模型(lg)部署
- 自定义管道组件扩展功能
