1.SpaCy库安装
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple spacy
2.下载标注模型
https://github.com/explosion/spacy-models/releases/tag/zh_core_web_trf-3.7.2
https://github.com/explosion/spacy-models/releases/tag/en_core_web_trf-3.7.3
3.模型安装
# 示例
pip install path/to/your/zh_core_web_trf-3.7.2-py3-none-any.whl
4.导入并加载模型
import spacy
# 记在英文模型
nlp = spacy.load('en_core_web_lg')
5.基本功能
###############基本功能###############
##文本处理和标记化
# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
# 打印分词结果
for token in doc:
print(token.text)
##词性标注
# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
# 打印标记化结果
for token in doc:
print(f'{token.text}: {token.pos_}')
##命名实体识别
# 处理文本
text = "Long live China"
doc = nlp(text)
for ent in doc.ents:
print(f'{ent.text}: {ent.label_}')
##依存解析
# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
for token in doc:
print(f'{token.text}: {token.dep_} -> {token.head.text}')
6.高级功能
###############高级功能###############
##词向量
# 加载包含词向量的更大模型
nlp_large = spacy.load('en_core_web_md')
# 获取词向量
doc_large = nlp_large("king queen man woman")
for token in doc_large:
print(f'{token.text}: {token.vector[:5]}')
##文本相似度
# 加载包含词向量的更大模型
nlp_large = spacy.load('en_core_web_md')
doc1 = nlp_large("I love machine learning.")
doc2 = nlp_large("I enjoy artificial intelligence.")
similarity = doc1.similarity(doc2)
print(f'Similarity: {similarity}')
##自定义管道组件
from spacy.language import Language
@Language.component("my_component")
def my_component(doc):
# Do something to the doc here
print(f"Custom component processed:{doc}")
return doc
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("my_component", name="print_info", last=True)
print(nlp.pipe_names)
doc = nlp("This is a sentence.")
##训练自定义模型
import random
from spacy.training.example import Example
# 创建空白模型
nlp = spacy.blank("en")
# 添加NER管道
ner = nlp.add_pipe("ner")
# 添加自定义实体标签
ner.add_label("GADGET")
# 定义训练数据
TRAIN_DATA = [
("Apple releases new iPhone.", {"entities": [(14, 20, "GADGET")]}),
("Google launches new Pixel phone.", {"entities": [(21, 26, "GADGET")]})
]
# 开始训练
nlp.begin_training()
for i in range(10):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], losses=losses)
print(losses)
# 测试自定义模型
doc = nlp("Amazon announces new Kindle.")
for ent in doc.ents:
print(ent.text, ent.label_)
7.CSV文档数据标注
import pandas as pd
import spacy
# 读取CSV文件
df = pd.read_csv('ori_dataset.csv') # 训练集包含一个text表头的列
print(df.head())
# 加载SpaCy模型
nlp = spacy.load('en_core_web_sm')
def mark_keywords(text):
doc = nlp(text)
for ent in doc.ents:
text = text.replace(ent.text, f"<KEY>{ent.text}</KEY>")
return text
# 对CSV中的文本进行标注
df['marked_text'] = df['text'].apply(mark_keywords)
print(df.head())
# 保存到新的CSV文件
df.to_csv('train_dataset.csv', index=False)
print("标注结果已保存到: train_dataset.csv")
text
"Albert Einstein was a theoretical physicist who developed the theory of relativity."
"Marie Curie won the Nobel Prize in Physics and Chemistry."
text,marked_text
"Albert Einstein was a theoretical physicist who developed the theory of relativity.","<KEY>Albert Einstein</KEY> was a theoretical physicist who developed the <KEY>theory of relativity</KEY>."
"Marie Curie won the Nobel Prize in Physics and Chemistry.","<KEY>Marie Curie</KEY> won the <KEY>Nobel Prize</KEY> in Physics and Chemistry."
关注公众号“CrazyNET”,获取更多资源