Text-Generation任务数据标注工具-优快云博客

本文链接：https://blog.youkuaiyun.com/FJCker/article/details/148932037

1.SpaCy库安装

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple spacy

2.下载标注模型

https://github.com/explosion/spacy-models/releases/tag/zh_core_web_trf-3.7.2
https://github.com/explosion/spacy-models/releases/tag/en_core_web_trf-3.7.3

3.模型安装

# 示例
pip install path/to/your/zh_core_web_trf-3.7.2-py3-none-any.whl

4.导入并加载模型

import spacy
 
# 记在英文模型
nlp = spacy.load('en_core_web_lg')

5.基本功能

###############基本功能###############
##文本处理和标记化
# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
# 打印分词结果
for token in doc:
    print(token.text)

##词性标注
# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
# 打印标记化结果
for token in doc:
    print(f'{token.text}: {token.pos_}')

##命名实体识别
# 处理文本
text = "Long live China"
doc = nlp(text)
 
for ent in doc.ents:
    print(f'{ent.text}: {ent.label_}')

##依存解析

# 处理文本
text = "Hello, welcome to the world of natural language processing."
doc = nlp(text)
 
for token in doc:
    print(f'{token.text}: {token.dep_} -> {token.head.text}')

6.高级功能

###############高级功能###############
##词向量
# 加载包含词向量的更大模型
nlp_large = spacy.load('en_core_web_md')
 
# 获取词向量
doc_large = nlp_large("king queen man woman")
for token in doc_large:
    print(f'{token.text}: {token.vector[:5]}')

##文本相似度
# 加载包含词向量的更大模型
nlp_large = spacy.load('en_core_web_md')
 
doc1 = nlp_large("I love machine learning.")
doc2 = nlp_large("I enjoy artificial intelligence.")
 
similarity = doc1.similarity(doc2)
print(f'Similarity: {similarity}')

##自定义管道组件
from spacy.language import Language
 
@Language.component("my_component")
def my_component(doc):
    # Do something to the doc here
    print(f"Custom component processed:{doc}")
    return doc
 
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("my_component", name="print_info", last=True)
print(nlp.pipe_names)
doc = nlp("This is a sentence.")

##训练自定义模型
import random
from spacy.training.example import Example
 
# 创建空白模型
nlp = spacy.blank("en")
 
# 添加NER管道
ner = nlp.add_pipe("ner")
 
# 添加自定义实体标签
ner.add_label("GADGET")
 
# 定义训练数据
TRAIN_DATA = [
    ("Apple releases new iPhone.", {"entities": [(14, 20, "GADGET")]}),
    ("Google launches new Pixel phone.", {"entities": [(21, 26, "GADGET")]})
]
 
# 开始训练
nlp.begin_training()
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(losses)
 
# 测试自定义模型
doc = nlp("Amazon announces new Kindle.")
for ent in doc.ents:
    print(ent.text, ent.label_)

7.CSV文档数据标注

import pandas as pd
import spacy

# 读取CSV文件
df = pd.read_csv('ori_dataset.csv') # 训练集包含一个text表头的列
print(df.head())

# 加载SpaCy模型
nlp = spacy.load('en_core_web_sm')

def mark_keywords(text):
    doc = nlp(text)
    for ent in doc.ents:
        text = text.replace(ent.text, f"<KEY>{ent.text}</KEY>")
    return text

# 对CSV中的文本进行标注
df['marked_text'] = df['text'].apply(mark_keywords)
print(df.head())

# 保存到新的CSV文件
df.to_csv('train_dataset.csv', index=False)
print("标注结果已保存到: train_dataset.csv")

text
"Albert Einstein was a theoretical physicist who developed the theory of relativity."
"Marie Curie won the Nobel Prize in Physics and Chemistry."

text,marked_text
"Albert Einstein was a theoretical physicist who developed the theory of relativity.","<KEY>Albert Einstein</KEY> was a theoretical physicist who developed the <KEY>theory of relativity</KEY>."
"Marie Curie won the Nobel Prize in Physics and Chemistry.","<KEY>Marie Curie</KEY> won the <KEY>Nobel Prize</KEY> in Physics and Chemistry."

关注公众号“CrazyNET”，获取更多资源