import re
from nltk import tokenize
f=open(file='D:\\pythonProject1\\CNN\Relation-Classification-using-Bidirectional-LSTM-Tree-master\\data\\TRAIN_FILE.TXT')
lines=[]
for line in f:
lines.append(line.strip())
relation=[]
for i,line in enumerate(lines):
if (i-1)%4==0:
relation.append(line)
text=[]
for i,line in enumerate(lines):
if (i%4)==0:
text.append(line)
print(text[:10])
f=open(file='关系.txt',mode='w')
for rel in relation:
f.write(rel+'\n')
e1=[]
e2=[]
f = open(file='实体.txt', mode='w')
for i,line in enumerate(text):
temp=[]
t=line.split('<e1>')
e1_t=t[1].split('</e1>')
temp.append(e1_t[1])
e1.append(e1_t[0])
e2_t=e1_t[1].split('<e2>')
e2_t2=e2_t[1].split('</e2>')
e2.append(e2_t2[0])
f.write('\n'+line)
f.write('\t'+e1[i]+'\t')
f.write(e2[i])
f.close()
print(f'文本的大小为:{len(text)},e1实体的数量是{len(e1)},e2实体的数量是{len(e2)}')
SDP-LSTM语料中的实体和关系
最新推荐文章于 2021-11-18 18:51:35 发布