import nrekit
import numpy as np
import tensorflow as tf
import sys
import os
dataset_name = ‘nyt’
if len(sys.argv) > 1:
dataset_name = sys.argv[1]
dataset_dir = os.path.join(’./data’, dataset_name)
if not os.path.isdir(dataset_dir):
raise Exception("[ERROR] Dataset dir %s doesn’t exist!" % (dataset_dir))
The first 3 parameters are train / test data file name, word embedding file name and relation-id mapping file name respectively.
train_loader = nrekit.data_loader.json_file_data_loader(os.path.join(dataset_dir, ‘train.json’),
os.path.join(dataset_dir, ‘word_vec.json’),
os.path.join(dataset_dir, ‘rel2id.json’),
mode=nrekit.data_loader.json_file_data_loader.MODE_RELFACT_BAG,
shuffle=True)
test_loader = nrekit.data_loader.json_file_data_loader(os.path.join(dataset_dir, ‘test.json’),
os.path.join(dataset_dir, ‘word_vec.json’),
os.path.join(dataset_dir, ‘rel2id.json’),
mode=nrekit.data_loader.json_file_data_loader.MODE_ENTPAIR_BAG,
shuffle=False)
framework = nrekit.framework.re_framework(train_loader, test_loader)
class model(nrekit.framework.re_model): # model的定义
encoder = “pcnn”
selector = “att”
def __init__(self, train_data_loader, batch_size, max_length=120):
nrekit.framework.re_model.__init__(self, train_data_loader, batch_size, max_length=max_length)
self.mask = tf.placeholder(dtype=tf.int32, shape=[None, max_length], name="mask")
# Embedding
x = nrekit.network.embedding.word_position_embedding(self.word, self.word_vec_mat, self.pos1, self.pos2)
# Encoder
if model.encoder == "pcnn":
x_train = nrekit.network.encoder.pcnn(x, self.mask, keep_prob=0.5)
x_test = nrekit.network.encoder.pcnn(x, self.mask, keep_prob=1.0)
elif model.encoder == "cnn":
x_train = nrekit.network.encoder.cnn(x, keep_prob=0.5)
x_test = nrekit.network.encoder.cnn(x, keep_prob=1.0)
elif model.encoder == "rnn":
x_train = nrekit.network.encoder.rnn(x, self.length, keep_prob=0.5)
x_test = nrekit.network.encoder.rnn(x, self.length, keep_prob=1.0)
elif model.encoder == "birnn":
x_train = nrekit.network.encoder.birnn(x, self.length, keep_prob=0.5)
x_test = nrekit.network.encoder.birnn(x, self.length, keep_prob=1.0)
else:
raise NotImplementedError
# Selector
if model.selector == "att":
self._train_logit, train_repre = nrekit.network.selector.bag_attention(x_train, self.scope, self.ins_label, self.rel_tot, True, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_attention(x_test, self.scope, self.ins_label, self.rel_tot, False, keep_prob=1.0)
elif model.selector == "ave":
self._train_logit, train_repre = nrekit.network.selector.bag_average(x_train, self.scope, self.rel_tot, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_average(x_test, self.scope, self.rel_tot, keep_prob=1.0)
self._test_logit = tf.nn.softmax(self._test_logit)
elif model.selector == "max":
self._train_logit, train_repre = nrekit.network.selector.bag_maximum(x_train, self.scope, self.ins_label, self.rel_tot, True, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_maximum(x_test, self.scope, self.ins_label, self.rel_tot, False, keep_prob=1.0)
self._test_logit = tf.nn.softmax(self._test_logit)
else:
raise NotImplementedError
# Classifier 交叉熵损失求误差
self._loss = nrekit.network.classifier.softmax_cross_entropy(self._train_logit, self.label, self.rel_tot, weights_table=self.get_weights())
def loss(self):
return self._loss
def train_logit(self):
return self._train_logit
def test_logit(self):
return self._test_logit
def get_weights(self):
with tf.variable_scope("weights_table", reuse=tf.AUTO_REUSE):
print("Calculating weights_table...")
_weights_table = np.zeros((self.rel_tot), dtype=np.float32)
for i in range(len(self.train_data_loader.data_rel)):
_weights_table[self.train_data_loader.data_rel[i]] += 1.0
_weights_table = 1 / (_weights_table ** 0.05)
weights_table = tf.get_variable(name='weights_table', dtype=tf.float32, trainable=False, initializer=_weights_table)
print("Finish calculating")
return weights_table
if len(sys.argv) > 2:
model.encoder = sys.argv[2]
if len(sys.argv) > 3:
model.selector = sys.argv[3]
framework.train(model, ckpt_dir=“checkpoint”, model_name=dataset_name + “" + model.encoder + "” + model.selector, max_epoch=60, gpu_nums=1)
import nrekit
import numpy as np
import tensorflow as tf
import sys
import os
import json
import codecs
dataset_name = ‘nyt’
if len(sys.argv) > 1:
dataset_name = sys.argv[1]
dataset_dir = os.path.join(’./data’, dataset_name)
if not os.path.isdir(dataset_dir):
raise Exception("[ERROR] Dataset dir %s doesn’t exist!" % (dataset_dir))
The first 3 parameters are train / test data file name, word embedding file name and relation-id mapping file name respectively.
test_loader = nrekit.data_loader.json_file_data_loader(os.path.join(dataset_dir, ‘test.json’),
os.path.join(dataset_dir, ‘word_vec.json’),
os.path.join(dataset_dir, ‘rel2id.json’),
mode=nrekit.data_loader.json_file_data_loader.MODE_ENTPAIR_BAG,
shuffle=False)
framework = nrekit.framework.re_framework(None, test_loader)
class model(nrekit.framework.re_model):
encoder = “pcnn”
selector = “att”
def __init__(self, train_data_loader, batch_size, max_length=120):
nrekit.framework.re_model.__init__(self, train_data_loader, batch_size, max_length=max_length)
self.mask = tf.placeholder(dtype=tf.int32, shape=[None, max_length], name="mask")
# Embedding
x = nrekit.network.embedding.word_position_embedding(self.word, self.word_vec_mat, self.pos1, self.pos2)
# Encoder
if model.encoder == "pcnn":
x_train = nrekit.network.encoder.pcnn(x, self.mask, keep_prob=0.5)
x_test = nrekit.network.encoder.pcnn(x, self.mask, keep_prob=1.0)
elif model.encoder == "cnn":
x_train = nrekit.network.encoder.cnn(x, keep_prob=0.5)
x_test = nrekit.network.encoder.cnn(x, keep_prob=1.0)
elif model.encoder == "rnn":
x_train = nrekit.network.encoder.rnn(x, self.length, keep_prob=0.5)
x_test = nrekit.network.encoder.rnn(x, self.length, keep_prob=1.0)
elif model.encoder == "birnn":
x_train = nrekit.network.encoder.birnn(x, self.length, keep_prob=0.5)
x_test = nrekit.network.encoder.birnn(x, self.length, keep_prob=1.0)
else:
raise NotImplementedError
# Selector
if model.selector == "att":
self._train_logit, train_repre = nrekit.network.selector.bag_attention(x_train, self.scope, self.ins_label, self.rel_tot, True, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_attention(x_test, self.scope, self.ins_label, self.rel_tot, False, keep_prob=1.0)
elif model.selector == "ave":
self._train_logit, train_repre = nrekit.network.selector.bag_average(x_train, self.scope, self.rel_tot, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_average(x_test, self.scope, self.rel_tot, keep_prob=1.0)
self._test_logit = tf.nn.softmax(self._test_logit)
elif model.selector == "max":
self._train_logit, train_repre = nrekit.network.selector.bag_maximum(x_train, self.scope, self.ins_label, self.rel_tot, True, keep_prob=0.5)
self._test_logit, test_repre = nrekit.network.selector.bag_maximum(x_test, self.scope, self.ins_label, self.rel_tot, False, keep_prob=1.0)
self._test_logit = tf.nn.softmax(self._test_logit)
else:
raise NotImplementedError
# Classifier
self._loss = nrekit.network.classifier.softmax_cross_entropy(self._train_logit, self.label, self.rel_tot, weights_table=self.get_weights())
def loss(self):
return self._loss
def train_logit(self):
return self._train_logit
def test_logit(self):
return self._test_logit
def get_weights(self):
with tf.variable_scope("weights_table", reuse=tf.AUTO_REUSE):
print("Calculating weights_table...")
_weights_table = np.zeros((self.rel_tot), dtype=np.float32)
for i in range(len(self.train_data_loader.data_rel)):
_weights_table[self.train_data_loader.data_rel[i]] += 1.0
_weights_table = 1 / (_weights_table ** 0.05)
weights_table = tf.get_variable(name='weights_table', dtype=tf.float32, trainable=False, initializer=_weights_table)
print("Finish calculating")
return weights_table
if len(sys.argv) > 2:
model.encoder = sys.argv[2]
if len(sys.argv) > 3:
model.selector = sys.argv[3]
auc, pred_result = framework.test(model, ckpt="./checkpoint/" + dataset_name + “" + model.encoder + "” + model.selector, return_result=True)
加载id2entity 和id2rel的dictionary
id2entity_file_name = os.path.join(’_processed_data’, ‘test_id2entity.json’)
rel2id_file_name = os.path.join(dataset_dir, ‘rel2id.json’)
id2entity = json.load(codecs.open(id2entity_file_name, “r”, “utf-8”))
rel2id = json.load(codecs.open(rel2id_file_name, “r”, “utf-8”))
id2rel = {rel2id[item]: item for item in rel2id.keys()}
for list_item in pred_result:
for item in list_item:
item[“entpair”] = “#”.join(id2entity.get(i) for i in item[“entpair”].split("#"))
relationid = str(item[“relation”])
item[“relation”] = id2rel.get(relationid)
with codecs.open(’./test_result/’ + dataset_name + “" + model.encoder + "” + model.selector + “_pred.json”, ‘w’, ‘utf-8’) as outfile:
json.dump(pred_result, outfile, ensure_ascii=False, indent=2)
import sklearn.metrics
import matplotlib
Use ‘Agg’ so this program could run on a remote server
matplotlib.use(‘Agg’)
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
result_dir = ‘./test_result’
def main():
models = sys.argv[1:]
for model in models:
x = np.load(os.path.join(result_dir, model +’_x’ + ‘.npy’))
y = np.load(os.path.join(result_dir, model + ‘_y’ + ‘.npy’))
f1 = (2 * x * y / (x + y + 1e-20)).max()
auc = sklearn.metrics.auc(x=x, y=y)
#plt.plot(x, y, lw=2, label=model + ‘-auc=’+str(auc))
plt.plot(x, y, lw=2, label=model)
print(model + ’ : ’ + ‘auc = ’ + str(auc) + ’ | ’ + ‘max F1 = ’ + str(f1))
print(’ P@100: {} | P@200: {} | P@300: {} | Mean: {}’.format(y[100], y[200], y[300], (y[100] + y[200] + y[300]) / 3))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.3, 1.0])
plt.xlim([0.0, 0.4])
plt.title('Precision-Recall')
plt.legend(loc="upper right")
plt.grid(True)
plt.savefig(os.path.join(result_dir, 'pr_curve'))
if name == “main”:
main()
import codecs
import json
from uuid import uuid1
def txt2json_dict(readfile, writefile):
f = codecs.open(readfile, ‘r’, ‘utf-8’)
f2 = codecs.open(writefile, ‘w’, ‘utf-8’)
write_dict = {}
for line in f:
key, value = line.strip().split()
write_dict[key] = value
f2.write(json.dumps(write_dict, ensure_ascii=False, indent=2))
f.close()
f2.close()
def txt2json_vec(readfile, writefile, fromline):
f = codecs.open(readfile, ‘r’, ‘utf-8’)
f2 = codecs.open(writefile, ‘w’, ‘utf-8’)
write_data = []
lines = f.readlines()
for line in lines[fromline:]:
temp_dict = {}
splitdata = line.strip().split()
word, vec = splitdata[0], splitdata[1:]
temp_dict["word"] = word
temp_dict["vec"] = vec
write_data.append(temp_dict)
f2.write(json.dumps(write_data, ensure_ascii=False, indent=2))
f.close()
f2.close()
def txt2json_data(readfile, writefile):
f = codecs.open(readfile, ‘r’, ‘utf-8’)
f2 = codecs.open(writefile, ‘w’, ‘utf-8’)
write_data = []
for line in f:
temp_dict = {
"head": {},
"relation": "",
"sentence": "",
"tail": {}
}
splitdata = line.strip().split()
headword, tailword, relation, sentence = splitdata[0], splitdata[1], splitdata[2], splitdata[3:]
if len(write_data) > 0:
for item in write_data:
if item["head"] and item["head"]["word"] == headword:
temp_dict["head"] = item["head"]
elif item["tail"] and item["tail"]["word"] == headword:
temp_dict["head"] = item["tail"]
else:
temp_dict["head"] = {
"word": headword,
"id": str(uuid1())
}
if item["head"] and item["head"]["word"] == tailword:
temp_dict["tail"] = item["head"]
elif item["tail"] and item["tail"]["word"] == tailword:
temp_dict["tail"] = item["tail"]
else:
temp_dict["tail"] = {
"word": tailword,
"id": str(uuid1())
}
else:
temp_dict["head"] = {
"word": headword,
"id": str(uuid1())
}
temp_dict["tail"] = {
"word": tailword,
"id": str(uuid1())
}
temp_dict["sentence"] = ''.join(sentence)
temp_dict["relation"] = relation
write_data.append(temp_dict)
f2.write(json.dumps(write_data, ensure_ascii= False, indent=2))
f.close()
f2.close()
txt2json_dict(’./origindata/relation2id.txt’, ‘./data/cndata/rel2id.json’)
txt2json_vec(’./origindata/vec.txt’, ‘./data/cndata/word_vec.json’, 1)
txt2json_data(’./origindata/train.txt’, ‘./data/cndata/train.json’)
txt2json_data(’./origindata/test.txt’, ‘./data/cndata/test.json’)
import tensorflow as tf
import os
import sklearn.metrics
import numpy as np
import sys
import time
def average_gradients(tower_grads):
“”"Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
class re_model:
def init(self, train_data_loader, batch_size, max_length=120):
self.word = tf.placeholder(dtype=tf.int32, shape=[None, max_length], name=‘word’)
self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, max_length], name=‘pos1’)
self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, max_length], name=‘pos2’)
self.label = tf.placeholder(dtype=tf.int32, shape=[batch_size], name=‘label’)
self.ins_label = tf.placeholder(dtype=tf.int32, shape=[None], name=‘ins_label’)
self.length = tf.placeholder(dtype=tf.int32, shape=[None], name=‘length’)
self.scope = tf.placeholder(dtype=tf.int32, shape=[batch_size, 2], name=‘scope’)
self.train_data_loader = train_data_loader
self.rel_tot = train_data_loader.rel_tot
self.word_vec_mat = train_data_loader.word_vec_mat
def loss(self):
raise NotImplementedError
def train_logit(self):
raise NotImplementedError
def test_logit(self):
raise NotImplementedError
class re_framework:
MODE_BAG = 0 # Train and test the model at bag level.
MODE_INS = 1 # Train and test the model at instance level
def __init__(self, train_data_loader, test_data_loader, max_length=120, batch_size=160):
self.train_data_loader = train_data_loader
self.test_data_loader = test_data_loader
self.sess = None
def one_step_multi_models(self, sess, models, batch_data_gen, run_array, return_label=True):
feed_dict = {}
batch_label = []
for model in models:
batch_data = batch_data_gen.next_batch(batch_data_gen.batch_size // len(models))
feed_dict.update({
model.word: batch_data['word'],
model.pos1: batch_data['pos1'],
model.pos2: batch_data['pos2'],
model.label: batch_data['rel'],
model.ins_label: batch_data['ins_rel'],
model.scope: batch_data['scope'],
model.length: batch_data['length'],
})
if 'mask' in batch_data and hasattr(model, "mask"):
feed_dict.update({model.mask: batch_data['mask']})
batch_label.append(batch_data['rel'])
result = sess.run(run_array, feed_dict)
batch_label = np.concatenate(batch_label)
if return_label:
result += [batch_label]
return result
def one_step(self, sess, mo