Word2Vec与Skip-Gram模型详解-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_48185819/article/details/106393485
# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
1-2. Word2Vec(Skip-gram) - Embedding Words and Show Graph
     Paper - Distributed Representations of Words and Phrases and their Compositionality(2013) 
'''

import tensorflow as tf
#import matplotlib.pyplot as plt
import numpy as np

tf.reset_default_graph()

# 3 Words Sentence
sentences = [ "i like dog", "i like cat", "i like animal",
              "dog cat animal", "apple cat dog like", "dog fish milk like",
              "dog cat eyes like", "i like apple", "apple i hate",
              "apple i movie book music like", "cat dog hate", "cat dog like"]

word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}

print word_dict

# Word2Vec Parameter
batch_size = 20     # 一次训练的词向量数量
embedding_size = 2 # To show 2 dim embedding graph      # 词向量的维度
voc_size = len(word_list)       # 词典中单词的数量


def random_batch(data, size):  # data是[中心词,上下文]对的列表, size=batch_size
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(data)), size, replace=False)

    for i in random_index:
        random_inputs.append(np.eye(voc_size)[data[i][0]])  # target
        random_labels.append(np.eye(voc_size)[data[i][1]])  # context word
    #print (random_inputs,random_labels)
    return random_inputs, random_labels

# Make skip gram of one size window
skip_grams = []
for i in range(1, len(word_sequence) - 1):
    target = word_dict[word_sequence[i]]
    context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
    #print(target,context)
    for w in context:
        skip_grams.append([target, w]) #target和上下文词
    #print skip_grams

#sys.exit(0)

# Model
inputs = tf.placeholder(tf.float32, shape=[None, voc_size])
labels = tf.placeholder(tf.float32, shape=[None, voc_size])

# W and WT is not Traspose relationship
W = tf.Variable(tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
WT = tf.Variable(tf.random_uniform([embedding_size, voc_size], -1.0, 1.0))

hidden_layer = tf.matmul(inputs, W) # [batch_size, embedding_size]  =  [None, voc_size]*[voc_size, embedding_size]
output_layer = tf.matmul(hidden_layer, WT) # [batch_size, voc_size] = [batch_size, embedding_size]*[embedding_size, voc_size]

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=labels))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)

    for epoch in range(5000):
        batch_inputs, batch_labels = random_batch(skip_grams, batch_size)
        _, loss = sess.run([optimizer, cost], feed_dict={inputs: batch_inputs, labels: batch_labels})

        if (epoch + 1)%1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        trained_embeddings = W.eval()
print trained_embeddings
for i, label in enumerate(word_list):
    x, y = trained_embeddings[i]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
1-2.Word2Vec