9 循环神经网络——具有记忆功能的网络(2)

9-25 rnnwordtest
通过让RNN网络对一段文字的训练学习来生成模型,最终可以使用机器生成的模型来表达自己的意思。

wordstest.txt (样本,放在代码同级目录下):

在尘世的纷扰中,只要心头悬挂着远方的灯光,我们就会坚持不懈地走,理想为我们灌注了精神的蕴藉。所以,生活再平凡、再普通、再琐碎,我们都要坚持一种信念,默守一种精神,为自己积淀站立的信心,前行的气力。

在程序运行之前需在代码文件的当前目录下依次建立log/rnnword文件夹

程序:

#1 定义基本工具函数
```
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import time
from collections import Counter

start_time = time.time()


def elapsed(sec):
    if sec < 60:
        return str(sec) + " sec"
    elif sec < (60 * 60):
        return str(sec / 60) + " min"
    else:
        return str(sec / (60 * 60)) + " hr"


# Target log path
tf.reset_default_graph()
training_file = 'wordstest.txt'


# 处理多个中文文件
def readalltxt(txt_files):
    labels = []
    for txt_file in txt_files:
        target = get_ch_lable(txt_file)
        labels.append(target)
    return labels


# 处理汉字
def get_ch_lable(txt_file):
    labels = ""
    with open(txt_file, 'rb') as f:
        for label in f:
            # labels =label.decode('utf-8')
            labels = labels + label.decode('gb2312')

    return labels


# 优先转文件里的字符到向量
def get_ch_lable_v(txt_file, word_num_map, txt_label=None):
    words_size = len(word_num_map)
    to_num = lambda word: word_num_map.get(word, words_size)
    if txt_file != None:
        txt_label = get_ch_lable(txt_file)

    labels_vector = list(map(to_num, txt_label))
    return labels_vector

'''----------------------------------------------------------'''
#2 样本预处理
training_data = get_ch_lable(training_file)

print("Loaded training data...")

print(len(training_data))
counter = Counter(training_data)
words = sorted(counter)
words_size = len(words)
word_num_map = dict(zip(words, range(words_size)))

print('字表大小:', words_size)
wordlabel = get_ch_lable_v(training_file, word_num_map)

'''----------------------------------------------------------'''
#1 参数设置
learning_rate = 0.001
training_iters = 10000
display_step = 1000
n_input = 4

n_hidden1 = 256
n_hidden2 = 512
n_hidden3 = 512
# tf Graph input
x = tf.placeholder("float", [None, n_input, 1])
wordy = tf.placeholder("float", [None, words_size])
'''----------------------------------------------------------'''
#2 定义网络结构
x1 = tf.reshape(x, [-1, n_input])
x2 = tf.split(x1, n_input, 1)
# 2-layer LSTM, each layer has n_hidden units.
rnn_cell = rnn.MultiRNNCell([rnn.LSTMCell(n_hidden1), rnn.LSTMCell(n_hidden2), rnn.LSTMCell(n_hidden3)])

#通过RNN得到输出 generate prediction
outputs, states = rnn.static_rnn(rnn_cell, x2, dtype=tf.float32)

#通过全连接输出指定维度  last output
pred = tf.contrib.layers.fully_connected(outputs[-1], words_size, activation_fn=None)
'''----------------------------------------------------------'''
#3 定义优化器
# Loss optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=wordy))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(wordy, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
'''----------------------------------------------------------'''
#4 训练模型
savedir = "log/rnnword/"
saver = tf.train.Saver(max_to_keep=1)  # 生成saver

# 启动session
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    step = 0
    offset = random.randint(0, n_input + 1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    kpt = tf.train.latest_checkpoint(savedir)
    print("kpt:", kpt)
    startepo = 0
    if kpt != None:
        saver.restore(session, kpt)
        ind = kpt.find("-")
        startepo = int(kpt[ind + 1:])
        print(startepo)
        step = startepo

    while step < training_iters:

        # 随机取一个位置偏移
        if offset > (len(training_data) - end_offset):
            offset = random.randint(0, n_input + 1)

        inwords = [[wordlabel[i]] for i in range(offset, offset + n_input)]  # 按照指定的位置偏移获取后面的4个文字向量当作输入

        inwords = np.reshape(np.array(inwords), [-1, n_input, 1])

        out_onehot = np.zeros([words_size], dtype=float)
        out_onehot[wordlabel[offset + n_input]] = 1.0
        out_onehot = np.reshape(out_onehot, [1, -1])  # 所有的字都变成onehot

        _, acc, lossval, onehot_pred = session.run([optimizer, accuracy, loss, pred],
                                                   feed_dict={x: inwords, wordy: out_onehot})
        loss_total += lossval
        acc_total += acc
        if (step + 1) % display_step == 0:
            print("Iter= " + str(step + 1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total / display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100 * acc_total / display_step))
            acc_total = 0
            loss_total = 0
            in2 = [words[wordlabel[i]] for i in range(offset, offset + n_input)]
            out2 = words[wordlabel[offset + n_input]]
            out_pred = words[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (in2, out2, out_pred))
            saver.save(session, savedir + "rnnwordtest.cpkt", global_step=step)
        step += 1
        offset += (n_input + 1)  # 中间隔了一个,作为预测

    print("Finished!")
    saver.save(session, savedir + "rnnwordtest.cpkt", global_step=step)
    print("Elapsed time: ", elapsed(time.time() - start_time))
    print("-------------------------------------------------------------")

    #5 运行模型生成句子
    ···
    while True:
        prompt = "请输入%s个字: " % n_input
        sentence = input(prompt)
        inputword = sentence.strip()

        if len(inputword) != n_input:
            print("您输入的字符长度为:", len(inputword), "请输入4个字")
            continue
        try:
            inputword = get_ch_lable_v(None, word_num_map, inputword)

            for i in range(32):
                keys = np.reshape(np.array(inputword), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s%s" % (sentence, words[onehot_pred_index])
                inputword = inputword[1:]
                inputword.append(onehot_pred_index)
            print(sentence)
        except:
            print("该字我还没学会")

结果:

Loaded training data...
98
字表大小: 69

kpt: None
Iter= 1000, Average Loss= 3.622068, Average Accuracy= 10.10%
['的', '蕴', '藉', '。'] - [所] vs [们]
Iter= 2000, Average Loss= 1.728836, Average Accuracy= 46.60%
['灯', '光', ',', '我'] - [们] vs [念]

Iter= 3000, Average Loss= 0.857182, Average Accuracy= 66.60%
['信', '念', ',', '默'] - [守] vs [守]
Iter= 4000, Average Loss= 0.460083, Average Accuracy= 84.00%
['的', '信', '心', ','] - [前] vs [前]
Iter= 5000, Average Loss= 0.307164, Average Accuracy= 87.90%
['再', '琐', '碎', ','] - [我] vs [我]
Iter= 6000, Average Loss= 0.367493, Average Accuracy= 87.70%
[',', '我', '们', '就'] - [会] vs [会]
Iter= 7000, Average Loss= 0.279600, Average Accuracy= 89.70%
['坚', '持', '不', '懈'] - [地] vs [地]
Iter= 8000, Average Loss= 0.079822, Average Accuracy= 96.10%
['行', '的', '气', '力'] - [。] vs [。]
Iter= 9000, Average Loss= 0.356419, Average Accuracy= 88.70%
['挂', '着', '远', '方'] - [的] vs [的]
Iter= 10000, Average Loss= 0.160459, Average Accuracy= 94.10%
['精', '神', '的', '蕴'] - [藉] vs [藉]
Finished!

Elapsed time:  13.827731307347616 min
-------------------------------------------------------------
请输入4个字: 生活平凡
生活平凡凡会坚持通远方的一自的气,藉的信心一种纷扰,灯光一种精神,为自己
请输入4个字: 平凡生活
平凡生活了精神的蕴藉。所以,生活蕴藉的信心一种纷扰,灯光一种精神,为自己
请输入4个字: 琐碎平凡
琐碎平凡都要坚持一种信念,默守一种精神,为自己积蕴站立为心以种注了精只们
请输入4个字: 

9-26 word2vect
使用CBOW模型来训练word2vec,最终将所学到的词向量分布关系可视化出来,同时通过该例子练习使用nce_loss函数与word embedding技术,实现自己的word2vec。

人体阴阳与电能.txt (样本,放在代码同级目录下):

人体阴阳与电能

阴应该是身体里内在的精力储存,可以理解成电池里的电量。
阳是将阴分解后化成的精力。可以理解成电流或电压。当有电流时机器开始活动。当电压太高时,机器会被烧毁。人呢,就相当于机器了。举个最简单的例子,市面上的便宜遥控玩具车。在电量低时,走的慢。电量高时,走的快。使用南孚之类的高能电池,玩不了几次,就坏了。

对于运动对身体的影响,我的理解是这样的。人体在运动的同时,也是消耗体内的阴的,将其转化为阳来支配身体运动。而不断的负荷运动中,会产生肌肉的生长,肌肉可以理解成变压器,将身体传过来的电压和电流提升。使终端可以使用更大的电压与电流。所以多运动的人,尤其搏击类剧烈运动,身体有两方面不同,第一体内的电量输出速度会比别人大。第二肢体的电量消耗(也就是功率)要比别人的大。假如一般人比作一个配有300升的油箱的QQ,搏击运动员就相当于一个配有300升油箱的2.0排量汽车。排量不同,跑起来的速度也不同。更高体能或爆发力的人就相当于一个配有300升油箱的4.0排量汽车,甚至更高,拥有跑得更快的能力。

适量的运动完为什么会觉得精神,舒服。因为全身的器官接受的电压都提升了。电足了自然运转的就快,功能更加灵敏,就很舒服。

运动过度为什么会觉得疲惫。体内的阴转化为阳的过程也是有一定速度的。通过过运动,可以将这个速度不断的提升。这可以理解成电池输出的电流大小,运动越多,输出的电流就越大,单位时间来提供身体所需要的电就越多。当身体的各个器官加大用电量,身体就会加大电流的输出。相当于电池快速放电,会短时间内将电量放空,体内变成一个低电量状态,然后就会感到累,没有劲。这时候通过睡觉,休息,才可以恢复。这个过程就比如使用充电剃须刀,按住一直让它工作,可能1个多小时就没电了,然后不动它,放个一天,再用,又有电了。电池在快速放电过程中,会有暂时的放尽,但并不能把所有的都放完,事后还会恢复。

人体与电池的连系。电池也是化学反映,内部原理是一部分物质在转化成另一部分物质的时候,放出了电能,对外提供能量。而人体呢,也是如此。随着细胞的工作分解糖类的过程中放出能量,由神经传送出去。但是每个细胞都是有寿命的,于是身体需要细胞有自我复制分裂过程。可以理解为只要细胞不停的复制不停的分解糖,我们就有不停的电量来源,就能活下来。不幸的是,我们的细胞分裂次数是固定的,伴随着出生,由基因决定。每次分裂,端粒体都会缩短,当减小到没有,细胞复制就会出现错误,引发病变,身体的某些功能就会衰退。也就是我们体内的阴是固定的。电池的电量是固定的,并且不能更换。

运动为什么会年轻,运动会加大对应器官的电流,使其更高效的工作,为了产生电能分裂细胞也就得加速,从而使各个器官都有更新的细胞换上。新的细胞功能更好,整体器官性能更好,人会更年轻。


运动不能长寿,道理很简单,电用得越快,死得越早。一些搏击高手,通过长期训练可以瞬间调用体内大量电能,达到更好的反映与杀伤力。但是阳太盛而阴不足,故不能长寿。

打坐,修道可以长寿,也就是节约用电而已。另外在道家的长期摸索中,总结出的一些方法让人体的电池保养的更好而已。就相当于对着笔记本的电池友好说明书严格执行而已。可以延长电池寿命,但也无法解决电量的用尽。

站桩、马步、平板支撑等静挺类运动为什么会练完更加精神。静挺类运动,相当于开车时挂空挡,轰油门,车不走,不做功,不会消耗过多的电能。但发动机在加转相当于电池放电量在加大,让身体个各器官的电压增大,功能更灵敏感觉更舒服。当眼部的电压增大,更加灵敏,就会感觉看外界的事物更亮,自己更有神。所以静挺运动只是刺激电池加速放电,多做静挺运动会加速电池放电的本领,不会增加肢体器官的多消耗电量的本领。

程序:

 #1 引入头文件
    import numpy as np
    import tensorflow as tf
    import random
    import collections
    from collections import Counter
    import jieba
    
    from sklearn.manifold import TSNE
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    mpl.rcParams['font.family'] = 'STSong'
    mpl.rcParams['font.size'] = 20
    '''-----------------------------------------------------------------'''
    #2 准备样本创建数据集
    training_file = '人体阴阳与电能.txt'
    
    # 中文字
    def get_ch_lable(txt_file):
        labels = ""
        with open(txt_file, 'rb') as f:
            for label in f:
                # labels =label.decode('utf-8')
                labels = labels + label.decode('gb2312')
    
        return labels
    
    
    # 分词
    def fenci(training_data):
        seg_list = jieba.cut(training_data)  # 默认是精确模式  
        training_ci = " ".join(seg_list)
        training_ci = training_ci.split()
        # 以空格将字符串分开
        training_ci = np.array(training_ci)
        training_ci = np.reshape(training_ci, [-1, ])
        return training_ci
    
    
    def build_dataset(words, n_words):
        """Process raw inputs into a dataset."""
        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(n_words - 1))
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reversed_dictionary
    
    
    training_data = get_ch_lable(training_file)
    print("总字数", len(training_data))
    training_ci = fenci(training_data)
    # print(training_ci)
    print("总词数", len(training_ci))
    training_label, count, dictionary, words = build_dataset(training_ci, 350)
    
    words_size = len(dictionary)
    print("字典词数", words_size)
    # print(training_label)#将文本转为词向量
    # print(words)#每个编号对应的词
    # print(dictionary)#每个词对应的编号
    # print(count)#每个词对应的个数
    ####################################################
    print('Sample data', training_label[:10], [words[i] for i in training_label[:10]])
    print("----------------------------------------------------------------")
    
    #3 获取批次数据
    data_index = 0
    
    def generate_batch(data, batch_size, num_skips, skip_window):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
    
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)
    
        if data_index + span > len(data):
            data_index = 0
    
        buffer.extend(data[data_index:data_index + span])
        data_index += span
    
        for i in range(batch_size // num_skips):
            target = skip_window  # target label at the center of the buffer
            targets_to_avoid = [skip_window]
            for j in range(num_skips):
                while target in targets_to_avoid:
                    target = random.randint(0, span - 1)
    
                targets_to_avoid.append(target)
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[target]
    
            if data_index == len(data):
                # print(data_index,len(data),span,len(data[:span]))
                # buffer[:] = data[:span]
                buffer = data[:span]
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
    
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index + len(data) - span) % len(data)
        return batch, labels
    
    
    batch, labels = generate_batch(training_label, batch_size=8, num_skips=2, skip_window=1)
    
    for i in range(8):  # 取第一个字,后一个是标签,再取其前一个字当标签,
        print(batch[i], words[batch[i]], '->', labels[i, 0], words[labels[i, 0]])
    '''---------------------------------------------------------------------'''
    
    print("----------------------------------------------------------------")
    #4 定义取样参数
    batch_size = 128
    embedding_size = 128  #embedding vector的维度 Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  #1个input生成2个标签 How many times to reuse an input to generate a label.
    
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = np.int32(words_size / 2)  #取样数据的分布范围 Only pick dev samples in the head of the distribution.
    print("valid_window", valid_window)
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 0-words_size/2,中的数取16个。不能重复。
    num_sampled = 64  # Number of negative examples to sample.
    
    '''----------------------------------------------------------------------'''
    #5 定义模型变量
    tf.reset_default_graph()
    
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    #CPU上执行 Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(tf.random_uniform([words_size, embedding_size], -1.0, 1.0))  # 94个,每个128个向量
    
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(tf.truncated_normal([words_size, embedding_size],
                                                      stddev=1.0 / tf.sqrt(np.float32(embedding_size))))
    
        nce_biases = tf.Variable(tf.zeros([words_size]))
        '''--------------------------------------------------------------------------'''
    #6 定义损失函数和优化器
    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
                       labels=train_labels, inputs=embed,
                       num_sampled=num_sampled, num_classes=words_size))
    
    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    print("________________________", similarity.shape)
    '''------------------------------------------------------------------------------'''
    #8 q启动session,训练模型
    # Begin training.
    num_steps = 100001
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print('Initialized')
    
        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_labels = generate_batch(training_label, batch_size, num_skips, skip_window)
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
    
            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += loss_val
    
            # 通过打印测试可以看到  embed的值在逐渐的被调节
            #        emv = sess.run(embed,feed_dict = {train_inputs: [37,18]})
            #        print("emv-------------------",emv[0])
    
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
            '''---------------------------------------------------------------------'''
            #9 输入验证数据,显示效果
            # Note that this is expensive (~20% slowdown if computed every 500 steps)
    
            if step % 10000 == 0:
                sim = similarity.eval(session=sess)
                # print(valid_size)
                for i in range(valid_size):
                    valid_word = words[valid_examples[i]]
                    # print("valid_word",valid_word)#16
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]  # argsort函数返回的是数组值从小到大的索引值
                    # print("nearest",nearest,top_k)
                    log_str = 'Nearest to %s:' % valid_word
    
                    for k in range(top_k):
                        close_word = words[nearest[k]]
                        log_str = '%s,%s' % (log_str, close_word)
                    print(log_str)
    
        print("-------------------------------------------------------")
    
        #10 词向量可视化
        final_embeddings = normalized_embeddings.eval()
    
    
    def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
        assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                         ha='right', va='bottom')
        plt.savefig(filename)
    
    
    try:
        # pylint: disable=g-import-not-at-top
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
        plot_only = 80  # 输出100个词
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [words[i] for i in range(plot_only)]
        # print(labels)
        plot_with_labels(low_dim_embs, labels)
    
    except ImportError:
        print('Please install sklearn, matplotlib, and scipy to show embeddings.')

结果:

总字数 1567

总词数 961
字典词数 350
Sample data [25, 132, 32, 26, 27, 133, 8, 9, 80, 134] ['人体', '阴阳', '与', '电能', '阴', '应该', '是', '身体', '里', '内在']
----------------------------------------------------------------
132 阴阳 -> 32 与
132 阴阳 -> 25 人体
32 与 -> 26 电能
32 与 -> 132 阴阳
26 电能 -> 27 阴
26 电能 -> 32 与
27 阴 -> 26 电能
27 阴 -> 133 应该
----------------------------------------------------------------
valid_window 175

________________________ (16, 350)
Initialized
Average loss at step  0 :  105.77994537353516
Nearest to 例子:,精力,到,衰退,出生,能活,大小,体内,神经
Nearest to 南孚:,灵敏,感觉,玩具车,方法,市面上,人,精力,电池
Nearest to 人体:,呢,搏击,是,年轻,别人,和,方面,。
Nearest to 舒服:,300,会,一个,人会,的,内,道理,更换
Nearest to 复制:,单位,决定,高手,道家,死,玩,内在,后
Nearest to 不能:,搏击,杀伤力,用电量,电能,精力,分解,机器,到
Nearest to 提升:,理解,剧烈运动,寿命,灵敏,输出,和,电池,年轻
Nearest to 走:,多,各个,类,减小,反映,还会,放电,排量
Nearest to 阴阳:,但是,坏,终端,每次,为什么,杀伤力,其,不动
Nearest to 更加:,汽车,两,人体,运动过度,所有,说明书,快,到
Nearest to 阳是:,比,增大,适量,感到,不会,运动会,下来,能力
Nearest to 升:,时候,简单,不同,比作,才,某些,时,物质
Nearest to 快速:,为,但是,分解,感到,低电量,达到,消耗,2.0
Nearest to 300:,舒服,另外,一些,不断,并且,一部分,的,转化
Nearest to 遥控:,几次,打坐,传过来,比如,一个,需要,不足,)
Nearest to 大:,一天,多,更好,坏,尤其,增大,后,端粒
Average loss at step  2000 :  3.546946987092495
Average loss at step  4000 :  2.634342535763979
Average loss at step  6000 :  2.6235247813761235
Average loss at step  8000 :  2.59721028599143
Average loss at step  10000 :  2.6027329542636872
Nearest to 例子:,精力,到,出生,衰退,体内,运动,大小,传过来
Nearest to 南孚:,玩具车,灵敏,感觉,方法,传过来,市面上,人,每个
Nearest to 人体:,年轻,方面,和,剧烈运动,精力,搏击,这,按住
Nearest to 舒服:,人会,300,一个,会,长寿,道理,更换,的
Nearest to 复制:,单位,决定,道家,高手,死,玩,传过来,不停
Nearest to 不能:,搏击,用电量,过,到,开始,遥控,机器,阴是
Nearest to 提升:,输出,和,理解,寿命,我,电池,如此,电压
Nearest to 走:,多,我们,各个,反映,UNK,类,减小,大
Nearest to 阴阳:,提供,我们,然后,一个,不停,油箱,摸索,衰退
Nearest to 更加:,汽车,两,到,或,所有,状态,运动过度,1
Nearest to 阳是:,适量,增大,运动过度,UNK,能力,感到,第二,比
Nearest to 升:,时候,简单,物质,不同,某些,才,由,寿命
Nearest to 快速:,为,但是,加速,感到,分解,一定,UNK,低电量
Nearest to 300:,舒服,另外,一部分,一些,不断,并且,长寿,出生
Nearest to 遥控:,打坐,传过来,几次,比如,一个,需要,),不能
Nearest to 大:,多,一天,更好,新,QQ,走,增大,阴
Average loss at step  12000 :  2.5970189170241356
Average loss at step  14000 :  2.5994799932837487
Average loss at step  16000 :  2.5864335169792176
Average loss at step  18000 :  2.5891829232275487
Average loss at step  20000 :  2.58370506478101
Nearest to 例子:,精力,到,出生,运动,体内,衰退,大小,传过来
Nearest to 南孚:,玩具车,灵敏,方法,感觉,传过来,阴阳,市面上,每个
Nearest to 人体:,阴阳,年轻,精力,方面,和,剧烈运动,这,另外
Nearest to 舒服:,人会,300,阴阳,道理,一个,长寿,会,UNK
Nearest to 复制:,单位,道家,决定,高手,传过来,不停,死,玩
Nearest to 不能:,阴阳,过,用电量,开始,搏击,遥控,到,阴是
Nearest to 提升:,阴阳,和,输出,理解,如此,电压,寿命,我
Nearest to 走:,多,我们,阴阳,各个,UNK,反映,类,大
Nearest to 阴阳:,UNK,电压,电流,用电量,减小,配有,马步,保养
Nearest to 更加:,汽车,阴阳,两,状态,到,UNK,或,所有
Nearest to 阳是:,适量,UNK,运动过度,第二,能力,增大,年轻,下来
Nearest to 升:,时候,阴阳,物质,简单,寿命,某些,不同,有
Nearest to 快速:,为,加速,但是,感到,UNK,分解,阴阳,一定
Nearest to 300:,舒服,另外,一部分,阴阳,并且,不断,一些,长寿
Nearest to 遥控:,打坐,传过来,几次,比如,一个,不能,需要,不足
Nearest to 大:,多,更好,一天,阴阳,新,QQ,走,阴
Average loss at step  22000 :  2.576025656312704
Average loss at step  24000 :  2.5769059507846834
Average loss at step  26000 :  2.570913948327303
Average loss at step  28000 :  2.5659243043623863
Average loss at step  30000 :  2.5711384868621825
Nearest to 例子:,精力,到,出生,运动,体内,传过来,大小,衰退
Nearest to 南孚:,玩具车,灵敏,方法,传过来,感觉,市面上,可以,每个
Nearest to 人体:,年轻,精力,方面,和,另外,剧烈运动,这,连系
Nearest to 舒服:,人会,300,道理,长寿,就,打坐,一个,会
Nearest to 复制:,道家,单位,决定,高手,传过来,不停,玩,电
Nearest to 不能:,过,遥控,开始,用电量,阴是,搏击,到,再用
Nearest to 提升:,和,理解,如此,输出,另外,我,寿命,电压
Nearest to 走:,我们,多,各个,大,类,便宜,可能,反映
Nearest to 阴阳:,而,与,让,在,呢,用电量,马步,但是
Nearest to 更加:,汽车,两,状态,所有,到,或,同时,1
Nearest to 阳是:,适量,运动过度,第二,能力,下来,年轻,增大,比
Nearest to 升:,时候,物质,寿命,简单,某些,有,更好,不同
Nearest to 快速:,加速,为,但是,感到,分解,UNK,的,一定
Nearest to 300:,舒服,另外,一部分,并且,不断,一些,出生,长寿
Nearest to 遥控:,打坐,传过来,一个,几次,比如,不能,),不足
Nearest to 大:,多,更好,一天,QQ,新,走,阴,人
Average loss at step  32000 :  2.5729226052761076
Average loss at step  34000 :  2.58018412822485
Average loss at step  36000 :  2.586327002108097
Average loss at step  38000 :  2.5677035285234453
Average loss at step  40000 :  2.5527956836521626
Nearest to 例子:,精力,到,运动,出生,体内,端粒,生长,传过来
Nearest to 南孚:,玩具车,灵敏,方法,传过来,感觉,市面上,300,每个
Nearest to 人体:,阴阳,精力,年轻,杀伤力,另外,方面,连系,和
Nearest to 舒服:,人会,300,道理,就,长寿,打坐,更换,另外
Nearest to 复制:,道家,单位,决定,传过来,有,不停,高手,电
Nearest to 不能:,过,遥控,阴是,开始,用电量,多,越,到
Nearest to 提升:,和,如此,另外,我,理解,寿命,细胞,输出
Nearest to 走:,我们,多,各个,便宜,大,器官,可能,搏击
Nearest to 阴阳:,电压,衰退,新,减小,化学,时候,用电量,延长
Nearest to 更加:,汽车,两,更好,状态,到,所有,同时,工作
Nearest to 阳是:,适量,第二,运动过度,能力,下来,年轻,电量,增大
Nearest to 升:,时候,寿命,物质,简单,某些,更好,负荷,才
Nearest to 快速:,加速,为,但是,UNK,感到,分解,阴阳,一定
Nearest to 300:,另外,舒服,一部分,并且,阴阳,不断,一些,出生
Nearest to 遥控:,打坐,传过来,一个,比如,不能,几次,不足,)
Nearest to 大:,多,更好,QQ,新,一天,阴,人,走
Average loss at step  42000 :  2.567821709230542
Average loss at step  44000 :  2.5641708382368087
Average loss at step  46000 :  2.5830851689279077
Average loss at step  48000 :  2.561120181247592
Average loss at step  50000 :  2.5722832694500686
Nearest to 例子:,精力,阴阳,到,运动,出生,体内,生长,端粒
Nearest to 南孚:,玩具车,灵敏,传过来,方法,感觉,市面上,阴阳,可以
Nearest to 人体:,杀伤力,精力,年轻,电流,另外,连系,方面,和
Nearest to 舒服:,人会,300,长寿,道理,就,打坐,更换,阴阳
Nearest to 复制:,道家,单位,有,决定,传过来,不停,电,过程
Nearest to 不能:,过,遥控,越,阴是,用电量,开始,多,再用
Nearest to 提升:,和,如此,另外,细胞,寿命,我,理解,电压
Nearest to 走:,我们,阴阳,各个,UNK,便宜,多,器官,可能
Nearest to 阴阳:,快,就,体内,电量,可以,是,糖,输出
Nearest to 更加:,汽车,两,更好,状态,工作,同时,到,细胞
Nearest to 阳是:,适量,第二,电量,运动过度,能力,下来,年轻,增大
Nearest to 升:,时候,寿命,物质,简单,负荷,某些,更好,有
Nearest to 快速:,加速,为,但是,感到,UNK,分解,一定,高时
Nearest to 300:,另外,舒服,一部分,并且,不断,出生,再用,长寿
Nearest to 遥控:,阴阳,打坐,传过来,不能,一个,比如,几次,不足
Nearest to 大:,多,阴阳,更好,QQ,新,一天,人,阴
Average loss at step  52000 :  2.568149507522583
Average loss at step  54000 :  2.566859869718552
Average loss at step  56000 :  2.561292907476425
Average loss at step  58000 :  2.5703523154854775
Average loss at step  60000 :  2.5685375665426253
Nearest to 例子:,精力,到,体内,出生,运动,生长,端粒,传过来
Nearest to 南孚:,玩具车,灵敏,传过来,方法,可以,感觉,市面上,300
Nearest to 人体:,杀伤力,电流,精力,另外,年轻,连系,电压,和
Nearest to 舒服:,人会,道理,长寿,300,就,打坐,更换,累
Nearest to 复制:,道家,有,单位,不停,传过来,过程,决定,电
Nearest to 不能:,过,越,遥控,用电量,多,阴是,开始,再用
Nearest to 提升:,和,如此,另外,细胞,慢,寿命,我,理解
Nearest to 走:,我们,器官,便宜,各个,生长,可能,搏击,能量
Nearest to 阴阳:,其,更好,可以,能量,长期,内,使用,转化成
Nearest to 更加:,汽车,更好,两,状态,工作,细胞,同时,总结
Nearest to 阳是:,适量,电量,第二,运动过度,能力,下来,年轻,增大
Nearest to 升:,时候,寿命,物质,负荷,某些,简单,更好,由
Nearest to 快速:,加速,为,但是,感到,UNK,分解,一定,高时
Nearest to 300:,另外,舒服,一部分,并且,再用,出生,不断,生长
Nearest to 遥控:,不能,传过来,打坐,比如,一个,几次,不足,需要
Nearest to 大:,多,QQ,新,更好,人,阴,一天,走
Average loss at step  62000 :  2.5638422949910162
Average loss at step  64000 :  2.556031964302063
Average loss at step  66000 :  2.558420734167099
Average loss at step  68000 :  2.55505197802186
Average loss at step  70000 :  2.5635871703326703
Nearest to 例子:,精力,到,体内,运动,出生,生长,端粒,放尽
Nearest to 南孚:,玩具车,传过来,方法,灵敏,300,可以,市面上,比如
Nearest to 人体:,杀伤力,电流,精力,另外,连系,电压,年轻,方面
Nearest to 舒服:,人会,道理,长寿,打坐,就,300,更换,累
Nearest to 复制:,道家,有,传过来,单位,过程,不停,电,决定
Nearest to 不能:,越,过,遥控,多,阴是,开始,再用,用电量
Nearest to 提升:,和,如此,另外,慢,大,寿命,我,细胞
Nearest to 走:,我们,便宜,可能,器官,生长,搏击,大,各个
Nearest to 阴阳:,呢,让,与,而,在,衰退,对应,延长
Nearest to 更加:,更好,汽车,两,工作,状态,同时,细胞,通过
Nearest to 阳是:,适量,电量,第二,运动过度,能力,下来,年轻,增大
Nearest to 升:,时候,寿命,物质,负荷,某些,简单,更好,人
Nearest to 快速:,加速,为,UNK,但是,感到,分解,一定,成
Nearest to 300:,另外,舒服,一部分,并且,再用,出生,生长,不断
Nearest to 遥控:,打坐,传过来,不能,比如,一个,几次,不足,需要
Nearest to 大:,多,QQ,新,更好,人,一天,阴,快
Average loss at step  72000 :  2.567703256070614
Average loss at step  74000 :  2.558490141928196
Average loss at step  76000 :  2.560944205969572
Average loss at step  78000 :  2.5581597674190997
Average loss at step  80000 :  2.556728454440832
Nearest to 例子:,精力,体内,到,运动,出生,细胞,生长,端粒
Nearest to 南孚:,玩具车,阴阳,传过来,方法,灵敏,300,比如,市面上
Nearest to 人体:,杀伤力,阴阳,电流,精力,另外,电压,连系,年轻
Nearest to 舒服:,人会,长寿,道理,打坐,就,300,更换,不同
Nearest to 复制:,有,道家,过程,传过来,细胞,单位,不停,电
Nearest to 不能:,越,过,遥控,阴阳,多,再用,阴是,负荷
Nearest to 提升:,和,另外,如此,慢,细胞,大,电压,寿命
Nearest to 走:,我们,器官,生长,可能,便宜,搏击,能量,大
Nearest to 阴阳:,电压,人体,电流,但是,杀伤力,另外,道家,太盛
Nearest to 更加:,更好,汽车,细胞,工作,状态,两,同时,总结
Nearest to 阳是:,电量,适量,第二,运动过度,能力,下来,年轻,这个
Nearest to 升:,时候,寿命,物质,负荷,某些,体内,人,简单
Nearest to 快速:,加速,为,但是,阴阳,感到,UNK,一定,寿命
Nearest to 300:,阴阳,另外,舒服,一部分,并且,再用,出生,生长
Nearest to 遥控:,不能,打坐,传过来,比如,一个,几次,不足,需要
Nearest to 大:,多,QQ,新,人,更好,快,阴,提升
Average loss at step  82000 :  2.5498250132352114
Average loss at step  84000 :  2.5451384124159815
Average loss at step  86000 :  2.551382177114487
Average loss at step  88000 :  2.5645904213786124
Average loss at step  90000 :  2.5656556817889213
Nearest to 例子:,精力,体内,到,出生,运动,生长,更好,细胞
Nearest to 南孚:,阴阳,玩具车,传过来,方法,灵敏,比如,300,可以
Nearest to 人体:,杀伤力,电流,阴阳,电压,另外,连系,精力,年轻
Nearest to 舒服:,人会,阴阳,道理,长寿,打坐,就,不同,300
Nearest to 复制:,有,道家,阴阳,过程,传过来,都,细胞,电
Nearest to 不能:,阴阳,越,过,遥控,多,再用,阴是,开始
Nearest to 提升:,阴阳,和,另外,慢,如此,大,细胞,电足
Nearest to 走:,我们,生长,器官,便宜,可能,阴阳,大,甚至
Nearest to 阴阳:,打坐,另外,而已,电压,新,得,、,慢
Nearest to 更加:,更好,阴阳,汽车,工作,状态,细胞,两,同时
Nearest to 阳是:,电量,阴阳,适量,第二,运动过度,下来,能力,这个
Nearest to 升:,阴阳,时候,寿命,负荷,物质,某些,人,体内
Nearest to 快速:,加速,阴阳,为,但是,UNK,感到,分解,成
Nearest to 300:,阴阳,一部分,另外,舒服,再用,并且,出生,生长
Nearest to 遥控:,阴阳,不能,传过来,打坐,比如,一个,几次,不足
Nearest to 大:,QQ,多,新,人,阴阳,快,更好,阴
Average loss at step  92000 :  2.547520384281874
Average loss at step  94000 :  2.546669751137495
Average loss at step  96000 :  2.56224885520339
Average loss at step  98000 :  2.559152750506997
Average loss at step  100000 :  2.559117724120617
Nearest to 例子:,精力,体内,到,出生,细胞,运动,生长,更好
Nearest to 南孚:,玩具车,传过来,方法,灵敏,比如,300,加速,可以
Nearest to 人体:,杀伤力,电流,电压,另外,连系,精力,反映,阴阳
Nearest to 舒服:,人会,道理,长寿,打坐,就,增大,不同,影响
Nearest to 复制:,有,道家,过程,细胞,都,传过来,电,不停
Nearest to 不能:,越,过,遥控,多,再用,负荷,阴是,全身
Nearest to 提升:,和,另外,慢,如此,大,细胞,电足,器官
Nearest to 走:,我们,生长,可能,器官,便宜,搏击,大,电能
Nearest to 阴阳:,另外,友好,被,恢复,新,电压,短时间,慢
Nearest to 更加:,更好,细胞,工作,状态,汽车,两,同时,总结
Nearest to 阳是:,适量,电量,第二,运动过度,能力,下来,这个,年轻
Nearest to 升:,寿命,时候,负荷,物质,某些,人,体内,阴是
Nearest to 快速:,加速,为,UNK,但是,感到,成,寿命,一定
Nearest to 300:,另外,一部分,舒服,再用,并且,出生,生长,南孚
Nearest to 遥控:,传过来,不能,打坐,比如,一个,几次,不足,需要
Nearest to 大:,QQ,人,多,快,新,提升,更好,阴
-------------------------------------------------------

9-27 word2vect自定义候选采样

准备一段文字作为训练的样本,对其使用CBOW模型计算得到word2vec,并将各个词的向量关系用图表示出来。其中,通过使用手动指定词频样本生成候选词的方法,来代替nce_loss中的默认选词方法。

自定义候选样本——少年歌行.txt (样本,放在代码同级目录下):

青州是一个泛称,正确的称呼应该是青州九城,分别是:福泽、荷瑞、临远、长兴、金秀、云间、休宁、牧野以及白城。这九城单独自治,上无州府,直达天启,只因为这九座城池代表了整个天启的财富。此九城均乃商城,他们覆盖了北离近乎八成的商业,乃是整个北离的经济中心。

    而在这青州九城之中,云间城的沐家被称为青州首富,但是沐家究竟有多少钱,究竟是不是青州最有钱的人,谁也不知道。但是沐家作为如今青州商会的掌舵人,他就是如今名义上的青州首富。沐家旗下产业众多,数不胜数,其中以药材产业发家,如今也依然是北离最大的药材商。

程序:


import numpy as np
import tensorflow as tf
import random
import collections
from collections import Counter
import jieba

from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
mpl.rcParams['font.family'] = 'STSong'
mpl.rcParams['font.size'] = 20

training_file = '自定义候选样本——少年歌行.txt'


# 中文字
def get_ch_lable(txt_file):
    labels = ""
    with open(txt_file, 'rb') as f:
        for label in f:
            # labels =label.decode('utf-8')
            labels = labels + label.decode('gb2312')

    return labels


# 分词
def fenci(training_data):
    seg_list = jieba.cut(training_data)  # 默认是精确模式
    training_ci = " ".join(seg_list)
    training_ci = training_ci.split()
    # 以空格将字符串分开
    training_ci = np.array(training_ci)
    training_ci = np.reshape(training_ci, [-1, ])
    return training_ci

'''--------------------------------------------------'''
#1 修改字典处理部分,生成词频数据
def build_dataset(words, n_words):#建立数据集
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))

    dictionary = dict()
    vocab_freqs = []#定义词频数据list
    for word, nvocab in count:
        dictionary[word] = len(dictionary)
        vocab_freqs.append(nvocab)#加入字典里的每个词频
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return data, count, dictionary, reversed_dictionary, vocab_freqs


training_data = get_ch_lable(training_file)
print("总字数", len(training_data))
training_ci = fenci(training_data)
# print(training_ci)
print("总词数", len(training_ci))
#使用vocab_freqs接收词频数据的返回值
training_label, count, dictionary, words, vocab_freqs = build_dataset(training_ci, 350)
'''--------------------------------------------------'''

words_size = len(dictionary)
print("字典词数", words_size)
# print(training_label)#将文本转为词向量
# print(words)#每个编号对应的词
# print(dictionary)#每个词对应的编号
# print(count)#每个词对应的个数
####################################################
print('Sample data', training_label[:10], [words[i] for i in training_label[:10]])
data_index = 0


def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)

    if data_index + span > len(data):
        data_index = 0

    buffer.extend(data[data_index:data_index + span])
    data_index += span

    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)

            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]

        if data_index == len(data):
            # print(data_index,len(data),span,len(data[:span]))
            # buffer[:] = data[:span]
            buffer = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(training_label, batch_size=8, num_skips=2, skip_window=1)

for i in range(8):  # 取第一个字,后一个是标签,再取其前一个字当标签,
    print(batch[i], words[batch[i]], '->', labels[i, 0], words[labels[i, 0]])

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = np.int32(words_size / 2)  # Only pick dev samples in the head of the distribution.
print("valid_window", valid_window)
valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 0-words_size/2,中的数取16个。不能重复。
num_sampled = 64  # Number of negative examples to sample.

tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(tf.random_uniform([words_size, embedding_size], -1.0, 1.0))  # 94个,每个128个向量

    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    '''--------------------------------------------------'''
    #2 通过词频数据进行候选样本采样
    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.truncated_normal([words_size, embedding_size],
                                                  stddev=1.0 / tf.sqrt(np.float32(embedding_size))))

    nce_biases = tf.Variable(tf.zeros([words_size]))

vocab_freqs[0] = 90

sampled = tf.nn.fixed_unigram_candidate_sampler(
    true_classes=tf.cast(train_labels, tf.int64),
    num_true=1,
    num_sampled=num_sampled,
    unique=True,
    range_max=words_size,
    unigrams=vocab_freqs)
'''--------------------------------------------------'''
#3 使用自己的采样计算softmax的loss
loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=nce_weights, biases=nce_biases,
                               labels=train_labels, inputs=embed,
                               num_sampled=num_sampled, num_classes=words_size, sampled_values=sampled))
'''--------------------------------------------------'''
#4 运行生成结果
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
print("________________________", similarity.shape)

# Begin training.
num_steps = 100001
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(training_label, batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        # 通过打印测试可以看到  embed的值在逐渐的被调节
        #        emv = sess.run(embed,feed_dict = {train_inputs: [37,18]})
        #        print("emv-------------------",emv[0])

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)

        if step % 10000 == 0:
            sim = similarity.eval(session=sess)
            # print(valid_size)
            for i in range(valid_size):
                valid_word = words[valid_examples[i]]
                # print("valid_word",valid_word)#16
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]  # argsort函数返回的是数组值从小到大的索引值
                # print("nearest",nearest,top_k)
                log_str = 'Nearest to %s:' % valid_word

                for k in range(top_k):
                    close_word = words[nearest[k]]
                    log_str = '%s,%s' % (log_str, close_word)
                print(log_str)

    final_embeddings = normalized_embeddings.eval()


def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                     ha='right', va='bottom')
    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 80  # 输出100个词
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [words[i] for i in range(plot_only)]
    # print(labels)
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')

结果:

总字数 257

总词数 156

字典词数 93

Sample data [3, 6, 22, 23, 1, 24, 2, 25, 26, 6] ['青州', '是', '一个', '泛称', ',', '正确', '的', '称呼', '应该', '是']

6 是 -> 3 青州

6 是 -> 22 一个
22 一个 -> 6 是
22 一个 -> 23 泛称

23 泛称 -> 1 ,

23 泛称 -> 22 一个
1 , -> 24 正确

1 , -> 23 泛称

valid_window 46

________________________ (16, 93)

Initialized
Average loss at step  0 :  4.367962837219238
Nearest to 九城:,他,以,临远,但是,数不胜数,休宁,商城,自治
Nearest to 直达:,以及,近乎,最大,商会,作为,掌舵人,白城,数不胜数
Nearest to ::,云间,掌舵人,整个,此,多少,代表,牧野,称为
Nearest to 一个:,他们,单独,之中,但是,代表,云间,乃,商
Nearest to 长兴:,自治,九座,只,他,城池,州府,发家,云间城
Nearest to 因为:,最,财富,众多,之中,福泽,商业,乃,是不是
Nearest to 北离:,他们,以及,代表,但是,人,乃,上,乃是
Nearest to 但是:,北离,此,他们,青州,一个,九城,这,以及
Nearest to 休宁:,究竟,荷瑞,商城,财富,数不胜数,。,最大,九城
Nearest to 白城:,福泽,最,多少,称呼,八成,只,直达,休宁
Nearest to 如今:,的,就是,知道,青州,旗下,之中,云间城,荷瑞
Nearest to 单独:,人,一个,商会,名义,依然,UNK,就是,均
Nearest to 只:,数不胜数,整个,长兴,就是,九座,经济,在,休宁
Nearest to 金秀:,其中,发家,他,以及,掌舵人,他们,临远,、
Nearest to 牧野:,此,应该,均,:,UNK,知道,数不胜数,自治
Nearest to 正确:,此,也,有,。,上,而,乃,产业
Average loss at step  2000 :  1.2564247762858867
Average loss at step  4000 :  1.1930320000350476
Average loss at step  6000 :  1.1876235606372356
Average loss at step  8000 :  1.1849836399182678
Average loss at step  10000 :  1.1775927014425398
Nearest to 九城:,自治,他,是不是,的,数不胜数,这,临远,但是
Nearest to 直达:,以及,近乎,数不胜数,最大,谁,掌舵人,作为,人
Nearest to ::,云间,掌舵人,整个,多少,此,牧野,代表,称为
Nearest to 一个:,他们,单独,之中,但是,代表,云间,八成,、
Nearest to 长兴:,自治,只,城池,荷瑞,九座,他,州府,福泽
Nearest to 因为:,最,财富,众多,在,九城,之中,福泽,是不是
Nearest to 北离:,代表,以及,他们,人,青州,但是,乃是,上
Nearest to 但是:,此,他们,九城,北离,乃是,这,究竟,一个
Nearest to 休宁:,究竟,荷瑞,商城,临远,财富,数不胜数,最大,依然
Nearest to 白城:,最,福泽,多少,称呼,九座,休宁,有钱,商会
Nearest to 如今:,的,首富,称为,就是,最,之中,旗下,知道
Nearest to 单独:,人,一个,商会,均,名义,UNK,依然,就是
Nearest to 只:,数不胜数,整个,长兴,就是,九座,经济,乃是,在
Nearest to 金秀:,临远,其中,发家,他,他们,以及,掌舵人,、
Nearest to 牧野:,此,应该,均,:,UNK,知道,数不胜数,自治
Nearest to 正确:,上,此,也,有,乃,而,究竟,产业
Average loss at step  12000 :  1.1766963436305522
Average loss at step  14000 :  1.1760558699816466
Average loss at step  16000 :  1.168885586500168
Average loss at step  18000 :  1.1715079602748155
Average loss at step  20000 :  1.1710579504221679
Nearest to 九城:,是不是,首富,的,商会,称为,这,如今,最
Nearest to 直达:,以及,数不胜数,谁,掌舵人,近乎,最大,作为,人
Nearest to ::,云间,掌舵人,整个,多少,此,牧野,代表,称为
Nearest to 一个:,他们,单独,之中,但是,代表,云间,北离,、
Nearest to 长兴:,自治,只,荷瑞,城池,九座,他,福泽,州府
Nearest to 因为:,最,九城,在,财富,众多,之中,是不是,福泽
Nearest to 北离:,代表,青州,以及,人,他们,乃是,上,乃
Nearest to 但是:,此,他们,究竟,乃是,九城,作为,这,一个
Nearest to 休宁:,荷瑞,究竟,商城,临远,财富,数不胜数,依然,最大
Nearest to 白城:,最,福泽,多少,称呼,九座,商会,有钱,休宁
Nearest to 如今:,首富,的,称为,最,这,九城,是不是,就是
Nearest to 单独:,人,一个,均,名义,商会,UNK,依然,就是
Nearest to 只:,数不胜数,整个,长兴,就是,九座,乃是,首富,经济
Nearest to 金秀:,临远,其中,发家,他,他们,以及,掌舵人,、
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,也,有,乃,究竟,而,数不胜数
Average loss at step  22000 :  1.1661443188786507
Average loss at step  24000 :  1.1733204437345266
Average loss at step  26000 :  1.1690986125469207
Average loss at step  28000 :  1.1682816864699126
Average loss at step  30000 :  1.171682315737009
Nearest to 九城:,首富,是不是,如今,商会,称为,的,这,最
Nearest to 直达:,数不胜数,谁,掌舵人,以及,近乎,最大,人,商城
Nearest to ::,云间,掌舵人,多少,整个,此,牧野,应该,代表
Nearest to 一个:,他们,单独,之中,北离,但是,代表,云间,、
Nearest to 长兴:,自治,荷瑞,只,城池,九座,他,福泽,州府
Nearest to 因为:,最,九城,在,财富,众多,青州,之中,是不是
Nearest to 北离:,代表,青州,人,以及,上,乃是,他们,商业
Nearest to 但是:,究竟,此,他们,乃是,作为,一个,以及,这
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,最大
Nearest to 白城:,最,福泽,多少,称呼,九座,这,商会,有钱
Nearest to 如今:,首富,的,称为,九城,这,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,名义,依然,商会,就是
Nearest to 只:,数不胜数,整个,长兴,就是,九座,乃是,首富,经济
Nearest to 金秀:,临远,其中,发家,他们,他,以及,、,掌舵人
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,也,有,究竟,乃,数不胜数,而
Average loss at step  32000 :  1.1653905573785306
Average loss at step  34000 :  1.1694961526244878
Average loss at step  36000 :  1.164877557501197
Average loss at step  38000 :  1.1677462163493038
Average loss at step  40000 :  1.1650465160384775
Nearest to 九城:,首富,如今,是不是,称为,商会,最,这,的
Nearest to 直达:,数不胜数,谁,掌舵人,以及,近乎,最大,商城,人
Nearest to ::,云间,掌舵人,多少,应该,此,牧野,代表,整个
Nearest to 一个:,他们,单独,之中,北离,但是,代表,云间,、
Nearest to 长兴:,自治,荷瑞,只,城池,九座,福泽,他,州府
Nearest to 因为:,最,九城,在,众多,财富,青州,是不是,之中
Nearest to 北离:,代表,青州,人,上,以及,乃是,天启,有钱
Nearest to 但是:,究竟,此,他们,乃是,作为,一个,以及,财富
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,最,福泽,多少,这,称呼,财富,九座,商会
Nearest to 如今:,首富,九城,的,称为,这,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,名义,依然,商会,就是
Nearest to 只:,数不胜数,整个,长兴,乃是,就是,九座,首富,其中
Nearest to 金秀:,临远,其中,发家,他们,他,以及,、,掌舵人
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,也,究竟,有,掌舵人,数不胜数,乃
Average loss at step  42000 :  1.1685177092552186
Average loss at step  44000 :  1.1661041075512766
Average loss at step  46000 :  1.1629066934511065
Average loss at step  48000 :  1.1682542145103216
Average loss at step  50000 :  1.1640924352556468
Nearest to 九城:,首富,如今,是不是,称为,最,商会,这,的
Nearest to 直达:,数不胜数,谁,掌舵人,以及,近乎,商城,人,最大
Nearest to ::,云间,掌舵人,应该,多少,此,牧野,代表,整个
Nearest to 一个:,他们,单独,之中,北离,代表,但是,云间,依然
Nearest to 长兴:,自治,荷瑞,只,城池,九座,福泽,他,州府
Nearest to 因为:,九城,最,在,众多,财富,青州,是不是,之中
Nearest to 北离:,代表,青州,上,人,天启,乃是,以及,药材
Nearest to 但是:,究竟,他们,此,乃是,作为,以及,一个,云间城
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,最,这,福泽,多少,称呼,财富,首富,九座
Nearest to 如今:,首富,九城,的,这,称为,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,名义,依然,商会,就是
Nearest to 只:,数不胜数,整个,长兴,乃是,就是,其中,九座,直达
Nearest to 金秀:,临远,其中,发家,他们,他,以及,、,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,究竟,掌舵人,天启,数不胜数,只,有
Average loss at step  52000 :  1.1681004599183797
Average loss at step  54000 :  1.1671880427896977
Average loss at step  56000 :  1.1644486174210906
Average loss at step  58000 :  1.1642334363609552
Average loss at step  60000 :  1.1682336192131042
Nearest to 九城:,首富,如今,是不是,称为,最,商会,这,的
Nearest to 直达:,数不胜数,谁,掌舵人,以及,商城,近乎,人,最大
Nearest to ::,云间,掌舵人,应该,多少,此,牧野,代表,称为
Nearest to 一个:,他们,单独,之中,北离,代表,但是,依然,云间
Nearest to 长兴:,荷瑞,自治,只,城池,福泽,九座,他,州府
Nearest to 因为:,九城,在,最,青州,众多,财富,是不是,福泽
Nearest to 北离:,代表,青州,上,天启,人,药材,八成,乃是
Nearest to 但是:,究竟,他们,此,乃是,作为,以及,云间城,一个
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,这,最,福泽,多少,财富,称呼,首富,沐家
Nearest to 如今:,首富,九城,这,称为,的,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,依然,名义,就是,商会
Nearest to 只:,数不胜数,整个,长兴,乃是,其中,直达,就是,九座
Nearest to 金秀:,临远,其中,发家,他们,他,以及,、,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,天启,掌舵人,究竟,数不胜数,只,有
Average loss at step  62000 :  1.165693889349699
Average loss at step  64000 :  1.1636382553800941
Average loss at step  66000 :  1.165245725825429
Average loss at step  68000 :  1.1643930895850063
Average loss at step  70000 :  1.1648004886806012
Nearest to 九城:,首富,如今,是不是,称为,最,这,商会,的
Nearest to 直达:,数不胜数,谁,掌舵人,以及,商城,人,近乎,只
Nearest to ::,云间,应该,掌舵人,此,多少,牧野,代表,称为
Nearest to 一个:,他们,单独,之中,北离,依然,代表,但是,云间
Nearest to 长兴:,荷瑞,自治,只,城池,福泽,九座,他,州府
Nearest to 因为:,九城,在,最,青州,众多,财富,是不是,福泽
Nearest to 北离:,代表,青州,上,天启,药材,人,了,八成
Nearest to 但是:,究竟,他们,此,乃是,作为,以及,云间城,。
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,这,最,财富,福泽,沐家,多少,首富,称呼
Nearest to 如今:,首富,九城,这,称为,的,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,依然,名义,青州,这
Nearest to 只:,数不胜数,整个,长兴,乃是,其中,直达,就是,发家
Nearest to 金秀:,临远,其中,发家,他们,他,以及,、,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,数不胜数
Nearest to 正确:,上,此,天启,掌舵人,究竟,数不胜数,只,最大
Average loss at step  72000 :  1.1621470819711686
Average loss at step  74000 :  1.1665577966570855
Average loss at step  76000 :  1.1659714679867028
Average loss at step  78000 :  1.167079231828451
Average loss at step  80000 :  1.1671546734571456
Nearest to 九城:,首富,如今,是不是,最,称为,这,商会,的
Nearest to 直达:,数不胜数,谁,掌舵人,商城,以及,人,只,近乎
Nearest to ::,云间,应该,掌舵人,此,多少,牧野,代表,称为
Nearest to 一个:,他们,单独,之中,北离,依然,代表,云间,但是
Nearest to 长兴:,荷瑞,自治,只,城池,福泽,九座,他,州府
Nearest to 因为:,九城,在,最,青州,众多,财富,是不是,福泽
Nearest to 北离:,青州,上,代表,天启,药材,八成,覆盖,人
Nearest to 但是:,究竟,他们,此,乃是,作为,。,云间城,以及
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,这,最,沐家,财富,首富,福泽,多少,称呼
Nearest to 如今:,首富,九城,这,称为,的,最,是不是,商会
Nearest to 单独:,人,一个,均,UNK,依然,青州,这,名义
Nearest to 只:,数不胜数,整个,其中,长兴,乃是,直达,发家,商城
Nearest to 金秀:,临远,其中,发家,他们,、,他,以及,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,金秀
Nearest to 正确:,上,天启,此,掌舵人,究竟,只,数不胜数,称呼
Average loss at step  82000 :  1.1630124755501747
Average loss at step  84000 :  1.1669416529908776
Average loss at step  86000 :  1.1645876120850445
Average loss at step  88000 :  1.1667308070361615
Average loss at step  90000 :  1.1636836918294429
Nearest to 九城:,首富,如今,是不是,最,称为,这,商会,的
Nearest to 直达:,数不胜数,谁,掌舵人,商城,人,以及,只,他
Nearest to ::,云间,应该,掌舵人,此,牧野,多少,代表,称为
Nearest to 一个:,他们,单独,之中,北离,依然,代表,应该,云间
Nearest to 长兴:,荷瑞,自治,只,福泽,城池,九座,他,州府
Nearest to 因为:,九城,在,最,青州,众多,财富,是不是,福泽
Nearest to 北离:,上,代表,青州,天启,药材,八成,覆盖,整个
Nearest to 但是:,究竟,他们,此,乃是,泛称,。,作为,被
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,这,沐家,最,财富,首富,福泽,多少,中心
Nearest to 如今:,首富,九城,这,称为,的,最,是不是,商会
Nearest to 单独:,人,均,一个,UNK,这,青州,依然,名义
Nearest to 只:,数不胜数,其中,乃是,整个,长兴,直达,发家,商城
Nearest to 金秀:,临远,其中,发家,他们,、,他,以及,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,金秀
Nearest to 正确:,上,天启,掌舵人,此,究竟,数不胜数,只,最大
Average loss at step  92000 :  1.162348118647933
Average loss at step  94000 :  1.165564891755581
Average loss at step  96000 :  1.1673970686942339
Average loss at step  98000 :  1.1639761181920767
Average loss at step  100000 :  1.163195191785693
Nearest to 九城:,首富,如今,是不是,最,称为,这,商会,的
Nearest to 直达:,数不胜数,谁,掌舵人,商城,人,以及,只,他
Nearest to ::,云间,应该,掌舵人,此,牧野,代表,多少,称为
Nearest to 一个:,他们,单独,之中,北离,依然,代表,应该,云间
Nearest to 长兴:,荷瑞,自治,只,福泽,城池,九座,他,州府
Nearest to 因为:,九城,在,最,青州,众多,财富,是不是,福泽
Nearest to 北离:,上,青州,天启,代表,药材,八成,整个,覆盖
Nearest to 但是:,究竟,他们,此,乃是,泛称,被,云间城,作为
Nearest to 休宁:,荷瑞,究竟,临远,商城,数不胜数,财富,依然,近乎
Nearest to 白城:,这,沐家,财富,首富,最,福泽,中心,多少
Nearest to 如今:,首富,九城,这,称为,的,最,是不是,商会
Nearest to 单独:,人,均,一个,UNK,青州,这,依然,名义
Nearest to 只:,数不胜数,其中,乃是,直达,长兴,发家,整个,商城
Nearest to 金秀:,临远,其中,发家,他们,、,他,以及,牧野
Nearest to 牧野:,应该,此,均,:,UNK,知道,自治,金秀
Nearest to 正确:,上,天启,掌舵人,究竟,此,只,数不胜数,最大

9-28
将sampled_softmax_loss改为nce_loss同样有效

程序:


import numpy as np
import tensorflow as tf
import random
import collections
from collections import Counter
import jieba

from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
mpl.rcParams['font.family'] = 'STSong'
mpl.rcParams['font.size'] = 20

training_file = '自定义候选样本——少年歌行.txt'


# 中文字
def get_ch_lable(txt_file):
    labels = ""
    with open(txt_file, 'rb') as f:
        for label in f:
            # labels =label.decode('utf-8')
            labels = labels + label.decode('gb2312')

    return labels


# 分词
def fenci(training_data):
    seg_list = jieba.cut(training_data)  # 默认是精确模式
    training_ci = " ".join(seg_list)
    training_ci = training_ci.split()
    # 以空格将字符串分开
    training_ci = np.array(training_ci)
    training_ci = np.reshape(training_ci, [-1, ])
    return training_ci

'''--------------------------------------------------'''
#1 修改字典处理部分,生成词频数据
def build_dataset(words, n_words):#建立数据集
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))

    dictionary = dict()
    vocab_freqs = []#定义词频数据list
    for word, nvocab in count:
        dictionary[word] = len(dictionary)
        vocab_freqs.append(nvocab)#加入字典里的每个词频
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return data, count, dictionary, reversed_dictionary, vocab_freqs


training_data = get_ch_lable(training_file)
print("总字数", len(training_data))
training_ci = fenci(training_data)
# print(training_ci)
print("总词数", len(training_ci))
#使用vocab_freqs接收词频数据的返回值
training_label, count, dictionary, words, vocab_freqs = build_dataset(training_ci, 350)
'''--------------------------------------------------'''

words_size = len(dictionary)
print("字典词数", words_size)
# print(training_label)#将文本转为词向量
# print(words)#每个编号对应的词
# print(dictionary)#每个词对应的编号
# print(count)#每个词对应的个数
####################################################
print('Sample data', training_label[:10], [words[i] for i in training_label[:10]])
data_index = 0


def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)

    if data_index + span > len(data):
        data_index = 0

    buffer.extend(data[data_index:data_index + span])
    data_index += span

    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)

            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]

        if data_index == len(data):
            # print(data_index,len(data),span,len(data[:span]))
            # buffer[:] = data[:span]
            buffer = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(training_label, batch_size=8, num_skips=2, skip_window=1)

for i in range(8):  # 取第一个字,后一个是标签,再取其前一个字当标签,
    print(batch[i], words[batch[i]], '->', labels[i, 0], words[labels[i, 0]])

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = np.int32(words_size / 2)  # Only pick dev samples in the head of the distribution.
print("valid_window", valid_window)
valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 0-words_size/2,中的数取16个。不能重复。
num_sampled = 64  # Number of negative examples to sample.

tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(tf.random_uniform([words_size, embedding_size], -1.0, 1.0))  # 94个,每个128个向量

    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    '''--------------------------------------------------'''
    #2 通过词频数据进行候选样本采样
    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.truncated_normal([words_size, embedding_size],
                                                  stddev=1.0 / tf.sqrt(np.float32(embedding_size))))

    nce_biases = tf.Variable(tf.zeros([words_size]))

vocab_freqs[0] = 90

sampled = tf.nn.fixed_unigram_candidate_sampler(
    true_classes=tf.cast(train_labels, tf.int64),
    num_true=1,
    num_sampled=num_sampled,
    unique=True,
    range_max=words_size,
    unigrams=vocab_freqs)
'''--------------------------------------------------'''
#3 使用自己的采样计算softmax的loss 将sampled_softmax_loss改为nce_loss同样有效
# loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=nce_weights, biases=nce_biases,
#                  labels=train_labels, inputs=embed,
#                  num_sampled=num_sampled, num_classes=words_size, sampled_values=sampled))

loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
                 labels=train_labels, inputs=embed,
                 num_sampled=num_sampled, num_classes=words_size))
'''--------------------------------------------------'''
#4 运行生成结果
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
print("________________________", similarity.shape)

# Begin training.
num_steps = 100001
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(training_label, batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        # 通过打印测试可以看到  embed的值在逐渐的被调节
        #        emv = sess.run(embed,feed_dict = {train_inputs: [37,18]})
        #        print("emv-------------------",emv[0])

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)

        if step % 10000 == 0:
            sim = similarity.eval(session=sess)
            # print(valid_size)
            for i in range(valid_size):
                valid_word = words[valid_examples[i]]
                # print("valid_word",valid_word)#16
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]  # argsort函数返回的是数组值从小到大的索引值
                # print("nearest",nearest,top_k)
                log_str = 'Nearest to %s:' % valid_word

                for k in range(top_k):
                    close_word = words[nearest[k]]
                    log_str = '%s,%s' % (log_str, close_word)
                print(log_str)

    final_embeddings = normalized_embeddings.eval()


def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                     ha='right', va='bottom')
    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 80  # 输出100个词
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [words[i] for i in range(plot_only)]
    # print(labels)
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')

结果:

总字数 257

总词数 156
字典词数 93
Sample data [3, 6, 22, 23, 1, 24, 2, 25, 26, 6] ['青州', '是', '一个', '泛称', ',', '正确', '的', '称呼', '应该', '是']
6 是 -> 3 青州
6 是 -> 22 一个
22 一个 -> 23 泛称
22 一个 -> 6 是
23 泛称 -> 1 ,
23 泛称 -> 22 一个
1 , -> 24 正确
1 , -> 23 泛称

________________________ (16, 93)

Initialized
Average loss at step  0 :  64.99969482421875
Nearest to 。:,云间城,有钱,钱,了,称为,长兴,谁,名义
Nearest to 荷瑞:,商会,发家,泛称,、,而,城池,分别,云间
Nearest to 但是:,中心,州府,九城,究竟,自治,发家,:,代表
Nearest to 整个:,八成,,,长兴,如今,九座,最,休宁,称呼
Nearest to 这:,是不是,商,:,究竟,UNK,单独,但是,直达
Nearest to 直达:,:,有,长兴,有钱,休宁,旗下,金秀,白城
Nearest to 如今:,此,之中,其中,的,不,整个,代表,作为
Nearest to 休宁:,直达,是不是,被,之中,他,泛称,整个,自治
Nearest to 无:,自治,因为,的,九座,九城,正确,荷瑞,州府
Nearest to 临远:,一个,药材,最,只,州府,旗下,云间,乃
Nearest to 上:,九座,一个,他们,产业,以,最,谁,他
Nearest to 只:,财富,,,泛称,单独,金秀,临远,其中,众多
Nearest to 云间:,有钱,青州,旗下,临远,一个,覆盖,荷瑞,、
Nearest to 应该:,中心,乃是,近乎,而,九座,是不是,在,城池
Nearest to 牧野:,他们,了,云间城,九座,多少,众多,首富,,
Nearest to 一个:,临远,正确,商城,长兴,产业,云间,上,自治
Average loss at step  2000 :  2.582893608689308
Average loss at step  4000 :  2.390000852525234
Average loss at step  6000 :  2.3697808198332786
Average loss at step  8000 :  2.3618074927330017
Average loss at step  10000 :  2.3590228804349898
Nearest to 。:,有钱,了,云间城,长兴,谁,但是,旗下,钱
Nearest to 荷瑞:,发家,云间,而,商会,、,无,城池,作为
Nearest to 但是:,究竟,发家,中心,自治,州府,。,旗下,九城
Nearest to 整个:,,,八成,长兴,九座,代表,休宁,近乎,如今
Nearest to 这:,商,此,是不是,但是,单独,:,代表,究竟
Nearest to 直达:,有,有钱,长兴,:,众多,休宁,旗下,州府
Nearest to 如今:,之中,此,其中,不,的,九座,代表,作为
Nearest to 休宁:,直达,之中,被,整个,他,是不是,在,经济
Nearest to 无:,的,因为,,,自治,九座,覆盖,其中,荷瑞
Nearest to 临远:,云间,药材,旗下,只,州府,乃,最,覆盖
Nearest to 上:,他们,他,九座,药材,八成,人,产业,商城
Nearest to 只:,财富,,,众多,其中,单独,之中,金秀,临远
Nearest to 云间:,有钱,青州,临远,旗下,荷瑞,覆盖,、,有
Nearest to 应该:,中心,乃是,近乎,而,北离,九座,:,在
Nearest to 牧野:,他们,了,多少,云间城,,,众多,首富,九座
Nearest to 一个:,临远,商城,长兴,他,正确,云间,产业,数不胜数
Average loss at step  12000 :  2.352353787481785
Average loss at step  14000 :  2.3492748656868936
Average loss at step  16000 :  2.3482363398075106
Average loss at step  18000 :  2.345673495590687
Average loss at step  20000 :  2.3448493235111236
Nearest to 。:,沐家,但是,旗下,有钱,青州,,,长兴,了
Nearest to 荷瑞:,云间,发家,而,、,沐家,商会,无,作为
Nearest to 但是:,究竟,发家,中心,。,自治,州府,北离,旗下
Nearest to 整个:,八成,,,长兴,代表,近乎,休宁,九座,上
Nearest to 这:,此,商,是不是,单独,但是,代表,知道,:
Nearest to 直达:,有,有钱,长兴,:,众多,休宁,数不胜数,旗下
Nearest to 如今:,之中,其中,此,不,的,九城,是不是,九座
Nearest to 休宁:,直达,之中,整个,被,青州,、,他,在
Nearest to 无:,,,的,因为,自治,九座,覆盖,。,其中
Nearest to 临远:,云间,药材,旗下,只,州府,乃,覆盖,长兴
Nearest to 上:,药材,他们,他,沐家,人,九座,八成,商城
Nearest to 只:,财富,,,众多,其中,单独,之中,金秀,临远
Nearest to 云间:,青州,有钱,临远,旗下,荷瑞,覆盖,、,有
Nearest to 应该:,中心,北离,乃是,近乎,青州,而,:,在
Nearest to 牧野:,他们,了,,,多少,云间城,。,众多,九座
Nearest to 一个:,临远,商城,长兴,他,正确,产业,云间,上
Average loss at step  22000 :  2.3436482590436936
Average loss at step  24000 :  2.3446671962738037
Average loss at step  26000 :  2.3456308463811872
Average loss at step  28000 :  2.3387797303199767
Average loss at step  30000 :  2.337983342051506
Nearest to 。:,沐家,青州,但是,旗下,究竟,在,,,有钱
Nearest to 荷瑞:,云间,发家,、,而,沐家,休宁,无,商会
Nearest to 但是:,究竟,发家,。,中心,自治,州府,北离,旗下
Nearest to 整个:,八成,,,长兴,近乎,代表,休宁,上,九座
Nearest to 这:,此,是不是,商,如今,单独,称为,但是,代表
Nearest to 直达:,有,长兴,有钱,:,众多,休宁,数不胜数,州府
Nearest to 如今:,之中,其中,九城,不,首富,此,的,是不是
Nearest to 休宁:,直达,之中,青州,整个,、,被,沐家,他
Nearest to 无:,,,的,因为,自治,九座,。,覆盖,其中
Nearest to 临远:,云间,药材,旗下,只,乃,长兴,州府,覆盖
Nearest to 上:,沐家,药材,他,人,他们,八成,九座,商城
Nearest to 只:,财富,,,众多,其中,单独,之中,金秀,究竟
Nearest to 云间:,青州,有钱,临远,荷瑞,旗下,、,覆盖,有
Nearest to 应该:,北离,中心,青州,乃是,:,近乎,而,在
Nearest to 牧野:,他们,了,多少,。,云间城,,,众多,沐家
Nearest to 一个:,商城,临远,他,长兴,上,产业,正确,云间
Average loss at step  32000 :  2.340764605164528
Average loss at step  34000 :  2.3380323770046236
Average loss at step  36000 :  2.3404678611159326
Average loss at step  38000 :  2.3368434708714485
Average loss at step  40000 :  2.3358238710761072
Nearest to 。:,沐家,青州,,,但是,究竟,旗下,在,北离
Nearest to 荷瑞:,云间,、,发家,沐家,而,,,休宁,无
Nearest to 但是:,究竟,。,发家,自治,中心,北离,州府,而
Nearest to 整个:,八成,,,近乎,长兴,代表,上,休宁,、
Nearest to 这:,此,是不是,商,如今,称为,首富,单独,的
Nearest to 直达:,有,长兴,有钱,众多,:,数不胜数,休宁,,
Nearest to 如今:,首富,之中,九城,其中,不,的,此,这
Nearest to 休宁:,直达,整个,青州,、,之中,沐家,被,也
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,,,旗下,长兴,乃,只,了
Nearest to 上:,沐家,药材,人,他,他们,八成,掌舵人,北离
Nearest to 只:,财富,,,众多,其中,之中,单独,上,究竟
Nearest to 云间:,青州,有钱,临远,荷瑞,旗下,、,覆盖,,
Nearest to 应该:,北离,中心,青州,乃是,:,近乎,而,在
Nearest to 牧野:,,,他们,。,了,多少,云间城,首富,云间
Nearest to 一个:,商城,上,临远,他,长兴,产业,正确,数不胜数
Average loss at step  42000 :  2.337449945271015
Average loss at step  44000 :  2.335675438165665
Average loss at step  46000 :  2.3375511239171027
Average loss at step  48000 :  2.3372044395804403
Average loss at step  50000 :  2.3362441070079805
Nearest to 。:,沐家,青州,但是,究竟,旗下,在,,,、
Nearest to 荷瑞:,云间,、,发家,沐家,而,休宁,无,,
Nearest to 但是:,究竟,。,发家,自治,中心,北离,州府,而
Nearest to 整个:,八成,近乎,北离,长兴,、,,,上,代表
Nearest to 这:,此,是不是,商,如今,首富,称为,单独,的
Nearest to 直达:,众多,长兴,有钱,:,有,数不胜数,休宁,州府
Nearest to 如今:,首富,九城,之中,其中,不,这,的,谁
Nearest to 休宁:,、,整个,直达,青州,之中,沐家,被,也
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,长兴,旗下,乃,了,覆盖,只
Nearest to 上:,药材,沐家,人,他,北离,掌舵人,八成,他们
Nearest to 只:,财富,众多,,,其中,之中,上,单独,究竟
Nearest to 云间:,青州,有钱,临远,、,荷瑞,旗下,覆盖,有
Nearest to 应该:,北离,青州,中心,乃是,:,而,近乎,药材
Nearest to 牧野:,。,他们,了,多少,云间城,首富,云间,,
Nearest to 一个:,商城,上,他,临远,长兴,产业,正确,青州
Average loss at step  52000 :  2.3356817450523377
Average loss at step  54000 :  2.3339782693982123
Average loss at step  56000 :  2.332487169444561
Average loss at step  58000 :  2.3332248421311377
Average loss at step  60000 :  2.3317807205915453
Nearest to 。:,沐家,青州,但是,究竟,旗下,在,北离,药材
Nearest to 荷瑞:,云间,发家,、,沐家,而,休宁,无,作为
Nearest to 但是:,究竟,。,发家,自治,中心,北离,州府,而
Nearest to 整个:,八成,近乎,北离,长兴,上,代表,最大,休宁
Nearest to 这:,此,首富,是不是,如今,商,称为,单独,最
Nearest to 直达:,众多,长兴,:,有,有钱,数不胜数,州府,休宁
Nearest to 如今:,首富,九城,之中,这,其中,不,谁,的
Nearest to 休宁:,整个,直达,之中,青州,、,沐家,被,在
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,药材,云间,长兴,旗下,了,乃,覆盖,州府
Nearest to 上:,药材,沐家,人,北离,他,掌舵人,八成,商业
Nearest to 只:,财富,众多,,,其中,上,之中,究竟,单独
Nearest to 云间:,临远,有钱,青州,荷瑞,旗下,覆盖,、,有
Nearest to 应该:,北离,青州,中心,:,乃是,近乎,而,药材
Nearest to 牧野:,。,了,他们,多少,云间城,云间,首富,九座
Nearest to 一个:,商城,上,他,临远,长兴,产业,正确,数不胜数
Average loss at step  62000 :  2.3331904749870302
Average loss at step  64000 :  2.334769561469555
Average loss at step  66000 :  2.33237244284153
Average loss at step  68000 :  2.3310637676715853
Average loss at step  70000 :  2.3346348379850386
Nearest to 。:,沐家,青州,但是,究竟,北离,旗下,在,,
Nearest to 荷瑞:,云间,、,发家,沐家,而,休宁,无,城池
Nearest to 但是:,究竟,。,发家,自治,中心,北离,而,州府
Nearest to 整个:,北离,近乎,八成,上,长兴,代表,最大,,
Nearest to 这:,此,首富,是不是,如今,称为,商,单独,最
Nearest to 直达:,众多,长兴,:,有钱,数不胜数,有,州府,休宁
Nearest to 如今:,首富,九城,之中,这,不,其中,谁,是不是
Nearest to 休宁:,整个,直达,青州,之中,、,沐家,被,荷瑞
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,长兴,了,旗下,乃,覆盖,州府
Nearest to 上:,药材,沐家,北离,人,他,掌舵人,八成,商业
Nearest to 只:,财富,众多,上,其中,,,究竟,之中,单独
Nearest to 云间:,临远,有钱,青州,荷瑞,旗下,、,覆盖,有
Nearest to 应该:,北离,青州,中心,:,乃是,近乎,而,在
Nearest to 牧野:,。,了,云间城,他们,多少,云间,首富,九座
Nearest to 一个:,商城,上,他,临远,长兴,产业,青州,北离
Average loss at step  72000 :  2.3330719541311264
Average loss at step  74000 :  2.332368581235409
Average loss at step  76000 :  2.330422281563282
Average loss at step  78000 :  2.332304368853569
Average loss at step  80000 :  2.3310767450928687
Nearest to 。:,沐家,青州,但是,究竟,北离,在,旗下,,
Nearest to 荷瑞:,云间,、,发家,而,沐家,休宁,无,单独
Nearest to 但是:,究竟,。,发家,自治,中心,北离,而,州府
Nearest to 整个:,北离,近乎,八成,上,长兴,最大,了,代表
Nearest to 这:,此,首富,是不是,称为,如今,商,单独,最
Nearest to 直达:,众多,长兴,数不胜数,:,有钱,有,州府,究竟
Nearest to 如今:,首富,九城,这,之中,不,谁,其中,称为
Nearest to 休宁:,整个,直达,之中,、,青州,荷瑞,沐家,被
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,长兴,了,乃,旗下,覆盖,州府
Nearest to 上:,药材,北离,沐家,人,掌舵人,他,八成,商业
Nearest to 只:,财富,众多,上,其中,究竟,之中,,,。
Nearest to 云间:,临远,有钱,荷瑞,青州,旗下,、,覆盖,有
Nearest to 应该:,北离,青州,中心,:,乃是,近乎,而,在
Nearest to 牧野:,。,了,云间城,首富,云间,多少,他们,九座
Nearest to 一个:,商城,上,他,临远,长兴,北离,产业,青州
Average loss at step  82000 :  2.331249094367027
Average loss at step  84000 :  2.3317255604863165
Average loss at step  86000 :  2.330369430541992
Average loss at step  88000 :  2.3284329971075057
Average loss at step  90000 :  2.331051857173443
Nearest to 。:,沐家,青州,但是,究竟,在,北离,旗下,,
Nearest to 荷瑞:,云间,、,发家,而,沐家,休宁,无,单独
Nearest to 但是:,究竟,。,发家,自治,中心,而,北离,州府
Nearest to 整个:,北离,近乎,八成,最大,长兴,上,了,代表
Nearest to 这:,此,首富,是不是,称为,如今,最,单独,商
Nearest to 直达:,众多,数不胜数,长兴,:,有钱,究竟,州府,有
Nearest to 如今:,首富,九城,这,之中,谁,不,称为,其中
Nearest to 休宁:,整个,直达,之中,青州,、,荷瑞,金秀,被
Nearest to 无:,,,的,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,长兴,了,旗下,乃,覆盖,州府
Nearest to 上:,北离,药材,人,沐家,掌舵人,他,八成,商业
Nearest to 只:,财富,众多,上,究竟,其中,之中,,,单独
Nearest to 云间:,临远,有钱,荷瑞,青州,旗下,覆盖,、,有
Nearest to 应该:,北离,青州,中心,:,乃是,近乎,而,在
Nearest to 牧野:,首富,云间,云间城,。,了,多少,他们,九座
Nearest to 一个:,商城,上,他,北离,临远,长兴,青州,数不胜数
Average loss at step  92000 :  2.3291835794448854
Average loss at step  94000 :  2.3295749364495277
Average loss at step  96000 :  2.3292557646632193
Average loss at step  98000 :  2.328701540887356
Average loss at step  100000 :  2.327756262302399
Nearest to 。:,青州,沐家,但是,究竟,在,北离,旗下,,
Nearest to 荷瑞:,云间,发家,、,而,休宁,沐家,无,单独
Nearest to 但是:,究竟,。,发家,自治,中心,而,北离,商
Nearest to 整个:,近乎,北离,最大,八成,长兴,了,上,代表
Nearest to 这:,首富,此,是不是,称为,最,单独,如今,商
Nearest to 直达:,众多,数不胜数,长兴,:,究竟,整个,有钱,州府
Nearest to 如今:,首富,九城,谁,这,之中,不,称为,最
Nearest to 休宁:,整个,直达,之中,、,荷瑞,青州,金秀,天启
Nearest to 无:,的,,,因为,自治,。,九座,覆盖,其中
Nearest to 临远:,云间,药材,长兴,旗下,覆盖,乃,州府,了
Nearest to 上:,药材,北离,人,沐家,掌舵人,他,八成,商业
Nearest to 只:,财富,上,众多,究竟,其中,之中,,,数不胜数
Nearest to 云间:,临远,有钱,荷瑞,青州,旗下,覆盖,、,有
Nearest to 应该:,北离,中心,青州,:,乃是,近乎,而,在
Nearest to 牧野:,首富,云间,云间城,多少,。,了,他们,九座
Nearest to 一个:,商城,上,他,北离,长兴,临远,产业,依然

9-29 word2vect学习样本候选采样
将9-27中的例子改成按照训练数据中类别出现分布方法进行采样,看看有什么效果。

程序:

import numpy as np
import tensorflow as tf
import random
import collections
from collections import Counter
import jieba

from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
mpl.rcParams['font.family'] = 'STSong'
mpl.rcParams['font.size'] = 20

training_file = '人体阴阳与电能.txt'


# 中文字
def get_ch_lable(txt_file):
    labels = ""
    with open(txt_file, 'rb') as f:
        for label in f:
            # labels =label.decode('utf-8')
            labels = labels + label.decode('gb2312')

    return labels


# 分词
def fenci(training_data):
    seg_list = jieba.cut(training_data)  # 默认是精确模式
    training_ci = " ".join(seg_list)
    training_ci = training_ci.split()
    # 以空格将字符串分开
    training_ci = np.array(training_ci)
    training_ci = np.reshape(training_ci, [-1, ])
    return training_ci


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))

    dictionary = dict()
    vocab_freqs = []
    for word, nvocab in count:
        dictionary[word] = len(dictionary)
        vocab_freqs.append(nvocab)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return data, count, dictionary, reversed_dictionary, vocab_freqs


training_data = get_ch_lable(training_file)
print("总字数", len(training_data))
training_ci = fenci(training_data)
# print(training_ci)
print("总词数", len(training_ci))
training_label, count, dictionary, words, vocab_freqs = build_dataset(training_ci, 350)

words_size = len(dictionary)
print("字典词数", words_size)
# print(training_label)#将文本转为词向量
# print(words)#每个编号对应的词
# print(dictionary)#每个词对应的编号
# print(count)#每个词对应的个数
####################################################
print('Sample data', training_label[:10], [words[i] for i in training_label[:10]])
data_index = 0


def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)

    if data_index + span > len(data):
        data_index = 0

    buffer.extend(data[data_index:data_index + span])
    data_index += span

    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)

            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]

        if data_index == len(data):
            # print(data_index,len(data),span,len(data[:span]))
            # buffer[:] = data[:span]
            buffer = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(training_label, batch_size=8, num_skips=2, skip_window=1)

for i in range(8):  # 取第一个字,后一个是标签,再取其前一个字当标签,
    print(batch[i], words[batch[i]], '->', labels[i, 0], words[labels[i, 0]])

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = np.int32(words_size / 2)  # Only pick dev samples in the head of the distribution.
print("valid_window", valid_window)
valid_examples = np.random.choice(valid_window, valid_size, replace=False)  # 0-words_size/2,中的数取16个。不能重复。
num_sampled = 64  # Number of negative examples to sample.

tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(tf.random_uniform([words_size, embedding_size], -1.0, 1.0))  # 94个,每个128个向量

    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.truncated_normal([words_size, embedding_size],
                                                  stddev=1.0 / tf.sqrt(np.float32(embedding_size))))

    nce_biases = tf.Variable(tf.zeros([words_size]))

sampled = tf.nn.learned_unigram_candidate_sampler(tf.cast(train_labels, tf.int64), 1, num_sampled,
                                                  True, words_size)

loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=nce_weights, biases=nce_biases,
                               labels=train_labels, inputs=embed,
                               num_sampled=num_sampled, num_classes=words_size, sampled_values=sampled))

# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
print("________________________", similarity.shape)

# Begin training.
num_steps = 100001
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(training_label, batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        # 通过打印测试可以看到  embed的值在逐渐的被调节
        #        emv = sess.run(embed,feed_dict = {train_inputs: [37,18]})
        #        print("emv-------------------",emv[0])

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)

        if step % 10000 == 0:
            sim = similarity.eval(session=sess)
            # print(valid_size)
            for i in range(valid_size):
                valid_word = words[valid_examples[i]]
                # print("valid_word",valid_word)#16
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]  # argsort函数返回的是数组值从小到大的索引值
                # print("nearest",nearest,top_k)
                log_str = 'Nearest to %s:' % valid_word

                for k in range(top_k):
                    close_word = words[nearest[k]]
                    log_str = '%s,%s' % (log_str, close_word)
                print(log_str)

    final_embeddings = normalized_embeddings.eval()


def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                     ha='right', va='bottom')
    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 80  # 输出100个词
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [words[i] for i in range(plot_only)]
    # print(labels)
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')

结果:

总字数 1563

总词数 961
字典词数 350
Sample data [25, 132, 32, 26, 27, 133, 8, 9, 80, 134] ['人体', '阴阳', '与', '电能', '阴', '应该', '是', '身体', '里', '内在']
132 阴阳 -> 25 人体
132 阴阳 -> 32 与
32 与 -> 26 电能
32 与 -> 132 阴阳
26 电能 -> 32 与
26 电能 -> 27 阴
27 阴 -> 26 电能
27 阴 -> 133 应该
valid_window 175

________________________ (16, 350)

Initialized
Average loss at step  0 :  4.330218315124512
Nearest to 内在:,支配,比,用电量,理解,烧毁,阴阳,排量,就
Nearest to 大:,充电,于是,分裂细胞,内,觉得,调用,过程,本领
Nearest to 南孚:,更加,这样,才,UNK,电足,跑,高手,被
Nearest to 我们:,减小,放出,物质,多,端粒,但是,化学,可能
Nearest to 电量:,而已,体能,一部分,细胞分裂,比作,大小,很,运动过度
Nearest to 举个:,器官,很,两,基因,高手,按住,阴,一直
Nearest to 增大:,总结,变成,出去,长寿,走,更换,一部分,整体
Nearest to 生长:,当,小时,全身,阴是,体内,精力,这个,QQ
Nearest to 一部分:,延长,比如,分裂细胞,UNK,电量,运动过度,由,出
Nearest to 配有:,再用,但,来源,当,人会,道理,一天,精神
Nearest to 觉得:,方面,出现,大,阴,这样,市面上,第一,长期
Nearest to 一个:,慢,睡觉,充电,休息,而已,最,没有,灵敏
Nearest to 对:,起来,4.0,变压器,放个,还会,由,举个,恢复
Nearest to 应该:,每次,训练,在,一部分,运动会,神经,人会,用尽
Nearest to 例子:,短时间,原理,化成,(,产生,而已,一个,随着
Nearest to 更好:,运动会,等静,并,电足,或,就,状态,引发
Average loss at step  2000 :  1.763382672920823
Average loss at step  4000 :  1.5275497647710146
Average loss at step  6000 :  1.5130859182327985
Average loss at step  8000 :  1.5073450089544058
Average loss at step  10000 :  1.4988866856545209
Nearest to 内在:,支配,理解,比,烧毁,传过来,2.0,排量,4.0
Nearest to 大:,本领,于是,充电,分裂细胞,走,内,过程,由
Nearest to 南孚:,更加,这样,才,跑,电足,被,高手,UNK
Nearest to 我们:,放出,减小,物质,端粒,多,由,才,放空
Nearest to 电量:,一部分,细胞分裂,而已,这样,运动过度,比作,体能,玩具车
Nearest to 举个:,很,基因,器官,两,按住,阴,一直,高手
Nearest to 增大:,总结,变成,长寿,出去,走,整体,一部分,更换
Nearest to 生长:,全身,阴是,QQ,当,体内,精力,小时,人会
Nearest to 一部分:,电量,比如,分裂细胞,延长,运动过度,应该,另外,出
Nearest to 配有:,但,再用,来源,道理,人会,当,充电,一天
Nearest to 觉得:,出现,方面,大,阴,长期,市面上,这样,方法
Nearest to 一个:,慢,充电,睡觉,休息,最,而已,2.0,换上
Nearest to 对:,起来,变压器,4.0,放个,阳,对于,完,由
Nearest to 应该:,训练,一部分,每次,运动会,人会,神经,高效,对应
Nearest to 例子:,短时间,原理,化成,(,同时,是,产生,反映
Nearest to 更好:,运动会,并,传过来,等静,电足,状态,剃须刀,市面上
Average loss at step  12000 :  1.4940771477073431
Average loss at step  14000 :  1.4915715685859323
Average loss at step  16000 :  1.489099360331893
Average loss at step  18000 :  1.4862490785717963
Average loss at step  20000 :  1.482955482378602
Nearest to 内在:,理解,支配,比,传过来,烧毁,2.0,4.0,排量
Nearest to 大:,本领,走,于是,分裂细胞,充电,内,由,过程
Nearest to 南孚:,更加,这样,才,跑,电足,被,高手,越
Nearest to 我们:,放出,物质,减小,端粒,多,由,放空,才
Nearest to 电量:,一部分,细胞分裂,而已,这样,运动过度,都,其,玩具车
Nearest to 举个:,很,基因,两,按住,器官,一直,阴,高手
Nearest to 增大:,总结,变成,长寿,走,出去,整体,一部分,尤其
Nearest to 生长:,全身,阴是,QQ,体内,人会,精力,适量,当
Nearest to 一部分:,电量,比如,分裂细胞,应该,延长,运动过度,另外,出
Nearest to 配有:,再用,但,来源,当,道理,充电,人会,一天
Nearest to 觉得:,出现,方面,阴,大,长期,市面上,这样,方法
Nearest to 一个:,充电,慢,睡觉,最,而已,休息,到,2.0
Nearest to 对:,起来,变压器,4.0,放个,阳,对于,电池,完
Nearest to 应该:,训练,一部分,每次,运动会,人会,神经,高效,对应
Nearest to 例子:,短时间,原理,化成,同时,是,(,反映,时候
Nearest to 更好:,传过来,并,运动会,市面上,电能,剃须刀,电足,道家
Average loss at step  22000 :  1.4807906688898802
Average loss at step  24000 :  1.4825783501863479
Average loss at step  26000 :  1.4787609797120094
Average loss at step  28000 :  1.4775283692777157
Average loss at step  30000 :  1.4792304769605398
Nearest to 内在:,理解,支配,传过来,比,烧毁,2.0,4.0,起来
Nearest to 大:,走,本领,于是,分裂细胞,充电,由,内,更新
Nearest to 南孚:,更加,这样,才,跑,电足,被,高手,就是
Nearest to 我们:,放出,物质,减小,端粒,由,放空,多,身体
Nearest to 电量:,一部分,细胞分裂,而已,都,这样,其,运动过度,物质
Nearest to 举个:,很,基因,两,按住,一直,站,器官,高手
Nearest to 增大:,总结,变成,长寿,走,出去,整体,尤其,达到
Nearest to 生长:,全身,QQ,阴是,人会,体内,精力,适量,固定
Nearest to 一部分:,电量,应该,比如,分裂细胞,延长,运动过度,另外,出
Nearest to 配有:,再用,但,当,来源,道理,充电,相当于,人会
Nearest to 觉得:,出现,方面,阴,长期,类,方法,大,市面上
Nearest to 一个:,充电,慢,睡觉,最,而已,休息,不动,到
Nearest to 对:,起来,变压器,4.0,对于,电池,放个,阳,完
Nearest to 应该:,训练,一部分,每次,运动会,人会,神经,而,对应
Nearest to 例子:,短时间,原理,化成,同时,反映,时候,(,是
Nearest to 更好:,传过来,市面上,并,电能,剃须刀,出生,节约用电,细胞
Average loss at step  32000 :  1.4767206180244685
Average loss at step  34000 :  1.4755764077603817
Average loss at step  36000 :  1.4746538476496935
Average loss at step  38000 :  1.47303012996912
Average loss at step  40000 :  1.4701586413681507
Nearest to 内在:,理解,传过来,支配,比,2.0,烧毁,4.0,起来
Nearest to 大:,走,本领,于是,分裂细胞,充电,更新,由,适量
Nearest to 南孚:,更加,这样,才,跑,电足,被,就是,高手
Nearest to 我们:,放出,端粒,减小,物质,复制,快,身体,由
Nearest to 电量:,一部分,细胞分裂,都,其,而已,这样,物质,运动过度
Nearest to 举个:,很,基因,两,按住,一直,站,而,高手
Nearest to 增大:,总结,变成,长寿,走,整体,出去,达到,尤其
Nearest to 生长:,全身,QQ,阴是,人会,适量,固定,精力,尤其
Nearest to 一部分:,电量,应该,比如,分裂细胞,延长,另外,运动过度,出
Nearest to 配有:,再用,但,当,来源,相当于,道理,充电,人会
Nearest to 觉得:,出现,方面,阴,长期,类,方法,更加,机器
Nearest to 一个:,充电,慢,睡觉,最,不动,到,而已,2.0
Nearest to 对:,起来,变压器,4.0,对于,电池,完,阳,放个
Nearest to 应该:,一部分,训练,每次,运动会,而,人会,神经,对应
Nearest to 例子:,短时间,原理,同时,化成,反映,时候,(,是
Nearest to 更好:,传过来,市面上,细胞,节约用电,电能,出生,并,剃须刀
Average loss at step  42000 :  1.471804656729102
Average loss at step  44000 :  1.468562864407897
Average loss at step  46000 :  1.4696577304564415
Average loss at step  48000 :  1.4698054756224155
Average loss at step  50000 :  1.4686057599633933
Nearest to 内在:,理解,传过来,支配,比,2.0,4.0,烧毁,起来
Nearest to 大:,走,本领,于是,适量,更新,分裂细胞,由,内
Nearest to 南孚:,更加,这样,才,跑,被,电足,就是,高手
Nearest to 我们:,放出,端粒,复制,减小,物质,快,身体,放空
Nearest to 电量:,一部分,细胞分裂,都,物质,这样,其,而已,阳是
Nearest to 举个:,很,基因,两,站,按住,一直,而,高手
Nearest to 增大:,总结,变成,长寿,走,整体,出去,达到,尤其
Nearest to 生长:,全身,QQ,阴是,人会,适量,固定,尤其,精力
Nearest to 一部分:,电量,应该,比如,分裂细胞,延长,另外,运动过度,出
Nearest to 配有:,再用,相当于,但,当,来源,道理,充电,人会
Nearest to 觉得:,出现,方面,阴,类,长期,机器,方法,这样
Nearest to 一个:,充电,慢,睡觉,最,不动,到,2.0,换上
Nearest to 对:,变压器,起来,4.0,对于,完,电池,阳,放个
Nearest to 应该:,一部分,训练,每次,而,人会,运动会,神经,对应
Nearest to 例子:,短时间,同时,原理,化成,反映,时候,本领,放尽
Nearest to 更好:,细胞,传过来,市面上,节约用电,出生,电能,剃须刀,对外
Average loss at step  52000 :  1.4684792654961347
Average loss at step  54000 :  1.4674723542258143
Average loss at step  56000 :  1.4681864749789237
Average loss at step  58000 :  1.4663644950687886
Average loss at step  60000 :  1.4664869887828826
Nearest to 内在:,理解,传过来,支配,2.0,比,4.0,起来,高能
Nearest to 大:,走,本领,适量,更新,于是,分裂细胞,影响,由
Nearest to 南孚:,更加,这样,才,跑,电足,被,就是,高手
Nearest to 我们:,放出,复制,端粒,快,减小,身体,第一,物质
Nearest to 电量:,一部分,细胞分裂,都,物质,这样,其,而已,阳是
Nearest to 举个:,很,基因,站,两,而,按住,一直,用尽
Nearest to 增大:,总结,变成,长寿,走,整体,出去,当,达到
Nearest to 生长:,全身,QQ,阴是,人会,适量,固定,尤其,甚至
Nearest to 一部分:,电量,应该,比如,分裂细胞,延长,另外,运动过度,出
Nearest to 配有:,相当于,再用,当,但,来源,道理,充电,一天
Nearest to 觉得:,出现,方面,机器,类,阴,更加,长期,方法
Nearest to 一个:,充电,慢,睡觉,最,不动,到,2.0,换上
Nearest to 对:,变压器,起来,对于,4.0,完,电池,阳,于是
Nearest to 应该:,一部分,训练,每次,而,人会,神经,对应,运动会
Nearest to 例子:,同时,短时间,原理,反映,化成,时候,放尽,本领
Nearest to 更好:,细胞,传过来,节约用电,市面上,出生,电能,对外,剃须刀
Average loss at step  62000 :  1.4655053863823413
Average loss at step  64000 :  1.4639758394956588
Average loss at step  66000 :  1.4638634560406207
Average loss at step  68000 :  1.4621722847670318
Average loss at step  70000 :  1.4651355121731757
Nearest to 内在:,理解,传过来,支配,2.0,4.0,起来,高能,输出
Nearest to 大:,走,本领,适量,更新,不同,于是,影响,分裂细胞
Nearest to 南孚:,更加,这样,才,跑,就是,被,电足,终端
Nearest to 我们:,复制,端粒,放出,快,第一,身体,减小,更好
Nearest to 电量:,一部分,物质,细胞分裂,这样,都,其,阳是,而已
Nearest to 举个:,很,基因,站,而,两,按住,一直,使用
Nearest to 增大:,总结,变成,走,长寿,整体,出去,当,达到
Nearest to 生长:,全身,QQ,阴是,人会,固定,适量,尤其,甚至
Nearest to 一部分:,电量,应该,比如,分裂细胞,另外,延长,运动过度,出
Nearest to 配有:,相当于,再用,当,但,来源,道理,充电,一天
Nearest to 觉得:,出现,方面,机器,类,长期,方法,阴,更加
Nearest to 一个:,充电,慢,睡觉,最,不动,到,机器,2.0
Nearest to 对:,变压器,起来,对于,4.0,完,于是,阳,电池
Nearest to 应该:,一部分,训练,每次,而,如此,人会,神经,对应
Nearest to 例子:,同时,短时间,反映,原理,化成,时候,放尽,本领
Nearest to 更好:,细胞,传过来,节约用电,市面上,出生,对外,剃须刀,时候
Average loss at step  72000 :  1.4661167548596858
Average loss at step  74000 :  1.463320901092142
Average loss at step  76000 :  1.4622772703021765
Average loss at step  78000 :  1.4636658035814762
Average loss at step  80000 :  1.4623175632357597
Nearest to 内在:,理解,传过来,2.0,支配,4.0,输出,起来,高能
Nearest to 大:,走,本领,适量,更新,不同,影响,于是,快
Nearest to 南孚:,更加,这样,才,跑,就是,电足,被,终端
Nearest to 我们:,复制,端粒,第一,快,放出,身体,更好,放空
Nearest to 电量:,一部分,物质,都,细胞分裂,这样,其,阳是,而已
Nearest to 举个:,很,基因,站,而,两,使用,按住,一直
Nearest to 增大:,总结,变成,走,长寿,整体,当,出去,达到
Nearest to 生长:,全身,QQ,阴是,人会,固定,尤其,适量,放尽
Nearest to 一部分:,电量,应该,比如,另外,分裂细胞,延长,运动过度,出
Nearest to 配有:,相当于,再用,当,但,来源,道理,充电,一天
Nearest to 觉得:,出现,方面,机器,类,方法,长期,更加,阴
Nearest to 一个:,充电,慢,最,睡觉,不动,到,机器,对
Nearest to 对:,变压器,对于,起来,4.0,UNK,完,于是,阳
Nearest to 应该:,一部分,训练,每次,而,如此,人会,神经,对应
Nearest to 例子:,同时,反映,短时间,时候,化成,原理,放尽,本领
Nearest to 更好:,细胞,节约用电,市面上,传过来,时候,某些,严格执行,我们
Average loss at step  82000 :  1.4621897186264396
Average loss at step  84000 :  1.462577383324504
Average loss at step  86000 :  1.4641863777190447
Average loss at step  88000 :  1.4599675855785608
Average loss at step  90000 :  1.4611568234860897
Nearest to 内在:,理解,传过来,2.0,输出,起来,4.0,高能,支配
Nearest to 大:,走,本领,适量,不同,更新,影响,人,快
Nearest to 南孚:,更加,这样,才,跑,就是,被,电足,终端
Nearest to 我们:,复制,第一,快,端粒,放出,身体,更好,放空
Nearest to 电量:,物质,一部分,细胞分裂,都,这样,其,阳是,固定
Nearest to 举个:,很,基因,站,而,使用,两,一直,按住
Nearest to 增大:,总结,变成,走,长寿,整体,当,出去,达到
Nearest to 生长:,全身,QQ,阴是,人会,放尽,尤其,固定,适量
Nearest to 一部分:,电量,应该,比如,另外,分裂细胞,延长,运动过度,后
Nearest to 配有:,相当于,再用,当,但,来源,道理,充电,低电量
Nearest to 觉得:,出现,方面,机器,类,方法,更加,长期,阴
Nearest to 一个:,充电,机器,最,慢,不动,睡觉,对,到
Nearest to 对:,变压器,对于,起来,4.0,完,一个,于是,阳
Nearest to 应该:,一部分,训练,每次,而,如此,人会,神经,对应
Nearest to 例子:,同时,反映,时候,短时间,化成,原理,放尽,本领
Nearest to 更好:,细胞,节约用电,市面上,传过来,时候,某些,我们,严格执行
Average loss at step  92000 :  1.4600442194938659
Average loss at step  94000 :  1.4629577462822199
Average loss at step  96000 :  1.460022892072797
Average loss at step  98000 :  1.4618379122763872
Average loss at step  100000 :  1.458584585454315
Nearest to 内在:,理解,传过来,2.0,输出,起来,高能,4.0,速度
Nearest to 大:,走,本领,适量,不同,影响,更新,快,人
Nearest to 南孚:,更加,这样,才,跑,被,就是,终端,电足
Nearest to 我们:,复制,第一,快,身体,端粒,更好,放出,我
Nearest to 电量:,物质,一部分,这样,其,都,细胞分裂,阳是,固定
Nearest to 举个:,很,基因,站,而,使用,两,因为,用尽
Nearest to 增大:,总结,变成,走,整体,长寿,当,出去,达到
Nearest to 生长:,QQ,全身,阴是,人会,放尽,反映,固定,尤其
Nearest to 一部分:,电量,应该,比如,另外,分裂细胞,延长,运动过度,后
Nearest to 配有:,相当于,再用,当,但,来源,道理,充电,低电量
Nearest to 觉得:,出现,方面,机器,类,方法,更加,长期,阴
Nearest to 一个:,充电,对,机器,不动,最,慢,睡觉,300
Nearest to 对:,变压器,对于,一个,起来,4.0,于是,完,UNK
Nearest to 应该:,一部分,训练,每次,而,如此,对应,神经,人会
Nearest to 例子:,同时,反映,时候,短时间,化成,原理,放尽,本领
Nearest to 更好:,细胞,节约用电,市面上,传过来,某些,严格执行,我们,时候

9-30 基本Seq2Seq
通过sin与con进行叠加变形生成无规律的模拟曲线,使用Seq2Seq模式对其进行学习,拟合特征,从而达到可以预测下一时刻数据的效果。

程序:


import random
import math

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

'''-----------------------------------------------------------------------------------'''
#1 定义模拟样本函数
def do_generate_x_y(isTrain, batch_size, seqlen):
    batch_x = []
    batch_y = []
    for _ in range(batch_size):
        offset_rand = random.random() * 2 * math.pi
        freq_rand = (random.random() - 0.5) / 1.5 * 15 + 0.5
        amp_rand = random.random() + 0.1

        sin_data = amp_rand * np.sin(np.linspace(
            seqlen / 15.0 * freq_rand * 0.0 * math.pi + offset_rand,
            seqlen / 15.0 * freq_rand * 3.0 * math.pi + offset_rand, seqlen * 2))

        offset_rand = random.random() * 2 * math.pi
        freq_rand = (random.random() - 0.5) / 1.5 * 15 + 0.5
        amp_rand = random.random() * 1.2

        sig_data = amp_rand * np.cos(np.linspace(
            seqlen / 15.0 * freq_rand * 0.0 * math.pi + offset_rand,
            seqlen / 15.0 * freq_rand * 3.0 * math.pi + offset_rand, seqlen * 2)) + sin_data

        batch_x.append(np.array([sig_data[:seqlen]]).T)
        batch_y.append(np.array([sig_data[seqlen:]]).T)

    # 当前shape: (batch_size, seq_length, output_dim)
    batch_x = np.array(batch_x).transpose((1, 0, 2))
    batch_y = np.array(batch_y).transpose((1, 0, 2))
    # 转换后shape: (seq_length, batch_size, output_dim)

    return batch_x, batch_y


# 生成15个连续序列,将con和sin随机偏移变化后的值叠加起来
def generate_data(isTrain, batch_size):
    seq_length = 15
    if isTrain:
        return do_generate_x_y(isTrain, batch_size, seq_length)
    else:
        return do_generate_x_y(isTrain, batch_size, seq_length * 2)
'''-----------------------------------------------------------------------------------'''
#2 参数及网络结构

sample_now, sample_f = generate_data(isTrain=True, batch_size=3)
print("training examples : ")
print(sample_now.shape)
print("(seq_length, batch_size, output_dim)")

seq_length = sample_now.shape[0]
batch_size = 10

output_dim = input_dim = sample_now.shape[-1]
hidden_dim = 12
layers_num = 2

# Optmizer:
learning_rate = 0.04
nb_iters = 100

lambda_l2_reg = 0.003  # L2 regularization of weights - avoids overfitting

tf.reset_default_graph()

encoder_input = []
expected_output = []
decode_input = []
for i in range(seq_length):
    encoder_input.append(tf.placeholder(tf.float32, shape=(None, input_dim)))
    expected_output.append(tf.placeholder(tf.float32, shape=(None, output_dim)))
    decode_input.append(tf.placeholder(tf.float32, shape=(None, input_dim)))

tcells = []
for i in range(layers_num):
    tcells.append(tf.contrib.rnn.GRUCell(hidden_dim))
Mcell = tf.contrib.rnn.MultiRNNCell(tcells)

dec_outputs, dec_memory = tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_input, decode_input, Mcell)

reshaped_outputs = []
for ii in dec_outputs:
    reshaped_outputs.append(tf.contrib.layers.fully_connected(ii, output_dim, activation_fn=None))
'''-----------------------------------------------------------------------------------'''
#3 定义loss函数及优化器

# L2 loss
output_loss = 0
for _y, _Y in zip(reshaped_outputs, expected_output):
    output_loss += tf.reduce_mean(tf.pow(_y - _Y, 2))

# generalization capacity)
reg_loss = 0
for tf_var in tf.trainable_variables():
    if not ("fully_connected" in tf_var.name):
        # print(tf_var.name)
        reg_loss += tf.reduce_mean(tf.nn.l2_loss(tf_var))

loss = output_loss + lambda_l2_reg * reg_loss
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
'''-----------------------------------------------------------------------------------'''
#4 启用session开始训练

sess = tf.InteractiveSession()

def train_batch(batch_size):
    X, Y = generate_data(isTrain=True, batch_size=batch_size)
    feed_dict = {encoder_input[t]: X[t] for t in range(len(encoder_input))}
    feed_dict.update({expected_output[t]: Y[t] for t in range(len(expected_output))})

    c = np.concatenate(([np.zeros_like(Y[0])], Y[:-1]), axis=0)

    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})

    _, loss_t = sess.run([train_op, loss], feed_dict)
    return loss_t


def test_batch(batch_size):
    X, Y = generate_data(isTrain=True, batch_size=batch_size)
    feed_dict = {encoder_input[t]: X[t] for t in range(len(encoder_input))}
    feed_dict.update({expected_output[t]: Y[t] for t in range(len(expected_output))})
    c = np.concatenate(([np.zeros_like(Y[0])], Y[:-1]), axis=0)  # 来预测最后一个序列
    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})
    output_lossv, reg_lossv, loss_t = sess.run([output_loss, reg_loss, loss], feed_dict)
    print("-----------------")
    print(output_lossv, reg_lossv)
    return loss_t


# Training
train_losses = []
test_losses = []

sess.run(tf.global_variables_initializer())
for t in range(nb_iters + 1):
    train_loss = train_batch(batch_size)
    train_losses.append(train_loss)
    if t % 50 == 0:
        test_loss = test_batch(batch_size)
        test_losses.append(test_loss)
        print("Step {}/{}, train loss: {}, \tTEST loss: {}".format(t, nb_iters, train_loss, test_loss))
print("Fin. train loss: {}, \tTEST loss: {}".format(train_loss, test_loss))

# 输出loss图例 Plot loss over time:
plt.figure(figsize=(12, 6))
plt.plot(np.array(range(0, len(test_losses))) /
         float(len(test_losses) - 1) * (len(train_losses) - 1),
         np.log(test_losses), label="Test loss")

plt.plot(np.log(train_losses), label="Train loss")
plt.title("Training errors over time (on a logarithmic scale)")
plt.xlabel('Iteration')
plt.ylabel('log(Loss)')
plt.legend(loc='best')
plt.show()
print("******************************************************")
'''-----------------------------------------------------------------------------------'''
#5 准备可视化数据

# Test
nb_predictions = 5
print("visualize {} predictions data:".format(nb_predictions))

preout = []
X, Y = generate_data(isTrain=False, batch_size=nb_predictions)
print(np.shape(X), np.shape(Y))
for tt in range(seq_length):
    feed_dict = {encoder_input[t]: X[t + tt] for t in range(seq_length)}
    feed_dict.update({expected_output[t]: Y[t + tt] for t in range(len(expected_output))})
    c = np.concatenate(([np.zeros_like(Y[0])], Y[tt:seq_length + tt - 1]), axis=0)  # 从前15个的最后一个开始预测

    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})
    outputs = np.array(sess.run([reshaped_outputs], feed_dict)[0])
    preout.append(outputs[-1])

print(np.shape(preout))  # 将每个未知预测值收集起来准备显示出来。
preout = np.reshape(preout, [seq_length, nb_predictions, output_dim])
'''-----------------------------------------------------------------------------------'''
#6 画图显示数据

for j in range(nb_predictions):
    plt.figure(figsize=(12, 3))

    for k in range(output_dim):
        past = X[:, j, k]
        expected = Y[seq_length - 1:, j, k]  # 对应预测值的打印

        pred = preout[:, j, k]

        label1 = "past" if k == 0 else "_nolegend_"
        label2 = "future" if k == 0 else "_nolegend_"
        label3 = "Pred" if k == 0 else "_nolegend_"
        plt.plot(range(len(past)), past, "o--b", label=label1)
        plt.plot(range(len(past), len(expected) + len(past)),
                 expected, "x--b", label=label2)
        plt.plot(range(len(past), len(pred) + len(past)),
                 pred, "o--y", label=label3)

    plt.legend(loc='best')
    plt.title("Predictions vs. future")
    plt.show()

结果:
在这里插入图片描述

   `training examples : 
    
    (15, 3, 1)
    
    (seq_length, batch_size, output_dim)
    
    
        -----------------
        8.529151 110.92636
        Step 0/100, train loss: 8.096626281738281, 	TEST loss: 8.861929893493652
        -----------------
        1.4181571 91.12217
        Step 50/100, train loss: 1.5739840269088745, 	TEST loss: 1.691523551940918
        -----------------
        0.59387666 80.93881
        Step 100/100, train loss: 1.0527762174606323, 	TEST loss: 0.8366931080818176
        Fin. train loss: 1.0527762174606323, 	TEST loss: 0.8366931080818176
        ******************************************************
        visualize 5 predictions data:
       (30, 5, 1) (30, 5, 1)
       (15, 5, 1)

在这里插入图片描述在这里插入图片描述
在这里插入图片描述在这里插入图片描述在这里插入图片描述
9-32 seq2seqstock
使用seq2seq模式对某个股票数据的训练学习,拟合特征,从而达到可以预测第二天股票价格的效果。

程序:



import random
import math

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
'''---------------------------------------------------------'''
#2 导入股票数据
pd.options.mode.chained_assignment = None  # default='warn'


def loadstock(window_size):
    names = ['date',
             'code',
             'name',
             'Close',
             'top_price',
             'low_price',
             'opening_price',
             'bef_price',
             'floor_price',
             'floor',
             'exchange',
             'Volume',
             'amount',
             '总市值',
             '流通市值']
    data = pd.read_csv('C:/Users/50633/Desktop/素材与样本/实例73 素材/600000.csv', names=names, header=None, encoding="gbk")
    # predictor_names = ["Close",'top_price',"low_price","opening_price"]
    predictor_names = ["Close"]
    training_features = np.asarray(data[predictor_names], dtype="float32")
    kept_values = training_features[1000:]

    X = []
    Y = []
    for i in range(len(kept_values) - window_size * 2):  # x ;前window_size,y后window_size
        X.append(kept_values[i:i + window_size])
        Y.append(kept_values[i + window_size:i + window_size * 2])

    X = np.reshape(X, [-1, window_size, len(predictor_names)])
    Y = np.reshape(Y, [-1, window_size, len(predictor_names)])
    print(np.shape(X))

    return X, Y

X_train = []
Y_train = []
X_test = []
Y_test = []
'''---------------------------------------------------------'''
#3 生成样本

def generate_data(isTrain, batch_size):
    # 40 pas values for encoder, 40 after for decoder's predictions.

    seq_length = 40
    seq_length_test = 80

    global Y_train
    global X_train
    global X_test
    global Y_test
    # First load, with memoization:
    if len(Y_train) == 0:
        X, Y = loadstock(window_size=seq_length)
        # X, Y = normalizestock(X, Y)

        # Split 80-20:
        X_train = X[:int(len(X) * 0.8)]
        Y_train = Y[:int(len(Y) * 0.8)]

    if len(Y_test) == 0:
        X, Y = loadstock(window_size=seq_length_test)
        # X, Y = normalizestock(X, Y)

        # Split 80-20:
        X_test = X[int(len(X) * 0.8):]
        Y_test = Y[int(len(Y) * 0.8):]

    if isTrain:
        return do_generate_x_y(X_train, Y_train, batch_size)
    else:
        return do_generate_x_y(X_test, Y_test, batch_size)


def do_generate_x_y(X, Y, batch_size):
    assert X.shape == Y.shape, (X.shape, Y.shape)
    idxes = np.random.randint(X.shape[0], size=batch_size)
    X_out = np.array(X[idxes]).transpose((1, 0, 2))
    Y_out = np.array(Y[idxes]).transpose((1, 0, 2))
    return X_out, Y_out


sample_now, sample_f = generate_data(isTrain=True, batch_size=3)
print("training examples : ")
print(sample_now.shape)
print("(seq_length, batch_size, output_dim)")
'''---------------------------------------------------------'''
#4 运行程序查看结果

seq_length = sample_now.shape[0]
batch_size = 100

output_dim = input_dim = sample_now.shape[-1]
hidden_dim = 12
layers_num = 2

# Optmizer:
learning_rate = 0.04
# nb_iters = 100
nb_iters = 100000
lambda_l2_reg = 0.003  # L2 regularization of weights - avoids overfitting

tf.reset_default_graph()

encoder_input = []
expected_output = []
decode_input = []
for i in range(seq_length):
    encoder_input.append(tf.placeholder(tf.float32, shape=(None, input_dim)))
    expected_output.append(tf.placeholder(tf.float32, shape=(None, output_dim)))
    decode_input.append(tf.placeholder(tf.float32, shape=(None, input_dim)))

tcells = []
for i in range(layers_num):
    tcells.append(tf.contrib.rnn.GRUCell(hidden_dim))
Mcell = tf.contrib.rnn.MultiRNNCell(tcells)

dec_outputs, dec_memory = tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(encoder_input, decode_input, Mcell)

reshaped_outputs = []
for ii in dec_outputs:
    reshaped_outputs.append(tf.contrib.layers.fully_connected(ii, output_dim, activation_fn=None))

# L2 loss
output_loss = 0
for _y, _Y in zip(reshaped_outputs, expected_output):
    output_loss += tf.reduce_mean(tf.pow(_y - _Y, 2))

# generalization capacity)
reg_loss = 0
for tf_var in tf.trainable_variables():
    if not ("fully_connected" in tf_var.name):
        # print(tf_var.name)
        reg_loss += tf.reduce_mean(tf.nn.l2_loss(tf_var))

loss = output_loss + lambda_l2_reg * reg_loss
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

sess = tf.InteractiveSession()


def train_batch(batch_size):
    X, Y = generate_data(isTrain=True, batch_size=batch_size)
    feed_dict = {encoder_input[t]: X[t] for t in range(len(encoder_input))}
    feed_dict.update({expected_output[t]: Y[t] for t in range(len(expected_output))})

    c = np.concatenate(([np.zeros_like(Y[0])], Y[:-1]), axis=0)

    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})

    _, loss_t = sess.run([train_op, loss], feed_dict)
    return loss_t


def test_batch(batch_size):
    X, Y = generate_data(isTrain=True, batch_size=batch_size)
    feed_dict = {encoder_input[t]: X[t] for t in range(len(encoder_input))}
    feed_dict.update({expected_output[t]: Y[t] for t in range(len(expected_output))})
    c = np.concatenate(([np.zeros_like(Y[0])], Y[:-1]), axis=0)  # 来预测最后一个序列
    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})
    output_lossv, reg_lossv, loss_t = sess.run([output_loss, reg_loss, loss], feed_dict)
    print("-----------------")
    print(output_lossv, reg_lossv)
    return loss_t


# Training
train_losses = []
test_losses = []

sess.run(tf.global_variables_initializer())
for t in range(nb_iters + 1):
    train_loss = train_batch(batch_size)
    train_losses.append(train_loss)
    if t % 1000 == 0:
        test_loss = test_batch(batch_size)
        test_losses.append(test_loss)
        print("Step {}/{}, train loss: {}, \tTEST loss: {}".format(t, nb_iters, train_loss, test_loss))
print("Fin. train loss: {}, \tTEST loss: {}".format(train_loss, test_loss))

# Plot loss over time:
plt.figure(figsize=(12, 6))
plt.plot(np.array(range(0, len(test_losses))) /
         float(len(test_losses) - 1) * (len(train_losses) - 1),
         np.log(test_losses), label="Test loss")

plt.plot(np.log(train_losses), label="Train loss")
plt.title("Training errors over time (on a logarithmic scale)")
plt.xlabel('Iteration')
plt.ylabel('log(Loss)')
plt.legend(loc='best')
plt.show()

# Test
nb_predictions = 5
print("visualize {} predictions data:".format(nb_predictions))

preout = []
X, Y = generate_data(isTrain=False, batch_size=nb_predictions)
print(np.shape(X), np.shape(Y))
for tt in range(seq_length):
    feed_dict = {encoder_input[t]: X[t + tt] for t in range(seq_length)}
    feed_dict.update({expected_output[t]: Y[t + tt] for t in range(len(expected_output))})
    c = np.concatenate(([np.zeros_like(Y[0])], Y[tt:seq_length + tt - 1]), axis=0)  # 从前15个的最后一个开始预测

    feed_dict.update({decode_input[t]: c[t] for t in range(len(c))})
    outputs = np.array(sess.run([reshaped_outputs], feed_dict)[0])
    preout.append(outputs[-1])

print(np.shape(preout))  # 将每个未知预测值收集起来准备显示出来。
preout = np.reshape(preout, [seq_length, nb_predictions, output_dim])

for j in range(nb_predictions):
    plt.figure(figsize=(12, 3))

    for k in range(output_dim):
        past = X[:, j, k]
        expected = Y[seq_length - 1:, j, k]  # 对应预测值的打印

        pred = preout[:, j, k]

        label1 = "past" if k == 0 else "_nolegend_"
        label2 = "future" if k == 0 else "_nolegend_"
        label3 = "Pred" if k == 0 else "_nolegend_"
        plt.plot(range(len(past)), past, "o--b", label=label1)
        plt.plot(range(len(past), len(expected) + len(past)),
                 expected, "x--b", label=label2)
        plt.plot(range(len(past), len(pred) + len(past)),
                 pred, "o--y", label=label3)

    plt.legend(loc='best')
    plt.title("Predictions vs. future")
    plt.show()

结果:
在这里插入图片描述
在这里插入图片描述在这里插入图片描述
在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述`

(3071, 40, 1)

(2991, 80, 1)
training examples : 
(40, 3, 1)
(seq_length, batch_size, output_dim)
  
-----------------
17317.17 118.2144
Step 0/10000, train loss: 13335.80078125, 	TEST loss: 17317.525390625
-----------------
182.19443 564.5218
Step 500/10000, train loss: 183.95169067382812, 	TEST loss: 183.88800048828125
-----------------
109.32391 748.95087
Step 1000/10000, train loss: 234.47450256347656, 	TEST loss: 111.57076263427734
-----------------
127.43893 777.31274
Step 1500/10000, train loss: 146.54627990722656, 	TEST loss: 129.77085876464844
-----------------
140.89859 862.653
Step 2000/10000, train loss: 152.479736328125, 	TEST loss: 143.48654174804688
-----------------
186.59155 1935.742
Step 2500/10000, train loss: 166.63821411132812, 	TEST loss: 192.39877319335938
-----------------
131.27348 1839.9236
Step 3000/10000, train loss: 186.79937744140625, 	TEST loss: 136.7932586669922
-----------------
186.177 1803.2905
Step 3500/10000, train loss: 178.34823608398438, 	TEST loss: 191.5868682861328
-----------------
147.16472 1690.1251
Step 4000/10000, train loss: 172.9777374267578, 	TEST loss: 152.23509216308594
 -----------------
155.39336 1593.2106
Step 4500/10000, train loss: 121.65401458740234, 	TEST loss: 160.17298889160156
-----------------
145.59406 1529.6902
Step 5000/10000, train loss: 140.6916046142578, 	TEST loss: 150.18312072753906
-----------------
113.53698 1934.22
Step 5500/10000, train loss: 186.61105346679688, 	TEST loss: 119.33963775634766
-----------------
143.49596 1809.5713
Step 6000/10000, train loss: 206.01654052734375, 	TEST loss: 148.92466735839844
-----------------
177.0319 1704.553
Step 6500/10000, train loss: 130.490966796875, 	TEST loss: 182.14556884765625
-----------------
151.47963 1659.7113
Step 7000/10000, train loss: 163.80068969726562, 	TEST loss: 156.45877075195312
-----------------
156.02988 1966.3689
Step 7500/10000, train loss: 179.2886962890625, 	TEST loss: 161.92898559570312
-----------------
107.89595 1814.5974
Step 8000/10000, train loss: 150.6245880126953, 	TEST loss: 113.3397445678711
-----------------
177.36761 1789.2323
Step 8500/10000, train loss: 126.08475494384766, 	TEST loss: 182.7353057861328
-----------------
162.47433 1727.0645
Step 9000/10000, train loss: 164.741455078125, 	TEST loss: 167.65553283691406
-----------------
159.67093 1695.32
Step 9500/10000, train loss: 162.73403930664062, 	TEST loss: 164.7568817138672
-----------------
154.89792 1658.6976
Step 10000/10000, train loss: 122.2144546508789, 	TEST loss: 159.87400817871094
Fin. train loss: 122.2144546508789, 	TEST loss: 159.87400817871094
visualize 5 predictions data:
(80, 5, 1) (80, 5, 1)
(40, 5, 1)

`

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值