import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle
import re
from tensorflow.python.ops import math_ops
# 读取User数据
users_title = ['UserID', 'Gender', 'Age', 'JobID', 'Zip-code']
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine='python',nrows=150)
users = users.filter(regex='UserID|Gender|Age|JobID') #pandas.DataFrame
users_orig = users.values #numpy.ndarray
gender_map = {'F': 0, 'M': 1}
users['Gender'] = users['Gender'].map(gender_map) # type pandas.Series
age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
users['Age'] = users['Age'].map(age_map)
#movies
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine='python',nrows=150)
movies_orig = movies.values
pattern = re.compile(r'^(.*)\((\d+)\)$')
title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map)
genres_set = set() #电影类型转集合, {'Sci-Fi', 'Mystery', 'Crime',}
for val in movies['Genres'].str.split('|'):
genres_set.update(val) #修改当前集合,添加新元素
genres_set.add('<PAD>') #纯添加
genres2int = {val:ii for ii, val in enumerate(genres_set)} #{"Children's": 0, 'Action': 1, '<PAD>': 4, 'Horror': 16}
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))} #{'Crime|Thriller': [15, 2], 'Action|Adventure|Thriller': [5, 0, 2],
for key in genres_map: #将电影类型转成等长数字列表,长度是18
for cnt in range(max(genres2int.values()) - len(genres_map[key])):
genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
movies['Genres'] = movies['Genres'].map(genres_map)
title_set = set() #电影Title转数字字典
for val in movies['Title'].str.split():
title_set.update(val)
title_set.add('<PAD>')
title2int = {val:ii for ii, val in enumerate(title_set)}
title_count = 15 #将电影Title转成等长数字列表,长度是15
title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['Title']))}
for key in title_map:
for cnt in range(title_count - len(title_map[key])):
title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
movies['Title'] = movies['Title'].map(title_map)
#读取评分数据集
ratings_title = ['UserID','MovieID', 'ratings', 'timestamps']
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python',nrows=150)
ratings = ratings.filter(regex='UserID|MovieID|ratings')
#合并三个表
data = pd.merge(pd.merge(ratings, users), movies) #pd.merge(df1,df2,合并方式,df1合并用的键,df2合并用的键)
# 将数据分成X和y两张表
target_fields = ['ratings']
features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields] #pandas.DataFrame
features = features_pd.values # numpy.ndarray
targets_values = targets_pd.values # numpy.ndarray
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('preprocess.p', 'wb'))
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('preprocess.p', mode='rb'))
import tensorflow as tf
import os
import pickle
def save_params(params):
pickle.dump(params, open('params.p', 'wb'))
def load_params():
return pickle.load(open('params.p', mode='rb'))
embed_dim = 32 ##嵌入矩阵的维度
uid_max = max(features.take(0,1)) + 1 ## 用户ID个数: 6040
gender_max = max(features.take(2,1)) + 1 ##性别个数: 1 + 1 = 2
age_max = max(features.take(3,1)) + 1 ##年龄类别个数: 6 + 1 = 7
job_max = max(features.take(4,1)) + 1 ##职业个数: 20 + 1 = 21
movie_id_max = max(features.take(1,1)) + 1 ##电影ID个数: 3952
movie_categories_max = max(genres2int.values()) + 1 ##电影类型个数: 18 + 1 = 19
movie_title_max = len(title_set) ##电影名单词个数: 5216
combiner = "sum" #对电影类型嵌入向量做加和操作的标志,考虑过使用mean做平均,但是没实现mean
sentences_size = title_count ##电影名长度:15
window_sizes = {2, 3, 4, 5} #文本卷积滑动窗口,分别滑动2, 3, 4, 5个单词
filter_num = 8 #文本卷积核数量
movieid2idx = {val[0]:i for i, val in enumerate(movies.values)} #电影ID转下标的字典,数据集中电影ID跟下标不一致,比如第5行的数据电影ID不一定是5
num_epochs = 5 # Number of Epochs
batch_size = 256
dropout_keep = 0.5
learning_rate = 0.0001
show_every_n_batches = 20 # Show stats for every n number of batches
save_dir = './save'
def get_inputs(): #定义输入的占位符
uid = tf.keras.layers.Input(shape=(1,), dtype='int32', name='uid')
user_gender = tf.keras.layers.Input(shape=(1,), dtype='int32', name='user_gender')
user_age = tf.keras.layers.Input(shape=(1,), dtype='int32', name='user_age')
user_job = tf.keras.layers.Input(shape=(1,), dtype='int32', name='user_job')
movie_id = tf.keras.layers.Input(shape=(1,), dtype='int32', name='movie_id')
movie_categories = tf.keras.layers.Input(shape=(18,), dtype='int32', name='movie_categories')
movie_titles = tf.keras.layers.Input(shape=(15,), dtype='int32', name='movie_titles')
return uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles
def get_user_embedding(uid, user_gender, user_age, user_job): #定义User的嵌入矩阵
uid_embed_layer = tf.keras.layers.Embedding(uid_max, embed_dim, input_length=1, name='uid_embed_layer')(uid)
gender_embed_layer = tf.keras.layers.Embedding(gender_max, embed_dim // 2, input_length=1, name='gender_embed_layer')(user_gender)
age_embed_layer = tf.keras.layers.Embedding(age_max, embed_dim // 2, input_length=1, name='age_embed_layer')(user_age)
job_embed_layer = tf.keras.layers.Embedding(job_max, embed_dim // 2, input_length=1, name='job_embed_layer')(user_job)
return uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer
def get_user_feature_layer(uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer):
#第一层全连接 #User嵌入矩阵 一起全连接 生成User的特征
uid_fc_layer = tf.keras.layers.Dense(embed_dim, name="uid_fc_layer", activation='relu')(uid_embed_layer)
gender_fc_layer = tf.keras.layers.Dense(embed_dim, name="gender_fc_layer", activation='relu')(gender_embed_layer)
age_fc_layer = tf.keras.layers.Dense(embed_dim, name="age_fc_layer", activation='relu')(age_embed_layer)
job_fc_layer = tf.keras.layers.Dense(embed_dim, name="job_fc_layer", activation='relu')(job_embed_layer)
#第二层全连接
user_combine_layer = tf.keras.layers.concatenate([uid_fc_layer, gender_fc_layer, age_fc_layer, job_fc_layer], 2) #(?, 1, 128)
user_combine_layer = tf.keras.layers.Dense(200, activation='tanh')(user_combine_layer) #(?, 1, 200)
user_combine_layer_flat = tf.keras.layers.Reshape([200], name="user_combine_layer_flat")(user_combine_layer)
return user_combine_layer, user_combine_layer_flat
def get_movie_id_embed_layer(movie_id): # 定义Movie ID的嵌入矩阵
movie_id_embed_layer = tf.keras.layers.Embedding(movie_id_max, embed_dim, input_length=1, name='movie_id_embed_layer')(movie_id)
return movie_id_embed_layer
def get_movie_categories_layers(movie_categories): # 合并电影类型的多个嵌入向量
movie_categories_embed_layer = tf.keras.layers.Embedding(movie_categories_max, embed_dim, input_length=18, name='movie_categories_embed_layer')(movie_categories)
movie_categories_embed_layer = tf.keras.layers.Lambda(lambda layer: tf.reduce_sum(layer, axis=1, keepdims=True))(movie_categories_embed_layer)
return movie_categories_embed_layer
def get_movie_cnn_layer(movie_titles): # Movie Title的文本卷积网络实现
#从嵌入矩阵中得到电影名对应的各个单词的嵌入向量
movie_title_embed_layer = tf.keras.layers.Embedding(movie_title_max, embed_dim, input_length=15, name='movie_title_embed_layer')(movie_titles)
sp=movie_title_embed_layer.shape
movie_title_embed_layer_expand = tf.keras.layers.Reshape([sp[1], sp[2], 1])(movie_title_embed_layer)
#对文本嵌入层使用不同尺寸的卷积核做卷积和最大池化
pool_layer_lst = []
for window_size in window_sizes:
conv_layer = tf.keras.layers.Conv2D(filter_num, (window_size, embed_dim), 1, activation='relu')(movie_title_embed_layer_expand)
maxpool_layer = tf.keras.layers.MaxPooling2D(pool_size=(sentences_size - window_size + 1 ,1), strides=1)(conv_layer)
pool_layer_lst.append(maxpool_layer)
#Dropout层
pool_layer = tf.keras.layers.concatenate(pool_layer_lst, 3, name ="pool_layer")
max_num = len(window_sizes) * filter_num
pool_layer_flat = tf.keras.layers.Reshape([1, max_num], name = "pool_layer_flat")(pool_layer)
dropout_layer = tf.keras.layers.Dropout(dropout_keep, name = "dropout_layer")(pool_layer_flat)
return pool_layer_flat, dropout_layer
# 将Movie的各个层一起做全连接
def get_movie_feature_layer(movie_id_embed_layer, movie_categories_embed_layer, dropout_layer):
#第一层全连接
movie_id_fc_layer = tf.keras.layers.Dense(embed_dim, name="movie_id_fc_layer", activation='relu')(movie_id_embed_layer)
movie_categories_fc_layer = tf.keras.layers.Dense(embed_dim, name="movie_categories_fc_layer", activation='relu')(movie_categories_embed_layer)
#第二层全连接
movie_combine_layer = tf.keras.layers.concatenate([movie_id_fc_layer, movie_categories_fc_layer, dropout_layer], 2)
movie_combine_layer = tf.keras.layers.Dense(200, activation='tanh')(movie_combine_layer)
movie_combine_layer_flat = tf.keras.layers.Reshape([200], name="movie_combine_layer_flat")(movie_combine_layer)
return movie_combine_layer, movie_combine_layer_flat
import tensorflow as tf
import datetime
from tensorflow import keras
from tensorflow.python.ops import summary_ops_v2
import time
MODEL_DIR = "./models"
class mv_network(object):
def __init__(self, batch_size=256):
self.batch_size = batch_size
self.best_loss = 9999
self.losses = {'train': [], 'test': []}
uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles = get_inputs() # 获取输入占位符
uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer = get_user_embedding(uid, user_gender, user_age, user_job) # 获取User的4个嵌入向量
# 得到用户特征
user_combine_layer, user_combine_layer_flat = get_user_feature_layer(uid_embed_layer, gender_embed_layer,
age_embed_layer, job_embed_layer)
movie_id_embed_layer = get_movie_id_embed_layer(movie_id) # 获取电影ID的嵌入向量
movie_categories_embed_layer = get_movie_categories_layers(movie_categories) # 获取电影类型的嵌入向量
pool_layer_flat, dropout_layer = get_movie_cnn_layer(movie_titles) # 获取电影名的特征向量
movie_combine_layer, movie_combine_layer_flat = get_movie_feature_layer(movie_id_embed_layer, # 得到电影特征
movie_categories_embed_layer,dropout_layer)
# 计算出评分
# 将用户特征和电影特征做矩阵乘法得到一个预测评分的方案
inference = tf.keras.layers.Lambda(lambda layer:
tf.reduce_sum(layer[0] * layer[1], axis=1), name="inference")(
(user_combine_layer_flat, movie_combine_layer_flat))
inference = tf.keras.layers.Lambda(lambda layer: tf.expand_dims(layer, axis=1))(inference)
self.model = tf.keras.Model(
inputs=[uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles],
outputs=[inference])
self.model.summary()
self.optimizer = tf.keras.optimizers.Adam(learning_rate)
# MSE损失,将计算值回归到评分
self.ComputeLoss = tf.keras.losses.MeanSquaredError()
self.ComputeMetrics = tf.keras.metrics.MeanAbsoluteError()
if tf.io.gfile.exists(MODEL_DIR):
pass
else:
tf.io.gfile.makedirs(MODEL_DIR)
train_dir = os.path.join(MODEL_DIR, 'summaries', 'train')
test_dir = os.path.join(MODEL_DIR, 'summaries', 'eval')
checkpoint_dir = os.path.join(MODEL_DIR, 'checkpoints')
self.checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
self.checkpoint = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer)
self.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
def compute_loss(self, labels, logits):
return tf.reduce_mean(tf.keras.losses.mse(labels, logits))
def compute_metrics(self, labels, logits):
return tf.keras.metrics.mae(labels, logits) #
@tf.function
def train_step(self, x, y):
with tf.GradientTape() as tape:
logits = self.model([x[0],x[1], x[2],x[3],x[4], x[5],x[6]], training=True)
loss = self.ComputeLoss(y, logits)
self.ComputeMetrics(y, logits)
grads = tape.gradient(loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
return loss, logits
def training(self, features, targets_values, epochs=5, log_freq=50):
for epoch_i in range(epochs):
# 将数据集分成训练集和测试集,随机种子不固定
train_X, test_X, train_y, test_y = train_test_split(features,targets_values,test_size=0.2,random_state=0)
train_batches = get_batches(train_X, train_y, self.batch_size)
batch_num = (len(train_X) // self.batch_size)
train_start = time.time()
if True:
start = time.time()
avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
for batch_i in range(batch_num):
x, y = next(train_batches)
categories = np.zeros([self.batch_size, 18])
for i in range(self.batch_size):
categories[i] = x.take(6, 1)[i]
titles = np.zeros([self.batch_size, sentences_size])
for i in range(self.batch_size):
titles[i] = x.take(5, 1)[i]
loss, logits = self.train_step([np.reshape(x.take(0, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(2, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(3, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(4, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(1, 1), [self.batch_size, 1]).astype(np.float32),
categories.astype(np.float32),
titles.astype(np.float32)],
np.reshape(y, [self.batch_size, 1]).astype(np.float32))
avg_loss(loss)
self.losses['train'].append(loss)
if tf.equal(self.optimizer.iterations % log_freq, 0):
rate = log_freq / (time.time() - start)
print('Step #{}\tEpoch {:>3} Batch {:>4}/{} Loss: {:0.6f} mae: {:0.6f} ({} steps/sec)'.format(
self.optimizer.iterations.numpy(),epoch_i,batch_i,batch_num,loss, (self.ComputeMetrics.result()), rate))
avg_loss.reset_states()
self.ComputeMetrics.reset_states()
start = time.time()
train_end = time.time()
print('\nTrain time for epoch #{} ({} total steps): {}'.format(epoch_i + 1, self.optimizer.iterations.numpy(),
train_end - train_start))
self.testing((test_X, test_y), self.optimizer.iterations)
self.export_path = os.path.join(MODEL_DIR, 'export')
tf.saved_model.save(self.model, self.export_path)
def testing(self, test_dataset, step_num):
test_X, test_y = test_dataset
test_batches = get_batches(test_X, test_y, self.batch_size)
avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
batch_num = (len(test_X) // self.batch_size)
for batch_i in range(batch_num):
x, y = next(test_batches)
categories = np.zeros([self.batch_size, 18])
for i in range(self.batch_size):
categories[i] = x.take(6, 1)[i]
titles = np.zeros([self.batch_size, sentences_size])
for i in range(self.batch_size):
titles[i] = x.take(5, 1)[i]
logits = self.model([np.reshape(x.take(0, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(2, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(3, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(4, 1), [self.batch_size, 1]).astype(np.float32),
np.reshape(x.take(1, 1), [self.batch_size, 1]).astype(np.float32),
categories.astype(np.float32),
titles.astype(np.float32)], training=False)
test_loss = self.ComputeLoss(np.reshape(y, [self.batch_size, 1]).astype(np.float32), logits)
avg_loss(test_loss)
# 保存测试损失
self.losses['test'].append(test_loss)
self.ComputeMetrics(np.reshape(y, [self.batch_size, 1]).astype(np.float32), logits)
print('Model test set loss: {:0.6f} mae: {:0.6f}'.format(avg_loss.result(), self.ComputeMetrics.result()))
if avg_loss.result() < self.best_loss:
self.best_loss = avg_loss.result()
print("best loss = {}".format(self.best_loss))
self.checkpoint.save(self.checkpoint_prefix)
def forward(self, xs):
predictions = self.model(xs)
return predictions
def get_batches(Xs, ys, batch_size):
for start in range(0, len(Xs), batch_size):
end = min(start + batch_size, len(Xs))
yield Xs[start:end], ys[start:end]
#mv_net=mv_network()
#mv_net.training(features, targets_values, epochs=5)
代码1
最新推荐文章于 2021-08-01 09:23:27 发布