import collections
import math
import datetime
import tensorflow as tf
import itertools
from collections import defaultdict
from config import *
embedding_size = 128
n_sampled = 500
batch_size = 1024
num_steps = 20001 # data_size / batch_size * n_epoch
#num_steps = 180001 # data_size / batch_size * n_epoch
every_k_step = 10000
num_skips = 2 # batch_size % num_skips == 0
window_size = 3
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
click_item = set()
def read_data(filename):
global click_item
with open(filename) as f:
for line in f.readlines():
line = list(map(lambda x: int(x), line.strip().split(' ')))
line.extend([''] * window_size)
data.extend(line)
click_item = set(filter(lambda x: x != '', data))
print("click item size: {0}".format(len(click_item)))
return data
def load_embedding():
rows = max(side_info[:, 0])+1
emb = np.zeros((rows, embedding_size))
hit = 0
with open(os.path.join(DATA_PATH, 'product_title_embedding')) as f:
for line in f.readlines():
line = line.strip().split('\t')
product_id = line[0]
row = id_map.get(product_id, -1)
if row >= 0:
hit += 1
for k in range(embedding_size):
emb[row][k] = float(line[k+1])
print("title embedding hit rate: {0}".format(hit/len(id_map)))
return emb
data_index = 0
offset = -window_size
def generate_batch(batch_size, num_skip, skip_window):
global data_index
global offset
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
ind = 0
flag = False
while ind < batch_size:
for j in range(offset, skip_window+1):
if j == 0 or data_index+j < 0 or data[data_index] == '' or data[data_index+j] == '':
continue
batch[ind] = data[data_index]
labels[ind, 0] = data[data_index+j]
ind += 1
if ind == batch_size:
offset = j + 1
flag = True
break
if flag:
break
else:
data_index = (data_index + 1) % len(data)
offset = -skip_window
return batch, labels
def get_cold_start_embedding():
global click_item
global ck_embedding
cold_start_item = set(side_info[:, 0]).difference(click_item)
item_size = len(cold_start_item)
cnt = item_size // batch_size
remain = (cnt + 1) * batch_size - item_size
cold_start_item = np.array(cold_start_item)
cold_start_item = np.concatenate([cold_start_item, [0] * remain], axis=0)
for i in range(cnt+1):
eval_embedding_list = []
eval_input = all_item[i*batch_size+j*batch_size_gpu: i*batch_size+(j+1)*batch_size_gpu]
eval_label = np.zeros((batch_size, 1))
eval_embedding = sess.run(merged_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
# for cold start item
# cold_start_embedding = sess.run(cold_start_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
eval_embedding = eval_embedding.tolist()
eval_embedding_list.extend(eval_embedding)
if i == cnt:
eval_embedding_list = eval_embedding_list[:-remain]
ck_embedding.update({all_item[i*batch_size+k]: eval_embedding_list[k] for k in range(len(eval_embedding_list))})
def dump_embedding(embedding_result, output_file):
with open(output_file, 'w') as f:
for k, v in embedding_result.items():
k = reversed_id_map[k]
f.write("{0} {1}\n".format(k, " ".join(list(map(lambda x: str(x), v)))))
def get_season_map(side_info_raw):
season_list = ['春','夏','秋','冬']
li = []
for i in range(1, len(season_list)+1):
li.extend(list(itertools.combinations(season_list, i)))
li.append('四季')
mapping = {'春夏秋冬':'四季', '一季度': '春', '二季度': '夏', '三季度':'秋', '四季度':'冬'}
d = defaultdict(lambda:set())
for line in side_info_raw:
arr = line
season = arr[2]
best_cnt = 0
best_match = ''
for m in mapping:
if m in season:
best_match = mapping[m]
break
if best_match == '':
for ss in li:
cnt = 0
for s in ss:
if s in season:
cnt += 1
if cnt > best_cnt:
best_cnt = cnt
best_match = "".join(ss)
if best_match == '':
d['无'].add(season)
else:
d[best_match].add(season)
season_mapping = {vv: k for k, v in d.items() for vv in v}
return season_mapping
if __name__ == '__main__':
title_emb = dict()
with open(os.path.join(DATA_PATH, 'product_title_embedding')) as f:
for line in f.readlines():
line = line.strip().split('\t')
product_id = line[0]
emb = np.array([float(x) for x in line[1:]])
title_emb[product_id] = emb
feature_dict = {}
with open(os.path.join(DATA_PATH, "feature_dict.old")) as f:
for line in f.readlines():
line = line.strip().split('\t')
ind = int(line[0])
if ind not in feature_dict:
feature_dict[ind] = dict()
feature_dict[ind][line[1]] = int(line[2])
graph = tf.Graph()
today_item = []
with open(os.path.join(DATA_PATH, 'side_info_raw')) as f:
for line in f.readlines():
line = line.strip()
arr = line.split("\t")
today_item.append(arr)
season_mapping = get_season_map(today_item)
for i in range(len(today_item)):
cur = today_item[i]
cur[2] = season_mapping.get(cur[2], '无')
today_item[i] = cur
today_item = np.array(today_item)
today_item_id = set(today_item[:, 0])
emb_item = set([x.strip().split(' ')[0] for x in open(os.path.join(MODEL_PATH, "final_embedding"))])
new_item = today_item_id.difference(emb_item)
idx = [i for i in range(today_item.shape[0]) if today_item[i, 0] in new_item]
new_item_info = today_item[idx, :]
row, col = new_item_info.shape
for i in range(row):
for j in range(1, col):
new_item_info[i, j] = feature_dict[j].get(new_item_info[i, j], 0)
feature_size = 9
with graph.as_default():
with tf.Session() as sess:
saver = tf.train.import_meta_graph(os.path.join(MODEL_PATH, 'model.meta'))
saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH))
#v_names = []
#for v in tf.trainable_variables():
# v_names.append(v)
embedding_list = []
for i in range(feature_size):
embedding = graph.get_tensor_by_name("embedding_{}:0".format(str(i)))
embedding_list.append(embedding)
alpha = graph.get_tensor_by_name("alpha:0".format(str(i)))
embedding_list, alpha = sess.run([embedding_list, alpha])
res = {}
avg_alpha = np.mean(alpha, axis=0)
print("cold start item size: {}".format(new_item_info.shape[0]))
for i in range(new_item_info.shape[0]):
emb = []
item_id = new_item_info[i, 0]
for j in range(1, feature_size):
if j < feature_size-1:
ind = int(new_item_info[i, j])
if ind >= len(embedding_list[j]):
feature_emb = [0]*128
else:
feature_emb = embedding_list[j][ind] * np.exp(avg_alpha[j])
else:
feature_emb = title_emb.get(item_id, np.array([0]*128)) * np.exp(avg_alpha[j])
emb.append(feature_emb)
emb = np.sum(emb, axis=0) / np.sum(np.exp(avg_alpha), axis=0)
res[item_id] = emb
with open(os.path.join(MODEL_PATH, "final_embedding"), "a") as f:
for k, v in res.items():
f.write("{0} {1}\n".format(k, " ".join(list(map(lambda x: str(x), v)))))
'''
final_emb = {}
with open(os.path.join(MODEL_PATH, "final_embedding")) as f:
for line in f.readlines():
line = line.strip()
item_id = line.split(" ")[0]
emb = [float(x) for x in line.split(" ")[1:]]
final_emb[item_id] = np.array(emb)
for k in res:
cand = dict()
emb1 = res[k]
norm1 = np.linalg.norm(emb1)
for k2 in final_emb:
emb2 = final_emb[k2]
norm2 = np.linalg.norm(emb2)
sim = 0
sim += sum([emb1[i]*emb2[i] for i in range(len(emb))])
sim /= (norm1 * norm2)
cand[k2] = sim
cand = sorted(cand.items(), key=lambda x: x[1], reverse=True)[:100]
print(k)
print(list(map(lambda x: x[0], cand)))
'''