import os
import os.path
import numpy as np
import tensorflow as tf
from tensorflow.contrib import learn
from sklearn.metrics import mean_squared_error
from mlp import generate_data, lstm_model, load_csvdata
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import dateutil.parser
import datetime
import matplotlib.dates as mdates
import math
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import time
LOG_DIR = './ops_logs/mlp_new2/'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
feature_beg = 3
TIMESTEPS = 353
layer_dimension = [1342,1342,1]
TRAINING_STEPS = 100000
# learn_rate=0.01
learn_rate=0.005
l2_regular=0.001
split_num = 0.1545
BATCH_SIZE = 64
PRINT_STEPS = TRAINING_STEPS / 1000
train_ave=np.array
train_std=np.array
train_ave_sk=np.array
train_std_sk=np.array
train_max=np.array
train_min=np.array
input_size = 1638
n_class = 2
X = tf.placeholder(tf.float32, shape=(None, input_size))
Y_ = tf.placeholder(tf.float32, shape=(None, n_class))
# mlp_new 1342 * 1342 * 1342
# mlp_new1 1342 * 1342 dropout:0.5
# mlp_new2 1342 * 1342 * 1342 dropout:0.5 sigmoid l2
# mlp_new3 1342 * 1342 * 1342 dropout:0.5 sigmoid
# mlp_new3 test
n_hidden_1 = 64
n_hidden_2 = 1500
n_hidden_3 = 1500
weight = {
'h1': tf.Variable(tf.random_normal([input_size, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_class]))
}
bias = {
'h1': tf.Variable(tf.random_normal([n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_2])),
'h3': tf.Variable(tf.random_normal([n_hidden_3])),
'out': tf.Variable(tf.random_normal([n_class]))
}
def _start_shell(local_ns=None):
# An interactive shell is useful for debugging/development.
import IPython
user_ns = {}
if local_ns:
user_ns.update(local_ns)
user_ns.update(globals())
IPython.start_ipython(argv=[], user_ns=user_ns)
def log_loss(test_y,predict_y):
all_loss = 0
for ii in range(len(predict_y)):
test_res = math.log(math.exp(test_y[ii]),10)
predict_res = math.log(math.exp(predict_y[ii]),10)
if not test_y[ii]:
test_res = 0
if not predict_y[ii]:
predict_res = 0
# print math.log(test_y[ii]),math.log(predict_y[ii])
all_loss += (test_res - predict_res) * (test_res - predict_res)
return all_loss / float(len(predict_y))
def auc_score(clf,Xtest, ytest, pos_label=1, auc_score=True):
pytest = clf.predict(Xtest)
pytestprob = clf.predict_proba(Xtest)
pytestprob = [k[pos_label] for k in pytestprob]
fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
header = "pos=" + str(pos_label) +"\tprecision\trecall\taccuracy\tf1_score"
scores = (precision_score(ytest, pytest, pos_label=pos_label),
recall_score(ytest, pytest, pos_label=pos_label),
accuracy_score(ytest, pytest),
f1_score(ytest, pytest, pos_label=pos_label))
if auc_score:
header += "\tauc"
scores = scores + (auc(fpr, tpr), )
print header
print scores
return scores
def get_one_hot(all_length):
res = []
for ii in range(all_length):
res.append(-1.0)
return res
def get_one_hot_orig(all_length):
res = []
for ii in range(all_length):
res.append(0.0)
return res
def get_one_hot_1(all_length):
res = []
for ii in range(all_length):
res.append(1.0)
return res
def get_label():
file = './data/label_1101.csv'
f = open(file, "r")
result = {}
flag = -1
while True:
data = f.readline()
flag += 1
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_label = datablock[4] if (datablock[4] == '0') else '1'
index_tr_te = "1" if (datablock[2].split(" ")[0] < "2017-04-01") else "0"
result[datablock[3]] = index_label + "," + index_tr_te
f.close()
file = './data/black_label_remove_repeat.csv'
f = open(file, "r")
flag = -1
while True:
data = f.readline()
flag += 1
if not data:
break
if not flag:
continue
datablock = data.strip('\n').replace('\r',"").split(',')
index_label = datablock[4] if (datablock[4] == '0') else '1'
#print datablock
index_tr_te = "1" if (datablock[2].split(" ")[0].split("-")[1] < "08") else "0"
result[datablock[3]] = index_label + "," + index_tr_te
f.close()
return result
def get_mob6():
file = './data/label_1010.csv'
f = open(file, "r")
# apply_id = get_applyid()
result = {}
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_label = datablock[4] if (datablock[4] == '0') else '1'
# if index_label == '1':
# print datablock
if float(index_label) > 1:
print index_label
result[datablock[3]] = index_label + ',' + datablock[2]
f.close()
print 'result',len(result)
return result
def get_data(file):
f = open(file, "r")
result = {}
flag = -1
while True:
data = f.readline()
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_res = []
# print datablock[1]
for ii in range(1,len(datablock)):
index_res.append(float(datablock[ii]))
result[datablock[0]] = index_res
f.close()
return result
def get_csv_all(train_flag,all_data_file):
mob_6_lable = get_label()
f = open(all_data_file, "r")
content = f.readlines()
f.close()
print "applyid length:",len(content)
train_x_res = []
train_y = []
test_x_res = []
test_y = []
apply_block = []
train_y_sk = []
train_x_sk = []
wandoujia_tr = get_data('./data/app_tr_wandoujia.txt')
wandoujia_te = get_data('./data/app_te_wandoujia.txt')
wandoujia_res = dict(wandoujia_tr.items() + wandoujia_te.items())
del wandoujia_tr,wandoujia_te
orig_file_tr = get_data('./data/lab_apps_lev2_lev3_tr.csv')
orig_file_te = get_data('./data/lab_apps_lev2_lev3_te.csv')
orig_data_all = dict(orig_file_tr.items() + orig_file_te.items())
del orig_file_tr,orig_file_te
log_info_tr = get_data("./data/feature_sdk_tr.txt")
log_info_te = get_data("./data/feature_sdk_te.txt")
log_info_all = dict(log_info_tr.items() + log_info_te.items())
del log_info_tr,log_info_te
input_tr = get_data("./data/input_feats/input_train_0912.csv")
input_te = get_data("./data/input_feats/input_test_0912.csv")
input_all = dict(input_tr.items() + input_te.items())
del input_tr,input_te
mix_tr = get_data("./data/mix_feats/mix_tr")
mix_te = get_data("./data/mix_feats/mix_te")
mix_all = dict(mix_tr.items() + mix_te.items())
del mix_tr,mix_te
log_info_all = {}
input_all,mix_all = {},{}
flag = 0
feature_beg = 1
for data in content:
flag += 1
if not (flag % 1000):
print flag
datablock = data.strip('\n').strip('\r').split(',')
apply_id = datablock[0]
index_value = []
index_value_sk = []
index_flag = -1
if wandoujia_res.has_key(apply_id):
index_input = wandoujia_res[apply_id]
else:
index_input = get_one_hot_orig(248)
#index_value.extend(index_input)
if orig_data_all.has_key(apply_id):
index_orig = orig_data_all[apply_id]
else:
index_orig = get_one_hot_orig(1638)
index_value.extend(index_orig)
if log_info_all.has_key(apply_id):
index_info = log_info_all[apply_id]
else:
index_info = get_one_hot(21)
#index_value.extend(index_info)
if input_all.has_key(apply_id):
index_input = input_all[apply_id]
else:
index_input = get_one_hot(178)
#index_value.extend(index_input)
if mix_all.has_key(apply_id):
index_mix = mix_all[apply_id]
else:
index_mix = get_one_hot_orig(18)
#index_value.extend(index_mix)
#print mob_6_lable[apply_id]
if mob_6_lable.has_key(apply_id):
index_label_content = mob_6_lable[apply_id]
index_flag = float(index_label_content.split(',')[1])
if index_label_content.split(',')[0] == '0':
index_label_class = [1,0]
# index_label_class = [0]
else:
index_label_class = [0,1]
if (index_flag != train_flag):
continue
train_x_res.append(index_value)
train_y.append(index_label_class)
train_x_sk.append(index_value)
train_y_sk.append(float(index_label_content.split(',')[0]))
apply_block.append(apply_id)
f.close()
print "trainx: ",len(train_x_res)
# print np.mean(train_x_res,axis=0)
#global train_ave
#global train_std
#global train_ave_sk
#global train_std_sk
#global train_max
#global train_min
#if train_flag:
if True:
#train_ave = np.mean(train_x_res,axis=0)
#train_std = np.std(train_x_res,axis=0)
train_min = np.min(train_x_res,axis=0)
train_max = np.max(train_x_res,axis=0)
train_ave_sk = np.mean(train_x_sk,axis=0)
train_std_sk = np.std(train_x_sk,axis=0)
all_mul = (train_x_res-train_min)
train_max_gap = (train_max-train_min)
for ii in range(len(train_max_gap)):
if not train_max_gap[ii]:
all_mul[ii] = 0.0
train_max_gap[ii] = 1.0
normalized_train_data=all_mul/train_max_gap
all_mul = (train_x_sk-train_ave_sk)
all_std = train_std_sk
for ii in range(len(all_std)):
if not all_std[ii]:
all_mul[ii] = 0.0
all_std[ii] = 1.0
normalized_train_data_sk=all_mul/all_std
# max-min scala
return np.array(normalized_train_data, dtype=np.float32),np.array(train_y, dtype=np.float32),normalized_train_data_sk,train_y_sk,apply_block
#return np.array(train_x_res, dtype=np.float32),np.array(train_y, dtype=np.float32),train_x_sk,train_y_sk,apply_block
def get_csv(file):
f = open(file, "r")
train_x_res = []
train_x_id = []
train_x_sk = []
train_y = []
input_res = get_input()
flag = 0
feature_beg = 1
while True:
data = f.readline()
if not data:
break
if not flag:
flag += 1
continue
if not (flag % 100):
print flag
datablock = data.strip('\n').split(',')
index_value = []
index_value_sk = []
index_flag = -1
for ii in datablock[feature_beg:len(datablock)]:
index_flag += 1
# if (index_flag == 3) or (index_flag == 2):
# continue
index_value.append(float(ii))
if input_res.has_key(datablock[0]):
index_input = input_res[datablock[0]]
else:
index_input = [[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1],[-1]]
index_value.extend(index_input)
index_label_class = [1,0]
if datablock[1] != '1':
# print 'not 000'
index_label_class = [0,1]
train_y.append(index_label_class)
train_x_res.append(index_value)
f.close()
return np.array(train_x_res, dtype=np.float32),np.array(train_y, dtype=np.float32)
def auc_score(pytestprob,ytest, pos_label=1, auc_score=True):
# pytestprob = [k[pos_label] for k in pytestprob]
# ytest = [k[pos_label] for k in ytest]
fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
if auc_score:
header = "\tauc"
scores = auc(fpr, tpr)
print header
print scores
return scores
def get_weight(shape, lambda1):
var = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(lambda1)(var))
return var
def _layer():
n_layers = len(layer_dimension)
cur_layer = X
in_dimension = layer_dimension[0]
for i in range(1, n_layers):
out_dimension = layer_dimension[i]
weight = get_weight([in_dimension, out_dimension], 0.0003)
bias = tf.Variable(tf.constant(0.1, shape=[out_dimension]))
if (i == n_layers):
cur_layer = tf.matmul(cur_layer, weight) + bias
else:
cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias)
in_dimension = layer_dimension[i]
return cur_layer
def multiplayer_perceptron(x, weight, bias,dropout):
layer0 = tf.nn.dropout(x, dropout)
layer1 = tf.add(tf.matmul(layer0, weight['h1']), bias['h1'])
layer1 = tf.nn.relu(layer1)
layer1 = tf.nn.dropout(layer1, dropout)
# 0.005
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(weight['h1']))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(bias['h1']))
#layer2 = tf.add(tf.matmul(layer1, weight['h2']), bias['h2'])
#layer2 = tf.nn.sigmoid(layer2)
#layer2 = tf.nn.dropout(layer2, dropout)
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(weight['h2']))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(bias['h2']))
#layer3 = tf.add(tf.matmul(layer2, weight['h3']), bias['h3'])
#layer3 = tf.nn.sigmoid(layer3)
#layer3 = tf.nn.dropout(layer3, dropout)
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(weight['h3']))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(bias['h3']))
out_layer = tf.add(tf.matmul(layer1, weight['out']), bias['out'])
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(weight['out']))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(l2_regular)(bias['out']))
return out_layer
def train_model(train_x,train_y,batch_size=36,):
sample_size = len(train_x)
# y= cur_layer
hidd=_layer()
y = tf.nn.softmax(hidd)
mse_loss = tf.reduce_mean(-tf.reduce_sum(Y_ * tf.log(y), reduction_indices=[1]))
# mse_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, Y_))
# mse_loss = tf.reduce_sum(tf.pow(Y_ - y, 2)) / sample_size
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'))
train_op=tf.train.AdamOptimizer(learn_rate).minimize(loss)
saver=tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(TRAINING_STEPS):
step=0
start=0
end=start+batch_size
while(end<len(train_x)):
_,loss_=sess.run([train_op,loss],feed_dict={X:train_x[start:end],Y_:train_y[start:end]})
start+=batch_size
end=start+batch_size
step+=1
if i%10==0:
print("train_step,loss: ",i,loss_,time.strftime("%I:%M:%S"))
print("save model: ",saver.save(sess,LOG_DIR + 'stock.model'))
def train_model2(train_x,train_y,batch_size,test_x1,test_y1):
sample_size = len(train_x)
y = multiplayer_perceptron(X, weight, bias, 0.5)
#y1 = tf.nn.softmax(y)
y1 = tf.nn.sigmoid(y)
mse_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y,labels=Y_))
#mse_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.clip_by_value(y,1e-10,1.0),Y_))
# cross_entropy label=0 need replace it to 1-06 tf.clip_by_value(y,1e-10,1.0)
# cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'))
# learn rate auto change
globel_step = tf.Variable(0)
#learning_rate = tf.train.exponential_decay(0.1,globel_step,100,0.8,staircase=True)
train_op=tf.train.GradientDescentOptimizer(learn_rate).minimize(loss)
saver=tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
#saver.restore(sess, LOG_DIR + 'stock.model')
total_batch_num = int((sample_size / float(batch_size)) + 1)
print total_batch_num
for i in range(TRAINING_STEPS):
avg_cost = 0.0
step=0
start=0
end=start+batch_size
while(end<len(train_x)):
_,loss_=sess.run([train_op,loss],feed_dict={X:train_x[start:end],Y_:train_y[start:end]})
start+=batch_size
end=start+batch_size
step+=1
avg_cost += loss_ / float(total_batch_num)
# if (step % 100):
# print("train_step,step,loss: ",i,step,loss_,time.strftime("%I:%M:%S"))
if (i%100==0):
print("train_step,loss: ",i,avg_cost,loss_,time.strftime("%I:%M:%S"))
print("save model: ",saver.save(sess,LOG_DIR + 'stock.model'))
loss_,pred_=sess.run([loss,y1],feed_dict={X:test_x1[0:1000],Y_:test_y1[0:1000]})
print loss_,pred_
prediction(LOG_DIR,test_x1,test_y1)
def prediction(model_file,val_x,val_y,apply_id=[],write_file=""):
# X = tf.placeholder(tf.float32, shape=(None, len(val_x[0])))
# Y_ = tf.placeholder(tf.float32, shape=(None, 1))
predict=[]
y = tf.nn.sigmoid(multiplayer_perceptron(X, weight, bias,1.0))
#y = tf.nn.softmax(multiplayer_perceptron(X, weight, bias,1.0))
saver=tf.train.Saver()
content = ""
with tf.Session() as sess:
module_file = tf.train.latest_checkpoint(model_file)
saver.restore(sess, module_file)
sess.run([weight,bias])
print "h1:",sess.run(weight['h1'])
print "h2:",sess.run(weight['h2'])
print "h3:",sess.run(weight['h3'])
print "out:",sess.run(weight['out'])
print "h1:",sess.run(bias['h1'])
print "h2:",sess.run(bias['h2'])
print "h3:",sess.run(bias['h3'])
print "out:",sess.run(bias['out'])
train_x_len = len(val_x)
for i in range(train_x_len):
next_seq=sess.run(y,feed_dict={X:val_x[i:i+1]})
predict.append(next_seq[0][1])
if write_file != "":
content += apply_id[i] + ',' + str(next_seq[0][1]) + '\n'
# print next_seq
# all_prob = sess.run(y,feed_dict={X:val_x[0:len(val_x)]})
y_val = [k[1] for k in val_y]
auc_score(predict,y_val,1)
if write_file != "":
fp = open('./data/' + write_file,'w')
fp.write(content)
fp.close()
if __name__=='__main__':
print time.strftime("%I:%M:%S")
index_x,index_y,train_x_sk,train_y_sk,train_apply = get_csv_all(1,'./data/applyid.csv')
test_x1,test_y1,val_x_sk,val_y_sk,val_apply = get_csv_all(0,'./data/applyid.csv')
print len(index_x),len(test_x1)
#print len(train_x_sk),len(val_x_sk)
#clf = LogisticRegression(C=0.05,class_weight={0: 0.01, 1: 0.99})
#clf.fit(train_x_sk, train_y_sk)
#y_prob = clf.predict_proba(train_x_sk)
#y_prob = [k[1] for k in y_prob]
#auc_score(y_prob,train_y_sk,1)
#y_prob = clf.predict_proba(val_x_sk)
#y_prob = [k[1] for k in y_prob]
#auc_score(y_prob,val_y_sk,1)
train_model2(index_x,index_y,BATCH_SIZE,test_x1,test_y1)
prediction(LOG_DIR,index_x,index_y,train_apply,"train_mlp_res.txt")
prediction(LOG_DIR,test_x1,test_y1,val_apply,"val_mlp_res.txt")