tree+lr


main_tkz.py
import pandas as pd
from tree_tkz import tree_tkz
from logistic_regression_tkz import logistic_regression_tkz
from logistic_regression_test import logistic_regression_test
import numpy as np

if __name__ == '__main__':

data = pd.read_table('../data/dz', header=None)
data.fillna(0, inplace=True)

data_columns = []

data.columns = data_columns
data['deal_type'] = np.where(data['deal_type'] > 0, 1, 0)

data_1 = data[data['deal_type']==1]
data = pd.concat([data_1, data.sample(frac=0.01, replace=True)])

feature_columns = []

data_train = data[feature_columns]
tree_tkz(data_train, feature_columns)

logistic_regression_tkz(data_train)

data = pd.read_table('../data/dz', header=None)
data.fillna(0, inplace=True)

data.columns = data_columns
data['deal_type'] = np.where(data['deal_type'] > 0, 1, 0)

data_1 = data[data['deal_type'] == 1]
data = pd.concat([data_1, data.sample(frac=0.01, replace=True)])

data_test = data[feature_columns]

logistic_regression_test(data_test)

tree_tkz.py
# -*- coding: utf-8 -*-

import pandas as pd
from sklearn import tree
import graphviz
import numpy as np
from re_tkz import re_tkz

#if __name__ == '__main__':
def tree_tkz(data, data_columns):

#clf = tree.DecisionTreeRegressor()
clf = tree.DecisionTreeClassifier()

clf = clf.fit(data.drop(labels=data_columns[-1], axis=1), data[data_columns[-1]])

feature_importances_list = clf.feature_importances_

feature_importances_dict = {}

for i in range(len(data_columns) - 1):
feature_importances_dict[data_columns[i]] = feature_importances_list[i]

print 'feature_importances_dict:', feature_importances_dict

dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=data_columns[:-1],
class_names=np.array(['1', '2']),
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("../data/tree")#view=True,显示pdf

re_tkz()
re_tkz.py
# -*- coding: utf-8 -*-

import re
import json

#if __name__ == '__main__':
def re_tkz():

with open('../data/tree', 'r') as f:
tree_data = f.readlines()

#<x1 &le; 1.5<br/>

result = {}

for data_line in tree_data:
find_list = re.findall(r'<(.+?) &le; (.+?)<br/>', data_line)
if find_list == []:
continue
t = find_list[0]
if t[0] not in result.keys():
result[t[0]] = []
if float(t[1]) not in result[t[0]]:#去重
result[t[0]].append(float(t[1]))
for k in result.keys():#排序
result[k].sort()
result[k].append(9999999)
result[k].insert(0, -9999999)
with open('../data/cut_data.json', 'w') as f:
json.dump(result, f)
logistic_regression_tkz.py
# -*- coding: utf-8 -*-

from sklearn.linear_model import LogisticRegression
from cut_data import cut_data
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV


#if __name__ == '__main__':
def logistic_regression_tkz(data):

x, y = cut_data(data)

lr = LogisticRegression()

'''
parameters = {'penalty': ['l1', 'l2'], 'C': [1.0, 1.1, 1.2]}

GSCV = GridSearchCV(lr, parameters, n_jobs=2)

GSCV.fit(x, y)

print GSCV.best_params_
'''
lr.fit(x, y)
joblib.dump(lr, '../data/lr.model')


print '权重:', lr.coef_#权重
print '截距:', lr.intercept_#截距

#y_pre = lr.predict_proba(x)
y_pre = lr.predict(x)
print 'train_auc:', roc_auc_score(list(y), list(y_pre))
cut_data.py
# -*- coding: utf-8 -*-

import pandas as pd
import json

def cut_data(data):

with open('../data/cut_data.json', 'r') as f:
cut_dict = json.load(f)

for k in cut_dict.keys():
#k_labels = [k + str(i) for i in range(len(cut_dict[k]) - 1)]
data[k] = pd.cut(data[k], cut_dict[k])#labels = k_labels

train_x = pd.get_dummies(data.drop(labels='deal_type', axis=1))
train_y = data['deal_type']
return train_x, train_y
logistic_regression_test.py
# -*- coding: utf-8 -*-

from cut_data import cut_data
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib

#if __name__ == '__main__':
def logistic_regression_test(data):

x, y = cut_data(data)
print x.shape
lr = joblib.load('../data/lr.model')

#y_pre = lr.predict_proba(x)
y_pre = lr.predict(x)
print 'test_auc:', roc_auc_score(list(y), list(y_pre))

转载于:https://www.cnblogs.com/kayy/p/9963491.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值