1. 目录结构
data/
-- raw_data.txt 直接从hive表下载的原始数据
-- tr.txt 随机切分的训练集
-- te.txt 随机切分的测试集
-- tr_bins.txt 训练集特征分箱文件
-- te_bins.txt 测试集特征分箱文件
-- tr_with_rule.txt 训练集抽取规则后的数据文件
-- te_with_rule.txt 测试集抽取规则后的数据文件
models/-- rule_max_leaves_8.txt 8个叶子结点的规则文件
-- tr_bins.pkl 训练集分箱的字典文件
-- model.m 模型文件
-- model.dot 模型树图文件
reports/-- tr_bins.xlsx 训练集分箱后的分析报告文件
src/data/
-- get_data.sql 下载hive表数据文件
-- split_dataset_random.py 随机切分训练集测试集文件
model/-- tr.conf 训练集配置文件
-- generate_dt_rule.py 主要特征抽取及规则生成文件
-- eval_rule.py 规则评估文件
-- train_f2b.txt 数据集分箱转化文件
utils/ 第三方包文件-- config.py
-- eval.py
-- feature_to_bins.py
-- __init__.txt__pycache__
-- config.cpython-38.pyc
-- feature_to_bins.cpython-38.pyc
-- __init__.cpython-38.pyc
2. 数据获取
2.1 从hive表下载数据数据集
get_data.sql
set hive.cli.print.header=true;
set hive.resultset.use.unique.column.names=false;
select *
from zz.table_name
where pt='2021-07-01'
;
执行
hive -f get_data.sql >../../data/raw_data.txt
2.2 随机切分训练集与测试集
设定随机种子random_state=5,训练集占比70%。
split_dataset_random.py
import pandas as pd
if __name__ == '__main__':
data = pd.read_csv('../../data/raw_data.txt', sep='\t')
tr = data.sample(frac=.7, replace=False, random_state=5, axis=0)
te = data[~data.index.isin(tr.index)]
tr.to_csv('../../data/tr.txt', sep='\t', index=False)
te.to_csv('../../data/te.txt', sep='\t', index=False)
3. 特征分箱
3.1 训练集特征分箱
3.1.1 训练集配置特征分箱参数 tr.conf
criterion=GINI
# criterion=MEAN_SQUARED_ERROR
data_file=../../data/tr.txt
# 标签
label=is_later_30d_loss
# features=push_1d_cnt,alert_push_1d_cnt,chat_user_1d_cnt
# 需要忽略的特征(label也要忽略)
ignore_features=uid,2d_retention,3d_retention,is_later_30d_loss,pt
# 类别型特征
categoricals=chat_prefer,chat_time_prefer,frd_prefer,is_upload_contact,life_cycle
# 注意类别型变量的取值,是否与0有冲突
# fill_cate_na_value=NULL
fill_cate_na_value=0
# 缺失值填充0,也可填充-1
fill_numeric_na_value=0
# 分箱数越大越易过拟合
max_bins=10
# max_bins=256
min_leaf=0.005
verbose=False
dump_file=../../reports/tr_bins.xlsx
dict_file=../../models/tr_bins.pkl
3.1.2 训练集特征分箱脚本
train_f2b.py
import pandas as pd
from utils.feature_to_bins import Bins
from utils.config import Config
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('-c', '--config', default='tr.conf')
parser.add_argument('-t', '--type', default='train')
parser.add_argument('-o', '--out', default='../../data/tr_bins.txt')
def load_data(config):
data_ = pd.read_csv(config.data_file, sep='\t')
label_na_counts = data_[config.label].isnull().sum()
if label_na_counts > 0:
print('WARNING: Label column has %d NA values. They will be droped.' %label_na_counts)
data_.dropna(subset=[config.label], inplace=True)
num_data, num_features = data_.shape
num_categoricals = len(config.categoricals)
cover_rate_ = data_.apply(lambda x: 1 - pd.isnull(x).sum() / len(x))
count_ = data_.apply(lambda x: len(x) - pd.isnull(x).sum())
print('[LOAD RAW DATA] samples = %d, features = %d, categoricals = %d' % (num_data, num_features, num_categoricals))
return data_, cover_rate_, count_
if __name__ == "__main__":
args = parser.parse_args()
conf = Config().load_config_file(args.config)
data, cover_rate_fun, count_fun = load_data(conf)
b = Bins(config=conf, data=data)
# fit 训练集分箱
if args.type == 'train':
bins = b.data_to_bins()
# 统计特征覆盖度,写入报表文件
bins['count'] = bins['feature_name'].map(count_fun)
bins['cover_rate'] = bins['feature_name'].map(cover_rate_fun)
bins.to_excel(conf.dump_file, index=False)
print("-----------统计报表写出完毕!!!------------")
out_data = b.bins_to_data(append=True)
print(out_data.iloc[:, -1].max())
print('[CART BINNING] file dumped, filename = %s' % conf.dump_file)
print('[CART BINNING] finished')
# 输出分箱文件
out_data.to_csv(args.out, sep='\t', index=False)
# transform 测试集转换
else:
feature_list = data.columns.tolist()
drop_cols = ['uid', '2d_retention', '3d_retention','is_later_30d_loss','pt']
for item in drop_cols:
feature_list.remove(item)
data_raw_new = data[feature_list]
data_bins = b.data_transform(data)
# 将原始特征feature_list 和 bins特征以及bins标签合并
out = pd.concat([data_raw_new, data_bins], axis=1)
out.to_csv('../../data/te_bins.txt', sep='\t', index=False)
执行
/data/zz/anaconda3/bin/python train_f2b.py
输出
(1)分析报告文件 tr_bins.xlsx
对于连续型特征,是整个实数域顺序分箱的,分箱下标从0开始,后面的bins规则<1.5,并不是数值<1.5,而是表示选择下标为0和1的两个分箱。
类别型特征亦是如此,比如“chat_prefer_bin>1.5”,表示下标为2的分箱([-1.0,5.0])。
在将_bin特征对应到原始特征时,需要注意,如果有使用类别型特征,需要将数据按照这个分箱进行手动编码。比如“chat_prefer_bin>1.5”,需要手动对原始特征chat_prefer进行编码,值1,2,3,4编码为0;空值和5编码为1。后面出规则时自己再重新映射回来。
备注: 如果分箱特征对应分割点两边数值连续,则可直接使用原始特征规则。
![]()
比如 life_cycle_bin<=2.5,表示下标<=2,此时分割点左侧是[6],[4],[3,5];右侧是[2],[1];两侧数值连续 。
对应到原始特征是life_cycle >2.5,即取值[3,4,5,6]与上述左侧是一致的,所以大多数情况直接用bin特征对应的原始特征抽规则就可。
(2)字典文件 tr_bins.pkl
测试集分箱时会用到
(3) 数据分箱文件 tr_bins.txt
此文件中既有原始特征又有分箱特征,比如“熟好友人数”,既有“shu_frd_cnt”,又有“shu_frd_cnt_bin”。
3.2 测试集特征分箱
3.2.1 测试集配置特征分箱参数 te.conf
criterion=GINI
# criterion=MEAN_SQUARED_ERROR
data_file=../../data/te.txt
# 标签
label=is_later_30d_loss
# 需要忽略的特征(label也要忽略)
ignore_features=uid,2d_retention,3d_retention,is_later_30d_loss,pt
# 类别型特征
categoricals=chat_prefer,chat_time_prefer,frd_prefer,is_upload_contact,life_cycle
# 缺失值填充0,也可填充-1
fill_cate_na_value=0
fill_numeric_na_value=0
# 分箱数越大越易过拟合
max_bins=10
# max_bins=256
min_leaf=0.005
verbose=False
dump_file=../../reports/te_bins.xlsx
dict_file=../../models/te_bins.pkl
3.2.2 测试集特征分箱脚本
te_f2b.py
import pandas as pd
from utils.feature_to_bins import Bins
from utils.config import Config
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('-c', '--config', default='te.conf')
parser.add_argument('-o', '--out', default='../../data/te_bins.txt')
if __name__ == "__main__":
args = parser.parse_args()
conf = Config().load_config_file(args.config)
data = pd.read_csv(conf.data_file, sep='\t')
num_data, num_features = data.shape
num_categoricals = len(config.categoricals)
print('[LOAD RAW DATA] samples = %d, features = %d, categoricals = %d' % (num_data, num_features, num_categoricals))
b = Bins(config=conf, data=data)
# transform 测试集分箱转换
data_bins = b.data_transform(data)
# transform 测试集原始特征提取
feature_list = data.columns.tolist()
drop_cols = ['uid', '2d_retention', '3d_retention','is_later_30d_loss','pt']
for item in drop_cols:
feature_list.remove(item)
data_raw_new = data[feature_list]
# 将原始特征feature_list 和 bins特征以及bins标签合并
out = pd.concat([data_raw_new, data_bins], axis=1)
# 输出分箱文件
out.to_csv(args.out, sep='\t', index=False)
执行
/data/zz/anaconda3/bin/python te_f2b.py
注意:这里会依赖训练集的分箱字典文件(tr_bins.pkl)!!!
4. 抽取规则
4.1 使用全部_bins特征抽取规则
generate_dt_rule_bins.py
# coding:utf-8
from six import StringIO
import pydotplus
import joblib
from sklearn import tree
from sklearn.tree import _tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import os
import sys
node_id = 0
# 最大叶子结点数可以手工调,一般设置8,16,32等
max_leaf_nodes = 8
data_path = '../../data/tr_bins.txt'
drop_cols = ['uid', '2d_retention', '3d_retention', 'is_later_30d_loss', 'pt']
label = 'is_later_30d_loss'
# 画决策树
def draw_tree(model, tree_dot_file_name, feature_list):
dot_data = StringIO()
tree.export_graphviz(model, out_file=dot_data, feature_names=feature_list,
rounded=True, filled=True, proportion=True, precision=4)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_dot(tree_dot_file_name)
# 打印决策树规则
def tree_to_code(tree, feature_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print('feature_name:', feature_name)
print("def tree({}):".format(", ".join(feature_names)))
with open('../../models/rule_max_leaves_' + str(max_leaf_nodes) + '.txt', 'a') as fw:
fw.write("def tree({}):".format(", ".join(feature_names)) + '\n')
def recurse(node, depth):
global node_id
indent = " " * depth
# print('tree_.feature:',tree_.feature)
if tree_.feature[node] != _tree.TREE_UNDEFINED:
# print('tree_.feature[node]:',tree_.feature[node])
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {}:".format(indent, name, threshold))
fw.write("{}if {} <= {}:".format(indent, name, threshold) + '\n')
recurse(tree_.children_left[node], depth + 1)
print("{}else: # if {} > {}".format(indent, name, threshold))
fw.write("{}else: # if {} > {}".format(indent, name, threshold) + '\n')
recurse(tree_.children_right[node], depth + 1)
else:
print("{}return {}".format(indent, node_id))
fw.write("{}return {}".format(indent, node_id) + '\n')
node_id += 1
recurse(0, 1)
if __name__ == '__main__':
dt = None
data = pd.read_csv(data_path, sep='\t')
feature_names = data.columns.tolist()
for item in drop_cols:
feature_names.remove(item)
print(feature_names)
# 只保留bins特征
feature_names = [item for item in feature_names if '_bin' in item]
model_file_path = sys.argv[1]
if os.path.exists(model_file_path):
dt = joblib.load(model_file_path)
else:
X = data.loc[:, feature_names]
# 缺失值填充0
for col in feature_names:
X[col] = X[col].fillna(0)
y = data.loc[:, label]
dt = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes).fit(X, y)
print('auc = %.4f' % roc_auc_score(y_true=y, y_score=dt.predict_proba(X)[:, 1]))
# joblib.dump(dt, model_file_path)
draw_tree(dt, model_file_path[:-2]+'.dot', feature_names)
tree_to_code(dt, feature_names)
执行
/data/zz/anaconda3/bin/python generate_dt_rule_bins.py ../../models/model.m

rule_max_leaves_8.txt
if active_days_30d_bin <= 3.5:
if active_days_30d_bin <= 0.5:
if active_frd_prefer_bin <= 2.5:
return 0
else: # if active_frd_prefer_bin > 2.5
return 1
else: # if active_days_30d_bin > 0.5
if chat_frd_prefer_bin <= 1.5:
return 2
else: # if chat_frd_prefer_bin > 1.5
if active_days_30d_bin <= 1.5:
return 3
else: # if active_days_30d_bin > 1.5
return 4
else: # if active_days_30d_bin > 3.5
if active_days_30d_bin <= 5.5:
return 5
else: # if active_days_30d_bin > 5.5
if active_days_30d_bin <= 7.5:
return 6
else: # if active_days_30d_bin > 7.5
return 7
根据上述得到规则重要特征:
active_days_30d_bin
active_frd_prefer_bin
chat_frd_prefer_bin
对应到原始特征:
active_days_30d
active_frd_prefer
chat_frd_prefer
4.2 使用重要原始特征抽取规则
generate_dt_rule.py
# coding:utf-8
from six import StringIO
import pydotplus
import joblib
from sklearn import tree
from sklearn.tree import _tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import os
import sys
node_id = 0
# 最大叶子结点数可以手工调,一般设置8,16,32等
max_leaf_nodes = 8
data_path = '../../data/tr.txt'
drop_cols = ['uid', '2d_retention', '3d_retention', 'is_later_30d_loss', 'pt']
label = 'is_later_30d_loss'
# 画决策树
def draw_tree(model, tree_dot_file_name, feature_list):
dot_data = StringIO()
tree.export_graphviz(model, out_file=dot_data, feature_names=feature_list,
rounded=True, filled=True, proportion=True, precision=4)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_dot(tree_dot_file_name)
# 打印决策树规则
def tree_to_code(tree, feature_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print('feature_name:', feature_name)
print("def tree({}):".format(", ".join(feature_names)))
with open('../../models/rule_max_leaves_' + str(max_leaf_nodes) + '.txt', 'a') as fw:
fw.write("def tree({}):".format(", ".join(feature_names)) + '\n')
def recurse(node, depth):
global node_id
indent = " " * depth
# print('tree_.feature:',tree_.feature)
if tree_.feature[node] != _tree.TREE_UNDEFINED:
# print('tree_.feature[node]:',tree_.feature[node])
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {}:".format(indent, name, threshold))
fw.write("{}if {} <= {}:".format(indent, name, threshold) + '\n')
recurse(tree_.children_left[node], depth + 1)
print("{}else: # if {} > {}".format(indent, name, threshold))
fw.write("{}else: # if {} > {}".format(indent, name, threshold) + '\n')
recurse(tree_.children_right[node], depth + 1)
else:
print("{}return {}".format(indent, node_id))
fw.write("{}return {}".format(indent, node_id) + '\n')
node_id += 1
recurse(0, 1)
if __name__ == '__main__':
dt = None
data = pd.read_csv(data_path, sep='\t')
feature_names = ["active_days_30d", "active_frd_prefer", "chat_frd_prefer"]
X = data.loc[:, feature_names]
# 缺失值填充0
for col in feature_names:
X[col] = X[col].fillna(0)
y = data.loc[:, label]
dt = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes).fit(X, y)
print('auc = %.4f' % roc_auc_score(y_true=y, y_score=dt.predict_proba(X)[:, 1]))
draw_tree(dt, model_file_path[:-2]+'.dot', feature_names)
tree_to_code(dt, feature_names)
更换文件中特征,重新执行
/data/zz/anaconda3/bin/python generate_dt_rule.py ../../models/model_rules.m
输出
rule_max_leaves_8.txt
def tree(active_days_30d, chat_frd_prefer, active_frd_prefer):
if active_days_30d <= 4.5:
if active_days_30d <= 1.5:
if chat_frd_prefer <= 0.0:
return 0
else: # if chat_frd_prefer > 0.0
return 1
else: # if active_days_30d > 1.5
if active_days_30d <= 2.5:
if chat_frd_prefer <= 1.5:
return 2
else: # if chat_frd_prefer > 1.5
return 3
else: # if active_days_30d > 2.5
return 4
else: # if active_days_30d > 4.5
if active_days_30d <= 11.5:
return 5
else: # if active_days_30d > 11.5
if active_days_30d <= 20.5:
return 6
else: # if active_days_30d > 20.5
return 7
5. 规则评估
将重要特征和规则粘贴到评估脚本
5.1 原始特征评估
eval_rule.py
import pandas as pd
import sys
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=30, progress_bar=True)
data_path = '../../data/tr.txt'
# data_path = '../../data/tr_bins.txt'
feature_names = ["active_days_30d", "active_frd_prefer", "chat_frd_prefer"]
label = 'is_later_30d_loss'
def tree(active_days_30d, chat_frd_prefer, active_frd_prefer):
if active_days_30d <= 4.5:
if active_days_30d <= 1.5:
if chat_frd_prefer <= 0.0:
return 0
else: # if chat_frd_prefer > 0.0
return 1
else: # if active_days_30d > 1.5
if active_days_30d <= 2.5:
if chat_frd_prefer <= 1.5:
return 2
else: # if chat_frd_prefer > 1.5
return 3
else: # if active_days_30d > 2.5
return 4
else: # if active_days_30d > 4.5
if active_days_30d <= 11.5:
return 5
else: # if active_days_30d > 11.5
if active_days_30d <= 20.5:
return 6
else: # if active_days_30d > 20.5
return 7
if __name__ == '__main__':
data = None
if len(sys.argv) > 1:
rule_data_file_path = sys.argv[1]
# ../../data/tr_with_rule.txt
data = pd.read_csv(rule_data_file_path, sep='\t')
else:
data = pd.read_csv(data_path, sep='\t')
data = data[feature_names + [label]]
for col in feature_names:
data[col].fillna(0, inplace=True)
# 根据特征个数,手动调整x[0],x[1],...,x[i]
data['rule_id'] = data.parallel_apply(lambda x: tree(x[0], x[1], x[2], axis=1)
data.to_csv('../../data/tr_with_rule.txt', sep='\t', index=False)
print('用户总数\t流失用户数\t流失用户占比')
print('data_cnt = %d, loss_cnt = %d, loss_rate = %.4f'
% (len(data), data[label].sum(), data[label].sum() / len(data)))
print('命中规则用户数\t流失用户数\t覆盖率\t流失用户占比(准确度)\t流失用户召回率')
df = data.groupby('rule_id')[label].agg(['count', 'sum', 'mean'])
df.loc[:,'mean'] = df.apply(lambda x: str(round(x['mean']*100,2))+"%",axis=1)
df.loc[:,'coverage'] = df.apply(lambda x: str(round(x['count']*100/len(data),2))+"%",axis=1)
df.loc[:,'recall'] = df.apply(lambda x: str(round(x['sum']*100/data[label].sum(),2))+"%",axis=1)
df = df[['count', 'sum', 'coverage', 'mean', 'recall']]
print(df)
执行
/data/zz/anaconda3/bin/python eval_rule.py
输出

注意这里的规则id是从0开始的!!!
5.2 _bins特征评估
直接使用_bin重要特征及相应规则,测试集训练集评估时也使用_bin文件,抽取规则后,根据分箱文件,将bin规则重新映射为原始特征规则,此处注意分箱的含义!
5.3 评估汇总文件
数据来源= zz.table_name
时间= 2021/07/01
特征缺失值填充-1
【全体用户】
标签= is_later_30d_loss
之后的30天是否流失,即之后的的30天活跃天数为0。
【训练集】

其中
覆盖率=命中规则用户数/样本总数(25924/2497635)
流失用户占比=流失用户数/命中规则用户数(11287/25924=43.5%) --- 也就是准确率。
流失用户召回率=流失用户数/流失用户总数(11287/86171=13.1%)
【测试集】


本文介绍了如何从Hive表下载数据,通过随机切分生成训练集和测试集,对数据进行特征分箱,然后基于不同特征抽取规则生成决策树模型。重点讲述了规则评估过程,包括原始特征和分箱特征的评估,并提供了关键步骤的代码实例。
266

被折叠的 条评论
为什么被折叠?



