res_DeepCrossing模型
DeepCrossing
4. 代码实现
# 导包
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from utils import SparseFeat, DenseFeat, VarLenSparseFeat
# from collections import namedtuple
# # 使用具名元组定义特征标记
# SparseFeat = namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim'])
# DenseFeat = namedtuple('DenseFeat', ['name', 'dimension'])
# VarLenSparseFeat = namedtuple('VarLenSparseFeat', ['name', 'vocabulary_size', 'embedding_dim', 'maxlen'])
1.读取并处理数据
- 读取数据
- 划分出数值型特征和类别型特征
- 数值型特征处理 – 控制填充,取对数(归一化?)
- 类别型特征的处理 – one-hot编码
# 读取数据
data = pd.read_csv('./data/criteo_sample.txt')
data.head()
label | I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | ... | C17 | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | NaN | 3 | 260.0 | NaN | 17668.0 | NaN | NaN | 33.0 | NaN | ... | e5ba7672 | 87c6f83c | NaN | NaN | 0429f84b | NaN | 3a171ecb | c0d61a5c | NaN | NaN |
1 | 0 | NaN | -1 | 19.0 | 35.0 | 30251.0 | 247.0 | 1.0 | 35.0 | 160.0 | ... | d4bb7bd8 | 6fc84bfb | NaN | NaN | 5155d8a3 | NaN | be7c41b4 | ded4aac9 | NaN | NaN |
2 | 0 | 0.0 | 0 | 2.0 | 12.0 | 2013.0 | 164.0 | 6.0 | 35.0 | 523.0 | ... | e5ba7672 | 675c9258 | NaN | NaN | 2e01979f | NaN | bcdee96c | 6d5d1302 | NaN | NaN |
3 | 0 | NaN | 13 | 1.0 | 4.0 | 16836.0 | 200.0 | 5.0 | 4.0 | 29.0 | ... | e5ba7672 | 52e44668 | NaN | NaN | e587c466 | NaN | 32c7478e | 3b183c5c | NaN | NaN |
4 | 0 | 0.0 | 0 | 104.0 | 27.0 | 1990.0 | 142.0 | 4.0 | 32.0 | 37.0 | ... | e5ba7672 | 25c88e42 | 21ddcdc9 | b1252a9d | 0e8585d2 | NaN | 32c7478e | 0d4a6d1a | 001f3601 | 92c878de |
5 rows × 40 columns
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 label 200 non-null int64
1 I1 110 non-null float64
2 I2 200 non-null int64
3 I3 166 non-null float64
4 I4 165 non-null float64
5 I5 194 non-null float64
6 I6 149 non-null float64
7 I7 190 non-null float64
8 I8 200 non-null float64
9 I9 190 non-null float64
10 I10 110 non-null float64
11 I11 190 non-null float64
12 I12 43 non-null float64
13 I13 165 non-null float64
14 C1 200 non-null object
15 C2 200 non-null object
16 C3 191 non-null object
17 C4 191 non-null object
18 C5 200 non-null object
19 C6 168 non-null object
20 C7 200 non-null object
21 C8 200 non-null object
22 C9 200 non-null object
23 C10 200 non-null object
24 C11 200 non-null object
25 C12 191 non-null object
26 C13 200 non-null object
27 C14 200 non-null object
28 C15 200 non-null object
29 C16 191 non-null object
30 C17 200 non-null object
31 C18 200 non-null object
32 C19 118 non-null object
33 C20 118 non-null object
34 C21 191 non-null object
35 C22 41 non-null object
36 C23 200 non-null object
37 C24 191 non-null object
38 C25 118 non-null object
39 C26 118 non-null object
dtypes: float64(12), int64(2), object(26)
memory usage: 62.6+ KB
# 将数据划分为sparse_feature和dense_feature
columns = data.columns.values
columns
array(['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25',
'C26'], dtype=object)
dense_features = [ feat for feat in columns if 'I' in feat]
sparse_features = [feat for feat in columns if 'C' in feat]
dense_features+sparse_features # 列表可以相加
['I1',
'I2',
'I3',
'I4',
'I5',
'I6',
'I7',
'I8',
'I9',
'I10',
'I11',
'I12',
'I13',
'C1',
'C2',
'C3',
'C4',
'C5',
'C6',
'C7',
'C8',
'C9',
'C10',
'C11',
'C12',
'C13',
'C14',
'C15',
'C16',
'C17',
'C18',
'C19',
'C20',
'C21',
'C22',
'C23',
'C24',
'C25',
'C26']
# 数据处理的函数
def data_process(data_df,dense_features,sparse_features):
"""
简单处理特征,包括填充缺失值,数值处理,类别编码
param data_df: DataFrame格式的数据
param dense_features: 数值特征名称列表
param sparse_features: 类别特征名称列表
"""
data_df[dense_features] = data_df[dense_features].fillna(0.0) # 空值补0.0
for f in dense_features: # 对数值型特征的处理
data_df[f] = data_df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
# 将类别型特征的空值填上-1
data_df[sparse_features] = data_df[sparse_features].fillna("-1")
for f in sparse_features:
lbe = LabelEncoder() # 对类别型特征进行了one-hot
data_df[f] = lbe.fit_transform(data_df[f]) # 传入的是个列表
return data_df[dense_features + sparse_features]
train_data = data_process(data, dense_features, sparse_features)
train_data
I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | I10 | ... | C17 | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 1.386294 | 5.564520 | 0.000000 | 9.779567 | 0.000000 | 0.000000 | 3.526361 | 0.000000 | 0.000000 | ... | 8 | 66 | 0 | 0 | 3 | 0 | 1 | 96 | 0 | 0 |
1 | 0.000000 | -1.000000 | 2.995732 | 3.583519 | 10.317318 | 5.513429 | 0.693147 | 3.583519 | 5.081404 | 0.000000 | ... | 7 | 52 | 0 | 0 | 47 | 0 | 7 | 112 | 0 | 0 |
2 | 0.000000 | 0.000000 | 1.098612 | 2.564949 | 7.607878 | 5.105945 | 1.945910 | 3.583519 | 6.261492 | 0.000000 | ... | 8 | 49 | 0 | 0 | 25 | 0 | 6 | 53 | 0 | 0 |
3 | 0.000000 | 2.639057 | 0.693147 | 1.609438 | 9.731334 | 5.303305 | 1.791759 | 1.609438 | 3.401197 | 0.000000 | ... | 8 | 37 | 0 | 0 | 156 | 0 | 0 | 32 | 0 | 0 |
4 | 0.000000 | 0.000000 | 4.653960 | 3.332205 | 7.596392 | 4.962845 | 1.609438 | 3.496508 | 3.637586 | 0.000000 | ... | 8 | 14 | 5 | 3 | 9 | 0 | 0 | 5 | 1 | 47 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | 0.000000 | 0.000000 | 4.736198 | 1.386294 | 8.018625 | 6.356108 | 1.098612 | 1.386294 | 5.370638 | 0.000000 | ... | 0 | 74 | 5 | 1 | 30 | 5 | 0 | 118 | 17 | 48 |
196 | 0.000000 | 0.693147 | 0.693147 | 0.693147 | 7.382746 | 2.564949 | 0.693147 | 2.564949 | 2.772589 | 0.000000 | ... | 1 | 25 | 0 | 0 | 138 | 0 | 0 | 68 | 0 | 0 |
197 | 0.693147 | 0.000000 | 1.945910 | 1.386294 | 0.000000 | 0.000000 | 2.995732 | 1.386294 | 1.386294 | 0.693147 | ... | 4 | 40 | 17 | 2 | 41 | 0 | 0 | 12 | 16 | 11 |
198 | 0.000000 | 3.135494 | 1.945910 | 3.135494 | 5.318120 | 5.036953 | 4.394449 | 2.944439 | 6.232448 | 0.000000 | ... | 4 | 7 | 18 | 1 | 123 | 0 | 0 | 10 | 16 | 49 |
199 | 0.693147 | -1.000000 | 0.000000 | 0.000000 | 4.934474 | 0.000000 | 0.693147 | 0.000000 | 0.000000 | 0.693147 | ... | 7 | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
200 rows × 39 columns
train_data['label'] = data['label']
train_data
I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | I10 | ... | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 1.386294 | 5.564520 | 0.000000 | 9.779567 | 0.000000 | 0.000000 | 3.526361 | 0.000000 | 0.000000 | ... | 66 | 0 | 0 | 3 | 0 | 1 | 96 | 0 | 0 | 0 |
1 | 0.000000 | -1.000000 | 2.995732 | 3.583519 | 10.317318 | 5.513429 | 0.693147 | 3.583519 | 5.081404 | 0.000000 | ... | 52 | 0 | 0 | 47 | 0 | 7 | 112 | 0 | 0 | 0 |
2 | 0.000000 | 0.000000 | 1.098612 | 2.564949 | 7.607878 | 5.105945 | 1.945910 | 3.583519 | 6.261492 | 0.000000 | ... | 49 | 0 | 0 | 25 | 0 | 6 | 53 | 0 | 0 | 0 |
3 | 0.000000 | 2.639057 | 0.693147 | 1.609438 | 9.731334 | 5.303305 | 1.791759 | 1.609438 | 3.401197 | 0.000000 | ... | 37 | 0 | 0 | 156 | 0 | 0 | 32 | 0 | 0 | 0 |
4 | 0.000000 | 0.000000 | 4.653960 | 3.332205 | 7.596392 | 4.962845 | 1.609438 | 3.496508 | 3.637586 | 0.000000 | ... | 14 | 5 | 3 | 9 | 0 | 0 | 5 | 1 | 47 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | 0.000000 | 0.000000 | 4.736198 | 1.386294 | 8.018625 | 6.356108 | 1.098612 | 1.386294 | 5.370638 | 0.000000 | ... | 74 | 5 | 1 | 30 | 5 | 0 | 118 | 17 | 48 | 0 |
196 | 0.000000 | 0.693147 | 0.693147 | 0.693147 | 7.382746 | 2.564949 | 0.693147 | 2.564949 | 2.772589 | 0.000000 | ... | 25 | 0 | 0 | 138 | 0 | 0 | 68 | 0 | 0 | 1 |
197 | 0.693147 | 0.000000 | 1.945910 | 1.386294 | 0.000000 | 0.000000 | 2.995732 | 1.386294 | 1.386294 | 0.693147 | ... | 40 | 17 | 2 | 41 | 0 | 0 | 12 | 16 | 11 | 1 |
198 | 0.000000 | 3.135494 | 1.945910 | 3.135494 | 5.318120 | 5.036953 | 4.394449 | 2.944439 | 6.232448 | 0.000000 | ... | 7 | 18 | 1 | 123 | 0 | 0 | 10 | 16 | 49 | 0 |
199 | 0.693147 | -1.000000 | 0.000000 | 0.000000 | 4.934474 | 0.000000 | 0.693147 | 0.000000 | 0.000000 | 0.693147 | ... | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
200 rows × 40 columns
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 I1 200 non-null float64
1 I2 200 non-null float64
2 I3 200 non-null float64
3 I4 200 non-null float64
4 I5 200 non-null float64
5 I6 200 non-null float64
6 I7 200 non-null float64
7 I8 200 non-null float64