推荐算法:DeepCrossing模型

res_DeepCrossing模型

DeepCrossing

4. 代码实现

# 导包
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder
# from utils import SparseFeat, DenseFeat, VarLenSparseFeat
# from collections import namedtuple

# # 使用具名元组定义特征标记
# SparseFeat = namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim'])
# DenseFeat = namedtuple('DenseFeat', ['name', 'dimension'])
# VarLenSparseFeat = namedtuple('VarLenSparseFeat', ['name', 'vocabulary_size', 'embedding_dim', 'maxlen'])

1.读取并处理数据

  • 读取数据
  • 划分出数值型特征和类别型特征
  • 数值型特征处理 – 控制填充,取对数(归一化?)
  • 类别型特征的处理 – one-hot编码
# 读取数据
data = pd.read_csv('./data/criteo_sample.txt')
data.head()
label I1 I2 I3 I4 I5 I6 I7 I8 I9 ... C17 C18 C19 C20 C21 C22 C23 C24 C25 C26
0 0 NaN 3 260.0 NaN 17668.0 NaN NaN 33.0 NaN ... e5ba7672 87c6f83c NaN NaN 0429f84b NaN 3a171ecb c0d61a5c NaN NaN
1 0 NaN -1 19.0 35.0 30251.0 247.0 1.0 35.0 160.0 ... d4bb7bd8 6fc84bfb NaN NaN 5155d8a3 NaN be7c41b4 ded4aac9 NaN NaN
2 0 0.0 0 2.0 12.0 2013.0 164.0 6.0 35.0 523.0 ... e5ba7672 675c9258 NaN NaN 2e01979f NaN bcdee96c 6d5d1302 NaN NaN
3 0 NaN 13 1.0 4.0 16836.0 200.0 5.0 4.0 29.0 ... e5ba7672 52e44668 NaN NaN e587c466 NaN 32c7478e 3b183c5c NaN NaN
4 0 0.0 0 104.0 27.0 1990.0 142.0 4.0 32.0 37.0 ... e5ba7672 25c88e42 21ddcdc9 b1252a9d 0e8585d2 NaN 32c7478e 0d4a6d1a 001f3601 92c878de

5 rows × 40 columns

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   200 non-null    int64  
 1   I1      110 non-null    float64
 2   I2      200 non-null    int64  
 3   I3      166 non-null    float64
 4   I4      165 non-null    float64
 5   I5      194 non-null    float64
 6   I6      149 non-null    float64
 7   I7      190 non-null    float64
 8   I8      200 non-null    float64
 9   I9      190 non-null    float64
 10  I10     110 non-null    float64
 11  I11     190 non-null    float64
 12  I12     43 non-null     float64
 13  I13     165 non-null    float64
 14  C1      200 non-null    object 
 15  C2      200 non-null    object 
 16  C3      191 non-null    object 
 17  C4      191 non-null    object 
 18  C5      200 non-null    object 
 19  C6      168 non-null    object 
 20  C7      200 non-null    object 
 21  C8      200 non-null    object 
 22  C9      200 non-null    object 
 23  C10     200 non-null    object 
 24  C11     200 non-null    object 
 25  C12     191 non-null    object 
 26  C13     200 non-null    object 
 27  C14     200 non-null    object 
 28  C15     200 non-null    object 
 29  C16     191 non-null    object 
 30  C17     200 non-null    object 
 31  C18     200 non-null    object 
 32  C19     118 non-null    object 
 33  C20     118 non-null    object 
 34  C21     191 non-null    object 
 35  C22     41 non-null     object 
 36  C23     200 non-null    object 
 37  C24     191 non-null    object 
 38  C25     118 non-null    object 
 39  C26     118 non-null    object 
dtypes: float64(12), int64(2), object(26)
memory usage: 62.6+ KB
# 将数据划分为sparse_feature和dense_feature
columns = data.columns.values
columns
array(['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
       'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25',
       'C26'], dtype=object)
dense_features = [ feat for feat in columns if 'I' in feat]
sparse_features = [feat for feat in columns if 'C' in feat]
dense_features+sparse_features # 列表可以相加
['I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26']
# 数据处理的函数
def data_process(data_df,dense_features,sparse_features):
    """
    简单处理特征,包括填充缺失值,数值处理,类别编码
    param data_df: DataFrame格式的数据
    param dense_features: 数值特征名称列表
    param sparse_features: 类别特征名称列表
    """
    data_df[dense_features] = data_df[dense_features].fillna(0.0) # 空值补0.0
    for f in dense_features: # 对数值型特征的处理
        data_df[f] = data_df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    # 将类别型特征的空值填上-1    
    data_df[sparse_features] = data_df[sparse_features].fillna("-1")
    for f in sparse_features:
        lbe = LabelEncoder() # 对类别型特征进行了one-hot
        data_df[f] = lbe.fit_transform(data_df[f]) # 传入的是个列表
    
    return data_df[dense_features + sparse_features]
train_data = data_process(data, dense_features, sparse_features)
train_data
I1 I2 I3 I4 I5 I6 I7 I8 I9 I10 ... C17 C18 C19 C20 C21 C22 C23 C24 C25 C26
0 0.000000 1.386294 5.564520 0.000000 9.779567 0.000000 0.000000 3.526361 0.000000 0.000000 ... 8 66 0 0 3 0 1 96 0 0
1 0.000000 -1.000000 2.995732 3.583519 10.317318 5.513429 0.693147 3.583519 5.081404 0.000000 ... 7 52 0 0 47 0 7 112 0 0
2 0.000000 0.000000 1.098612 2.564949 7.607878 5.105945 1.945910 3.583519 6.261492 0.000000 ... 8 49 0 0 25 0 6 53 0 0
3 0.000000 2.639057 0.693147 1.609438 9.731334 5.303305 1.791759 1.609438 3.401197 0.000000 ... 8 37 0 0 156 0 0 32 0 0
4 0.000000 0.000000 4.653960 3.332205 7.596392 4.962845 1.609438 3.496508 3.637586 0.000000 ... 8 14 5 3 9 0 0 5 1 47
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195 0.000000 0.000000 4.736198 1.386294 8.018625 6.356108 1.098612 1.386294 5.370638 0.000000 ... 0 74 5 1 30 5 0 118 17 48
196 0.000000 0.693147 0.693147 0.693147 7.382746 2.564949 0.693147 2.564949 2.772589 0.000000 ... 1 25 0 0 138 0 0 68 0 0
197 0.693147 0.000000 1.945910 1.386294 0.000000 0.000000 2.995732 1.386294 1.386294 0.693147 ... 4 40 17 2 41 0 0 12 16 11
198 0.000000 3.135494 1.945910 3.135494 5.318120 5.036953 4.394449 2.944439 6.232448 0.000000 ... 4 7 18 1 123 0 0 10 16 49
199 0.693147 -1.000000 0.000000 0.000000 4.934474 0.000000 0.693147 0.000000 0.000000 0.693147 ... 7 72 0 0 0 0 0 0 0 0

200 rows × 39 columns

train_data['label'] = data['label']
train_data
I1 I2 I3 I4 I5 I6 I7 I8 I9 I10 ... C18 C19 C20 C21 C22 C23 C24 C25 C26 label
0 0.000000 1.386294 5.564520 0.000000 9.779567 0.000000 0.000000 3.526361 0.000000 0.000000 ... 66 0 0 3 0 1 96 0 0 0
1 0.000000 -1.000000 2.995732 3.583519 10.317318 5.513429 0.693147 3.583519 5.081404 0.000000 ... 52 0 0 47 0 7 112 0 0 0
2 0.000000 0.000000 1.098612 2.564949 7.607878 5.105945 1.945910 3.583519 6.261492 0.000000 ... 49 0 0 25 0 6 53 0 0 0
3 0.000000 2.639057 0.693147 1.609438 9.731334 5.303305 1.791759 1.609438 3.401197 0.000000 ... 37 0 0 156 0 0 32 0 0 0
4 0.000000 0.000000 4.653960 3.332205 7.596392 4.962845 1.609438 3.496508 3.637586 0.000000 ... 14 5 3 9 0 0 5 1 47 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
195 0.000000 0.000000 4.736198 1.386294 8.018625 6.356108 1.098612 1.386294 5.370638 0.000000 ... 74 5 1 30 5 0 118 17 48 0
196 0.000000 0.693147 0.693147 0.693147 7.382746 2.564949 0.693147 2.564949 2.772589 0.000000 ... 25 0 0 138 0 0 68 0 0 1
197 0.693147 0.000000 1.945910 1.386294 0.000000 0.000000 2.995732 1.386294 1.386294 0.693147 ... 40 17 2 41 0 0 12 16 11 1
198 0.000000 3.135494 1.945910 3.135494 5.318120 5.036953 4.394449 2.944439 6.232448 0.000000 ... 7 18 1 123 0 0 10 16 49 0
199 0.693147 -1.000000 0.000000 0.000000 4.934474 0.000000 0.693147 0.000000 0.000000 0.693147 ... 72 0 0 0 0 0 0 0 0 0

200 rows × 40 columns

train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   I1      200 non-null    float64
 1   I2      200 non-null    float64
 2   I3      200 non-null    float64
 3   I4      200 non-null    float64
 4   I5      200 non-null    float64
 5   I6      200 non-null    float64
 6   I7      200 non-null    float64
 7   I8      200 non-null    float64
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值