jilu
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import jieba
import csv
df = pd.read_csv('train_data.csv',encoding='utf-8')
df['label']=1
df.loc[df['satisfaction_id']==1,['label']]=1
df.loc[df['satisfaction_id']==3,['label']]=0
df['send_content_words'] = df['send_content'].apply(lambda s: list(jieba.cut(s))) #调用结巴分词
maxlen = 100 #截断词数
min_count = 5 #出现次数少于该值的词扔掉。这是最简单的降维方法
content = []
for i in df['send_content_words']:
content.extend(i)
--diction
abc = pd.Series(content).value_counts()
abc = abc[abc >= min_count]
abc[:] = range(1, len(abc)+1)
abc[''] = 0
def doc2num(s, maxlen):
s = [i for i in s if i in abc.index]
s = s[:maxlen] + ['']*max(0, maxlen-len(s))
return list(abc[s])
df['doc2num'] = df['send_content_words'].apply(lambda s: doc2num(s, maxlen))
#手动打乱数据
idx = range(len(df))
np.random.shuffle(idx)
all_ = df.loc[idx]
#按keras的输入要求来生成数据
x = np.array(list(all_['doc2num']))
y = np.array(list(all_['label']))
y = y.reshape((-1,1)) #调整标签形状
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding
from keras.layers import LSTM,Bidirectional
#建立模型
model = Sequential()
model.add(Embedding(len(abc), 64, input_length=maxlen))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
batch_size = 128
train_num = 15000
model.fit(x, y, batch_size = batch_size, nb_epoch=30)
--test
df_test = pd.read_csv('test_data.csv',encoding='utf-8')
df_test['label']=1
df_test.loc[df['satisfaction_id']==1,['label']]=1
df_test.loc[df['satisfaction_id']==3,['label']]=0
df_test['send_content_words'] = df_test['send_content'].apply(lambda s: list(jieba.cut(s))) #调用结巴分词
df_test['doc2num'] = df_test['send_content_words'].apply(lambda s: doc2num(s, maxlen))
x_test = np.array(list(df_test['doc2num']))
y_test = np.array(list(df_test['label']))
y_test = y_test.reshape((-1,1)) #调整标签形状
model.evaluate(x_test,y_test)
Epoch 1/30
69999/69999 [==============================] - 75s - loss: 0.3004 - acc: 0.8809
Epoch 2/30
69999/69999 [==============================] - 74s - loss: 0.2439 - acc: 0.9086
Epoch 3/30
69999/69999 [==============================] - 74s - loss: 0.2239 - acc: 0.9175
Epoch 4/30
69999/69999 [==============================] - 74s - loss: 0.2067 - acc: 0.9243
Epoch 5/30
69999/69999 [==============================] - 74s - loss: 0.1888 - acc: 0.9326
Epoch 6/30
69999/69999 [==============================] - 74s - loss: 0.1708 - acc: 0.9394
Epoch 7/30
69999/69999 [==============================] - 74s - loss: 0.1556 - acc: 0.9445
Epoch 8/30
69999/69999 [==============================] - 74s - loss: 0.1368 - acc: 0.9516
Epoch 9/30
69999/69999 [==============================] - 74s - loss: 0.1248 - acc: 0.9562
Epoch 10/30
69999/69999 [==============================] - 74s - loss: 0.1119 - acc: 0.9615
Epoch 11/30
69999/69999 [==============================] - 75s - loss: 0.1007 - acc: 0.9655
Epoch 12/30
69999/69999 [==============================] - 75s - loss: 0.0933 - acc: 0.9680
Epoch 13/30
69999/69999 [==============================] - 75s - loss: 0.0849 - acc: 0.9711
Epoch 14/30
69999/69999 [==============================] - 75s - loss: 0.0785 - acc: 0.9737
Epoch 15/30
69999/69999 [==============================] - 75s - loss: 0.0651 - acc: 0.9786
Epoch 16/30
69999/69999 [==============================] - 75s - loss: 0.0595 - acc: 0.9805
Epoch 17/30
69999/69999 [==============================] - 75s - loss: 0.0555 - acc: 0.9821
Epoch 18/30
69999/69999 [==============================] - 75s - loss: 0.0492 - acc: 0.9842
Epoch 19/30
69999/69999 [==============================] - 76s - loss: 0.0499 - acc: 0.9839
Epoch 20/30
69999/69999 [==============================] - 76s - loss: 0.0424 - acc: 0.9862
Epoch 21/30
69999/69999 [==============================] - 75s - loss: 0.0402 - acc: 0.9871
Epoch 22/30
69999/69999 [==============================] - 75s - loss: 0.0367 - acc: 0.9887
Epoch 23/30
69999/69999 [==============================] - 75s - loss: 0.0353 - acc: 0.9885
Epoch 24/30
69999/69999 [==============================] - 75s - loss: 0.0342 - acc: 0.9890
Epoch 25/30
69999/69999 [==============================] - 74s - loss: 0.0273 - acc: 0.9913
Epoch 26/30
69999/69999 [==============================] - 75s - loss: 0.0255 - acc: 0.9918
Epoch 27/30
69999/69999 [==============================] - 75s - loss: 0.0289 - acc: 0.9905
Epoch 28/30
69999/69999 [==============================] - 75s - loss: 0.0260 - acc: 0.9916
Epoch 29/30
69999/69999 [==============================] - 75s - loss: 0.0244 - acc: 0.9920
Epoch 30/30
69999/69999 [==============================] - 75s - loss: 0.0201 - acc: 0.9939
df_test = pd.read_csv('test_data.csv',encoding='utf-8')
df_test['label']=1
df_test.loc[df['satisfaction_id']==1,['label']]=1
df_test.loc[df['satisfaction_id']==3,['label']]=0
df_test['send_content_words'] = df_test['send_content'].apply(lambda s: list(jieba.cut(s))) #调用结巴分词
df_test['doc2num'] = df_test['send_content_words'].apply(lambda s: doc2num(s, maxlen))
x_test = np.array(list(df_test['doc2num']))
y_test = np.array(list(df_test['label']))
y_test = y_test.reshape((-1,1)) #调整标签形状
model.evaluate(x_test,y_test)
[4.7231967521905895, 0.49716666666666665]
model.evaluate(x_test[:3000],y_test[:3000])
[1.0248313174247741, 0.84699999999999998]
#cnn
from keras.layers import Dense,Flatten,Reshape,AveragePooling2D,Convolution2D,Conv2D,MaxPooling2D
print('Build model...')
model = Sequential()
model.add(Embedding(len(abc), 64, input_length=maxlen))
model.add(Reshape(target_shape=(80,80,1),input_shape=(maxlen,64)))
model.add(Conv2D(32,(3,3),activation='relu',input_shape=(None,80,80,1),padding='same'))
model.add(Conv2D(32,(3,3),activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64,(3,3),activation='relu'))
model.add(Conv2D(64,(3,3),activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
batch_size = 128
train_num = 3500
model.fit(x,y, batch_size = batch_size, nb_epoch=30)