word2vec和常见CNN+RNN网格结构组成的文本分类模型

最新推荐文章于 2025-05-09 21:47:45 发布

南楚巫妖

最新推荐文章于 2025-05-09 21:47:45 发布

阅读量3.3k

点赞数 5

分类专栏： kaggle 深度学习自然语言处理

本文链接：https://blog.youkuaiyun.com/yingdajun/article/details/108712464

版权

自然语言处理同时被 3 个专栏收录

40 篇文章

订阅专栏

深度学习

24 篇文章

订阅专栏

kaggle

8 篇文章

订阅专栏

作者为了应付毕业，所以在补充深度学习相关知识，这是我尝试把word2vec和深度学习相互结合的一次记录。

数据集来源
数据集预处理
生成word2vec模型
搭建网络并且训练

数据集来源

本文的数据集源自kaggle比赛中的NLP入门比赛，灾难新闻预报警。

数据集预处理

数据导入：

import numpy as np
import pandas as pd
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

数据预处理：

import re
import os, sys
import string
# 停用词
from nltk.corpus import stopwords
# 小写
def text_to_lowercase(text):
    return text.lower()
#去掉标点符号
def text_remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
#去掉url
def text_remove_url(text):
    return re.sub(r"http\S+", "", text)
#去掉@符号
def text_remove_twitter_handle(text):
    return re.sub('@[^\s]+','',text)
#去掉Python strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列。
def text_remove_leadtrail_spaces(text):
    return text.strip()
def clean_text(text):
    # order matters
    text1 = text_remove_twitter_handle(text)
    text2 = text_remove_url(text1)
    text3 = text_remove_punctuation(text2)
    text4 = text_to_lowercase(text3)
    text5 = text_remove_leadtrail_spaces(text4)
    return text5
# x = train_df["text"]
# 类似于list遍历性操作
# y = [clean_text(i) for i in x]
# text processing
train_df['text_processed'] =[clean_text(i) for i in train_df["text"]]
# x1 = test_df["text"]
# y1 = [clean_text(i) for i in x1]
# text processing
#清洗数据
test_df['text_processed'] =[clean_text(i) for i in test_df["text"]]
feature=train_df['text_processed']
target=train_df['target']

生成word2vec模型

from gensim.models import Word2Vec
# 训练模型，词向量的长度设置为500# ， 迭代次数为8# ，采用skip-gram模型# ,采用负采样# 窗口选择6# 最小词频是7# ，模型保存为pkl格式
w2v_model=Word2Vec(feature, size=500, sg=1,hs=0,window=6, iter=8,min_count=7)
w2v_model.wv.save_word2vec_format("./word2Vec" + ".pkl", binary=True)

导入工具包

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.layers.merge import concatenate
# 搭建模型
from keras.models import Sequential, Model
# 这个是层的搭建
from keras.layers import Dense, Embedding, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers import  BatchNormalization
from keras.layers import Convolution1D, Conv1D,MaxPooling1D
from keras.layers import Dense, Embedding, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
rom keras.utils import to_categorical

数据集分割和转数字

# 文本标签分类数量
NUM_CLASS=2
# 输入维度
INPUT_SIZE=64
# # 序列对齐文本数据
# Tokenizer是一个用于向量化文本，或将文本转换为序列
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(feature)
vocab = tokenizer.word_index
x_ids=tokenizer.texts_to_sequences(feature)
pad_s=pad_sequences(x_ids, maxlen=INPUT_SIZE)
from keras.utils import to_categorical
target_u=to_categorical(target,NUM_CLASS)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(pad_s,target_u,random_state=22,test_size=0.2)

搭建网络并且训练

给Embedding加入word2vec

embeding_matrix=np.zeros((len(vocab)+1,500))
for word,i in vocab.items():
    try:
        embeding_vector=w2v_model[str(word)]
        embeding_matrix[i]=embeding_vector
    except KeyError:
        continue

textCNN模型加入word2vec

from keras.layers import Flatten,Dropout
main_input=Input(shape=(INPUT_SIZE,),dtype='float64')
embedder=Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)
embed=embedder(main_input)
cnn1=Conv1D(256,3,padding='same',strides=1,activation='relu')(embed)
cnn1=MaxPooling1D(pool_size=38)(cnn1)
cnn2=Conv1D(256,4,padding='same',strides=1,activation='relu')(embed)
cnn2=MaxPooling1D(pool_size=37)(cnn2)
cnn3=Conv1D(256,5,padding='same',strides=1,activation='relu')(embed)
cnn3=MaxPooling1D(pool_size=36)(cnn3)
cnn=concatenate([cnn1,cnn2,cnn3],axis=-1)
flat=Flatten()(cnn)
drop=Dropout(0.2)(flat)
main_output=Dense(NUM_CLASS,activation='softmax')(drop)
model=Model(inputs=main_input,outputs=main_output)
model.summary()

模型搭建结果：

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])

模型训练结果：

其他的模型

加了word2vec的CNN模型

model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())  # (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])

加入了word2vec的RNN模型

model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])

加入了word2vec的Bi-GRU

# 模型结构：词嵌入-双向GRU*2-全连接
model = Sequential()
# 64是序列号
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])

加入了word2vec的CNN+RNN 串联

# 模型结构：词嵌入-卷积池化-GRU*2-全连接
model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Convolution1D(256, 3, padding='same', strides = 1))
model.add(Activation('relu'))
model.add(MaxPool1D(pool_size=2))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])

加入了word2vec的 CNN+RNN 并联

# 模型结构：词嵌入-卷积池化-全连接 ---拼接-全连接
#                -双向GRU-全连接
main_input = Input(shape=(INPUT_SIZE,), dtype='float64')
embed = Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)(main_input)
cnn = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn = MaxPool1D(pool_size=4)(cnn)
cnn = Flatten()(cnn)
cnn = Dense(256)(cnn)
rnn = Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))(embed)
rnn = Dense(256)(rnn)
con = concatenate([cnn,rnn], axis=-1)
main_output = Dense(NUM_CLASS, activation='softmax')(con)
model = Model(inputs = main_input, outputs = main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test,y_test])