前言:
TextCNN对于字符串长度不是很长(20个字符以下)的文本分类效果较好,比如要通过新闻标题对新闻种类进行分类,就可以使用TextCNN,主要的步骤如下:
1、导入数据 read_csv
2、jieba分词(中文)
3、建立词库 Tokenizer.fit_on_texts
4、把词转换成数字编号 Tokenizer.texts_to_sequences
5、规整单个sentence的长度 pad_sequences
6、构建textcnn主函数并训练
7、predict出最后结果
注:我也没有数据集,可以自己爬豆瓣玩玩
import tensorflow as tf
import pandas as pd
import jieba
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
#数据导入和预处理
data = pd.read_csv('data.csv')
data['content'] = data['content'].apply(lambda x: list(jieba.cut(x)))
tk = Tokenizer()
tk.fit_on_texts(data['content'])
feature_nums = len(tk.word_index) + 1
train_x, test_x, train_y, test_y = train_test_split(data['content'], data['label'], random_state=10, test_size=0.2)
train_x = tk.texts_to_sequences(train_x)
test_x = tk.texts_to_sequences(test_x)
out_dimension = 300
sentence_length = 100
train_x = pad_sequences(train_x, maxlen=sentence_length)
test_x = pad_sequences(test_x, maxlen=sentence_length)
#编制模型
def convolutions():
'''textcnn的多个卷积'''
inpt = layers.Input(shape=(sentence_length, out_dimension, 1))
out_list = []
for size in [3,4,5]:
conv = layers.Conv2D(filters=64, kernel_size=(size, out_dimension), activation='relu', strides=1)(inpt)
conv = layers.MaxPool2D(pool_size=(sentence_length-size+1,1))(conv)
out_list.append(conv)
outt = layers.concatenate(out_list)
model = tf.keras.Model(inputs=inpt, outputs=outt)
return model
def textcnn(feature_nums, out_dimension, sentence_length):
'''textcnn的主函数'''
model = tf.keras.Sequential([
layers.Embedding(input_dim=feature_nums, output_dim=out_dimension, input_length=sentence_length),
layers.Reshape((sentence_length, out_dimension, 1)),
convolutions(),
layers.Flatten(),
layers.Dense(100, activation='relu'),
layers.Dropout(0.1),
layers.Dense(3, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(),metrics=['accuracy'])
return model
#模型训练和预测
model = textcnn(feature_nums, out_dimension, sentence_length)
model.fit(train_x, train_y, batch_size=64, epochs=10, validation_split=0.2)
y_pred = model.predict_classes(test_x)