Keras文本分类

最新推荐文章于 2022-07-04 18:14:35 发布

原创

最新推荐文章于 2022-07-04 18:14:35 发布 · 1.3k 阅读

2 ·

CC 4.0 BY-SA版权

文章目录

内置样本
序列截断或补齐为等长
词嵌入
词嵌入+RNN
词嵌入+CNN
CNN+RNN
双向RNN
双向RNN+Word2Vector

内置样本

影评正负2分类（已编码）

from keras.datasets import imdb  # Internet Movie Database
# num_words设定较小时，会发现高频词多是停词
(x, y), _ = imdb.load_data(num_words=1)
print(x.shape, y.shape)  # (25000,) (25000,)
# 词与ID间的映射
word2id = imdb.get_word_index()
id2word = {
   
   i: w for w, i in word2id.items()}
print(x[0])
print(' '.join([id2word[i] for i in x[0]]))

序列截断或补齐为等长

from keras.preprocessing.sequence import pad_sequences
maxlen = 2
print(pad_sequences([[1, 2, 3], [1]], maxlen))
"""[[2 3] [0 1]]"""
print(pad_sequences([[1, 2, 3], [1]], maxlen, value=9))
"""[[2 3] [9 1]]"""
print(pad_sequences([[1, 2, 3], [1]], maxlen, padding='post'))
"""[[2 3] [1 0]]"""
print(pad_sequences([[1, 2, 3], [1]], maxlen, truncating='post'))
"""[[1 2] [0 1]]"""

词嵌入

from keras.datasets.imdb import load_data  # 影评情感2分类
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

"""配置"""
num_words = 10000  # 按词频大小取样本前10000个词
input_dim = num_words  # 词库大小（必须>=num_words）
maxlen = 25  # 序列长度
output_dim = 40  # 词向量维度
batch_size = 128
epochs = 2

"""数据读取与处理"""
(x, y), _ = load_data(num_words=num_words)
x = pad_sequences(x, maxlen)

"""建模"""
model = Sequential()
# 词嵌入：词库大小、词向量维度、固定序列长度
model.add(Embedding(input_dim, output_dim, input_length=maxlen))
# 平坦化：maxlen * output_dim
model.add(Flatten())
# 输出层：2分类
model.add(Dense(units=1, activation='sigmoid'))
# RMSprop优化器、二元交叉熵损失
model.compile('rmsprop'