之前做了基于机器学习的逻辑回归和贝叶斯进行文本分类,感觉还是太简单了,今天尝试一下字符级CNN用于文本分类,原文可以点Character-level Convolutional Networks for Text Classification
机器学习的文本分类可以看我之前做的这两个:
NLP实战之sklearn+逻辑回归进行THUCNews文本分类python
NLP实战之基于sklearn+TfidfVectorizer/CountVectorizer+贝叶斯模型进行THUCNews文本分类python
数据集参考前面两个文章进行下载哈,本文章的完整代码在这里。
数据准备
主要包括:读取、构造大小为5000的词汇表及对应id(索引)、标签向量化、数据内容和标签的转换,以及批量数据的构造
from collections import Counter
import numpy as np
import tensorflow.keras as kr
def open_file(filename, mode='r'):
"""
mode: 'r' or 'w' 读/写
"""
return open(filename, mode, encoding='utf-8', errors='ignore')
def read_file(filename):
"""读取文件数据content/label,返回的数据类型list"""
contents, labels = [], []
with open_file(filename) as f:
for line in f:
try:
label, content = line.strip().split('\t')#按照制表符 分割字符串
if content:
contents.append(list(content))
labels.append(label)
except:
pass
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)#词袋
count_pairs = counter.most_common(vocab_size - 1)#top n
words, _ = list(zip(*count_pairs))#获取key
# 添加一个 <PAD> 来将所有文本pad为同一长度
words = ['<PAD>'] + list(words)
open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')
def read_vocab(vocab_dir):
"""读取词汇表"""
# words = open_file(vocab_dir).read().strip().split('\n')
with open_file(vocab_dir) as fp:
words = [_.strip() for _ in fp.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
def read_category():
"""读取分类目录,固定"""
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
def to_words(content, words):
"""将id表示的内容转换为文字"""
return ''.join(words[x] for x in content)
def process_file(filename, word_to_id, cat_to_id, max_length=600):
"""将文件转换为id表示"""
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range