NLP实战之tensorflow+字符级CNN进行THUCNews文本分类python_利用贝叶斯模型对thucnews数据集文本分类-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_37723079/article/details/106313685

之前做了基于机器学习的逻辑回归和贝叶斯进行文本分类，感觉还是太简单了，今天尝试一下字符级CNN用于文本分类，原文可以点Character-level Convolutional Networks for Text Classification
机器学习的文本分类可以看我之前做的这两个：
NLP实战之sklearn+逻辑回归进行THUCNews文本分类python
NLP实战之基于sklearn+TfidfVectorizer/CountVectorizer+贝叶斯模型进行THUCNews文本分类python
数据集参考前面两个文章进行下载哈，本文章的完整代码在这里。

数据准备

主要包括：读取、构造大小为5000的词汇表及对应id（索引）、标签向量化、数据内容和标签的转换，以及批量数据的构造


from collections import Counter
import numpy as np
import tensorflow.keras as kr

def open_file(filename, mode='r'):
    """
    mode: 'r' or 'w' 读/写
    """
    return open(filename, mode, encoding='utf-8', errors='ignore')


def read_file(filename):
    """读取文件数据content/label,返回的数据类型list"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')#按照制表符 分割字符串
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)#词袋
    count_pairs = counter.most_common(vocab_size - 1)#top n
    words, _ = list(zip(*count_pairs))#获取key
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        words = [_.strip() for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range