#英文预处理
1 #按空格进行分词,同时针对推文一些特性,去除@用户名,保留表情等一些特殊符号 2 tokenizer = TweetTokenizer() 3 for counter,rev in enumerate(reviews): 4 # 去除HTML网页格式 5 temp = BeautifulSoup(rev) 6 text = temp.get_text() 7 # 去除空格 8 text = re.sub(' +',' ',text) 9 test = re.sub(r'[()\[\]{}.,;:!?\<=>?@_^#$%"&*-],' ',text) 10 # strip leading and trailing white space 11 text = text.strip() 12 # tokenize 13 tokens = tokenizer.tokenize(text) 14 cleaned_reviews.append(tokens) 15 if counter % round(len(reviews)/10) == 0: 16 print(counter, '/', len(reviews), 'reviews cleaned') 17 # get list of tokens from all reviews 18 # 两个list变成一个list 19 all_tokens = [token for sublist in cleaned_reviews for token in sublist] 20 # 根据词频做index, 把单词转成index 21 counts = dict(Counter(all_tokens)) 22 sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True) 23 # assign to each word an index based on its frequency in the corpus 24 # the most frequent word will get index equal to 1 25 word_to_index = dict([(tuple[0],idx+1) for idx, tuple in enumerate(sorted_counts)]) 26 with open(path_to_IMDB + 'word_to_index_new.json', 'w') as my_file: 27 json.dump(word_to_index, my_file, sort_keys=True, indent=4)
词共现矩阵的构建
https://github.com/urgedata/pythondata/blob/master/Text%20Analytics/ericbrown.ipynb
#中文预处理
#jieba分词和去停用词
#jieba 分词可以将我们的自定义词典导入,格式 “词” “词性” “词频”
jieba.load_userdict('data/userdict.txt')
#定义一个keyword类
class keyword(object):
def Chinese_Stopwords(self): #导入停用词库
stopword=[]
cfp=open('data/stopWord.txt','r+','utf-8') #停用词的txt文件
for line in cfp:
for word in line.split():
stopword.append(word)
cfp.close()
return stopword
def Word_cut_list(self,word_str):
#利用正则表达式去掉一些一些标点符号之类的符号。
word_str = re.sub(r'\s+', ' ', word_str) # trans 多空格 to空格
word_str = re.sub(r'\n+', ' ', word_str) # trans 换行 to空格
word_str = re.sub(r'\t+', ' ', word_str) # trans Tab to空格
word_str = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——;!,”。《》,。:“?、~@#¥%……&*()1234567①②③④)]+".decode("utf8"), "".decode("utf8"), word_str)
wordlist = list(jieba.cut(word_str))#jieba分词
wordlist_N = []
chinese_stopwords=self.Chinese_Stopwords()
for word in wordlist:
if word not in chinese_stopwords:#词语的清洗:去停用词
if word != '\r\n' and word!=' ' and word != '\u3000'.decode('unicode_escape') \
and word!='\xa0'.decode('unicode_escape'):#词语的清洗:去全角空格
wordlist_N.append(word)
return wordlist_N
#名词提取
def Word_pseg(self,word_str): # 名词提取函数
words = pseg.cut(word_str)
word_list = []
for wds in words:
# 筛选自定义词典中的词,和各类名词,自定义词库的词在没设置词性的情况下默认为x词性,即词的flag词性为x
if wds.flag == 'x' and wds.word != ' ' and wds.word != 'ns' \
or re.match(r'^n', wds.flag) != None \
and re.match(r'^nr', wds.flag) == None:
word_list.append(wds.word)
return word_list
import tensorflow.contrib.keras as kr
def read_file(filename):
"""读取文件数据"""
contents, labels = [], []
with open_file(filename) as f:
for line in f:
try:
label, content = line.strip().split('\t')
if content:
contents.append(list(content))#通过list把一句话分成一个个字
labels.append(native_content(label))
except:
pass
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1) #输出几个出现次数最多的元素
words, _ = list(zip(*count_pairs)) #通过zip只取出其中的单词
# 添加一个 <PAD> 来将所有文本pad为同一长度
words = ['<PAD>'] + list(words)
open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')
def read_vocab(vocab_dir):
"""读取词汇表"""
# words = open_file(vocab_dir).read().strip().split('\n')
with open_file(vocab_dir) as fp:
# 如果是py2 则每个值都转化为unicode
words = [native_content(_.strip()) for _ in fp.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
def process_file(filename, word_to_id, cat_to_id, max_length=600):
"""将文件转换为id表示"""
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
return x_pad, y_pad
#建立词表 text = open(path,encoding='utf-8').read().lower() chars = set(text) print ('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) #kreas下运行LSTM的Input生成,在建立词表的基础上,数据向量化 print('Vectorization...') X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) print(X) y = np.zeros((len(sentences), len(chars)), dtype=np.bool) for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): X[i, t, char_indices[char]] = 1 y[i, char_indices[next_chars[i]]] = 1
# 过滤词长,过滤停用词,只保留中文 def is_fine_word(word, min_length=2): rule = re.compile(r"^[\u4e00-\u9fa5]+$") if len(word) >= min_length and word not in STOP_WORDS and re.search(rule, word): return True else: return False
#逐字切分的处理方式,同时去掉一些常见的虚词,如“之”、“乎”、“者”、“也”。
def singCut(text):
tex = [i.strip('\n').strip('\r').strip('。').strip(',|)|:|{|}|“|” |(|\n') for i in text]
return list(filter(None, tex)) #去掉空字符
text = '云横秦岭家何在,雪拥蓝关马不前'
#虚词通用词库
stopwords = '而|何|乎|乃|且|其|若|所|为'
#去掉标点
poem = [[i.strip(') |: |?|{|}| “|” (| \n\n\r|。') for i in tex if i not in stopwords]for tex in text]
poem = list(filter(None, poem ))
预处理(去特殊符号、去停用词、分词)
把词转成index(word to index), 把原文都变成数值
去掉topN词频的以及小于TOPM词频的
对每篇进行 truncation and padding
word2vec训练 得到 w2v_model[word] 的embedding,加入CNN作为初始值(kreas里面训练需要把每个词转成embedding这种)
训练CNN模型
https://github.com/Tixierae/deep_learning_NLP
构建词汇表
categories转成id, 读取词汇表,构建word_to_id字典(字符级别)
读入训练数据,预处理,将文本pad到固定长度
批次训练CNN(tensorflow内部会自动初始化embedding)
预测
https://github.com/gaussic/text-classification-cnn-rnn
引用链接:
https://www.jianshu.com/p/aea87adee163