逆向最大匹配分词

最新推荐文章于 2022-04-08 13:51:54 发布

转载最新推荐文章于 2022-04-08 13:51:54 发布 · 451 阅读

1 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/u/3411375/blog/907259

文章标签：

#python

最近学习Python 有学习任务写一个逆向最大分词

import xlrd
import codecs
import os

#读取所有需要分词的文件路径
def eachFile(filepath):
    pathDir = os.listdir(filepath)
    paths_set = set()
    for i in pathDir:
        paths_set.add(os.path.join('/%s'%i))
    return paths_set
#读取文本内容
def readtxt(filepath):
    with open(filepath,'r',encoding='utf8') as f:
        sentences = f.readlines()
    f.close()
    return sentences
#将分词的结果存储
def writer_result(filepath,sentence):
    with codecs.open(filepath,'a',encoding='utf8') as w:
        w.write(sentence)
    w.close()

#读分词词典,词典中最长词长度
def get_seg_words(filepath):
    xl =xlrd.open_workbook(filepath)
    sheet = xl.sheet_by_index(0)
    words = sheet.col_values(1,1)
    max_index = 0
    word_dir = set()
    for word in words:
        word_dir.add(word)
        if len(word)>max_index:
            max_index = len(word)
    return word_dir,max_index
#读取停用词词典
def get_stop_words(filepath):
    xl = xlrd.open_workbook(filepath)
    sheet = xl.sheet_by_index(0)
    words = sheet.col_values(1, 1)
    stop_words = set()
    for word in words:
        stop_words.add(word)
    return stop_words


paths_set = eachFile('分词文本')

seg_words,max_index = get_seg_words(r'词表/words.xlsx')
stop_words = get_stop_words(r'词表/stopwords.xlsx')
for path in paths_set:
    print('begain : %s'%path)
    sentences = readtxt('分词文本'+path)
    for sentence in sentences:
        sentence = sentence.strip()
        start_index = 1
        end_index = len(sentence)
        result_sentence=''
        while start_index>0:
            for start_index in range(max(end_index-max_index,0),end_index,1):
                #print(sentence[start_index:end_index])
                if sentence[start_index:end_index] in stop_words:
                    break
                elif sentence[start_index:end_index] in seg_words or end_index == start_index+1:
                    str = sentence[start_index:end_index]
                    result_sentence=str+'/'+result_sentence
                    break
            end_index = start_index

        writer_result('result'+path,result_sentence)
        writer_result('result'+path,'\r\n')

转载于:https://my.oschina.net/u/3411375/blog/907259