最近学习Python 有学习任务 写一个逆向最大分词
import xlrd
import codecs
import os
#读取所有需要分词的文件路径
def eachFile(filepath):
pathDir = os.listdir(filepath)
paths_set = set()
for i in pathDir:
paths_set.add(os.path.join('/%s'%i))
return paths_set
#读取文本内容
def readtxt(filepath):
with open(filepath,'r',encoding='utf8') as f:
sentences = f.readlines()
f.close()
return sentences
#将分词的结果存储
def writer_result(filepath,sentence):
with codecs.open(filepath,'a',encoding='utf8') as w:
w.write(sentence)
w.close()
#读分词词典,词典中最长词长度
def get_seg_words(filepath):
xl =xlrd.open_workbook(filepath)
sheet = xl.sheet_by_index(0)
words = sheet.col_values(1,1)
max_index = 0
word_dir = set()
for word in words:
word_dir.add(word)
if len(word)>max_index:
max_index = len(word)
return word_dir,max_index
#读取停用词词典
def get_stop_words(filepath):
xl = xlrd.open_workbook(filepath)
sheet = xl.sheet_by_index(0)
words = sheet.col_values(1, 1)
stop_words = set()
for word in words:
stop_words.add(word)
return stop_words
paths_set = eachFile('分词文本')
seg_words,max_index = get_seg_words(r'词表/words.xlsx')
stop_words = get_stop_words(r'词表/stopwords.xlsx')
for path in paths_set:
print('begain : %s'%path)
sentences = readtxt('分词文本'+path)
for sentence in sentences:
sentence = sentence.strip()
start_index = 1
end_index = len(sentence)
result_sentence=''
while start_index>0:
for start_index in range(max(end_index-max_index,0),end_index,1):
#print(sentence[start_index:end_index])
if sentence[start_index:end_index] in stop_words:
break
elif sentence[start_index:end_index] in seg_words or end_index == start_index+1:
str = sentence[start_index:end_index]
result_sentence=str+'/'+result_sentence
break
end_index = start_index
writer_result('result'+path,result_sentence)
writer_result('result'+path,'\r\n')
1814

被折叠的 条评论
为什么被折叠?



