def delete_blank_lines(sentences):
return [s for s in sentences if s.split()]
no_line_datas = delete_blank_lines(n_datas)
去除数字
DIGIT_RE = re.compile(r'\d+')
no_digit_datas = DIGIT_RE.sub('', no_line_datas)
def delete_digit(sentences):
return [DIGIT_RE.sub('', s) for s in sentences]
判断句子形式(简单句或者复杂句)
STOPS = ['。', '.', '?', '?', '!', '!'] # 中英文句末字符
def is_sample_sentence(sentence):
count = 0
for word in sentence:
if word in STOPS:
count += 1
if count > 1:
return False