【Python小项目】文档字词整理

最新推荐文章于 2024-02-01 17:27:04 发布

原创最新推荐文章于 2024-02-01 17:27:04 发布 · 320 阅读

1 ·

CC 4.0 BY-SA版权

Python笔记与积累专栏收录该内容

14 篇文章

订阅专栏

博客围绕文档处理问题展开，先提出去除文档中多余空格、空行等符号，并按索引分段的需求。接着分解问题，阐述去除多余符号和按索引分段的步骤及优化方法，还提及应用到文档时需解决的索引范围和正则表达式编写问题，最后给出扩展思路，如用命令行参数获取路径和构建索引种类。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

问题提出： 一个文档中，有多余的空格、空行、Tab制表符、换行符，现在需要去除这些多余符号，并且根据该文档索引(A)(B)等分段，一个索引引导一个段落。
在这里插入图片描述
问题分解：

1、去除多余空格、空行、Tab制表符、换行符：
step1：需要先读取文件流open(file)
step2：将流转成字符串f.read()
step3：将字符串打散，分裂成列表split()，每个单词成为一个元素，剔除多余元素
step4：新列表拼接成新的字符串join
step5：写入文件write

#读取文档
with open('cup.txt') as f:
    str_list=f.read().split()#split()未指定分隔符。可以将单个或者连续的换行符、制表符、空格去除，因此空行也一并去除
    new_str=' '.join(str_list)#用空格将单词两两间隔
#写入文档
with open('cup.txt','w') as f:
    f.write(new_str)

优化:
（参考https://blog.youkuaiyun.com/weixin_40844116/article/details/88419016）

#code1
with open('cup.txt','r+') as f:#读取文档
    str_list=f.read().split()#split()未指定分隔符。可以将单个或者连续的换行符、制表符、空格去除，因此空行也一并去除
    new_str=' '.join(str_list)#用空格将单词两两间隔
    f.seek(0)
    f.truncate()
    f.write(new_str)#写入文档

这样就将杂乱的文本整合成一整段纯净的字符串

2、找到索引，以索引为标志，在每个索引的前一个位置的空格替换成换行符–用到re模块。
step1:确定正则表达式
step2:利用编组，找到序列所在的位置,定位到前一位
step3:将定位好的位置替换成换行符

先写一个小模型

from re import *
string='(A) aa (B) bb (C)cc'
compile_string='(\(A\)).*?(\(B\)).*?(\(C\))'
pat=compile(compile_string)
m=match(pat,string)
print(m.start(0),m.start(1),m.start(2),m.start(3))
index=[m.start(i)-1 for i in range(2,4)]#需要替换成换行符对的位置,要从m.start(2)-1开始，首个索引不需要换行
print(index)
#因为字符串不能直接用切片，所以先list一下，变成列表
file_list=list(string)
for i in index:
    file_list[i]='\n'
new_string=''.join(file_list)#再将新的列表拼接成字符串
new_string
----------------------------------------------------
0 0 7 14
[6, 13]
'(A) aa\n(B) bb\n(C)cc'

应用到文档，需要解决的问题：
1.具体索引从A到几？如果实际文档索引是A-F,正则表达式只写到A-E,那么就只会匹配到E，如果正则表达式写到A-G，那么就会匹配不到，报错。比较准确的方法是通过input,人工输入
2.索引不确定，正则表达式怎么写？

#构建一个字母序列的方法--利用unicode值
end_index=input('Please enter the ending index:')
index_list_orig=[chr(i) for i in range(ord('A'),ord(end_index)+1)]
index_list_orig
-------------------------------------------------------------
Please enter the ending index:I
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

参考：https://blog.youkuaiyun.com/weixin_40844116/article/details/84038411
既然索引已经确定了，那么根据这个索引，利用序列的方法，可以写出对应的正则表达式

end_index=input('Please enter the ending index:')
index_list=['(\('+chr(i)+'\))' for i in range(ord('A'),ord(end_index)+1)]#仿照模型中的表达式
compile_string='.*?'.join(index_list)
compile_string
---------------------------------------------------------------
Please enter the ending index:I
'(\\(A\\)).*?(\\(B\\)).*?(\\(C\\)).*?(\\(D\\)).*?(\\(E\\)).*?(\\(F\\)).*?(\\(G\\)).*?(\\(H\\)).*?(\\(I\\))'

依照模型，代码如下：

#code2
from re import *
end_index=input('Please enter the ending index:')
index_list=['(\('+chr(i)+'\))' for i in range(ord('A'),ord(end_index)+1)]
compile_string='.*?'.join(index_list)
pat=compile(compile_string)
with open('mess.txt') as f:#读取文档后加工
    file_string=f.read()
    m=match(pat,file_string)
    index=[m.start(i)-1 for i in range(2,len(index_list)+1)]#范围是从位置2到最后一个索引
    #print(index)
    file_list=list(file_string) 
    #print(file_list)
    for i in index:
        file_list[i]='\n'
    new_string=''.join(file_list)#再将新的列表拼接成字符串
    #print(new_string)

with open('mess.txt','w') as f:#写入文档
    f.write(new_string)

依次执行code1和code2就可以实现所需要的功能，但文档多次打开，可以合并只打开一次：

from re import *
end_index=input('Please enter the ending index:')
index_list=['(\('+chr(i)+'\))' for i in range(ord('A'),ord(end_index)+1)]
compile_string='.*?'.join(index_list)#构建正则表达式
pat=compile(compile_string)
with open('cup.txt','r+') as f:
    file_string=f.read()
    str_list=file_string.split()#split()未指定分隔符。可以将单个或者连续的换行符、制表符、空格去除，因此空行也一并去除,形成一个无杂质的单词列表
    new_str=' '.join(str_list)#用空格将单词两两间隔 ,得到纯净的文本  
         
    m=match(pat,new_str)
    index=[m.start(i)-1 for i in range(2,len(index_list)+1)]#范围是编组2到最后一个索引，编组0和编组1不需要添加换行符
    file_list=list(new_str)#字符串无法切片赋值修改内容，需要字母列表化 
    for i in index:
        file_list[i]='\n'
    new_string=''.join(file_list)#再将新的带换行符的列表拼接成字符串文本

    f.seek(0)#定位到开头位置
    f.truncate()#清空之后所有文本
    f.write(new_string)#写入文档

扩展：以上代码已经可以处理当前路径内指定文件的文档整理，实际业务中，文档并不固定，索引也不只(A)这一种
1.利用命令行参数实现交互，根据输入(ABPD_05)获得文档路径
目标文件…/ABPD/05/int.txt …/ABPD/05/prin.txt …/ABPD/05/eod.txt
(更复杂的情况参考os.walk()目录树生成器获取文档路径）
2.构建所有的索引种类，经过输入判别，确定完整索引

import sys,os
from re import *
#假设有（A）(a) (I) (i) A a I i 四中序列，则需要输入首索引取确定是属于哪种形式（如果只用最后一个索引去判定，当输入I的时候无法判断是英文大写还是罗马大写）
#构造基础的四种不带括号的序列
upper_list=[n*chr(i) for n in range(1,4) for i in range(ord('A'),ord('Z')+1)]#多重嵌套的循环
c=[n*chr(i) for n in range(1,4) for i in range(ord('a'),ord('z')+1)]
roman_upper=['I','II','III','IV','V','VI','VII','VIII','IX','X']
roman_uppers=[i*'X'+j for i in range(4) for j in roman_upper]#40个罗马数
roman_lower=['i','ii','iii','iv','v','vi','vii','viii','ix','x']
roman_lowers=[i*'x'+j for i in range(4) for j in roman_lower]
index_type=[upper_list,upper_list,roman_uppers,roman_lowers]



ticker,series='ABPD_05'.split('_')#将文件夹ACDP和05获取
scriptpath=os.getcwd()#获取当前路径
path_to_text=os.path.join(scriptpath,ticker,series)#连接成为文件存放路径

#文件完整名
int_file=path_to_text+'\\'+'int.txt'#也可以用/
prin_file=path_to_text+'\\'+'prin.txt'
eod_file=path_to_text+'\\'+'eod.txt'
files=[int_file,prin_file,eod_file]
#print(int_file,prin_file,eod_file)
for file in files:
    #####整理文档
    #print(file)
    ##输入首尾索引
    begining_index_original=input('Please enter the begining index of '+file[len(path_to_text)+1:]+':')
    ending_index_original=input('Please enter the ending index of '+file[len(path_to_text)+1:]+':')
    ##如果有括号就去括号
    begining_index=begining_index_original[1:-1] if '(' in begining_index_original else begining_index_original
    ending_index=ending_index_original[1:-1] if '(' in ending_index_original else ending_index_original
    
    ##用去除括号后的首索引在四种索引中查找，确定索引始末
    for i in index_type:
        if begining_index==i[0]:
            index_list_original=i[:i.index(ending_index)+1]
            break
    else:
        print('not the first index')
        break
        
    ##再把刚才去的括号加上,得到准确的索引列表
    if '(' in begining_index_original:
        index_list=['(\('+i+'\))' for i in index_list_original]
    else:
        index_list=['('+i+')' for i in index_list_original]
    print(index_list)
    #index_list=['(\('+chr(i)+'\))' for i in range(ord('A'),ord(end_index)+1)]
    compile_string='.*?'.join(index_list)
    pat=compile(compile_string)
    with open(file,'r+') as f:
        file_string=f.read()
        str_list=file_string.split()#split()未指定分隔符。可以将单个或者连续的换行符、制表符、空格去除，因此空行也一并去除,形成一个无杂质的单词列表
        new_str=' '.join(str_list)#用空格将单词两两间隔 ,得到纯净的文本   


        m=match(pat,new_str)
        index=[m.start(i)-1 for i in range(2,len(index_list)+1)]#范围是编组2到最后一个索引，编组0和编组1不需要添加换行符
        file_list=list(new_str)#字符串无法用切片和索引，需要字母列表化 
        for i in index:
            file_list[i]='\n'
        new_string=''.join(file_list)#再将新的带换行符的列表拼接成字符串文本

        f.seek(0)#定位到开头位置
        f.truncate()#清空之后所有文本
        f.write(new_string)#写入文档