Python3批量转换文本文件编码 |
002 | try: |
003 | from chardet.universaldetector import UniversalDetector |
004 | IsAuto = True |
005 | except ImportError: |
006 | IsAuto = False |
007 | import os |
008 | import os.path |
009 | import glob |
010 |
011 | def Convert_Auto( filename,out_enc="utf-8" ): |
012 | ''' Re-encode text file with auto detec current encode. Need chardet Lib. |
013 | Input Parameter: |
014 | filename: full path and file name, e.g. c:\dir1\file.txt |
015 | out_enc: new encode. Default as 'utf-8' |
016 | Output Parameter |
017 | None''' |
018 | try: |
019 | f=open(filename,'rb') |
020 | b= b' ' |
021 | b+=f.read(1024) |
022 | u=UniversalDetector() |
023 | u.reset() |
024 | u.feed(b) |
025 | u.close() |
026 | f.seek(0) |
027 | b=f.read() |
028 | f.close() |
029 | in_enc=u.result['encoding'] |
030 | new_content=b.decode(in_enc, 'ignore') |
031 | f=open(filename, 'w', encoding=out_enc) |
032 | f.write(new_content) |
033 | f.close() |
034 | print ("Success: "+filename+" converted from "+ in_enc+" to "+out_enc+"
!") |
035 | except IOError: |
036 | print ("Error: "+filename+" FAIL to converted from "+ in_enc+" to "+out_enc+"
!" ) |
037 |
038 | def Convert_Manu( filename,in_enc='gbk', out_enc="utf-8" ): |
039 | ''' Re-encode text file with manual decide input text encode. |
040 | Input Parameter: |
041 | filename: full path and file name, e.g. c:\dir1\file.txt |
042 | in_enc: current encode. Default as 'gbk' |
043 | out_enc: new encode. Default as 'utf-8' |
044 | Output Parameter |
045 | None''' |
046 | try: |
047 | print ("convert " + filename) |
048 | f=open(filename,'rb') |
049 | b=f.read() |
050 | f.close() |
051 | new_content=b.decode(in_enc, 'ignore') |
052 | f=open(filename, 'w', encoding=out_enc) |
053 | f.write(new_content) |
054 | f.close() |
055 | print ("Success: "+filename+" converted from "+ in_enc+" to "+out_enc+"
!") |
056 | except IOError: |
057 | print ("Error: "+filename+" FAIL to converted from "+ in_enc+" to "+out_enc+"
!" ) |
058 |
059 |
060 | def explore(dir, IsLoopSubDIR=True): |
061 | '''Convert files encoding. |
062 | Input: |
063 | dir : Current folder |
064 | IsLoopSubDIR: True -- Include files in sub folder |
065 | False-- Only include files in current folder |
066 | Output: |
067 | NONE |
068 | ''' |
069 | if IsLoopSubDIR: |
070 | flist=getSubFileList(dir, '.txt') |
071 | else: |
072 | flist=getCurrFileList(dir, '.txt') |
073 | for fname in flist: |
074 | if IsAuto: |
075 | Convert_Auto(fname, 'utf-8') |
076 | else: |
077 | Convert_Manu(fname, 'gbk', 'utf-8') |
078 |
079 | |
080 | def getSubFileList(dir, suffix=''): |
081 | '''Get all file list with specified suffix under current folder(Include sub folder) |
082 | Input: |
083 | dir : Current folder |
084 | suffix : default to blank, means select all files. |
085 | Output: |
086 | File list |
087 | ''' |
088 | flist=[] |
089 | for root, dirs, files in os.walk(os.getcwd()): |
090 | for name in files: |
091 | if name.endswith(suffix): |
092 | flist.append(os.path.join(root, name)) |
093 | return flist |
094 |
095 | def getCurrFileList(dir, suffix=''): |
096 | '''Get all file list with specified suffix under current level folder |
097 | Input: |
098 | dir : Current folder |
099 | suffix : default to blank, means select all files. |
100 | Output: |
101 | File list |
102 | ''' |
103 | if suffix=='': |
104 | files=glob.glob('*') |
105 | else: |
106 | files=glob.glob('*'+suffix) |
107 | flist=[] |
108 | for f in files: |
109 | flist.append(os.path.join(os.getcwd(), f)) |
110 | return flist |
111 | |
112 | |
113 | def main(): |
114 | explore(os.getcwd(), True) |
115 | |
116 | if __name__ == "__main__": |
117 | main() |

本文提供了一种Python脚本,用于批量转换文本文件的编码,包括自动检测和手动指定编码两种方式。脚本适用于不同目录层级下的文件,并能够处理各种常见的编码问题。
431

被折叠的 条评论
为什么被折叠?



