Python3批量转换文本文件编码

最新推荐文章于 2021-03-15 16:08:10 发布

转载最新推荐文章于 2021-03-15 16:08:10 发布 · 552 阅读

Python 同时被 2 个专栏收录

131 篇文章

订阅专栏

编码

79 篇文章

订阅专栏

本文提供了一种Python脚本，用于批量转换文本文件的编码，包括自动检测和手动指定编码两种方式。脚本适用于不同目录层级下的文件，并能够处理各种常见的编码问题。

Python3批量转换文本文件编码



-*- coding: utf-8 -*-

002 try:

003 from chardet.universaldetector import UniversalDetector

004 IsAuto = True

005 except ImportError:

006 IsAuto = False

007 import os

008 import os.path

009 import glob

010

011 def Convert_Auto( filename,out_enc="utf-8" ):

012 ''' Re-encode text file with auto detec current encode. Need chardet Lib.

013 Input Parameter:

014 filename: full path and file name, e.g. c:\dir1\file.txt

015 out_enc: new encode. Default as 'utf-8'

016 Output Parameter

017 None'''

018 try:

019 f=open(filename,'rb')

020 b= b' '

021 b+=f.read(1024)

022 u=UniversalDetector()

023 u.reset()

024 u.feed(b)

025 u.close()

026 f.seek(0)

027 b=f.read()

028 f.close()

029 in_enc=u.result['encoding']

030 new_content=b.decode(in_enc, 'ignore')

031 f=open(filename, 'w', encoding=out_enc)

032 f.write(new_content)

033 f.close()

034 print ("Success: "+filename+" converted from "+ in_enc+" to "+out_enc+" !")

035 except IOError:

036 print ("Error: "+filename+" FAIL to converted from "+ in_enc+" to "+out_enc+" !" )

037

038 def Convert_Manu( filename,in_enc='gbk', out_enc="utf-8" ):

039 ''' Re-encode text file with manual decide input text encode.

040 Input Parameter:

041 filename: full path and file name, e.g. c:\dir1\file.txt

042 in_enc: current encode. Default as 'gbk'

043 out_enc: new encode. Default as 'utf-8'

044 Output Parameter

045 None'''

046 try:

047 print ("convert " + filename)

048 f=open(filename,'rb')

049 b=f.read()

050 f.close()

051 new_content=b.decode(in_enc, 'ignore')

052 f=open(filename, 'w', encoding=out_enc)

053 f.write(new_content)

054 f.close()

055 print ("Success: "+filename+" converted from "+ in_enc+" to "+out_enc+" !")

056 except IOError:

057 print ("Error: "+filename+" FAIL to converted from "+ in_enc+" to "+out_enc+" !" )

058

059

060 def explore(dir, IsLoopSubDIR=True):

061 '''Convert files encoding.

062 Input:

063 dir : Current folder

064 IsLoopSubDIR: True -- Include files in sub folder

065 False-- Only include files in current folder

066 Output:

067 NONE

068 '''

069 if IsLoopSubDIR:

070 flist=getSubFileList(dir, '.txt')

071 else:

072 flist=getCurrFileList(dir, '.txt')

073 for fname in flist:

074 if IsAuto:

075 Convert_Auto(fname, 'utf-8')

076 else:

077 Convert_Manu(fname, 'gbk', 'utf-8')

078

079

080 def getSubFileList(dir, suffix=''):

081 '''Get all file list with specified suffix under current folder(Include sub folder)

082 Input:

083 dir : Current folder

084 suffix : default to blank, means select all files.

085 Output:

086 File list

087 '''

088 flist=[]

089 for root, dirs, files in os.walk(os.getcwd()):

090 for name in files:

091 if name.endswith(suffix):

092 flist.append(os.path.join(root, name))

093 return flist

094

095 def getCurrFileList(dir, suffix=''):

096 '''Get all file list with specified suffix under current level folder

097 Input:

098 dir : Current folder

099 suffix : default to blank, means select all files.

100 Output:

101 File list

102 '''

103 if suffix=='':

104 files=glob.glob('*')

105 else:

106 files=glob.glob('*'+suffix)

107 flist=[]

108 for f in files:

109 flist.append(os.path.join(os.getcwd(), f))

110 return flist

111

112

113 def main():

114 explore(os.getcwd(), True)

115

116 if __name__ == "__main__":

117 main()