为了将vs项目下的文件都从gbk转到utf8,保证在xcode下正常,自己写了这个脚本
0.2版 增加文件类型过滤;先备份后转换;
import time
import os
import os.path
import shutil
import chardet
rootdir = "D:\\tmp\\Classes"
filetype_limit = ['h', 'c', 'cpp']
print "rootdir=" + rootdir
def get_time_for_filename():
return time.strftime("%m%d-%H%M%S", time.localtime(time.time()))
print 'read all encoding info...'
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
for filename in filenames:
#get file extension
fext = os.path.splitext(filename)[1][1:].lower()
need_convert = False
for ftype in filetype_limit:
if (fext == ftype.lower()):
need_convert = True
break
if (not need_convert):
#print 'ignor ' + filename
continue
fullname = os.path.join(parent,filename)
tmpfile = open(fullname, 'rb')
tmpstr = tmpfile.read()
enc_result = chardet.detect(tmpstr)
if enc_result['encoding'] != None:
file_enc = enc_result['encoding'] + "," + str(enc_result['confidence']) + "," + fullname
file_encs.append(file_enc)
file_encs_dict[fullname] = enc_result
else :
file_enc = "None,None," + fullname
file_encs.append(file_enc)
tmpfile.close()
print 'read ' + file_enc
#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
allstr += file_enc + "\n"
#print file_enc
encode_info_filename = get_time_for_filename() + '_all_encode_info.csv'
outfile = open(encode_info_filename, 'w')
outfile.write(allstr)
outfile.close()
print 'save to ' + encode_info_filename
print 'convert gb2312 to utf-8...'
convert_log = ""
for key1 in file_encs_dict:
enc1 = file_encs_dict[key1]
if enc1['encoding'] == 'GB2312':
#backup file
backup_filename = key1 + '.gb2312'
shutil.copy(key1, backup_filename)
print 'backup ' + backup_filename
file_convert = open(key1, 'r')
str_convert = file_convert.read()
file_convert.close()
file_convert = open(key1, 'w')
str_convert_unicode = str_convert.decode('gbk')
str_convert_utf8 = str_convert_unicode.encode('utf-8')
file_convert.write(str_convert_utf8)
file_convert.close()
log1 = "gb2312 to utf8," + key1
print log1;
convert_log += log1 + "\n"
convert_result_filename = get_time_for_filename() + '_convert_result.csv'
convert_log_file = open(convert_result_filename, 'w')
convert_log_file.write(convert_log)
convert_log_file.close()
print 'save to ' + convert_result_filename
print "all completed"
----------------------------------------------------------------------------------------------------------
0.1版
import os
import os.path
import chardet
#chardet 字符编码检测组件
rootdir = "D:\\pj\\testpj\\frameworks\\runtime-src\\Classes"
print "rootdir=" + rootdir
#read all encoding info
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
for filename in filenames:
fullname = os.path.join(parent,filename)
tmpfile = open(fullname, 'rb')
tmpstr = tmpfile.read()
enc_result = chardet.detect(tmpstr)
if enc_result['encoding'] != None:
file_enc = enc_result['encoding'] + "," + str(enc_result['confidence']) + "," + fullname
#confidence 检测结果的可信度或准确度
file_encs.append(file_enc)
file_encs_dict[fullname] = enc_result
else :
file_enc = "None,None," + fullname
file_encs.append(file_enc)
tmpfile.close()
#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
allstr += file_enc + "\n"
#print file_enc
outfile = open('out.csv', 'w')
outfile.write(allstr)
outfile.close()
#convert gb2312 to utf-8
convert_log = ""
for key1 in file_encs_dict:
enc1 = file_encs_dict[key1]
if enc1['encoding'] == 'GB2312':
file_convert = open(key1, 'r')
str_convert = file_convert.read()
file_convert.close()
file_convert = open(key1, 'w')
str_convert_unicode = str_convert.decode('gbk')
str_convert_utf8 = str_convert_unicode.encode('utf-8')
file_convert.write(str_convert_utf8)
file_convert.close()
#decode() 其他编码转为python标准的unicode编码,encode() unicode转为其他编码
log1 = "gb2312 to utf8," + key1
print log1;
convert_log += log1 + "\n"
convert_log_file = open('convert_result.csv', 'w')
convert_log_file.write(convert_log)
convert_log_file.close()
print "all completed"