批量检测文件字符编码集并转换编码的python脚本

本文介绍了一个用于将Visual Studio项目中特定类型的文件从GBK编码转换为UTF-8编码的Python脚本。该脚本可以自动检测文件编码,并仅对指定类型的文件进行转换,同时记录操作过程。

为了将vs项目下的文件都从gbk转到utf8,保证在xcode下正常,自己写了这个脚本

0.2版 增加文件类型过滤;先备份后转换;

import time
import os
import os.path
import shutil
import chardet

rootdir = "D:\\tmp\\Classes"
filetype_limit = ['h', 'c', 'cpp']
print "rootdir=" + rootdir

def get_time_for_filename():
	return time.strftime("%m%d-%H%M%S", time.localtime(time.time()))

print 'read all encoding info...'
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
	for filename in filenames:
		#get file extension
		fext = os.path.splitext(filename)[1][1:].lower()
		need_convert = False
		for ftype in filetype_limit:
			if (fext == ftype.lower()):
				need_convert = True
				break
		if (not need_convert):
			#print 'ignor ' + filename
			continue
		fullname = os.path.join(parent,filename)
		tmpfile = open(fullname, 'rb')
		tmpstr = tmpfile.read()
		enc_result = chardet.detect(tmpstr)
		if enc_result['encoding'] != None:
			file_enc = enc_result['encoding'] + "," + str(enc_result['confidence'])  + "," + fullname
			file_encs.append(file_enc)
			file_encs_dict[fullname] = enc_result
		else :
			file_enc = "None,None," + fullname
			file_encs.append(file_enc)
		tmpfile.close()
		print 'read ' + file_enc

#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
	allstr += file_enc + "\n"
	#print file_enc
encode_info_filename = get_time_for_filename() + '_all_encode_info.csv'
outfile = open(encode_info_filename, 'w')
outfile.write(allstr)
outfile.close()
print 'save to ' + encode_info_filename

print 'convert gb2312 to utf-8...'
convert_log = ""
for key1 in file_encs_dict:
	enc1 = file_encs_dict[key1]
	if  enc1['encoding'] == 'GB2312':
		#backup file
		backup_filename = key1 + '.gb2312'
		shutil.copy(key1, backup_filename)
		print 'backup ' + backup_filename
		
		file_convert = open(key1, 'r')
		str_convert = file_convert.read()
		file_convert.close()
		file_convert = open(key1, 'w')
		str_convert_unicode = str_convert.decode('gbk')
		str_convert_utf8 = str_convert_unicode.encode('utf-8')
		file_convert.write(str_convert_utf8)
		file_convert.close()
		
		log1 = "gb2312 to utf8," + key1
		print log1;
		convert_log += log1 + "\n"
		
convert_result_filename = get_time_for_filename() + '_convert_result.csv'
convert_log_file = open(convert_result_filename, 'w')
convert_log_file.write(convert_log)
convert_log_file.close()
print 'save to ' + convert_result_filename

print "all completed"





----------------------------------------------------------------------------------------------------------

0.1版

import os
import os.path
import chardet
#chardet 字符编码检测组件

rootdir = "D:\\pj\\testpj\\frameworks\\runtime-src\\Classes"
print "rootdir=" + rootdir

#read all encoding info
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
	for filename in filenames:
		fullname = os.path.join(parent,filename)
		tmpfile = open(fullname, 'rb')
		tmpstr = tmpfile.read()
		enc_result = chardet.detect(tmpstr)
		if enc_result['encoding'] != None:
			file_enc = enc_result['encoding'] + "," + str(enc_result['confidence'])  + "," + fullname
			#confidence 检测结果的可信度或准确度
			file_encs.append(file_enc)
			file_encs_dict[fullname] = enc_result
		else :
			file_enc = "None,None," + fullname
			file_encs.append(file_enc)
		tmpfile.close()

#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
	allstr += file_enc + "\n"
	#print file_enc
outfile = open('out.csv', 'w')
outfile.write(allstr)
outfile.close()

#convert gb2312 to utf-8
convert_log = ""
for key1 in file_encs_dict:
	enc1 = file_encs_dict[key1]
	if  enc1['encoding'] == 'GB2312':
	
		file_convert = open(key1, 'r')
		str_convert = file_convert.read()
		file_convert.close()
		
		file_convert = open(key1, 'w')
		str_convert_unicode = str_convert.decode('gbk')
		str_convert_utf8 = str_convert_unicode.encode('utf-8')
		file_convert.write(str_convert_utf8)
		file_convert.close()
		#decode() 其他编码转为python标准的unicode编码,encode() unicode转为其他编码
		
		log1 = "gb2312 to utf8," + key1
		print log1;
		convert_log += log1 + "\n"
		
convert_log_file = open('convert_result.csv', 'w')
convert_log_file.write(convert_log)
convert_log_file.close()

print "all completed"



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值