批量检测文件字符编码集并转换编码的python脚本

最新推荐文章于 2025-06-25 09:06:06 发布

klyhssrs

最新推荐文章于 2025-06-25 09:06:06 发布

阅读量779

点赞数

CC 4.0 BY-SA版权

分类专栏： cocos2d-x

本文链接：https://blog.youkuaiyun.com/klyhssrs/article/details/50636109

cocos2d-x 专栏收录该内容

15 篇文章

订阅专栏

本文介绍了一个用于将Visual Studio项目中特定类型的文件从GBK编码转换为UTF-8编码的Python脚本。该脚本可以自动检测文件编码，并仅对指定类型的文件进行转换，同时记录操作过程。

为了将vs项目下的文件都从gbk转到utf8，保证在xcode下正常，自己写了这个脚本

0.2版增加文件类型过滤；先备份后转换；

import time
import os
import os.path
import shutil
import chardet

rootdir = "D:\\tmp\\Classes"
filetype_limit = ['h', 'c', 'cpp']
print "rootdir=" + rootdir

def get_time_for_filename():
	return time.strftime("%m%d-%H%M%S", time.localtime(time.time()))

print 'read all encoding info...'
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
	for filename in filenames:
		#get file extension
		fext = os.path.splitext(filename)[1][1:].lower()
		need_convert = False
		for ftype in filetype_limit:
			if (fext == ftype.lower()):
				need_convert = True
				break
		if (not need_convert):
			#print 'ignor ' + filename
			continue
		fullname = os.path.join(parent,filename)
		tmpfile = open(fullname, 'rb')
		tmpstr = tmpfile.read()
		enc_result = chardet.detect(tmpstr)
		if enc_result['encoding'] != None:
			file_enc = enc_result['encoding'] + "," + str(enc_result['confidence'])  + "," + fullname
			file_encs.append(file_enc)
			file_encs_dict[fullname] = enc_result
		else :
			file_enc = "None,None," + fullname
			file_encs.append(file_enc)
		tmpfile.close()
		print 'read ' + file_enc

#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
	allstr += file_enc + "\n"
	#print file_enc
encode_info_filename = get_time_for_filename() + '_all_encode_info.csv'
outfile = open(encode_info_filename, 'w')
outfile.write(allstr)
outfile.close()
print 'save to ' + encode_info_filename

print 'convert gb2312 to utf-8...'
convert_log = ""
for key1 in file_encs_dict:
	enc1 = file_encs_dict[key1]
	if  enc1['encoding'] == 'GB2312':
		#backup file
		backup_filename = key1 + '.gb2312'
		shutil.copy(key1, backup_filename)
		print 'backup ' + backup_filename
		
		file_convert = open(key1, 'r')
		str_convert = file_convert.read()
		file_convert.close()
		file_convert = open(key1, 'w')
		str_convert_unicode = str_convert.decode('gbk')
		str_convert_utf8 = str_convert_unicode.encode('utf-8')
		file_convert.write(str_convert_utf8)
		file_convert.close()
		
		log1 = "gb2312 to utf8," + key1
		print log1;
		convert_log += log1 + "\n"
		
convert_result_filename = get_time_for_filename() + '_convert_result.csv'
convert_log_file = open(convert_result_filename, 'w')
convert_log_file.write(convert_log)
convert_log_file.close()
print 'save to ' + convert_result_filename

print "all completed"

----------------------------------------------------------------------------------------------------------

0.1版

import os
import os.path
import chardet
#chardet 字符编码检测组件

rootdir = "D:\\pj\\testpj\\frameworks\\runtime-src\\Classes"
print "rootdir=" + rootdir

#read all encoding info
file_encs = []
file_encs_dict = {}
for parent,dirnames,filenames in os.walk(rootdir):
	for filename in filenames:
		fullname = os.path.join(parent,filename)
		tmpfile = open(fullname, 'rb')
		tmpstr = tmpfile.read()
		enc_result = chardet.detect(tmpstr)
		if enc_result['encoding'] != None:
			file_enc = enc_result['encoding'] + "," + str(enc_result['confidence'])  + "," + fullname
			#confidence 检测结果的可信度或准确度
			file_encs.append(file_enc)
			file_encs_dict[fullname] = enc_result
		else :
			file_enc = "None,None," + fullname
			file_encs.append(file_enc)
		tmpfile.close()

#save to csv file
file_encs.sort()
allstr = ''
for file_enc in file_encs:
	allstr += file_enc + "\n"
	#print file_enc
outfile = open('out.csv', 'w')
outfile.write(allstr)
outfile.close()

#convert gb2312 to utf-8
convert_log = ""
for key1 in file_encs_dict:
	enc1 = file_encs_dict[key1]
	if  enc1['encoding'] == 'GB2312':
	
		file_convert = open(key1, 'r')
		str_convert = file_convert.read()
		file_convert.close()
		
		file_convert = open(key1, 'w')
		str_convert_unicode = str_convert.decode('gbk')
		str_convert_utf8 = str_convert_unicode.encode('utf-8')
		file_convert.write(str_convert_utf8)
		file_convert.close()
		#decode() 其他编码转为python标准的unicode编码，encode() unicode转为其他编码
		
		log1 = "gb2312 to utf8," + key1
		print log1;
		convert_log += log1 + "\n"
		
convert_log_file = open('convert_result.csv', 'w')
convert_log_file.write(convert_log)
convert_log_file.close()

print "all completed"