文件夹下的文件:

每个文件中的内容:

代码:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
def findFiles(path): return glob.glob(path)
categorty_lines = {}
all_categories = []
for filename in findFiles('E:\\data\\surname\\names\\*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
categorty_lines[category] = lines
print(categorty_lines)
all_leters = string.ascii_letters + " .,;"
n_letters = len(all_leters)
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize("NFD",s)
if unicodedata.category(c) != 'Mn' and c in all_leters
)
print(unicodeToAscii('Ślusàrski'))
categorty_lines = {}
all_categories = []
def readLines(filename):
lines = open(filename,encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
for filename in findFiles('E:\\data\\surname\\names\\*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
categorty_lines[category] = lines
n_categories = len(all_categories)
结果:

将categorty_lines字典类型数据存入csv文件中:
import csv
fpath = r'E:\\data\\surname\\names\\merge_surname.csv'
f = open(fpath, 'w')
res_list = []
res_dic = {}
for key,value in categorty_lines.items():
for v in value:
res_dic["surname"]=v
res_dic['nationality']=key
res_list.append(res_dic)
res_dic={}
fieldnames = ['surname','nationality']
csvw = csv.DictWriter(f,fieldnames=fieldnames,lineterminator='\n')
csvw.writeheader()
csvw.writerows(res_list)
f.close()
解析:
首先将数据解析为列名:值的形式;即

最终形式为:

这段代码读取'E:datasurname
ames'目录下所有.txt文件的内容,将文件按类别(基于文件名)组织成字典,并对姓名进行Unicode到ASCII的转换。然后,将结果写入CSV文件,包含两列:'surname'和'nationality',记录了姓氏和国籍信息。

1017

被折叠的 条评论
为什么被折叠?



