3. 数据处理阶段
分析
为方便后面进行回归分析,所以需要将所有数据转换为数字的形式,在这里我们需要完成的功能如下:
-
将被判年限如
四年
转换为4*12
(以月份为单位) -
将赔偿金额如
五万八千元
转换为58000
(以元为单位) -
将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字,如:
-
将被告生日转为当前年龄
-
将性别转为0|1数字
代码实现
先说一下将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字的功能实现,因为我当时没有反应过来numpy
有一个专门的方法去重,所以自己将它实现了一遍
import csv
import pandas as pd
'''
将所有的被告籍贯做归一化处理
'''
# ChineseAdminiDivisionsDict.py
# 中国行政区划编码
count=0
ProvinceCode = {
}
CityCode = {
}
LOCATIONDICT={
}
def ETLHome(home):
'''转换对应的结果输出'''
if home in LOCATIONDICT.keys():
homecode=LOCATIONDICT[home]
return homecode
else:
return "NaN"
def getLocationCode(homes):
locations = set()
'''过滤重复数据'''
for x in homes:
locations.add(x)
'''初始化地区字典'''
init_locations(locations)
'''根据字典,将对应的省市赋值'''
location_list = []
parseLocation(locations,location_list)
return location_list
def parseLocation(locations,location_list):
for x in locations:
pid = parseProvince(x)
cid = parseCity(x)
location_id=str(pid)+"."+str(cid)
location_list.append({
"location":x,"location_id":location_id})
LOCATIONDICT[x]=location_id
# '''打印字典作为后面的对照'''
# save_title_csv()
# for x in location_list:
# save_data_csv(x)
def parseProvince(home):
if home.__contains__("龙岩市"):
home= "福建省"+home
for x in ProvinceCode.keys():
if home.__contains__(x):
return ProvinceCode[x]
def parseCity(home):
for x in CityCode.keys():
if home.__contains__(x):
return CityCode[x]
def init_locations(locations):
province_set = set()
city_set = set()
for x in locations:
if x.__contains__("省"):
(province,city)=x.split("省")
province_set.add(province)
city_set.add(city)
elif x.__contains__("自治区"):
(province, city) = x.split("自治区")
province_set.add(province)
city_set.add(city)
elif x.__contains__("重庆市"):
(province, city) = x.split("市")
province_set.add(province)
city_set.add(city)
elif x.__contains__("龙岩市"):
province_set.add("福建省")
city_set.add("龙岩市")
else:
province_set.add(x)
for index,province in enumerate(province_set):
ProvinceCode[province]=index
for index,city in enumerate(city_set):
CityCode[city]=index
def save_title_csv():
data_title = ['location','location_id']
with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, data_title)
writer.writeheader()
def save_data_csv(data):
global count
with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
count += 1
print('=' * 20 + '第{}条csv写入成功'.format(count) + '=' * 20)
if __name__ == '__main__':
df = pd.read_csv("../ETL.csv")
homes = df['home']
getLocationCode(homes)
for home in homes:
print(home,ETLHome(home))
而这么多代码用numpy的unique
函数就可以实现,好气啊!
import csv
import pandas as pd
import numpy as np
def get_court_map(court_df):
map_list = dict()
temp_uni = np.unique(court_df)
for i in range(len(temp_uni)):
map_list[temp_uni[i]] = i
# {'东阳市人民检察院': 0, '乐安县人民检察院': 1,...}
return map_list
def savecourtmap(court_df):
save_title_csv