xlsx文件转为csv
import pandas as pd#需要用到的包
import numpy as np#需要用到的包
path = "/home/public/GFQ/math_model/"#路径
filepath_poi = path + "data.xlsx"#路径+文件名
data1=pd.read_excel(filepath_poi)#读取文件。用data1储存
data1.to_csv('select.csv',index=False, encoding='utf-8')
按概率统计
data=np.zeros((1005))#国家概率
with open(csv_file) as f:
reader = csv.reader(f)
first_row = next(reader)
# second_row = reader.__next__()
# print(first_row)
# print(second_row)
#-----------------------
for row in reader:
index=int(row[8])
#print(data[index])
data[index]+=1
#print(data[index])
with open('pre.csv', mode="w") as f:
writer = csv.writer(f)
with open(csv_file) as f01:
reader = csv.reader(f01)
first_row = next(reader)
writer.writerow(first_row)
# print(first_row)
for row in reader:
index=int(row[8])
#print(data[index])
row[8]=data[index]*1.0/114184
writer.writerow(row)
或者
d = {}
e={}
with open('one1.csv') as f:
reader = csv.reader(f)
first_row = next(reader)
# first_row = reader.__next__()
# second_row = reader.__next__()
# print(first_row)
# print(second_row)
#-----------------------
index = 0
index2=0
dis_area = []
dis_area2=[]
for row in reader:
dis_area.append(row[31])#第一列的数据映射到不重复的矩阵
dis_area2.append(row[32])
for one_geo in dis_area:
if one_geo not in d:
d[one_geo]=[]
d[one_geo].append(1)
else:
d[one_geo][0]=d[one_geo][0]+1
for one_geo in dis_area2:
if one_geo not in e:
e[one_geo]=[]
e[one_geo].append(1)
else:
e[one_geo][0]=e[one_geo][0]+1
#城市控制从前补
with open('one2.csv', mode="w") as f:
writer = csv.writer(f)
with open('one1.csv') as f01:
reader = csv.reader(f01)
first_row = next(reader)
writer.writerow(first_row)
# second_row = reader.__next__()
# print(first_row)
# print(second_row)
#-----------------------
for row in reader:
row[31]=d[row[31]][0]*1.0/114184
row[32]=e[row[32]][0]*1.0/114184
writer.writerow(row)
缺失值处理(缺失值处理后记得用某个来查看有没有补进去np.isnan(train['doubtterr']).any())
import pandas as pd
import numpy as np
import csv
path = "/home/public/GFQ/math_model/"
train=pd.read_excel(path+'data2.xlsx')
train['nkill']=train['nkill'].fillna(train['nkill'].mean())
train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean())
train['nkillter']=train['nkillter'].fillna(train['nkillter'].mean())
train['nwound']=train['nwound'].fillna(train['nwound'].mean())
train['nwoundus']=train['nwoundus'].fillna(train['nwoundus'].mean())
train['nkillus']=train['nkillus'].fillna(train['nkillus'].mean())
train['nwoundte']=train['nwoundte'].fillna(train['nwoundte'].mean())
train['nperpcap']=train['nperpcap'].fillna(train['nperpcap'].mean())
train['multiple']=train['nwoundte'].fillna(0)
train['guncertain1']=train['guncertain1'].fillna(train['guncertain1'].mode() )#zhongshu
train['doubtterr']=train['doubtterr'].fillna(train['doubtterr'].mode() )
train['gname']=train['gname'].fillna(train['gname'].mode() )
train['claimmode']=train['claimmode'].fillna(train['claimmode'].mode() )
train['ishostkid']=train['ishostkid'].fillna(train['ishostkid'].mode() )
数据归一化
import pandas as pd
import numpy as np
import csv
path = "/home/public/GFQ/math_model/"
train=pd.read_excel(path+'one.xlsx')
dum_propextent=pd.get_dummies(train['propextent'],prefix='propextent')
dum_propextent.head(3)
df=pd.concat([train,dum_propextent],axis=1)
df.to_csv('one1.csv',index=False, encoding='utf-8')