老师布置了个作业,要进行成绩分析。先看看数据长啥样
嗯,难看(@_@😉
给了一大堆,能直接拿来用的也就只有总分了,memo一项里有平时分期中分期末分总评分,但是有缺失值,有的课程没有期末分或者期中,这样的话没办法批量处理,只能遍历,一个个处理。
先准备好正则表达式
#2019-1-14 15:13:52_XXX_提交_平时:85/期末:69/总评:75;2019-1
p1 = re.compile('.*平时:([0-9]*\.?[0-9]+)/期')
p2 = re.compile('.*期中:([0-9]*\.?[0-9]+)/期')
p3 = re.compile('.*期末:([0-9]*\.?[0-9]+)/总')
p4 = re.compile('.*总评:([0-9]*\.?[0-9]+);.*')
然后读取数据,取出我们想要的,顺便把缺失的补上,但是有的没有期中或者期末,所以还是需要我们自己遍历进行处理
data_import=pd.read_csv('18大数据.CSV',encoding='GBK')#读取源数据
info=pd.DataFrame(data_import)[['index','StuId','StuName','StuClass','CurId','CurName','总评','Memo']]#取出来这几个
info.set_index('index',inplace=True)#把index设为索引
info['平时']=''#新建列
info['qizhong'] = ''
info['期末'] = ''
info['zongping'] = ''
info['Memo'].fillna('XXX_提交_平时:0/期中:0/期末:0/总评:0',inplace=True)#memo里空白补全
遍历处理
for index,row in info.iterrows():#遍历,用正则拿东西
#print(index)
#print(type(row['Memo']))
#if(type(row['Memo'])==type('str')):
if (p1.search(row['Memo']) != None):
info.loc[index, '平时'] = p1.search(row['Memo']).group(1)
else:
info.loc[index, '平时'] = '0'
if (p2.search(row['Memo']) != None):
info.loc[index, 'qizhong'] = p2.search(row['Memo']).group(1)
else:
info.loc[index, 'qizhong'] = '0'
if (p3.search(row['Memo']) != None):
info.loc[index, '期末'] = p3.search(row['Memo']).group(1)
else:
info.loc[index, '期末'] = '0'
if (p4.search(row['Memo']) != None):
info.loc[index, 'zongping'] = p4.search(row['Memo']).group(1)
else:
info.loc[index, 'zongping'] = '0'
info.sort_values(by=['StuClass',"StuId","CurId"],axis=0,ascending=[True,True,True],inplace=True)
#排序,现拍班级,班级一样看id,id一样看课程id
数据处理完之后就可以保存辽
def save(name,value):#保存函数
file_name=name+'_output.csv'
value.to_csv('output/'+file_name,index=None,encoding='GBK')
print(file_name+'成功')
data_group = info.groupby(info["StuClass"])#按班级分
for key, value in data_group:
save(key,value)
data_group = info.groupby(info["CurName"])#按课程分
for key, value in data_group:
save(key,value)
NICE( ̄y▽, ̄)╭