第1关:Concat与Append操作
import pandas as pd
def task1():
#********** Begin **********#
df1 = pd.read_csv("step1/data.csv")
df2 = pd.read_csv("step1/data1.csv")
result = pd.concat([df1, df2], axis=1).set_index('Ladder').fillna(0)
#********** End **********#
return result
第2关:合并与连接
import pandas as pd
def task2(dataset1,dataset2,dataset3):
# ********** Begin **********#
df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(dataset2)
df3 = pd.DataFrame(dataset3)
df4 = pd.merge(df1, df2, how='outer')
df4 = pd.merge(df4, df3, how='outer')
df4 = df4.fillna(0, axis=1)
df4['user_id'] = df4['user_id'] + df4['id']
df4['user_id'] =df4['user_id'].astype('int')
df4.drop(df4.index[3], inplace=True)
result = df4.drop('id', axis=1)
result = result.sort_values(by='user_id', ascending=True)
result.index = [6, 7, 8, 0, 1, 2, 3, 4, 5]
# ********** End **********#
return result
第3关:案例:美国各州的统计数据
import pandas as pd
import numpy as np
def task3():
#********** Begin **********#
#读取三个csv文件
pop = pd.DataFrame(pd.read_csv("./step3/state-population.csv"))
ares = pd.DataFrame(pd.read_csv("./step3/state-areas.csv"))
abbrevs = pd.DataFrame(pd.read_csv("./step3/state-abbrevs.csv"))
# 合并pop和abbrevs并删除重复列
df1 = pd.merge(pop, abbrevs, how='outer', left_on='state/region', right_on='abbreviation')
df1 = df1.drop('abbreviation', axis=1)
# 填充对应的全称
df1.loc[df1['state/region'] == 'PR', 'state'] = 'Puerto Rico'
df1.loc[df1['state/region'] == 'USA', 'state'] = 'United States'
# 合并面积数据
df1 = pd.merge(df1, ares, on='state', how='left')
# 删掉这些缺失值
df1 = df1.dropna()
# 取year为2010年的数据,并将索引设为state列
df1 = df1.loc[df1['year'] == 2010]
df1.set_index('state')
# 计算人口密度
df1['population'] = df1['population'] / df1['area (sq. mi)']
# 对密度求和
a = df1.loc[df1['ages'] == 'under18']['population']
b = df1.loc[df1['ages'] == 'total']['population']
all = a.values + b.values
all = pd.DataFrame(all, index=df1.loc[df1['ages'] == 'under18']['state'])
# 对值进行排序
all = all.sort_values(0, ascending=False)
# 输出人口密度前5名和倒数5名
print("前5名:")
front = str(all.iloc[:5, 0])[:-24]
print(front)
print('dtype: float64')
print("后5名:")
back = str(all.iloc[-5:, 0])[:-24]
print(back)
print('dtype: float64')
# ********** End **********#