#coding=utf-8
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
#导入文件
abb=pd.read_csv('./state-abbrevs.csv')
area=pd.read_csv('./state-areas.csv')
pop=pd.read_csv('./state-population.csv')
#将人口数据和各州简称数据进行合并
abb_pop=pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
#print(abb_pop_merge.head())
#将合并的数据重复的一列进行删除
abb_pop=abb_pop.drop(labels='abbreviation',axis=1,inplace=False)
#print(c)
#查看存在缺失数据的列
#print(abb_pop.isnull().any(axis=1))
#在drop中,0是列,1是行,在其他中0是行,1是列
#print(abb_pop.info())
#去重操作
#思路:将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
#将state的空值定位到
abb_pop['state'].isnull()
a=abb_pop.loc[abb_pop['state'].isnull()]#作为一个行索引
b=a['state/region']
#print(b)
c=b.unique()
print(c)
2:找出state全称里面的空值,进行赋值。
abb_pop['state'].isnull()
a=abb_pop.loc[abb_pop['state'].isnull()]#作为一个行索引
b=a['state/region']
#print(b)
c=b.unique()#代表空值有哪些:‘PR'和’USA'
#print(c)
#将USA的全称进行赋值
#(05:50)将USA对应的行索引求出来
date=abb_pop.loc[abb_pop['state/region']=='USA']#行数据求出来
#将行索引取出
indexes=abb_pop.loc[abb_pop['state/region']=='USA'].index
abb_pop.loc[indexes,'state']='United States'
#将PR里的空值赋值
date2=abb_pop.loc[abb_pop['state/region']=='PR']
indexes2=date2.index
abb_pop.loc[indexes2,'state']='pprrrr'
3:合并各州的面积数据
abb_pop_area=pd.merge(abb_pop,area,how='outer')
print(abb_pop_area.head())
4:area(sq.mi)这一列中有缺失数据
abb_pop_area=pd.merge(abb_pop,area,how='outer')
#print(abb_pop_area.info())
a=abb_pop_area['area (sq. mi)'].isnull()
area_is_null=abb_pop_area.loc[a]
indexes3=area_is_null.index
print(indexes3)
5:去除含有缺失数据的行
abb_pop_area.drop(labels=indexes3,inplace=True,axis=0)
6:找出2010年全民人口的数据
abb_pop_area=pd.merge(abb_pop,area,how='outer')
#print(abb_pop_area.info())
a=abb_pop_area['area (sq. mi)'].isnull()
area_is_null=abb_pop_area.loc[a]
indexes3=area_is_null.index
#print(indexes3)
abb_pop_area.drop(labels=indexes3,inplace=True,axis=0)
#print(abb_pop_area.head())
#abb_pop_area.query('age==''and year==2010')
#print(abb_pop_area.query('ages =="total" and year==2010.0'))
#print(abb_pop_area.query('ages ==‘total’and year==2010.0'))
print(abb_pop_area.query('ages =="total" and year==2010.0'))
7:计算各州的人口密度
abb_pop_area['midu']=abb_pop_area.population /abb_pop_area['area (sq. mi)']
print(abb_pop_area)
8:排序,找出人口密度最高的一个州
abb_pop_area['midu']=abb_pop_area.population /abb_pop_area['area (sq. mi)']
print(abb_pop_area.sort_values(by='midu',axis=0,ascending=False).iloc[0] [0] )