import numpy as np
import pandas as pd
from pandas import DataFrame,Series
areas = pd.read_csv('./state-areas.csv')
areas
|
state |
area (sq. mi) |
| 0 |
Alabama |
52423 |
| 1 |
Alaska |
656425 |
| 2 |
Arizona |
114006 |
| 3 |
Arkansas |
53182 |
| 4 |
California |
163707 |
| 5 |
Colorado |
104100 |
| 6 |
Connecticut |
5544 |
| 7 |
Delaware |
1954 |
| 8 |
Florida |
65758 |
| 9 |
Georgia |
59441 |
| 10 |
Hawaii |
10932 |
| 11 |
Idaho |
83574 |
| 12 |
Illinois |
57918 |
| 13 |
Indiana |
36420 |
| 14 |
Iowa |
56276 |
| 15 |
Kansas |
82282 |
| 16 |
Kentucky |
40411 |
| 17 |
Louisiana |
51843 |
| 18 |
Maine |
35387 |
| 19 |
Maryland |
12407 |
| 20 |
Massachusetts |
10555 |
| 21 |
Michigan |
96810 |
| 22 |
Minnesota |
86943 |
| 23 |
Mississippi |
48434 |
| 24 |
Missouri |
69709 |
| 25 |
Montana |
147046 |
| 26 |
Nebraska |
77358 |
| 27 |
Nevada |
110567 |
| 28 |
New Hampshire |
9351 |
| 29 |
New Jersey |
8722 |
| 30 |
New Mexico |
121593 |
| 31 |
New York |
54475 |
| 32 |
North Carolina |
53821 |
| 33 |
North Dakota |
70704 |
| 34 |
Ohio |
44828 |
| 35 |
Oklahoma |
69903 |
| 36 |
Oregon |
98386 |
| 37 |
Pennsylvania |
46058 |
| 38 |
Rhode Island |
1545 |
| 39 |
South Carolina |
32007 |
| 40 |
South Dakota |
77121 |
| 41 |
Tennessee |
42146 |
| 42 |
Texas |
268601 |
| 43 |
Utah |
84904 |
| 44 |
Vermont |
9615 |
| 45 |
Virginia |
42769 |
| 46 |
Washington |
71303 |
| 47 |
West Virginia |
24231 |
| 48 |
Wisconsin |
65503 |
| 49 |
Wyoming |
97818 |
| 50 |
District of Columbia |
68 |
| 51 |
Puerto Rico |
3515 |
ab = pd.read_csv('./state-abbrevs.csv')
ab
|
state |
abbreviation |
| 0 |
Alabama |
AL |
| 1 |
Alaska |
AK |
| 2 |
Arizona |
AZ |
| 3 |
Arkansas |
AR |
| 4 |
California |
CA |
| 5 |
Colorado |
CO |
| 6 |
Connecticut |
CT |
| 7 |
Delaware |
DE |
| 8 |
District of Columbia |
DC |
| 9 |
Florida |
FL |
| 10 |
Georgia |
GA |
| 11 |
Hawaii |
HI |
| 12 |
Idaho |
ID |
| 13 |
Illinois |
IL |
| 14 |
Indiana |
IN |
| 15 |
Iowa |
IA |
| 16 |
Kansas |
KS |
| 17 |
Kentucky |
KY |
| 18 |
Louisiana |
LA |
| 19 |
Maine |
ME |
| 20 |
Montana |
MT |
| 21 |
Nebraska |
NE |
| 22 |
Nevada |
NV |
| 23 |
New Hampshire |
NH |
| 24 |
New Jersey |
NJ |
| 25 |
New Mexico |
NM |
| 26 |
New York |
NY |
| 27 |
North Carolina |
NC |
| 28 |
North Dakota |
ND |
| 29 |
Ohio |
OH |
| 30 |
Oklahoma |
OK |
| 31 |
Oregon |
OR |
| 32 |
Maryland |
MD |
| 33 |
Massachusetts |
MA |
| 34 |
Michigan |
MI |
| 35 |
Minnesota |
MN |
| 36 |
Mississippi |
MS |
| 37 |
Missouri |
MO |
| 38 |
Pennsylvania |
PA |
| 39 |
Rhode Island |
RI |
| 40 |
South Carolina |
SC |
| 41 |
South Dakota |
SD |
| 42 |
Tennessee |
TN |
| 43 |
Texas |
TX |
| 44 |
Utah |
UT |
| 45 |
Vermont |
VT |
| 46 |
Virginia |
VA |
| 47 |
Washington |
WA |
| 48 |
West Virginia |
WV |
| 49 |
Wisconsin |
WI |
| 50 |
Wyoming |
WY |
pop = pd.read_csv('./state-population.csv')
pop
|
state/region |
ages |
year |
population |
| 0 |
AL |
under18 |
2012 |
1117489.0 |
| 1 |
AL |
total |
2012 |
4817528.0 |
| 2 |
AL |
under18 |
2010 |
1130966.0 |
| 3 |
AL |
total |
2010 |
4785570.0 |
| 4 |
AL |
under18 |
2011 |
1125763.0 |
| ... |
... |
... |
... |
... |
| 2539 |
USA |
total |
2010 |
309326295.0 |
| 2540 |
USA |
under18 |
2011 |
73902222.0 |
| 2541 |
USA |
total |
2011 |
311582564.0 |
| 2542 |
USA |
under18 |
2012 |
73708179.0 |
| 2543 |
USA |
total |
2012 |
313873685.0 |
2544 rows × 4 columns
pop.head()
|
state/region |
ages |
year |
population |
| 0 |
AL |
under18 |
2012 |
1117489.0 |
| 1 |
AL |
total |
2012 |
4817528.0 |
| 2 |
AL |
under18 |
2010 |
1130966.0 |
| 3 |
AL |
total |
2010 |
4785570.0 |
| 4 |
AL |
under18 |
2011 |
1125763.0 |
pop.shape
(2544, 4)
将地名全称与人口相对应
n1 = pd.merge(pop,ab,left_on='state/region',right_on='abbreviation',how = 'outer')
n1.head()
|
state/region |
ages |
year |
population |
state |
abbreviation |
| 0 |
AL |
under18 |
2012 |
1117489.0 |
Alabama |
AL |
| 1 |
AL |
total |
2012 |
4817528.0 |
Alabama |
AL |
| 2 |
AL |
under18 |
2010 |
1130966.0 |
Alabama |
AL |
| 3 |
AL |
total |
2010 |
4785570.0 |
Alabama |
AL |
| 4 |
AL |
under18 |
2011 |
1125763.0 |
Alabama |
AL |
n1.shape
(2544, 6)
n1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state/region 2544 non-null object
1 ages 2544 non-null object
2 year 2544 non-null int64
3 population 2524 non-null float64
4 state 2448 non-null object
5 abbreviation 2448 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 139.1+ KB
n1.drop(labels = 'abbreviation',axis = 1,inplace=True)
n1.head()
|
state/region |
ages |
year |
population |
state |
| 0 |
AL |
under18 |
2012 |
1117489.0 |
Alabama |
| 1 |
AL |
total |
2012 |
4817528.0 |
Alabama |
| 2 |
AL |
under18 |
2010 |
1130966.0 |
Alabama |
| 3 |
AL |
total |
2010 |
4785570.0 |
Alabama |
| 4 |
AL |
under18 |
2011 |
1125763.0 |
Alabama |
查找空数据并填充
n1.info()