import pandas as pd
from pandas import Series, DataFrame
import numpy as np
Series基础
obj = pd.Series([4 , 7 , -5 , 3 ], index=['d' , 'b' , 'a' , 'c' ])
obj
d 4
b 7
a -5
c 3
dtype: int64
obj.name = 'Series Name'
obj.index.name = 'index'
obj
index
d 4
b 7
a -5
c 3
Name: Series Name, dtype: int64
print('左边的index:' , obj.index)
print('右边的values:' , obj.values)
print('索引a:' , obj['a' ])
print('判断:' , 'b' in obj)
print('\n索引b/d/c: \n' , obj[['b' ,'d' ,'c' ]])
print('\n大于0的:\n' ,obj[obj > 0 ])
左边的index: Index(['d', 'b', 'a', 'c'], dtype='object', name='index')
右边的values: [ 4 7 -5 3]
索引a: -5
判断: True
索引b/d/c:
index
b 7
d 4
c 3
Name: Series Name, dtype: int64
大于0的:
index
d 4
b 7
c 3
Name: Series Name, dtype: int64
obj[obj>0 ].isnull()
index
d False
b False
c False
Name: Series Name, dtype: bool
DataFrame基础
data = {'state' : ['Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' , 'Nevada' ],
'year' : [2000 , 2001 , 2002 , 2001 , 2002 , 2003 ],
'pop' : [1.5 , 1.7 , 3.6 , 2.4 , 2.9 , 3.2 ]}
frame = pd.DataFrame(data)
frame
pop state year 0 1.5 Ohio 2000 1 1.7 Ohio 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002 5 3.2 Nevada 2003
df = pd.DataFrame(frame, columns=['year' , 'state' , 'pop' ])
df.head(3 )
year state pop 0 2000 Ohio 1.5 1 2001 Ohio 1.7 2 2002 Ohio 3.6
df.tail(3 )
year state pop 3 2001 Nevada 2.4 4 2002 Nevada 2.9 5 2003 Nevada 3.2
df2 = pd.DataFrame(data, columns=['year' , 'state' , 'pop' , 'debt' ],
index=['one' , 'two' , 'three' , 'four' , 'five' , 'six' ])
df2
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN six 2003 Nevada 3.2 NaN
df2['year' ].head(3 )
one 2000
two 2001
three 2002
Name: year, dtype: int64
df2.year
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
注意:df2[column]能应对任何列名,但df2.column的情况下,列名必须是有效的python变量名才行。
df2.loc['three' ]
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
df2.iloc[2 :5 ]
year state pop debt three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN
df2.iloc[::2 ]
year state pop debt one 2000 Ohio 1.5 NaN three 2002 Ohio 3.6 NaN five 2002 Nevada 2.9 NaN
df2['debt' ] = np.arange(6. )
df2
year state pop debt one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0 six 2003 Nevada 3.2 5.0
df2['eastern' ] = df2.state == 'Ohio'
df2
year state pop debt eastern one 2000 Ohio 1.5 0.0 True two 2001 Ohio 1.7 1.0 True three 2002 Ohio 3.6 2.0 True four 2001 Nevada 2.4 3.0 False five 2002 Nevada 2.9 4.0 False six 2003 Nevada 3.2 5.0 False
del df2['eastern' ]
df.columns
Index(['year', 'state', 'pop'], dtype='object')
df2.T
one two three four five six year 2000 2001 2002 2001 2002 2003 state Ohio Ohio Ohio Nevada Nevada Nevada pop 1.5 1.7 3.6 2.4 2.9 3.2 debt 0 1 2 3 4 5
df2.index.name = '索引名'
df2.columns.name = '行名'
df2
行名 year state pop debt 索引名 one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0 six 2003 Nevada 3.2 5.0
print('全部列名:' , df2.columns)
print('判断:' , 'debt' in df2.columns)
全部列名: Index(['year', 'state', 'pop', 'debt'], dtype='object', name='行名')
判断: True
print('全部索引:' ,df2.index)
print('索引切片:' ,df2.index[1 :3 ])
全部索引: Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object', name='索引名')
索引切片: Index(['two', 'three'], dtype='object', name='索引名')
df2.values
array([[2000, 'Ohio', 1.5, 0.0],
[2001, 'Ohio', 1.7, 1.0],
[2002, 'Ohio', 3.6, 2.0],
[2001, 'Nevada', 2.4, 3.0],
[2002, 'Nevada', 2.9, 4.0],
[2003, 'Nevada', 3.2, 5.0]], dtype=object)
与python里的set不同,pandas的index可以有重复的labels,在这种重复的标签中选择的话,会选中所有相同的标签。
dup_labels = pd.Index(['foo' , 'foo' , 'bar' , 'bar' ])
dup_labels
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')
Index还有一些方法和属性:
重复值
values = pd.Series(['apple' , 'orange' , 'apple' , 'apple' ] * 2 )
values
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
dtype: object
values.is_unique
False
pd.unique(values)
array(['apple', 'orange'], dtype=object)
pd.value_counts(values)
apple 6
orange 2
dtype: int64
values = pd.Series([0 , 1 , 0 , 0 ] * 2 )
dim = pd.Series(['apple' , 'orange' ])
dim.take(values)
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
汇总和描述性统计
df = pd.DataFrame([[1.4 , np.nan], [7.1 , -4.5 ],
[np.nan, np.nan], [0.75 , -1.3 ]],
index=['a' , 'b' , 'c' , 'd' ],
columns=['one' , 'two' ])
df
one two a 1.40 NaN b 7.10 -4.5 c NaN NaN d 0.75 -1.3
df.sum()
one 9.25
two -5.80
dtype: float64
df.sum(axis='columns' )
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
计算的时候,NA(即缺失值)会被除外,除非整个切片全是NA。我们可以用skipna来跳过计算NA:
df.mean(axis='columns' , skipna=False )
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
df.info()
df.describe()
one two count 3.000000 2.000000 mean 3.083333 -2.900000 std 3.493685 2.262742 min 0.750000 -4.500000 25% 1.075000 -3.700000 50% 1.400000 -2.900000 75% 4.250000 -2.100000 max 7.100000 -1.300000
obj = pd.Series(['a' , 'a' , 'b' , 'c' ] * 4 )
obj.describe()
count 16
unique 3
top a
freq 8
dtype: object
下表是一些描述和汇总统计数据:
相关性与协方差
price = pd.read_pickle('../examples/yahoo_price.pkl' )
price.head()
AAPL GOOG IBM MSFT Date 2010-01-04 27.990226 313.062468 113.304536 25.884104 2010-01-05 28.038618 311.683844 111.935822 25.892466 2010-01-06 27.592626 303.826685 111.208683 25.733566 2010-01-07 27.541619 296.753749 110.823732 25.465944 2010-01-08 27.724725 300.709808 111.935822 25.641571
returns = price.pct_change()
returns.tail()
AAPL GOOG IBM MSFT Date 2016-10-17 -0.000680 0.001837 0.002072 -0.003483 2016-10-18 -0.000681 0.019616 -0.026168 0.007690 2016-10-19 -0.002979 0.007846 0.003583 -0.002255 2016-10-20 -0.000512 -0.005652 0.001719 -0.004867 2016-10-21 -0.003930 0.003011 -0.012474 0.042096
returns.corr()
AAPL GOOG IBM MSFT AAPL 1.000000 0.407919 0.386817 0.389695 GOOG 0.407919 1.000000 0.405099 0.465919 IBM 0.386817 0.405099 1.000000 0.499764 MSFT 0.389695 0.465919 0.499764 1.000000
returns.corrwith(returns['IBM' ])
AAPL 0.386817
GOOG 0.405099
IBM 1.000000
MSFT 0.499764
dtype: float64
returns.cov()
AAPL GOOG IBM MSFT AAPL 0.000277 0.000107 0.000078 0.000095 GOOG 0.000107 0.000251 0.000078 0.000108 IBM 0.000078 0.000078 0.000146 0.000089 MSFT 0.000095 0.000108 0.000089 0.000215
唯一值,值计数,会员
obj = pd.Series(['c' , 'a' , 'd' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' ])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()
c 3
a 3
b 2
d 1
dtype: int64
pd.value_counts(obj.values, sort=False )
b 2
a 3
c 3
d 1
dtype: int64
mask = obj.isin(['b' , 'c' ])
mask
0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
dtype: bool
obj[obj.isin(['a' , 'd' ])]
1 a
2 d
3 a
4 a
dtype: object
a = pd.Series(['c' , 'a' , 'b' , 'b' , 'c' , 'a' ])
b = pd.Series(['c' , 'b' , 'a' ])
pd.Index(b).get_indexer(a)
array([0, 2, 1, 1, 0, 2], dtype=int64)
主要功能
1、Reindexing(重新索引)
Series中应用
obj = pd.Series([4.5 , 7.2 , -5.3 , 3.6 ], index=['d' , 'b' , 'a' , 'c' ])
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
obj2 = obj.reindex(['a' , 'b' , 'c' , 'd' , 'e' ])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj3 = pd.Series(['bule' , 'purple' , 'yellow' ], index=[0 , 2 , 4 ])
obj3
0 bule
2 purple
4 yellow
dtype: object
obj3.reindex(range(6 ), method='ffill' )
0 bule
1 bule
2 purple
3 purple
4 yellow
5 yellow
dtype: object
对于DataFrame,reindex能更改row index,或column index。reindex the rows
df = pd.DataFrame(np.arange(9 ).reshape(3 , 3 ),
index=['a' , 'c' , 'd' ],
columns=['Ohio' , 'Texas' , 'California' ])
df
Ohio Texas California a 0 1 2 c 3 4 5 d 6 7 8
df2 = df.reindex(index=['a' , 'b' , 'c' , 'd' ], columns=['Texas' , 'Utah' , 'California' ], fill_value=0 )
df2
Texas Utah California a 1 0 2 b 0 0 0 c 4 0 5 d 7 0 8
df.loc[['a' , 'b' , 'c' , 'd' ], ['Texas' , 'Utah' , 'California' ]]
Texas Utah California a 1.0 NaN 2.0 b NaN NaN NaN c 4.0 NaN 5.0 d 7.0 NaN 8.0
2、drop() 按轴删除记录
对于series,drop回返回一个新的object,并删去你制定的axis的值
obj = pd.Series(np.arange(5. ), index=['a' , 'b' , 'c' , 'd' , 'e' ])
print('删除后:\n' , obj.drop(['b' ,'d' ]))
print('原来的:\n' , obj)
删除后:
a 0.0
c 2.0
e 4.0
dtype: float64
原来的:
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
对于DataFrame,index能按行或列的axis来删除:
df = pd.DataFrame(np.arange(16 ).reshape(4 , 4 ),
index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ],
columns=['one' , 'two' , 'three' , 'four' ])
df
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
df.drop(['Colorado' , 'Ohio' ])
one two three four Utah 8 9 10 11 New York 12 13 14 15
df.drop('two' , axis=1 )
one three four Ohio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15
df.drop(['one' ,'four' ], axis='columns' , inplace=True )
df
two three Ohio 1 2 Colorado 5 6 Utah 9 10 New York 13 14
3、Indexing, Selection, and Filtering(索引,选择,过滤)
data = pd.DataFrame(np.arange(16 ).reshape((4 , 4 )),
index=['Ohio' , 'Colorado' , 'Utah' , 'New York' ],
columns=['one' , 'two' , 'three' , 'four' ])
data
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data[data['three' ] > 5 ]
one two three four Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data[data < 5 ] = 0
data
one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data.loc['Colorado' , ['two' , 'three' ]]
two 5
three 6
Name: Colorado, dtype: int32
data.iloc[2 , [3 , 0 , 1 ]]
four 11
one 8
two 9
Name: Utah, dtype: int32
data.iloc[:, :3 ][data.three > 5 ]
one two three Colorado 0 5 6 Utah 8 9 10 New York 12 13 14
pandas中有很多用于选择和重新选择数据的方法:
ser2 = pd.Series(np.arange(3. ), index=['a' , 'b' , 'c' ])
ser2
a 0.0
b 1.0
c 2.0
dtype: float64
4、算数和数据对齐
df1 = pd.DataFrame(np.arange(12. ).reshape((3 , 4 )), columns=list('abcd' ))
df2 = pd.DataFrame(np.arange(20. ).reshape((4 , 5 )), columns=list('abcde' ))
df2
a b c d e 0 0.0 1.0 2.0 3.0 4.0 1 5.0 6.0 7.0 8.0 9.0 2 10.0 11.0 12.0 13.0 14.0 3 15.0 16.0 17.0 18.0 19.0
df1
a b c d 0 0.0 1.0 2.0 3.0 1 4.0 5.0 6.0 7.0 2 8.0 9.0 10.0 11.0
df2.loc[1 , 'b' ] = np.nan
df2
a b c d e 0 0.0 1.0 2.0 3.0 4.0 1 5.0 NaN 7.0 8.0 9.0 2 10.0 11.0 12.0 13.0 14.0 3 15.0 16.0 17.0 18.0 19.0
df1 + df2
a b c d e 0 0.0 2.0 4.0 6.0 NaN 1 9.0 11.0 13.0 15.0 NaN 2 18.0 20.0 22.0 24.0 NaN 3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0 )
a b c d e 0 0.0 2.0 4.0 6.0 4.0 1 9.0 11.0 13.0 15.0 9.0 2 18.0 20.0 22.0 24.0 14.0 3 15.0 16.0 17.0 18.0 19.0
下表中就有很多这样灵活的算数方法:
每一个都有一个配对的,以 r 开头,意思是反转:
1 / df1
a b c d 0 inf 1.000000 0.500000 0.333333 1 0.250000 0.200000 0.166667 0.142857 2 0.125000 0.111111 0.100000 0.090909
df1.rdiv(1 )
a b c d 0 inf 1.000000 0.500000 0.333333 1 0.250000 0.200000 0.166667 0.142857 2 0.125000 0.111111 0.100000 0.090909
5、DataFrame和Series之间的操作
df = pd.DataFrame(np.arange(12. ).reshape((4 , 3 )),
columns=list('bde' ),
index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
series = df.iloc[0 ]
df
b d e Utah 0.0 1.0 2.0 Ohio 3.0 4.0 5.0 Texas 6.0 7.0 8.0 Oregon 9.0 10.0 11.0
series
b 0.0
d 1.0
e 2.0
Name: Utah, dtype: float64
df - series
b d e Utah 0.0 0.0 0.0 Ohio 3.0 3.0 3.0 Texas 6.0 6.0 6.0 Oregon 9.0 9.0 9.0
df.sub(series, axis='columns' )
b d e Utah 0.0 0.0 0.0 Ohio 3.0 3.0 3.0 Texas 6.0 6.0 6.0 Oregon 9.0 9.0 9.0
如果想要广播列,去匹配行,必须要用到算数方法:
series2 = df['d' ]
series2
Utah 1.0
Ohio 4.0
Texas 7.0
Oregon 10.0
Name: d, dtype: float64
df.sub(series2, axis=0 )
b d e Utah -1.0 0.0 1.0 Ohio -1.0 0.0 1.0 Texas -1.0 0.0 1.0 Oregon -1.0 0.0 1.0
6、DataFrame转为Numpy数组
要想把一个DataFrame变为Numpy数组,使用.values属性
data = pd.DataFrame({'x0' : [1 , 2 , 3 , 4 , 5 ],
'x1' : [0.01 , -0.01 , 0.25 , -4.1 , 0. ],
'y' : [-1.5 , 0. , 3.6 , 1.3 , -2. ]})
data
x0 x1 y 0 1 0.01 -1.5 1 2 -0.01 0.0 2 3 0.25 3.6 3 4 -4.10 1.3 4 5 0.00 -2.0
data.values
array([[ 1. , 0.01, -1.5 ],
[ 2. , -0.01, 0. ],
[ 3. , 0.25, 3.6 ],
[ 4. , -4.1 , 1.3 ],
[ 5. , 0. , -2. ]])
7、函数应用和映射
numpy的ufuncs(element-wise数组方法)也能用在pandas的object上
frame = pd.DataFrame(np.random.randn(4 , 3 ), columns=list('bde' ),
index=['Utah' , 'Ohio' , 'Texas' , 'Oregon' ])
frame
b d e Utah -1.062629 -0.501572 -1.797577 Ohio -0.143117 -0.177578 -0.738473 Texas 0.317834 -0.632825 1.258225 Oregon 2.541097 -0.951197 1.021166
np.abs(frame)
b d e Utah 1.062629 0.501572 1.797577 Ohio 0.143117 0.177578 0.738473 Texas 0.317834 0.632825 1.258225 Oregon 2.541097 0.951197 1.021166
另一个常用的操作是把一个用在一维数组上的函数,应用在一行或一列上。要用到DataFrame中的apply函数:
f = lambda x: x.max() - x.min()
frame.apply(f, axis=0 )
b 3.603726
d 0.773618
e 3.055802
dtype: float64
frame.apply(f, axis=1 )
Utah 1.296006
Ohio 0.595356
Texas 1.891050
Oregon 3.492293
dtype: float64
像是sum, mean这样的数组统计方法,DataFrame中已经集成了,所以没必要用apply。
apply不会返回标量,只会返回一个含有多个值的series:
def f (x) :
return pd.Series([x.min(), x.max()], index=['min' , 'max' ])
frame
b d e Utah -1.062629 -0.501572 -1.797577 Ohio -0.143117 -0.177578 -0.738473 Texas 0.317834 -0.632825 1.258225 Oregon 2.541097 -0.951197 1.021166
frame.apply(f)
b d e min -1.062629 -0.951197 -1.797577 max 2.541097 -0.177578 1.258225
element-wise的python函数也能用。假设想要格式化frame中的浮点数,变为string。可以用apply map:
format = lambda x: '%.2f' % x
frame.applymap(format)
b d e Utah -1.06 -0.50 -1.80 Ohio -0.14 -0.18 -0.74 Texas 0.32 -0.63 1.26 Oregon 2.54 -0.95 1.02
applymap的做法是,series有一个map函数,能用来实现element-wise函数:
frame['e' ].map(format)
Utah -1.80
Ohio -0.74
Texas 1.26
Oregon 1.02
Name: e, dtype: object
8、排序
按row或column index来排序的话,可以用sort_index方法,会返回一个新的object
(1)、sort_index 按索引值排序
obj = pd.Series(range(4 ), index=['d' , 'a' , 'b' , 'c' ])
obj.sort_index()
a 1
b 2
c 3
d 0
dtype: int64
在DataFrame,可以用index或其他axis来排序
frame = pd.DataFrame(np.arange(8 ).reshape((2 , 4 )),
index=['three' , 'one' ],
columns=['d' , 'a' , 'b' , 'c' ])
frame
frame.sort_index()
frame.sort_index(axis=1 , ascending=False )
(2)、按values值排序
frame.sort_values(by='three' , axis=1 , ascending=True )
(3)、rank()
ranking(排名)是给有效的数据分配数字。rank方法能用于series和DataFrame,rank方法默认会给每个group一个mean rank(平均排名)。rank 表示在这个数在原来的Series中排第几名,有相同的数,取其排名平均(默认)作为值
obj = pd.Series([7 , -5 , 7 , 4 , 2 , 0 , 4 ])
obj
0 7
1 -5
2 7
3 4
4 2
5 0
6 4
dtype: int64
obj.sort_values()
1 -5
5 0
4 2
3 4
6 4
0 7
2 7
dtype: int64
obj.rank()
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
在obj中,4和4的排名是第4名和第五名,取平均得4.5。7和7的排名分别是第六名和第七名,则其排名取平均得6.5。
rank也可以根据数据被观测到的顺序来设定:
obj.rank(method='first' )
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
这里没有给0和2(指两个数字7)赋予average rank 6.5,而是给第一个看到的7(label 0)设置rank为6,第二个看到的7(label 2)设置rank为7。
也可以设置降序:
obj.rank(ascending=False , method='max' )
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
dataframe 可以根据行或列来计算rank:
frame = pd.DataFrame({'b' : [4.3 , 7 , -3 , 2 ],
'a' : [0 , 1 , 0 , 1 ],
'c' : [-2 , 5 , 8 , -2.5 ]})
frame
a b c 0 0 4.3 -2.0 1 1 7.0 5.0 2 0 -3.0 8.0 3 1 2.0 -2.5
frame.rank(axis='columns' )
a b c 0 2.0 3.0 1.0 1 1.0 3.0 2.0 2 2.0 1.0 3.0 3 2.0 3.0 1.0
9、有重复label的轴索引
我们看到的所有例子都有unique axis labels(index values),唯一的轴标签(索引值)。一些pandas函数(reindex),需要label是唯一的,但这并是不强制的。比如下面有一个重复的索引:
obj = pd.Series(range(5 ), index=['a' , 'a' , 'b' , 'b' , 'c' ])
obj
a 0
a 1
b 2
b 3
c 4
dtype: int64
obj.index.is_unique
False
数据选择对于重复label则表现有点不同。如果一个label有多个值,那么就会返回一个series, 如果是label只对应一个值的话,会返回一个标量:
obj['a' ]
a 0
a 1
dtype: int64
这个选择的逻辑也应用于DataFrame:
df = pd.DataFrame(np.random.randn(4 , 3 ), index=['a' , 'a' , 'b' , 'b' ])
df
0 1 2 a -1.241237 -1.036060 0.943747 a -0.429217 -1.029268 1.979792 b 0.095657 -0.876478 0.096081 b 0.034921 0.703786 0.293833
df.loc['b' ]
0 1 2 b 0.095657 -0.876478 0.096081 b 0.034921 0.703786 0.293833