这里我们会再一次的通过《利用Python进行数据分析》这本书的第五章部分,对Pandas做一些简单的介绍,主要都是比较naive的东西,不过对于初学者是很好的,通过这篇博文大家可以看到pandas许多有趣强大的数据表格处理能力;这里主要通过代码记录的形式记录下来了作者学习的过程,一些比较难懂的地方都做了一些说明,如果有问题欢迎批评指正~~~(*^__^*) 咯咯……
In [2]: import pandas
In [4]: from pandas import Series,DataFrame
In [5]: obj=Series([4,7,-5,3]) #一种基本数据结构Series
In [6]: obj
Out[6]:
0 4
1 7
2 -5
3 3
dtype: int64
In [7]: obj.values #访问里面的数值
Out[7]: array([ 4, 7, -5, 3], dtype=int64)
In [8]: obj.index #访问索引
Out[8]: Int64Index([0, 1, 2, 3], dtype='int64')
In [9]: obj2=Series([4,7,-5,3],index=['d','b','a','c'])
In [10]: obj2
Out[10]:
d 4
b 7
a -5
c 3
dtype: int64
In [11]: obj2.index
Out[11]: Index([u'd', u'b', u'a', u'c'], dtype='object')
In [12]: obj2['a']
Out[12]: -5
In [13]: obj2['d']=6 #修改数据
In [14]: obj2[['c','a','d']] #查看数据 两个中括号 给出列表的形式
Out[14]:
c 3
a -5
d 6
dtype: int64
In [15]: obj2
Out[15]:
d 6
b 7
a -5
c 3
dtype: int64
In [16]: obj2[obj2>0] #返回大于0的
Out[16]:
d 6
b 7
c 3
dtype: int64
In [17]: obj2*2
Out[17]:
d 12
b 14
a -10
c 6
dtype: int64
In [18]: import numpy as np
In [19]: np.exp(obj2) #自动的一个个处理
Out[19]:
d 403.428793
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
In [20]: 'b' in obj2
Out[20]: True
In [21]: 'e' in obj2
Out[21]: False
In [22]: sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
In [23]: obj3=Series(sdata) #将字典转化为pandas中的Series数据结构
In [24]: obj3
Out[24]:
Ohio 35000
Oregon 16000
Texas 71000
Utah 5000
dtype: int64
In [25]: states=['California','Ohio','Oregon','Texas']
In [28]: obj4=Series(sdata,index=states) #注意我们始终使用中括号或者说列表的形式
In [29]: obj4
Out[29]:
California NaN
Ohio 35000
Oregon 16000
Texas 71000
dtype: float64
In [31]: import pandas as pd
In [32]: pd.isnull(obj4) # 方法一
Out[32]:
California True
Ohio False
Oregon False
Texas False
dtype: bool
In [33]: pd.notnull(obj4)
Out[33]:
California False
Ohio True
Oregon True
Texas True
dtype: bool
In [34]: obj4.isnull() # 方法二
Out[34]:
California True
Ohio False
Oregon False
Texas False
dtype: bool
In [34]:
In [35]: obj3
Out[35]:
Ohio 35000
Oregon 16000
Texas 71000
Utah 5000
dtype: int64
In [36]: obj4
Out[36]:
California NaN
Ohio 35000
Oregon 16000
Texas 71000
dtype: float64
In [37]: obj3+obj4 #会自动对齐不同索引的数据
Out[37]:
California NaN
Ohio 70000
Oregon 32000
Texas 142000
Utah NaN
dtype: float64
In [38]: obj4.name='population' #赋名
In [39]: obj4.index.name='state'
In [40]: obj4
Out[40]:
state
California NaN
Ohio 35000
Oregon 16000
Texas 71000
Name: population, dtype: float64
In [41]: obj
Out[41]:
0 4
1 7
2 -5
3 3
dtype: int64
In [42]: obj.index=['Bob','Steve','jeff','Ryan'] #如何加入索引
In [43]: obj
Out[43]:
Bob 4
Steve 7
jeff -5
Ryan 3
dtype: int64
In [45]: data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
...: 'year':[2000,2001,2002,2001,2001],
...: 'pop':[1.5,1.7,3.6,2.4,2.9]}
In [46]: frame=DataFrame(data) #DataFrame数据结构
In [47]: frame
Out[47]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2001
In [49]: DataFrame(data,columns=['year','state','pop']) #按列要求显示
Out[49]:
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2001 Nevada 2.9
In [50]: frame2=DataFrame(data,columns=['year','state','pop','debt'],
...: index=['one','two','three','four','five'])
In [51]: frame2
Out[51]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2001 Nevada 2.9 NaN
In [53]: frame2.columns
Out[53]: Index([u'year', u'state', u'pop', u'debt'], dtype='object')
In [54]: frame2['state'] #获取方法一
Out[54]:
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
Name: state, dtype: object
In [55]: frame2.year #获取方法二
Out[55]:
one 2000
two 2001
three 2002
four 2001
five 2001
Name: year, dtype: int64
In [56]: frame2.ix['three'] # 行索引
Out[56]:
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
In [57]: frame2['debt']=16.5 # 通常的列索引方法
In [58]: frame2
Out[58]:
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2001 Nevada 2.9 16.5
In [59]: frame2['debt']=np.arange(5.) #给“debt”赋值
In [60]: frame2
Out[60]:
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2001 Nevada 2.9 4
In [61]: val=Series([-1.2,-1.5,-1.7],index=['two','four','five'])
In [62]: frame2['debt']=val # 自动匹配
In [63]: frame2
Out[63]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2001 Nevada 2.9 -1.7
In [64]: frame2['eastern']=frame2.state=='Ohio' # 集成了两种定位列的方法
In [65]: frame2
Out[65]:
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2001 Nevada 2.9 -1.7 False
In [66]: del frame2['eastern'] #删除列
In [67]: frame2.columns
Out[67]: Index([u'year', u'state', u'pop', u'debt'], dtype='object')
In [68]: pop={'Nevada':{2001:2.4,2002:2.9} #字典内嵌套字典
...: 'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
In [69]: frame3=DataFrame(pop) #转化为DataFrame
In [70]: frame3
Out[70]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
In [72]: frame3.T #转置
Out[72]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
In [73]: DataFrame(pop,index=[2001,2002,2003])
Out[73]:
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN
In [74]: frame3.Ohio[:-1]
Out[74]:
2000 1.5
2001 1.7
Name: Ohio, dtype: float64
In [77]: pdata={'Ohio':frame3['Ohio'][:-1],
...: 'Nevada':frame3['Nevada'][:2]} #默认state为列名
In [78]: pdata
Out[78]:
{'Nevada': 2000 NaN
2001 2.4
Name: Nevada, dtype: float64, 'Ohio': 2000 1.5
2001 1.7
Name: Ohio, dtype: float64}
In [80]: DataFrame(pdata)
Out[80]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
frame3
Out[82]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.index.name='year'; # 给索引起名字
frame3.columns.name='state';
frame3
Out[85]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.values
Out[86]:
array([[ nan, 1.5],
[ 2.4, 1.7],
[ 2.9, 3.6]])
frame2
Out[87]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2001 Nevada 2.9 -1.7
frame2.values
Out[88]:
array([[2000L, 'Ohio', 1.5, nan],
[2001L, 'Ohio', 1.7, -1.2],
[2002L, 'Ohio', 3.6, nan],
[2001L, 'Nevada', 2.4, -1.5],
[2001L, 'Nevada', 2.9, -1.7]], dtype=object)
obj=Series(range(3),index=['a','b','c'])
index=obj.index
index
Out[91]: Index([u'a', u'b', u'c'], dtype='object')
index[1:]
Out[92]: Index([u'b', u'c'], dtype='object')
index[1]='d' #index对象是不能修改的
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-4d3850bb4691> in <module>()
----> 1 index[1]='d' #index对象是不能修改的
C:\Anaconda\lib\site-packages\pandas\core\base.pyc in _disabled(self, *args, **kwargs)
180 """This method will not function because object is immutable."""
181 raise TypeError("'%s' does not support mutable operations." %
--> 182 self.__class__)
183
184 __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
TypeError: '<class 'pandas.core.index.Index'>' does not support mutable operations.
index=pd.Index(np.arange(3))
obj2=Series([1.5,-2.5,0],index=index)
obj2.index is index
Out[96]: True
frame3
Out[97]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
'Ohio' in frame3.columns
Out[98]: True
2003 in frame3.index
Out[99]: False
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj
Out[101]:
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
obj2=obj.reindex(['a','b','c','d','e'])
obj2 #重新定义索引
Out[103]:
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj.reindex(['a','b','c','d','e'],fill_value=0) #重新索引,插值处理
Out[105]:
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
obj3=Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill') #根据前项填充
Out[107]:
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
obj3.reindex(range(6),method='ffill') #前向填充
Out[108]:
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],
columns=['Ohio','Texas','California'])
frame
Out[110]:
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
frame2=frame.reindex(['a','b','c','d'])
frame2
Out[112]:
Ohio Texas California
a 0 1 2
b NaN NaN NaN
c 3 4 5
d 6 7 8
states=['Texas','Utah','California']
frame.reindex(columns=states)
Out[114]:
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
frame.reindex(index=['a','b','c','d'],method='ffill',
columns=states)
Out[115]:
Texas Utah California
a 1 NaN 2
b 1 NaN 2
c 4 NaN 5
d 7 NaN 8
frame.reindex(index=['a','b','c','d'],method='ffill',
columns=states)
Out[116]:
Texas Utah California
a 1 NaN 2
b 1 NaN 2
c 4 NaN 5
d 7 NaN 8
frame.ix[['a','b','c','d'],states] <span style="font-family: Arial, Helvetica, sans-serif;">#可以同时对行列进行索引 </span>
Out[117]:
Texas Utah California
a 1 NaN 2
b NaN NaN NaN
c 4 NaN 5
d 7 NaN 8
obj=Series(np.arange(5.),index=['a','b','c','d','e'])
new_obj=obj.drop('c')
new_obj
Out[120]:
a 0
b 1
d 3
e 4
dtype: float64
obj.drop(['d','c']) #丢弃指定轴上的项
Out[121]:
a 0
b 1
e 4
dtype: float64
data=DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
Out[123]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15
data.drop('two',axis=1)
Out[124]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
data.drop(['two','four'],axis=1)
Out[125]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
#索引、选取和过滤
obj=Series(np.arange(4.),index=['a','b','c','d'])
obj
Out[128]:
a 0
b 1
c 2
d 3
dtype: float64
obj['b']
Out[129]: 1.0
obj[1]
Out[130]: 1.0
obj[2:4]
Out[131]:
c 2
d 3
dtype: float64
obj[['b','a','d']]
Out[132]:
b 1
a 0
d 3
dtype: float64
obj[[1,3]]
Out[133]:
b 1
d 3
dtype: float64 # 取行的方法
obj[obj<2]
Out[134]:
a 0
b 1
dtype: float64
obj['b':'c'] #利用标签的切片运算其末端是包含的
Out[135]:
b 1
c 2
dtype: float64
obj['b':'c']=5
obj
Out[137]:
a 0
b 5
c 5
d 3
dtype: float64
data=DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data
Out[139]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data['two']
Out[140]:
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int32
data[['three','one']] #以列表的形式告诉要获取的哪些数据
Out[141]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
data[:2]
Out[142]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
data[data['three']>5] # 按条件截取
Out[143]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data[data<5]=0
data
Out[145]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data.ix['Colorado',['two','three']]
Out[146]:
two 5
three 6
Name: Colorado, dtype: int32
data.ix[['Colorado','Utah'],[3,0,1]] # ix注明了行与列
Out[147]:
four one two
Colorado 7 0 5
Utah 11 8 9
data.ix[2]
Out[148]:
one 8
two 9
three 10
four 11
Name: Utah, dtype: int32
data.ix[:'Utah','two']
Out[149]:
Ohio 0
Colorado 5
Utah 9
Name: two, dtype: int32
# ix直接同时定位行与列
data.ix[data.three>5,:3]
Out[151]:
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14
#算术运算和数据对齐
s1=Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2=Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1+s2
Out[156]:
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),
index=['Ohio','Texas','Colorado'])
df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),
index=['Utah','Ohio','Texas','Oregon'])
df1
Out[159]:
b c d
Ohio 0 1 2
Texas 3 4 5
Colorado 6 7 8
df2
Out[160]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
df1+df2
Out[161]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3 NaN 6 NaN
Oregon NaN NaN NaN NaN
Texas 9 NaN 12 NaN
Utah NaN NaN NaN NaN
df1=DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2=DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
df1
Out[164]:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df2
Out[165]:
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
df1+df2
Out[166]:
a b c d e
0 0 2 4 6 NaN
1 9 11 13 15 NaN
2 18 20 22 24 NaN
3 NaN NaN NaN NaN NaN
df1.add(df2,fill_value=0)
Out[167]:
a b c d e
0 0 2 4 6 4
1 9 11 13 15 9
2 18 20 22 24 14
3 15 16 17 18 19
df1.reindex(columns=df2.columns,fill_value=0) # df1重索引
Out[168]:
a b c d e
0 0 1 2 3 0
1 4 5 6 7 0
2 8 9 10 11 0
#DataFrame与Series之间的运算
arr=np.arange(12.).reshape((3,4))
arr
Out[172]:
array([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]])
arr[0]
Out[173]: array([ 0., 1., 2., 3.])
arr-arr[0]
Out[174]:
array([[ 0., 0., 0., 0.],
[ 4., 4., 4., 4.],
[ 8., 8., 8., 8.]])
#广播
<pre name="code" class="python">frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])</span>
series=frame.ix[0]
frame
Out[178]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
series
Out[179]:
b 0
d 1
e 2
Name: Utah, dtype: float64
frame-series
#将Series的索引匹配到DataFrame的列 然后沿着行一直向下传播
Out[180]:
b d e
Utah 0 0 0
Ohio 3 3 3
Texas 6 6 6
Oregon 9 9 9
series2=Series(range(3),index=['b','e','f'])
frame+series2
#向下传播相加
Out[182]:
b d e f
Utah 0 NaN 3 NaN
Ohio 3 NaN 6 NaN
Texas 6 NaN 9 NaN
Oregon 9 NaN 12 NaN
series3=frame['d']
frame
Out[184]:
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
series3
Out[185]:
Utah 1
Ohio 4
Texas 7
Oregon 10
Name: d, dtype: float64
frame.sub(series3,axis=0)
Out[186]:
b d e
Utah -1 0 1
Ohio -1 0 1
Texas -1 0 1
<span style="font-family: Arial, Helvetica, sans-serif;">Oregon -1 0 1</span>
<span style="font-family: Arial, Helvetica, sans-serif;">#axis=0表示一列列的减</span>
<pre name="code" class="python">#函数应用和映射
frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
frame
Out[190]: b d e
Utah -1.755094 2.300596 2.196608
Ohio 1.342909 0.913712 0.345515
Texas -0.630440 -0.755394 -1.060220
Oregon -0.735883 -2.286866 1.428040
np.abs(frame)
Out[191]: b d e
Utah 1.755094 2.300596 2.196608
Ohio 1.342909 0.913712 0.345515
Texas 0.630440 0.755394 1.060220
Oregon 0.735883 2.286866 1.428040
f=lambda x:x.max()-x.min()
frame.apply(f)
Out[193]:
b 3.098004
d 4.587462
e 3.256828
dtype: float64
frame.apply(f,axis=1)
Out[194]:
Utah 4.055690
Ohio 0.997394
Texas 0.429779
Oregon 3.714906
dtype: float64
<pre name="code" class="python">def f(x):
return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)
Out[196]: b d e
min -1.755094 -2.286866 -1.060220
max 1.342909 2.300596 2.196608
#默认按列
format=lambda x:'%.2f' %x
frame.applymap(format)
Out[199]: b d e
Utah -1.76 2.30 2.20
Ohio 1.34 0.91 0.35
Texas -0.63 -0.76 -1.06
Oregon -0.74 -2.29 1.43
#格式化字符串
frame['e'].map(format)
Out[201]:
Utah 2.20
Ohio 0.35
Texas -1.06
Oregon 1.43
Name: e, dtype: object
obj=Series(range(4),index=['d','a','b','c'])
obj.sort_index()
Out[203]:
a 1
b 2
c 3
d 0
dtype: int64
frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()
Out[205]:
d a b c
one 4 5 6 7
three 0 1 2 3
frame.sort_index(axis=1)
Out[206]:
a b c d
three 1 2 3 0
one 5 6 7 4
#默认升序排序
frame.sort_index(axis=1,ascending=False)
Out[208]:
d c b a
three 0 3 2 1
one 4 7 6 5
obj=Series([4,7,-3,2])
# Series排序
obj.order()
Out[210]:
2 -3
3 2
0 4
1 7
dtype: int64
obj=Series([4,np.nan,7,np.nan,-3,2])
obj.order()
Out[212]:
4 -3
5 2
0 4
2 7
1 NaN
3 NaN
dtype: float64
frame=DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame
Out[214]:
a b
0 0 4
1 1 7
2 0 -3
3 1 2
frame.sort_index(by='b')
Out[215]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7
frame.sort_index(by=['a','b'])
Out[216]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7
obj=Series([7,-5,7,4,2,0,4])
obj.rank()
Out[218]:
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
#rank通过分配一个平均排名来破坏平级关系
obj.rank(method='first')
Out[220]:
0 6
1 1
2 7
3 4
4 3
5 2
6 5
dtype: float64
#给出 以在原数据中出现的顺序 排名
frame=DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame
Out[223]:
a b c
0 0 4.3 -2.0
1 1 7.0 5.0
2 0 -3.0 8.0
3 1 2.0 -2.5
frame.rank(axis=1)
Out[224]: a b c
0 2 3 1
1 1 3 2
2 2 1 3
3 2 3 1
#按行给出排名
#带有重复值的轴索引
obj=Series(range(5),index=['a','a','b','b','c'])
obj
Out[228]:
a 0
a 1
b 2
b 3
c 4
dtype: int64
obj.index.is_unique
Out[229]: False
obj['a']
Out[230]:
a 0
a 1
dtype: int64
obj['c']
Out[231]: 4
<pre name="code" class="python">df=DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df
Out[233]: 0 1 2
a 0.163283 0.563385 -0.474972
a -0.988967 0.293592 -1.800715
b 0.517782 0.499097 0.289553
b -0.272931 2.072591 -0.352391
df.ix['b']
Out[234]:
0 1 2
b 0.517782 0.499097 0.289553
b -0.272931 2.072591 -0.352391
#汇总和计算描述统计
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
df=DataFrame([[1.4,np.nan],[7.1,-4.5],
[np.nan,np.nan],[0.75,-1.3]],
index=['a','b','c','d'],
columns=['one','two'])
df
Out[5]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum()
Out[6]:
one 9.25
two -5.80
dtype: float64
df.sum(axis=1)
Out[7]:
a 1.40
b 2.60
c NaN
d -0.55
dtype: float64
#按行求和
df.mean(axis=1,skipna=False)
Out[9]:
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
df.idxmax() #间接索引
Out[10]:
one b
two d
dtype: object
df.cumsum() #累计求和
Out[11]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
df.describe()
Out[12]:
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
obj=Series(['a','a','b','c']*4)
obj
Out[14]:
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
obj.describe()
Out[15]:
count 16
unique 3
top a
freq 8
dtype: object
<pre name="code" class="python">import pandas.io.data as web
#相关系数与协方差
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price=DataFrame({tic:data['Adj Close'] for tic,data in all_data.iteritems()})
volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
returns=price.pct_change()
returns.tail()
Out[22]: AAPL GOOG IBM MSFT
Date
2009-12-24 0.034339 0.011117 0.004385 0.002587
2009-12-28 0.012294 0.007098 0.013326 0.005484
2009-12-29 -0.011861 -0.005571 -0.003477 0.007058
2009-12-30 0.012147 0.005376 0.005461 -0.013699
2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
<pre name="code" class="python">#corr方法用于计算两个Series中对齐的值的相关系数,cov用于计算协方差
returns.MSFT.corr(returns.IBM)
Out[24]:
0.49597968454928959
returns.MSFT.cov(returns.IBM)
Out[25]:
0.00021595764843398299
returns.corr() #相关系数矩阵
Out[26]: <span style="font-family: Arial, Helvetica, sans-serif;">AAPL GOOG IBM MSFT</span>
AAPL 1.000000 0.470676 0.410011 0.424305
GOOG 0.470676 1.000000 0.390689 0.443587
IBM 0.410011 0.390689 1.000000 0.495980
MSFT 0.424305 0.443587 0.495980 1.000000
returns.cov() #协方差矩阵
Out[27]: AAPL GOOG IBM MSFT
AAPL 0.001027 0.000303 0.000252 0.000309
GOOG 0.000303 0.000580 0.000142 0.000205
IBM 0.000252 0.000142 0.000367 0.000216
MSFT 0.000309 0.000205 0.000216 0.000516
returns.corrwith(returns.IBM)
Out[28]:
AAPL 0.410011
GOOG 0.390689
IBM 1.000000
MSFT 0.495980
dtype: float64
returns.corrwith(volume) #百分比变化与成交量的相关系数
Out[29]:
AAPL -0.057549
GOOG 0.062647
IBM -0.007892
MSFT -0.014245
dtype: float64
obj=Series(['c','a','d','a','a','b','b','c','c'])
uniques=obj.unique()
uniques
Out[32]:
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts() #计数个数
Out[33]:
c 3
a 3
b 2
d 1
dtype: int64
pd.value_counts(obj.values,sort=False)
Out[34]:
a 3
c 3
b 2
d 1
dtype: int64
mask=obj.isin(['b','c'])
mask
Out[36]:
0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
dtype: bool
obj[mask]
Out[37]:
0 c
5 b
6 b
7 c
8 c
dtype: object
#柱形统计
data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
data
Out[40]: Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
result=data.apply(pd.value_counts).fillna(0) #计数12345各自的个数,
result
Out[42]:
Qu1 Qu2 Qu3
1 1 1 1
2 0 2 1
3 2 2 0
4 2 0 2
5 0 0 1
#处理缺失数据
string_data=Series(['aardvark','artichoke',np.nan,'avocado'])
string_data
Out[46]:
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
string_data.isnull()
Out[47]:
0 False
1 False
2 True
3 False
dtype: bool
<pre name="code" class="python">string_data[0]=None
string_data.isnull()
Out[49]:
0 True
1 False
2 True
3 False
dtype: bool
#滤除缺失数据
from numpy import nan as NA
data=Series([1,NA,3.5,NA,7])
data.dropna()
Out[53]:
0 1.0
2 3.5
4 7.0
dtype: float64
data[data.notnull()]
Out[54]:
0 1.0
2 3.5
4 7.0
dtype: float64
# DataFrame丢弃NA
data=DataFrame([[1.,6.5,3.],[1.,NA,NA],
[NA,NA,NA],[NA,6.5,3.]])
cleaned=data.dropna()
data
Out[58]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3
cleaned
Out[59]:
0 1 2
0 1 6.5 3
data.dropna(how='all') #丢弃全是NaN的
Out[60]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
3 NaN 6.5 3
data[4]=NA
data
Out[62]:
0 1 2 4
0 1 6.5 3 NaN
1 1 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3 NaN
data.dropna(axis=1,how='all')
Out[63]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3
df=DataFrame(np.random.randn(7,3))
df.ix[:4,1]=NA
df.ix[:2,2]=NA
df
Out[67]:
0 1 2
0 1.001896 NaN NaN
1 0.157002 NaN NaN
2 -0.531478 NaN NaN
3 0.553903 NaN -0.854074
4 0.902825 NaN 1.355097
5 -0.372478 -0.840326 0.586763
6 0.728256 -1.042467 -0.705461
df.dropna(thresh=3)
Out[68]:
0 1 2
5 -0.372478 -0.840326 0.586763
6 0.728256 -1.042467 -0.705461
#填充缺失数据
df.fillna(0)
Out[70]:
0 1 2
0 1.001896 0.000000 0.000000
1 0.157002 0.000000 0.000000
2 -0.531478 0.000000 0.000000
3 0.553903 0.000000 -0.854074
4 0.902825 0.000000 1.355097
5 -0.372478 -0.840326 0.586763
6 0.728256 -1.042467 -0.705461
df.fillna({1:0.5,3:-1}) #对不同的列进行填充
Out[71]:
0 1 2
0 1.001896 0.500000 NaN
1 0.157002 0.500000 NaN
2 -0.531478 0.500000 NaN
3 0.553903 0.500000 -0.854074
4 0.902825 0.500000 1.355097
5 -0.372478 -0.840326 0.586763
6 0.728256 -1.042467 -0.705461
#对现有对象进行就地修改
df=df.fillna(0,inplace=True)
df
Out[74]:
0 1 2
0 1.001896 0.000000 0.000000
1 0.157002 0.000000 0.000000
2 -0.531478 0.000000 0.000000
3 0.553903 0.000000 -0.854074
4 0.902825 0.000000 1.355097
5 -0.372478 -0.840326 0.586763
6 0.728256 -1.042467 -0.705461
df=DataFrame(np.random.randn(6,3))
df.ix[2:,1]=NA
df.ix[4:,2]=NA
df
Out[78]:
0 1 2
0 -0.554156 0.197443 0.293045
1 0.694153 0.220753 0.397910
2 1.222762 NaN 1.914798
3 -0.330765 NaN -0.077081
4 0.693529 NaN NaN
5 -0.458041 NaN NaN
df.fillna(method='ffill')
Out[79]:
0 1 2
0 -0.554156 0.197443 0.293045
1 0.694153 0.220753 0.397910
2 1.222762 0.220753 1.914798
3 -0.330765 0.220753 -0.077081
4 0.693529 0.220753 -0.077081
5 -0.458041 0.220753 -0.077081
df.fillna(method='ffill',limit=2)
Out[80]:
0 1 2
0 -0.554156 0.197443 0.293045
1 0.694153 0.220753 0.397910
2 1.222762 0.220753 1.914798
3 -0.330765 0.220753 -0.077081
4 0.693529 NaN -0.077081
5 -0.458041 NaN -0.077081
11.5/3
Out[81]: 3.8333333333333335
data=Series([1.,NA,3.5,NA,7])
data.fillna(data.mean()) # 填充值为均值
Out[83]:
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
#层次化索引
data=Series(np.random.randn(10),
index=[['a','a','a','b','b','b','c','c','d','d'],
[1,2,3,1,2,3,1,2,2,3]])
data
Out[86]:
a 1 -0.736490
2 -0.159564
3 0.571495
b 1 -1.124832
2 -0.213716
3 0.780868
c 1 0.681764
2 -0.786278
d 2 -0.222474
3 1.339298
dtype: float64
#两层索引
data.index
Out[88]:
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b']
Out[89]:
1 -1.124832
2 -0.213716
3 0.780868
dtype: float64
data['b':'c']
Out[90]:
b 1 -1.124832
2 -0.213716
3 0.780868
c 1 0.681764
2 -0.786278
dtype: float64
data.ix[['b','d']]
Out[91]:
b 1 -1.124832
2 -0.213716
3 0.780868
d 2 -0.222474
3 1.339298
dtype: float64
data[:,2] #在内层中选取
Out[92]:
a -0.159564
b -0.213716
c -0.786278
d -0.222474
dtype: float64
data.unstack() #被放到一个DataFrame当中
Out[93]:
1 2 3
a -0.736490 -0.159564 0.571495
b -1.124832 -0.213716 0.780868
c 0.681764 -0.786278 NaN
d NaN -0.222474 1.339298
data.unstack().stack()
Out[94]:
a 1 -0.736490
2 -0.159564
3 0.571495
b 1 -1.124832
2 -0.213716
3 0.780868
c 1 0.681764
2 -0.786278
d 2 -0.222474
3 1.339298
dtype: float64
#分层索引
frame=DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],
['Green','Red','Green']])
frame
Out[97]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame
Out[100]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
frame['Ohio']
Out[101]:
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
frame.swaplevel('key1','key2')
Out[102]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
#交换
frame.sortlevel(1)
Out[104]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
frame.swaplevel(0,1).sortlevel(0)
Out[105]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
frame.sortlevel(1) #根据 key2进行了排序
Out[106]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
frame.swaplevel(0,1).sortlevel(0) # key1 key2 交换以后根据 key2进行排序
Out[107]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
#根据级别汇总统计
frame.sum(level='key2')
Out[109]:
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
frame
Out[110]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
frame.sum(level='color',axis=1)
Out[111]:
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10
#将DataFrame的行 列 交换索引
frame=DataFrame({'a':range(7),'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
Out[114]:
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
frame2=frame.set_index(['c','d']) #以c d作为索引
frame2
Out[116]:
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
frame.set_index(['c','d'],drop=False)
Out[117]:
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
frame2.reset_index()
Out[118]:
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1