Series
基本特征:类似一维数组的对象,由数据和索引组成
相当于定长有序的字典,index和values相互独立
import pandas as pd
from pandas import Series
a=pd.Series([1,2.0,'a'])
print(a)
0 1
1 2
2 a
dtype: object
自定义Series的index
>>> #指定Series的索引
>>> b=pd.Series(['aa','bb','cc'],index=[1,2,3])
>>> b
1 aa
2 bb
3 cc
dtype: object
>>> b.index
Int64Index([1, 2, 3], dtype='int64')
>>> b.values
array(['aa', 'bb', 'cc'], dtype=object)
Series的数据对齐功能
判断哪些索引值没有在字典里
>>> data={'aa':'92','bb':'99','cc':'96'}
>>> sinindex=['aa','bb','cc','dd']
>>> pd.Series(data,index=sinindex)
aa 92
bb 99
cc 96
dd NaN
dtype: object
>>> #判断哪些索引值是空的
>>> pd.isnull(pd.Series(data,index=sinindex))
aa False
bb False
cc False
dd True
dtype: bool
>>>
>>> #在算术运算中自动对齐不同索引的数据
>>> aSer=pd.Series(data,index=sinindex)
>>> aSer
aa 92
bb 99
cc 96
dd NaN
dtype: object
>>> bSer={'aa':'86','dd':'122','ee':'11'}
>>> cSer=pd.Series(bSer)
>>> aSer+cSer
aa 9286
bb NaN
cc NaN
dd NaN
ee NaN
dtype: object
Series的name属性
Series对象和索引具有name属性
>>> aSer
aa 92
bb 99
cc 96
dd NaN
dtype: object
>>> aSer.name='winycg'
>>> aSer.index.name='students'
>>> aSer
students
aa 92
bb 99
cc 96
dd NaN
Name: winycg, dtype: object
DataFrame
基本特征:
(1)含有一个有序的列
(2)一个表格型的数据结构
(3)大致可看成一个共享index的Series集合
创建DataFrame
>>> #1.用字典
>>> data={'name':['aa','bb','cc'],'pay':[4000,5000,6000]}
>>> frame=pd.DataFrame(data)
>>> frame
name pay
0 aa 4000
1 bb 5000
2 cc 6000
>>> #上述索引为默认索引
>>> import numpy as np
>>> frame=pd.DataFrame(data,index=range(1,4),columns=['name','pay'])
>>> frame
name pay
1 aa 4000
2 bb 5000
3 cc 6000
>>> #索引
>>> frame.index
RangeIndex(start=1, stop=4, step=1)
>>> #字段名称
>>> frame.columns
Index(['name', 'pay'], dtype='object')
>>> frame.values
array([['aa', '4000'],
['bb', '5000'],
['cc', '6000']], dtype=object)
>>> #2.使用数组array创建
>>> import pandas as pd
>>> df1=pd.DataFrame(np.arange(12).reshape((3,4)))
>>> df1
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
>>> #上述为默认索引值和字段名
>>> df2=pd.DataFrame(np.arange(12).reshape((3,4)),index=np.arange(1,4),columns=['a','b','c','d'])
>>> df2
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> #上述为指定索引和字段名
>>>#产生index为时间序列
>>> dates=pd.date_range('20170520',periods=7)
>>> dates
DatetimeIndex(['2017-05-20', '2017-05-21', '2017-05-22', '2017-05-23',
'2017-05-24', '2017-05-25', '2017-05-26'],
dtype='datetime64[ns]', freq='D')
>>> dates=pd.date_range('20170520',periods=3)
>>> dates
DatetimeIndex(['2017-05-20', '2017-05-21', '2017-05-22'], dtype='datetime64[ns]', freq='D')
>>> df=pd.DataFrame(np.arange(12).reshape((3,4)),index=dates)
>>> df
0 1 2 3
2017-05-20 0 1 2 3
2017-05-21 4 5 6 7
2017-05-22 8 9 10 11
取得DataFrame的行和列可以获得Series
>>> df
a b c d
x 0 1 2 3
y 4 5 6 7
z 8 9 10 11
>>> #选择行,以下标的形式,只能是序列
>>> df[:2]
a b c d
x 0 1 2 3
y 4 5 6 7
>>> df[0:1]
a b c d
x 0 1 2 3
>>>#选择行,以标签名的形式,只能是序列,包含最后一个元素
>>> df['x':'z']
a b c d
x 0 1 2 3
y 4 5 6 7
z 8 9 10 11
>>>#------------------------------------------------------------
>>> #选择列,若是一列则类型为Series
>>> df.a
x 0
y 4
z 8
Name: a, dtype: int32
>>> df['a']
x 0
y 4
z 8
Name: a, dtype: int32
>>> df[['a','b']]
a b
x 0 1
y 4 5
z 8 9
>>>#-------------------------------------------------------------
>>>用标签选择:loc
>>>#选择行
>>> df.loc['x']
a 0
b 1
c 2
d 3
Name: x, dtype: int32
>>> type(df.loc['x'])
<class 'pandas.core.series.Series'
>>>#上述为Series类型
>>>#选择行和列
>>> df.loc[['x','y'],['a','b']]
a b
x 0 1
y 4 5
>>> type(df.loc[['x','y'],['a','b']])
<class 'pandas.core.frame.DataFrame'>
>>> df.loc['x',['a','b']]
a 0
b 1
Name: x, dtype: int32
>>> type(df.loc['x',['a','b']])
<class 'pandas.core.series.Series'>
>>>#-------------------------------------------------------------
>>>#用位置选择(和numpy的二维array切片操作类似):
>>> df.iloc[1:3,2:4]
c d
y 6 7
z 10 11
>>> #间隔行输出
>>> df.iloc[[0,2],2]
x 2
z 10
Name: c, dtype: int32
>>> df.iloc[:,1]
x 1
y 5
z 9
Name: b, dtype: int32
>>> df.iloc[1,:]
a 4
b 5
c 6
d 7
Name: y, dtype: int32
>>> df.iloc[1,1]
5
>>>#-------------------------------------------------------------
>>> #混合标签和位置:ix
>>>#选择行,可以转化为Series格式
>>> df.ix[1]
a 4
b 5
c 6
d 7
Name: y, dtype: int32
>>> df.ix[:2,['a','b']]
a b
x 0 1
y 4 5
>>> df.ix[1:2]
a b c d
y 4 5 6 7
>>>#--------------------------------------------------------------
>>>使用布尔参数
>>> df>5
a b c d
x False False False False
y False False True True
z True True True True
>>> df[df>5]=99
>>> df
a b c d
x 0 1 2 3
y 4 5 99 99
z 99 99 99 99
DataFrame对象的添加、删除和修改
>>> dff
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>>#修改某一列
>>> dff['a']=5
>>> dff
a b c d
1 5 1 2 3
2 5 5 6 7
3 5 9 10 11
>>> # 使用apply+lambda对某一列进行修改
>>> dff.b.apply(lambda x: x+1)
>>> dff
a b c d
1 5 1 2 4
2 5 5 6 8
3 5 9 10 12
>>> #添加某个列
>>> data=[2,3,10]
>>> dff['e']=data
>>> dff
a b c e
1 5 1 2 2
2 5 5 6 3
3 5 9 10 10
>>> #删除某个列
>>> del dff['d']
>>> dff
a b c
1 5 1 2
2 5 5 6
3 5 9 10
DataFrame的排序功能
>>> df2
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> df2.sort_index(axis=1,ascending=False)
d c b a
1 3 2 1 0
2 7 6 5 4
3 11 10 9 8
>>> #对columns进行排序
>>> df2.sort_index(axis=0,ascending=False)
a b c d
3 8 9 10 11
2 4 5 6 7
1 0 1 2 3
>>> #对index进行排序
>>> df2.sort_values(by='b')
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> #对b列的数据进行排序
DataFrame统计和筛选
>>> df
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> # 查看某一列的属性值以及对应的数量
>>> df.a.value_counts()
3 1
11 1
7 1
Name: 3, dtype: int64
>>> #找寻最小值
>>> df.a.min()
0
>>>#求列的平均值
>>> df.a.mean()
4.0
>>>#-----------------------------------------------
>>> #选择符合条件的行
>>> df[df.a>=4]
a b c d
2 4 5 6 7
3 8 9 10 11
>>>#找寻index在(1,3)之间的行
>>> df[(df.index>1) & (df.index<3)]
a b c d
2 4 5 6 7
Grouping分组
>>> df=pd.DataFrame({'a':[1,2,2,3,3,4],'b':[2,3,4,4,1,4],'c':[2,3,2,2,3,8]})
>>> df
a b c
0 1 2 2
1 2 3 3
2 2 4 2
3 3 4 2
4 3 1 3
5 4 4 8
>>>#按照某一列进行分组,显示每个组内行的个数
>>> df.groupby('a').count()
b c
a
1 1 1
2 2 2
3 2 2
4 1 1
>>> df.groupby('b').count()
a c
b
1 1 1
2 1 1
3 1 1
4 3 3
>>>#----------------------------------------------------------
>>> #对组内的每一列求和
>>> df.groupby('a').sum()
b c
a
1 2 2
2 7 5
3 5 5
4 4 8
>>> #显示组内某一列的和
>>> df.groupby('a').b.sum()
a
1 2
2 7
3 5
4 4
Name: b, dtype: int64
合并
方法1:Append
>>> df
a b c
0 1 2 2
1 2 3 3
2 2 4 2
3 3 4 2
4 3 1 3
5 4 4 8
>>> dfx=pd.DataFrame(np.arange(6).reshape((2,3)),columns=['a','b','c'])
>>> dfx
a b c
0 0 1 2
1 3 4 5
>>>#合并DataFrame
>>> df.append(dfx)
a b c
0 1 2 2
1 2 3 3
2 2 4 2
3 3 4 2
4 3 1 3
5 4 4 8
0 0 1 2
1 3 4 5
>>> dfx=pd.DataFrame(np.arange(6).reshape((2,3)),columns=['Q','b','c'])
>>> df.append(dfx)
Q a b c
0 NaN 1.0 2 2
1 NaN 2.0 3 3
2 NaN 2.0 4 2
3 NaN 3.0 4 2
4 NaN 3.0 1 3
5 NaN 4.0 4 8
0 0.0 NaN 1 2
1 3.0 NaN 4 5
方法2.concat
>>> df=pd.DataFrame(np.arange(12).reshape((3,4)),index=
[1,2,3],columns=['a','b','c','d'])
>>> df
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> dfx=pd.DataFrame({'d':[3,7,6],'e':[1,2,3]})
>>> dfx
d e
0 3 1
1 7 2
2 6 3
>>>#-------------------------------------------------------
>>>#上下合并
>>> pd.concat([df,dfx])
a b c d e
1 0.0 1.0 2.0 3 NaN
2 4.0 5.0 6.0 7 NaN
3 8.0 9.0 10.0 11 NaN
0 NaN NaN NaN 3 1.0
1 NaN NaN NaN 7 2.0
2 NaN NaN NaN 6 3.0
>>>#重新设置默认坐标
>>> pd.concat([df,dfx],ignore_index=True)
a b c d e
0 0.0 1.0 2.0 3 NaN
1 4.0 5.0 6.0 7 NaN
2 8.0 9.0 10.0 11 NaN
3 NaN NaN NaN 3 1.0
4 NaN NaN NaN 7 2.0
>>>#内连接,只连接公共的列
>>> pd.concat([df,dfx],join='inner')
d
1 3
2 7
3 11
0 3
1 7
2 6
>>>#---------------------------------------------------------
>>>#左右合并
>>> pd.concat([df,dfx],axis=1)
a b c d d e
0 NaN NaN NaN NaN 3.0 1.0
1 0.0 1.0 2.0 3.0 7.0 2.0
2 4.0 5.0 6.0 7.0 6.0 3.0
3 8.0 9.0 10.0 11.0 NaN NaN
>>>#内连接,只连接公共的index
>>> pd.concat([df,dfx],axis=1,join='inner')
a b c d d e
1 0 1 2 3 7 2
2 4 5 6 7 6 3
>>>#----------------------------------------------------------
>>>#指定在某些轴进行索引
>>> pd.concat([df,dfx],axis=1,join_axes=[df.index])
a b c d d e
1 0 1 2 3 7.0 2.0
2 4 5 6 7 6.0 3.0
3 8 9 10 11 NaN NaN
方法3.类似于数据库中的连接
>>> df1=pd.DataFrame({'key':['a','b','c','e'],'data1':range(4)})
>>> df1
data1 key
0 0 a
1 1 b
2 2 c
3 3 e
>>> df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
>>> df2
data2 key
0 0 a
1 1 b
2 2 d
>>>#------------------------------------------------------------
>>>#df1和df2根据key进行内连接
>>> pd.merge(df1,df2,on='key')
data1 key data2
0 0 a 0
1 1 b 1
>>>#若要连接的列名不同,可以显式指定
>>> df3=pd.DataFrame({'key3':['a','b','c','e'],'data1':range(4)})
>>> df3
data1 key3
0 0 a
1 1 b
2 2 c
3 3 e
>>> df4=pd.DataFrame({'key4':['a','b','d'],'data2':range(3)})
>>> df4
data2 key4
0 0 a
1 1 b
2 2 d
>>> pd.merge(df3,df4,left_on='key3',right_on='key4')
data1 key3 data2 key4
0 0 a 0 a
1 1 b 1 b
>>>#-----------------------------------------------------------
>>>#外连接
>>> pd.merge(df1,df2,on='key',how='outer')
data1 key data2
0 0.0 a 0.0
1 1.0 b 1.0
2 2.0 c NaN
3 3.0 e NaN
4 NaN d 2.0
>>>#显示连接的情况
>>> pd.merge(df1,df2,on='key',how='outer',indicator=True)
data1 key data2 _merge
0 0.0 a 0.0 both
1 1.0 b 1.0 both
2 2.0 c NaN left_only
3 3.0 e NaN left_only
4 NaN d 2.0 right_only
>>>#-----------------------------------------------------------
>>>#利用多个键进行合并
>>> left_df=pd.DataFrame({'key1':['a','b','c'],'key2':['d','e','f'],'lval':[1,2,3]})
>>> left_df
key1 key2 lval
0 a d 1
1 b e 2
2 c f 3
>>> right_df=pd.DataFrame({'key1':['a','r','c'],'key2':['d','e','f'],'rval':[4,5,6]})
>>> right_df
key1 key2 rval
0 a d 4
1 r e 5
2 c f 6
>>> pd.merge(left_df,right_df,on=['key1','key2'])
key1 key2 lval rval
0 a d 1 4
1 c f 3 6
>>> pd.merge(left_df,right_df,on=['key1','key2'],how='outer')
key1 key2 lval rval
0 a d 1.0 4.0
1 b e 2.0 NaN
2 c f 3.0 6.0
3 r e NaN 5.0
>>>#----------------------------------------------------------
处理缺失数据
>>> dates=pd.date_range('20171101',periods=4)
>>> df=pd.DataFrame(np.arange(12).reshape((4,3)),index=dates,columns=['a','b','c'])
>>> df
a b c
2017-11-01 0 1 2
2017-11-02 3 4 5
2017-11-03 6 7 8
2017-11-04 9 10 11
>>> df.iloc[0,1]=np.nan
>>> df.iloc[1,2]=np.nan
>>> df
a b c
2017-11-01 0 NaN 2.0
2017-11-02 3 4.0 NaN
2017-11-03 6 7.0 8.0
2017-11-04 9 10.0 11.0
>>>#-----------------------------------------------------------
>>>#去除存在nan的行
>>> df.dropna()
a b c
2017-11-03 6 7.0 8.0
2017-11-04 9 10.0 11.0
>>>#去除存在nan的列
>>> df.dropna(axis=1)
a
2017-11-01 0
2017-11-02 3
2017-11-03 6
2017-11-04 9
>>>#去除全为nan的行
>>> df.dropna(how='all')
a b c
2017-11-01 0 NaN 2.0
2017-11-02 3 4.0 NaN
2017-11-03 6 7.0 8.0
2017-11-04 9 10.0 11.0
>>>#------------------------------------------------------------
>>>#填充缺失数据
>>> dfx=df.copy()
>>> dfx.fillna(value=0.0)
a b c
2017-11-01 0 0.0 2.0
2017-11-02 3 4.0 0.0
2017-11-03 6 7.0 8.0
2017-11-04 9 10.0 11.0
>>>#-------------------------------------------------------------
>>>#判断数据是否缺失
>>> df.isnull()
a b c
2017-11-01 False True False
2017-11-02 False False True
2017-11-03 False False False
2017-11-04 False False False
>>>#判断DataFrame是否存在数据缺失
>>> np.any(df.isnull())==True
True
>>> np.all(df.isnull())==True
False
pandas的绘图功能
pandas主要是基于Series和DataFrame的数据进行绘图
绘制一列数据:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df=pd.DataFrame(np.arange(12).reshape((3,4)),index=[1,2,3],columns=['a','b','c','d'])
print(df)
'''
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
'''
df.b.plot()
plt.show()
绘制柱形图:
df=pd.DataFrame(np.arange(12).reshape((3,4)),index=[1,2,3],columns=['a','b','c','d'])
print(df)
'''
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
'''
df.plot(kind='bar')
#绘制横向的柱形图
df.plot(kind='barh')
#产生堆积效果的柱状图
df.plot(kind='bar',stacked=True)
plt.show()
绘制饼状图
df.b.plot(kind='pie',autopct='%.2f')
plt.show()
将DataFrame数据存取到csv格式的文件
>>> df=pd.DataFrame(np.arange(12).reshape((3,4)),index=[1,2,3],columns=['a','b','c','d'])
>>> df
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>>#将df存入csv文件
>>>df.to_csv('df.csv')
>>>#忽略行标签(index),忽略列标签(header)
>>> df.to_csv('df.csv',index=False)
>>>#将csv文件中的DataFrame读取
>>>pd.read_csv('df.csv')
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
存取到excel格式文件
>>> df
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11
>>> df.to_excel('df.xlsx',sheet_name='df1')
>>> pd.read_excel('df.xlsx')
a b c d
1 0 1 2 3
2 4 5 6 7
3 8 9 10 11