numpy基础
import numpy as np
定义array
In [156]: np.ones(3)
Out[156]: array([1., 1., 1.])
In [157]: np.ones((3,5))
Out[157]:
array([[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]])
In [158]:
In [158]: np.zeros(4)
Out[158]: array([0., 0., 0., 0.])
In [159]: np.zeros((2,5))
Out[159]:
array([[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]])
In [160]:
In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
In [147]: print(a)
[[1 3 5 2]
[4 2 6 1]]
In [148]:
In [161]: np.arange(10)
Out[161]: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [162]: np.arange(3,13)
Out[162]: array([ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
In [163]: np.arange(3,13).reshape((2,5))
Out[163]:
array([[ 3, 4, 5, 6, 7],
[ 8, 9, 10, 11, 12]])
In [164]:
In [169]: np.arange(2,25,2)
Out[169]: array([ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24])
In [170]: np.arange(2,25,2).reshape(3,4)
Out[170]:
array([[ 2, 4, 6, 8],
[10, 12, 14, 16],
[18, 20, 22, 24]])
In [171]:
In [176]: np.linspace(1,10,4)
Out[176]: array([ 1., 4., 7., 10.])
In [177]:
array基本运算
In [7]: a = np.array([[1,2],[3,4]])
In [8]: b = np.arange(5,9).reshape((2,3))
In [10]: print(a)
[[1 2]
[3 4]]
In [11]: print(b)
[[5 6]
[7 8]]
In [12]:
In [12]: a+b
Out[12]:
array([[ 6, 8],
[10, 12]])
In [13]: a-b
Out[13]:
array([[-4, -4],
[-4, -4]])
In [14]: a*b # 对应元素相乘
Out[14]:
array([[ 5, 12],
[21, 32]])
In [17]: a/b
Out[17]:
array([[0, 0],
[0, 0]])
In [18]:
In [18]: a**2
Out[18]:
array([[ 1, 4],
[ 9, 16]])
In [19]:
In [15]: np.dot(a,b) # 矩阵乘法
Out[15]:
array([[19, 22],
[43, 50]])
In [16]: a.dot(b)
Out[16]:
array([[19, 22],
[43, 50]])
In [17]:
In [54]: print(a)
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
In [55]: np.sum(a)
Out[55]: 90
In [56]: np.min(a)
Out[56]: 2
In [57]: np.max(a)
Out[57]: 13
In [58]:
In [58]: np.sum(a,axis=1)
Out[58]: array([14, 30, 46])
In [59]: np.sum(a,axis=0)
Out[59]: array([18, 21, 24, 27])
In [60]:
# 三角函数结合random生成一组随机数据
In [74]: N = 10
In [75]: t = np.linspace(0, 2*np.pi, N)
In [76]: print(t)
[0. 0.6981317 1.3962634 2.0943951 2.7925268 3.4906585
4.1887902 4.88692191 5.58505361 6.28318531]
In [77]: y = np.sin(t) + 0.02*np.random.randn(N)
In [78]: print(y)
[-0.00947902 0.64196198 0.96567468 0.89394571 0.33830193 -0.3015316
-0.86943758 -0.95954123 -0.62526393 0.02872202]
In [79]: M = 3
In [80]: for ii, vv in zip(np.random.rand(M)*N, np.random.randn(M)):
...: y[int(ii):] += vv
...:
In [81]: print(y)
[-0.00947902 0.64196198 1.47685437 1.55309848 0.99745469 0.35762117
-0.21028481 -0.30038846 -0.29746375 0.35652221]
In [82]:
In [101]: a = np.arange(2,14).reshape((3,4))
In [102]: print(a)
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
In [103]: print(np.argmin(a)) # 最小值的索引
0
In [104]: print(np.argmax(a)) # 最大值的索引
11
In [105]: np.cumsum(a) # 从0元素开始的累计和
Out[105]: array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
In [106]: np.cumprod(a) # 从1元素开始的累计乘
Out[106]:
array([ 2, 6, 24, 120, 720,
5040, 40320, 362880, 3628800, 39916800,
479001600, 6227020800])
In [107]:
In [129]: a
Out[129]:
array([[ 2, 3, 4, 5],
[ 6, 7, 8, 9],
[10, 11, 12, 13]])
In [130]: np.cumsum(a,axis=1)
Out[130]:
array([[ 2, 5, 9, 14],
[ 6, 13, 21, 30],
[10, 21, 33, 46]])
In [131]: np.cumsum(a,axis=0)
Out[131]:
array([[ 2, 3, 4, 5],
[ 8, 10, 12, 14],
[18, 21, 24, 27]])
In [132]:
In [133]: np.cumprod(a,axis=1)
Out[133]:
array([[ 2, 6, 24, 120],
[ 6, 42, 336, 3024],
[ 10, 110, 1320, 17160]])
In [134]: np.cumprod(a,axis=0)
Out[134]:
array([[ 2, 3, 4, 5],
[ 12, 21, 32, 45],
[120, 231, 384, 585]])
In [135]:
In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
In [147]: print(a)
[[1 3 5 2]
[4 2 6 1]]
In [148]: a.shape
Out[148]: (2, 4)
In [149]: a.ndim
Out[149]: 2
In [150]: a.size
Out[150]: 8
In [151]: np.diff(a) # 累差运算
Out[151]:
array([[ 2, 2, -3],
[-2, 4, -5]])
In [152]: np.diff(a,axis=1)
Out[152]:
array([[ 2, 2, -3],
[-2, 4, -5]])
In [153]: np.diff(a,axis=0)
Out[153]: array([[ 3, -1, 1, -1]])
In [154]:
In [108]: a = np.array([10,7,11,9,8,13,12,9])
In [109]: a.ndim
Out[109]: 1
In [110]: a.shape
Out[110]: (8,)
In [111]: a.size
Out[111]: 8
In [112]: a.mean() # 均值
Out[112]: 9.875
In [113]: a.var() # 方差
Out[113]: 3.609375
In [114]: a.std() # 标准差
Out[114]: 1.899835519196333
In [115]:
In [117]: np.median(a) # 中位数
Out[117]: 9.5
In [118]:
In [138]: z = (a-a.mean())/a.std() # z-score
In [139]: print(z)
[ 0.06579517 -1.5132889 0.59215653 -0.46056619 -0.98692754 1.64487924
1.11851788 -0.46056619]
In [140]:
In [198]: a = np.arange(-3,3).reshape((2,3))
In [199]: a
Out[199]:
array([[-3, -2, -1],
[ 0, 1, 2]])
In [200]: np.nonzero(a) # 查找非0元素
Out[200]: (array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
In [201]: print(np.nonzero(a))
(array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
In [202]:
In [207]: a = np.arange(14,2,-1).reshape((3,4))
In [208]: print(a)
[[14 13 12 11]
[10 9 8 7]
[ 6 5 4 3]]
In [209]: np.sort(a) # 排序
Out[209]:
array([[11, 12, 13, 14],
[ 7, 8, 9, 10],
[ 3, 4, 5, 6]])
In [210]:
In [210]: np.sort(a,axis=1)
Out[210]:
array([[11, 12, 13, 14],
[ 7, 8, 9, 10],
[ 3, 4, 5, 6]])
In [211]: np.sort(a,axis=0)
Out[211]:
array([[ 6, 5, 4, 3],
[10, 9, 8, 7],
[14, 13, 12, 11]])
In [212]:
# 矩阵的转置
In [212]: a = np.arange(14,2,-1).reshape((3,4))
In [213]: print(a)
[[14 13 12 11]
[10 9 8 7]
[ 6 5 4 3]]
In [214]:
In [215]: print(np.transpose(a))
[[14 10 6]
[13 9 5]
[12 8 4]
[11 7 3]]
In [216]: a.T
Out[216]:
array([[14, 10, 6],
[13, 9, 5],
[12, 8, 4],
[11, 7, 3]])
In [217]:
In [220]: a.T.dot(a) # 先转置,再进行矩阵乘法
Out[220]:
array([[332, 302, 272, 242],
[302, 275, 248, 221],
[272, 248, 224, 200],
[242, 221, 200, 179]])
In [221]:
# 矩阵的clip,处理最大值和最小值
In [221]: print(a)
[[14 13 12 11]
[10 9 8 7]
[ 6 5 4 3]]
In [222]: np.clip(a,5,11)
Out[222]:
array([[11, 11, 11, 11],
[10, 9, 8, 7],
[ 6, 5, 5, 5]])
In [223]:
卷积运算
numpy.convolve(weights,array)
weight = [a,b,c]
array = [i,j,k,m,n]
Result:[ai, bi+aj, ci+bj+ak, cj+bk+am, ck+bm+an, cm+bn, cn][N-1:-N+1]
针对移动平均算法来预测下一个数据,越接近待预测点的数据权重越大,
那么就需要让 i, j, k, m, n 的系数逐渐增大即可;即让 a > b > c ,并且 a+b+c=1 。
示例:
In [223]: weight = np.ones(3)/3
In [224]: print(weight)
[0.33333333 0.33333333 0.33333333]
In [225]: arr = np.array([8,11,9,7,10])
In [226]: np.convolve(weight,arr)
Out[226]:
array([2.66666667, 6.33333333, 9.33333333, 9. , 8.66666667,
5.66666667, 3.33333333])
In [227]:
In [227]: weight = np.array([0.8,0.1,0.1])
In [228]: np.convolve(weight,arr)
Out[228]: array([6.4, 9.6, 9.1, 7.6, 9.6, 1.7, 1. ])
In [229]:
random常用操作
# 生成随机浮点数,范围是在0.0~1.0之间
In [19]: a = np.random.random((2,3))
In [20]: print(a)
[[0.02185901 0.69585563 0.04555439]
[0.37331857 0.32903986 0.62448246]]
In [21]:
# 生成随机整数,可指定起止范围
In [48]: np.random.randint(3)
Out[48]: 2
In [49]: np.random.randint(low=3,high=9)
Out[49]: 6
In [50]: np.random.randint(low=3,high=9,size=(3,4))
Out[50]:
array([[5, 6, 7, 8],
[8, 7, 3, 8],
[5, 4, 5, 5]])
In [51]:
In [68]: np.random.randint(low=-5,high=2,size=(3,4))
Out[68]:
array([[-4, -4, -2, 1],
[ 1, 0, 0, 1],
[-4, -3, 1, -5]])
In [69]:
# 生成正态分布,又名高斯分布(Gaussian distribution)随机数
In [64]: np.random.normal()
Out[64]: -0.5399414561419419
In [65]: np.random.normal(loc=0,scale=1,size=(2,3))
Out[65]:
array([[-0.50318082, -0.38614219, 0.30450427],
[ 0.41711087, 0.29990928, -0.7843322 ]])
In [66]:
In [66]: np.random.normal(loc=2,scale=3,size=(2,3))
Out[66]:
array([[ 3.37067379, 6.23517315, 2.3267659 ],
[ 6.46832646, -2.76363304, 5.77883853]])
In [67]:
# 生成标准正态分布("standard normal" distribution)随机数,标准正态分布的平均值为0,方差为1,服从u(0,1)分布。
In [83]: np.random.randn()
Out[83]: 0.502482341264108
In [84]: np.random.randn(3,4)
Out[84]:
array([[ 0.34507555, -0.26868132, -0.56103417, 0.86176617],
[-0.16535555, -0.38045904, 0.48176385, -1.09005206],
[-0.60780266, 1.74113117, -0.72427329, -0.51232408]])
In [85]:
# 生成[0, 1)间随机数
In [99]: np.random.rand()
Out[99]: 0.607701127768974
In [100]: np.random.rand(3,4)
Out[100]:
array([[0.73020695, 0.53993878, 0.46693879, 0.82611629],
[0.76117076, 0.16522599, 0.85129611, 0.74448772],
[0.6450236 , 0.49994053, 0.04115063, 0.30081311]])
In [101]:
array索引
# 一维数组的索引和list类似
略
# 二维数组的索引
In [13]: import numpy as np
In [14]: a = np.arange(3,15).reshape((3,4))
In [15]: print(a)
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
In [16]: a[1]
Out[16]: array([ 7, 8, 9, 10])
In [17]: a[1,2]
Out[17]: 9
In [18]: a[1][2] # 等价于 a[1,2]
Out[18]: 9
In [19]:
In [19]: a[1,1:-1] # 获取第二行,除去首尾元素
Out[19]: array([8, 9])
In [20]: a[1,1:2] # 获取第二行第二个元素
Out[20]: array([8])
In [21]:
In [24]: a[1:-1,2] # 获取第二列,除去首尾元素
Out[24]: array([9])
In [26]: a[:,2] # 获取第二列元素
Out[26]: array([ 5, 9, 13])
In [27]:
迭代array
# 迭代行
In [27]: print(a)
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
In [28]: for row in a:
...: print(row)
...:
[3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]
In [29]:
# 迭代列
In [29]: print(a.T)
[[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
[ 6 10 14]]
In [30]: for column in a.T:
...: print(column)
...:
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
[ 6 10 14]
In [31]:
# 二维矩阵,多行转换成一行,迭代每一个item
In [31]: print(a)
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
In [32]: print(a.flat)
<numpy.flatiter object at 0x7f392e3545c0>
In [33]: print(a.flatten())
[ 3 4 5 6 7 8 9 10 11 12 13 14]
In [34]: for item in a.flat:
...: print(item)
...:
3
4
5
6
7
8
9
10
11
12
13
14
In [35]:
合并array
In [39]: a = np.array([1,2,3])
In [40]: b = np.array([2,2,2])
In [41]: c = np.vstack((a,b)) # vertical stack,上下合并
In [42]: print(c)
[[1 2 3]
[2 2 2]]
In [43]: c.shape
Out[43]: (2, 3)
In [44]: c.ndim
Out[44]: 2
In [45]: c.size
Out[45]: 6
In [46]:
In [47]: d = np.hstack((a,b)) # horizontal stack,左右合并
In [48]: print(d)
[1 2 3 2 2 2]
In [49]: d.shape
Out[49]: (6,)
In [50]: d.ndim
Out[50]: 1
In [51]: d.size
Out[51]: 6
In [52]:
# newaxis改变数组维度
In [54]: print(a)
[1 2 3]
In [55]: e = a[np.newaxis,:]
In [56]: print(e)
[[1 2 3]]
In [57]: f = a[:,np.newaxis]
In [58]: print(f)
[[1]
[2]
[3]]
In [59]:
In [59]: a = np.array([1,2,3])[:,np.newaxis]
In [60]: b = np.array([2,2,2])[:,np.newaxis]
In [61]: print(a)
[[1]
[2]
[3]]
In [62]: print(b)
[[2]
[2]
[2]]
In [63]: c = np.vstack((a,b))
In [64]: print(c)
[[1]
[2]
[3]
[2]
[2]
[2]]
In [65]: d = np.hstack((a,b)) # 合并两个array
In [66]: print(d)
[[1 2]
[2 2]
[3 2]]
In [67]:
In [74]: d = np.hstack((a,b,b,a)) # 合并多个array
In [75]: print(d)
[[1 2 2 1]
[2 2 2 2]
[3 2 2 3]]
In [76]:
# concatenate 常用来合并多个矩阵或序列,axis可以方便的指定维度
In [76]: a = np.array([1,2,3])
In [77]: b = np.array([2,2,2])
In [78]: a = a[:,np.newaxis]
In [79]: b = b[:,np.newaxis]
In [80]: c = np.concatenate((a,b,b,a),axis=0)
In [81]: print(c)
[[1]
[2]
[3]
[2]
[2]
[2]
[2]
[2]
[2]
[1]
[2]
[3]]
In [82]: c = np.concatenate((a,b,b,a),axis=1)
In [83]: print(c)
[[1 2 2 1]
[2 2 2 2]
[3 2 2 3]]
In [84]:
分割array
In [92]: a = np.arange(12).reshape((3,4))
In [93]: print(a)
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
In [94]: c = np.split(a,2,axis=1) # 等项分割
In [95]: len(c)
Out[95]: 2
In [96]: c[0]
Out[96]:
array([[0, 1],
[4, 5],
[8, 9]])
In [97]: c[1]
Out[97]:
array([[ 2, 3],
[ 6, 7],
[10, 11]])
In [98]:
In [98]: print(c)
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
In [99]:
In [99]: d = np.array_split(a,3,axis=1) # 不等项分割
In [100]: len(d)
Out[100]: 3
In [101]: print(d)
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
In [102]: d[0]
Out[102]:
array([[0, 1],
[4, 5],
[8, 9]])
In [103]: d[1]
Out[103]:
array([[ 2],
[ 6],
[10]])
In [104]: d[2]
Out[104]:
array([[ 3],
[ 7],
[11]])
In [105]:
In [111]: print(a)
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
In [112]: b = np.hsplit(a,2) # horizontal split,水平分割
In [113]: print(b)
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
In [114]: b[0]
Out[114]:
array([[0, 1],
[4, 5],
[8, 9]])
In [115]: b[1]
Out[115]:
array([[ 2, 3],
[ 6, 7],
[10, 11]])
In [116]:
In [116]: c = np.vsplit(a,3) # vertical split,垂直分割
In [117]: len(c)
Out[117]: 3
In [118]: print(c)
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
In [119]: c[0]
Out[119]: array([[0, 1, 2, 3]])
In [120]: c[1]
Out[120]: array([[4, 5, 6, 7]])
In [121]: c[2]
Out[121]: array([[ 8, 9, 10, 11]])
In [122]:
Numpy.copy()
In [150]: a = np.arange(4)
In [151]: print(a)
[0 1 2 3]
In [152]: b = a
In [153]: b is a
Out[153]: True
In [154]: a[0] = 99
In [155]: print(b)
[99 1 2 3]
In [156]:
In [156]: c = a.copy() # deep copy
In [157]: c is a
Out[157]: False
In [159]: print(a)
[99 1 2 3]
In [160]: a[1:3] = [7,8]
In [161]: print(a)
[99 7 8 3]
In [163]: print(b)
[99 7 8 3]
In [164]: print(c)
[99 1 2 3]
In [165]:
Numpy其他
In [169]: a = np.array([-9,7,12,-4,-3,6,2])
In [170]: print(a)
[-9 7 12 -4 -3 6 2]
In [171]: np.abs(a)
Out[171]: array([ 9, 7, 12, 4, 3, 6, 2])
In [172]: np.where(np.abs(a)>6)
Out[172]: (array([0, 1, 2]),)
In [173]:
numpy参考:http://pda.readthedocs.io/en/latest/chp4.html
Pandas基础
import pandas as pd
Series
In [173]: import pandas as pd
In [174]: import numpy as np
In [175]: s = pd.Series([1,3,6,np.nan,44,1]) # 定义pandas.Series
In [176]: print(s)
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
In [177]:
Base Time Series Frequencies
Aggragate for duplicate Indices
In [157]: dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000','1/3/2000','1/3/2000'])
In [158]: dates
Out[158]:
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
'2000-01-03', '2000-01-03'],
dtype='datetime64[ns]', freq=None)
In [159]: dup_ts = pd.Series(np.arange(6), index=dates)
In [160]: dup_ts
Out[160]:
2000-01-01 0
2000-01-02 1
2000-01-02 2
2000-01-02 3
2000-01-03 4
2000-01-03 5
dtype: int64
In [161]: dup_ts.index.is_unique
Out[161]: False
In [162]: dup_ts['2000-01-01']
Out[162]: 0
In [163]: dup_ts['2000-01-02']
Out[163]:
2000-01-02 1
2000-01-02 2
2000-01-02 3
dtype: int64
In [164]: dup_ts['2000-01-03']
Out[164]:
2000-01-03 4
2000-01-03 5
dtype: int64
In [165]:
In [165]: grouped = dup_ts.groupby(level=0)
In [166]: grouped.mean()
Out[166]:
2000-01-01 0.0
2000-01-02 2.0
2000-01-03 4.5
dtype: float64
In [167]: grouped.count()
Out[167]:
2000-01-01 1
2000-01-02 3
2000-01-03 2
dtype: int64
In [168]: grouped.sum()
Out[168]:
2000-01-01 0
2000-01-02 6
2000-01-03 9
dtype: int64
In [169]:
Group by month or weekday by passing a function that accesses those fields on the time series’s index.
In [90]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
In [91]: ts = pd.Series(np.arange(100), index=rng)
In [92]: ts.groupby(lambda x: x.month).mean()
Out[92]:
1 15
2 45
3 75
4 95
dtype: int64
In [93]: ts.groupby(lambda x: x.month).sum()
Out[93]:
1 465
2 1305
3 2325
4 855
dtype: int64
In [94]: ts.groupby(lambda x: x.month).max()
Out[94]:
1 30
2 59
3 90
4 99
dtype: int64
In [95]: ts.groupby(lambda x: x.weekday).mean()
Out[95]:
0 47.5
1 48.5
2 49.5
3 50.5
4 51.5
5 49.0
6 50.0
dtype: float64
In [96]: ts.groupby(lambda x: x.weekday).sum()
Out[96]:
0 665
1 679
2 693
3 707
4 721
5 735
6 750
dtype: int64
In [97]:
Resample method arguments
Resampling and Frequency Conversion
In [50]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
In [51]: ts = pd.Series(np.random.randn(len(rng)), index=rng)
In [52]: ts
Out[52]:
2000-01-01 0.030631
2000-01-02 -2.087034
2000-01-03 1.238687
2000-01-04 -1.297059
2000-01-05 -1.341296
2000-01-06 -0.353311
2000-01-07 -0.854693
2000-01-08 0.426789
...
2000-03-27 1.262705
2000-03-28 -0.646236
2000-03-29 -0.349658
2000-03-30 -1.093438
2000-03-31 -0.254758
2000-04-01 0.146417
2000-04-02 1.774502
2000-04-03 -0.712635
2000-04-04 -1.552352
2000-04-05 0.303172
2000-04-06 -0.023492
2000-04-07 -1.418930
2000-04-08 0.789877
2000-04-09 1.767594
Freq: D, Length: 100, dtype: float64
In [53]:
In [53]: ts.resample('M').mean()
Out[53]:
2000-01-31 0.003531
2000-02-29 0.030067
2000-03-31 -0.106783
2000-04-30 0.119350
Freq: M, dtype: float64
In [54]: ts.resample('M',kind='period').mean()
Out[54]:
2000-01 0.003531
2000-02 0.030067
2000-03 -0.106783
2000-04 0.119350
Freq: M, dtype: float64
In [55]:
Aggregate this data into five-minute chunks or bars by taking the sum of each group.
In [71]: rng = pd.date_range('1/1/2000', periods=24, freq='T')
In [72]: rng
Out[72]:
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
'2000-01-01 00:02:00', '2000-01-01 00:03:00',
'2000-01-01 00:04:00', '2000-01-01 00:05:00',
'2000-01-01 00:06:00', '2000-01-01 00:07:00',
'2000-01-01 00:08:00', '2000-01-01 00:09:00',
'2000-01-01 00:10:00', '2000-01-01 00:11:00',
'2000-01-01 00:12:00', '2000-01-01 00:13:00',
'2000-01-01 00:14:00', '2000-01-01 00:15:00',
'2000-01-01 00:16:00', '2000-01-01 00:17:00',
'2000-01-01 00:18:00', '2000-01-01 00:19:00',
'2000-01-01 00:20:00', '2000-01-01 00:21:00',
'2000-01-01 00:22:00', '2000-01-01 00:23:00'],
dtype='datetime64[ns]', freq='T')
In [73]: ts = pd.Series(np.arange(24), index=rng)
In [74]: ts
Out[74]:
2000-01-01 00:00:00 0
2000-01-01 00:01:00 1
2000-01-01 00:02:00 2
2000-01-01 00:03:00 3
2000-01-01 00:04:00 4
2000-01-01 00:05:00 5
2000-01-01 00:06:00 6
2000-01-01 00:07:00 7
2000-01-01 00:08:00 8
2000-01-01 00:09:00 9
2000-01-01 00:10:00 10
2000-01-01 00:11:00 11
2000-01-01 00:12:00 12
2000-01-01 00:13:00 13
2000-01-01 00:14:00 14
2000-01-01 00:15:00 15
2000-01-01 00:16:00 16
2000-01-01 00:17:00 17
2000-01-01 00:18:00 18
2000-01-01 00:19:00 19
2000-01-01 00:20:00 20
2000-01-01 00:21:00 21
2000-01-01 00:22:00 22
2000-01-01 00:23:00 23
Freq: T, dtype: int64
In [75]: ts.resample('5min').sum()
Out[75]:
2000-01-01 00:00:00 10
2000-01-01 00:05:00 35
2000-01-01 00:10:00 60
2000-01-01 00:15:00 85
2000-01-01 00:20:00 86
Freq: 5T, dtype: int64
In [76]: ts.resample('5min',closed='left').sum()
Out[76]:
2000-01-01 00:00:00 10
2000-01-01 00:05:00 35
2000-01-01 00:10:00 60
2000-01-01 00:15:00 85
2000-01-01 00:20:00 86
Freq: 5T, dtype: int64
In [77]:
In [77]: ts.resample('5min').max()
Out[77]:
2000-01-01 00:00:00 4
2000-01-01 00:05:00 9
2000-01-01 00:10:00 14
2000-01-01 00:15:00 19
2000-01-01 00:20:00 23
Freq: 5T, dtype: int64
In [78]:
In [78]: ts.resample('5min',closed='right').sum()
Out[78]:
1999-12-31 23:55:00 0
2000-01-01 00:00:00 15
2000-01-01 00:05:00 40
2000-01-01 00:10:00 65
2000-01-01 00:15:00 90
2000-01-01 00:20:00 66
Freq: 5T, dtype: int64
In [79]:
In [79]: ts.resample('5min',loffset='-1s').sum()
Out[79]:
1999-12-31 23:59:59 10
2000-01-01 00:04:59 35
2000-01-01 00:09:59 60
2000-01-01 00:14:59 85
2000-01-01 00:19:59 86
Freq: 5T, dtype: int64
In [80]:
# Open-High-Low-Close (OHLC) resampling
In [81]: ts.resample('5min').ohlc()
Out[81]:
open high low close
2000-01-01 00:00:00 0 4 0 4
2000-01-01 00:05:00 5 9 5 9
2000-01-01 00:10:00 10 14 10 14
2000-01-01 00:15:00 15 19 15 19
2000-01-01 00:20:00 20 23 20 23
In [82]:
Resampling with Periods
In [118]: frame = pd.DataFrame(np.random.randn(24, 4),
...: index=pd.period_range('1-2000', '12-2001', freq='M'),
...: columns=['Beijing', 'Luoyang', 'New York', 'Tokyo'])
In [119]: frame
Out[119]:
Beijing Luoyang New York Tokyo
2000-01 1.120268 -1.120345 -1.154800 0.443861
2000-02 0.611443 0.200576 -1.163600 -1.137567
2000-03 0.658112 2.332235 -1.718285 1.589246
2000-04 -0.863050 1.890877 2.046202 0.410414
2000-05 0.710052 -0.041623 0.122719 -1.141112
2000-06 0.299393 1.227689 0.718627 1.004851
2000-07 1.287335 -0.179045 -0.476422 0.949235
2000-08 -2.140590 0.433699 -0.783202 1.073706
2000-09 -0.149710 -0.580780 0.755274 0.514259
2000-10 0.190940 -0.187451 1.710803 -1.631272
2000-11 0.419288 0.565235 0.470381 0.599020
2000-12 0.951111 0.464671 -0.854858 -0.009189
2001-01 -1.383493 -0.147035 -0.379006 0.472686
2001-02 1.803475 -1.628368 -0.896757 -0.508827
2001-03 0.575910 -0.528299 1.182473 0.159452
2001-04 -1.056161 -0.475357 0.861852 1.168667
2001-05 -1.316565 0.354719 1.354205 -0.369083
2001-06 0.497406 -1.799904 -0.512882 -0.092718
2001-07 0.896944 -1.276022 0.137365 0.087199
2001-08 -0.046908 -0.650024 0.958182 -0.048369
2001-09 0.085401 1.067235 0.541318 0.853376
2001-10 1.165047 -0.794425 1.137002 0.064595
2001-11 -0.438006 0.706564 1.464403 0.278069
2001-12 -0.094644 0.666789 0.220349 -0.386617
In [120]: frame[:5]
Out[120]:
Beijing Luoyang New York Tokyo
2000-01 1.120268 -1.120345 -1.154800 0.443861
2000-02 0.611443 0.200576 -1.163600 -1.137567
2000-03 0.658112 2.332235 -1.718285 1.589246
2000-04 -0.863050 1.890877 2.046202 0.410414
2000-05 0.710052 -0.041623 0.122719 -1.141112
In [121]: annual_frame = frame.resample('A-DEC').mean()
In [122]: annual_frame
Out[122]:
Beijing Luoyang New York Tokyo
2000 0.257883 0.417145 -0.027263 0.222121
2001 0.057367 -0.375344 0.505709 0.139869
In [123]:
In [123]: annual_frame_max = frame.resample('A-DEC').max()
In [124]: annual_frame_max
Out[124]:
Beijing Luoyang New York Tokyo
2000 1.287335 2.332235 2.046202 1.589246
2001 1.803475 1.067235 1.464403 1.168667
In [125]:
DataFrame
# 第一种定义pandas.DataFrame方式:直接导入numpy的数据
In [186]: df1 = pd.DataFrame(np.arange(12).reshape((3,4))) # 定义pandas.DataFrame
In [187]: print(df1)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
In [188]:
In [178]: dates = pd.date_range('20160101',periods=6)
In [179]: print(dates)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
In [180]:
# 定义pandas.DataFrame,并指定列名和行名
In [184]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
In [185]: print(df)
a b c d
2016-01-01 1.193589 0.165348 1.598806 -0.478980
2016-01-02 1.188886 -1.232185 -0.633066 0.594805
2016-01-03 2.707996 -0.116420 1.622761 0.399708
2016-01-04 0.416469 1.593061 -0.044390 -0.031153
2016-01-05 -0.637080 1.680110 1.371026 0.821549
2016-01-06 -0.079359 1.421577 0.042537 1.058749
In [186]:
# 第二种定义pandas.DataFrame方式:把参数当做字典传入DataFrame
In [188]: df2 = pd.DataFrame({'A' : 1.,
...: 'B' : pd.Timestamp('20130102'),
...: 'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
...: 'D' : np.array([3] * 4,dtype='int32'),
...: 'E' : pd.Categorical(["test","train","test","train"]),
...: 'F' : 'foo'})
In [189]: print(df2)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [190]:
In [190]: print(df2.dtypes) # 查看DataFrame内容的类型
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
In [191]:
In [191]: print(df2.index) # 打印DataFrame列的名字
Int64Index([0, 1, 2, 3], dtype='int64')
In [192]:
In [192]: print(df2.columns) # 打印DataFrame行的名字
Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')
In [193]:
In [194]: print(df2.values) # 打印DataFrame的内容
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
In [195]:
In [196]: print(df2)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [197]:
In [197]: print(df2.describe()) # 打印出DataFrame的数学运算的相关数据
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
In [198]:
In [200]: print(df2.T) # 把DataFrame进行transport,即转置
0 1 2 3
A 1 1 1 1
B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
C 1 1 1 1
D 3 3 3 3
E test train test train
F foo foo foo foo
In [201]:
# 对DataFrame排序
In [203]: print(df2)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [204]: df2.sort_index(axis=1, ascending=False) # 按照index(列名)排序
Out[204]:
F E D C B A
0 foo test 3 1.0 2013-01-02 1.0
1 foo train 3 1.0 2013-01-02 1.0
2 foo test 3 1.0 2013-01-02 1.0
3 foo train 3 1.0 2013-01-02 1.0
In [205]:
In [205]: df2.sort_index(axis=0, ascending=False) # 按照行名排序
Out[205]:
A B C D E F
3 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
0 1.0 2013-01-02 1.0 3 test foo
In [206]:
In [207]: df2.sort_values(by='E') # 指定value进行排序
Out[207]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
3 1.0 2013-01-02 1.0 3 train foo
In [208]:
Pandas筛选数据
In [212]: dates = pd.date_range('20160101',periods=6)
In [213]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
In [214]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [215]:
In [215]: print(df['A']) # 选取指定列
2016-01-01 0
2016-01-02 4
2016-01-03 8
2016-01-04 12
2016-01-05 16
2016-01-06 20
Freq: D, Name: A, dtype: int64
In [216]: print(df.A) # 等价于 df['A']
2016-01-01 0
2016-01-02 4
2016-01-03 8
2016-01-04 12
2016-01-05 16
2016-01-06 20
Freq: D, Name: A, dtype: int64
In [217]:
In [217]: print(df[0:3]) # 切片方式选取某些行
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
In [218]: print(df['2016-01-01':'2016-01-03']) # 等价于 df[0:3]
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
In [219]:
# select by label : loc
In [220]: print(df.loc['2016-01-02'])
A 4
B 5
C 6
D 7
Name: 2016-01-02 00:00:00, dtype: int64
In [221]:
In [221]: print(df.loc['2016-01-02']['B'])
5
In [222]:
In [227]: print(df.loc[:,['A','B']])
A B
2016-01-01 0 1
2016-01-02 4 5
2016-01-03 8 9
2016-01-04 12 13
2016-01-05 16 17
2016-01-06 20 21
In [228]:
In [228]: print(df.loc['2016-01-03',['A','B']])
A 8
B 9
Name: 2016-01-03 00:00:00, dtype: int64
In [229]:
In [232]: print(df.loc['2016-01-03':'2016-01-05',['A','B']])
A B
2016-01-03 8 9
2016-01-04 12 13
2016-01-05 16 17
In [233]:
# select by position : iloc
In [235]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [236]: print(df.iloc[3])
A 12
B 13
C 14
D 15
Name: 2016-01-04 00:00:00, dtype: int64
In [237]: print(df.iloc[3,1])
13
In [238]:
In [238]: print(df.iloc[3:5,1:3])
B C
2016-01-04 13 14
2016-01-05 17 18
In [239]:
In [240]: print(df.iloc[[1,3,5],1:3])
B C
2016-01-02 5 6
2016-01-04 13 14
2016-01-06 21 22
In [241]:
# mixed selection : ix
In [243]: print(df.ix[:3,['A','C']])
/usr/local/anaconda2/bin/ipython2:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
#!/usr/local/anaconda2/bin/python
A C
2016-01-01 0 2
2016-01-02 4 6
2016-01-03 8 10
In [244]:
# Boolean indexing
In [9]: print(df[df.A>8])
A B C D
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [10]:
df.head(n) # 返回DataFrame前n行
df.tail(n) # 返回DateFrame后n行
Pandas设置值
# 给DataFrame设置值
In [1]: import numpy as np
In [2]: import pandas as pd
In [3]: dates = pd.date_range('20160101',periods=6)
In [4]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
In [5]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [6]:
In [7]: df.iloc[2,2] = 99
In [10]: df.loc['2016-01-02','B'] = 100
In [11]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 100 6 7
2016-01-03 8 9 99 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [12]:
In [17]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [18]: df.A[df.A>4] = 0
In [19]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 0 9 10 11
2016-01-04 0 13 14 15
2016-01-05 0 17 18 19
2016-01-06 0 21 22 23
In [20]:
In [21]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [22]: df[df.A>4] = 0
In [23]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 0 0 0 0
2016-01-04 0 0 0 0
2016-01-05 0 0 0 0
2016-01-06 0 0 0 0
In [24]:
In [30]: df['F'] = np.nan # 增加一列,赋值为NaN
In [31]: print(df)
A B C D F
2016-01-01 0 1 2 3 NaN
2016-01-02 4 5 6 7 NaN
2016-01-03 8 9 10 11 NaN
2016-01-04 12 13 14 15 NaN
2016-01-05 16 17 18 19 NaN
2016-01-06 20 21 22 23 NaN
In [32]:
# 增加一列,需要制定行名
In [46]: df['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20160101',periods=6))
In [47]: print(df)
A B C D E F
2016-01-01 0 1 2 3 NaN 1
2016-01-02 4 5 6 7 NaN 2
2016-01-03 8 9 10 11 NaN 3
2016-01-04 12 13 14 15 NaN 4
2016-01-05 16 17 18 19 NaN 5
2016-01-06 20 21 22 23 NaN 6
In [48]:
Pandas删除DataFrame数据
In [1]: import numpy as np
In [2]: import pandas as pd
In [3]: values = np.arange(12).reshape((3,4))
In [4]: print(values)
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
In [5]:
In [8]: df = pd.DataFrame(values,index=['row1','row2','row3'],columns=['A','B','C','D'])
In [9]: print(df)
A B C D
row1 0 1 2 3
row2 4 5 6 7
row3 8 9 10 11
In [10]:
In [10]: print(df.shape)
(3, 4)
In [11]:
In [11]: df.drop(columns='A',axis=1)
Out[11]:
B C D
row1 1 2 3
row2 5 6 7
row3 9 10 11
In [12]: df.drop(columns=['A','C'],axis=1)
Out[12]:
B D
row1 1 3
row2 5 7
row3 9 11
In [13]:
In [13]: df.drop(index='row2',axis=0)
Out[13]:
A B C D
row1 0 1 2 3
row3 8 9 10 11
In [14]: df.drop(index=['row2','row3'],axis=0)
Out[14]:
A B C D
row1 0 1 2 3
In [15]:
如果index用的是 “pd.date_range('20160101',periods=6)”
In [43]: print(df)
a b c d
2016-01-01 1.273748 0.949407 -0.446053 -0.126789
2016-01-02 -0.770801 1.641150 0.840216 -0.991219
2016-01-03 -0.164625 -1.459954 1.214388 0.281621
2016-01-04 1.863281 1.163653 0.319549 -1.545655
2016-01-05 0.452804 0.203472 -1.232536 0.681963
2016-01-06 0.171324 0.353359 1.674004 -2.026071
In [44]: print(df.index)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
In [45]:
In [45]: df.drop(index=pd.datetime(2016,1,4),axis=0)
Out[45]:
a b c d
2016-01-01 1.273748 0.949407 -0.446053 -0.126789
2016-01-02 -0.770801 1.641150 0.840216 -0.991219
2016-01-03 -0.164625 -1.459954 1.214388 0.281621
2016-01-05 0.452804 0.203472 -1.232536 0.681963
2016-01-06 0.171324 0.353359 1.674004 -2.026071
In [46]: df.drop(index=[pd.datetime(2016,1,2),pd.datetime(2016,1,5)],axis=0)
Out[46]:
a b c d
2016-01-01 1.273748 0.949407 -0.446053 -0.126789
2016-01-03 -0.164625 -1.459954 1.214388 0.281621
2016-01-04 1.863281 1.163653 0.319549 -1.545655
2016-01-06 0.171324 0.353359 1.674004 -2.026071
In [47]:
Pandas处理丢失的数据
# 处理丢失数据
In [7]: print(df)
A B C D
2016-01-01 0 1 2 3
2016-01-02 4 5 6 7
2016-01-03 8 9 10 11
2016-01-04 12 13 14 15
2016-01-05 16 17 18 19
2016-01-06 20 21 22 23
In [8]: df.iloc[0,1] = np.nan
In [9]: df.iloc[1,2] = np.nan
In [10]: print(df)
A B C D
2016-01-01 0 NaN 2.0 3
2016-01-02 4 5.0 NaN 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
In [11]: print(df.dropna(axis=1,how='any')) # 删除NaN数据所在行,how = {'any','all'}
A D
2016-01-01 0 3
2016-01-02 4 7
2016-01-03 8 11
2016-01-04 12 15
2016-01-05 16 19
2016-01-06 20 23
In [12]: print(df.dropna(axis=0,how='any')) # 删除NaN数据所在行,how = {'any','all'}
A B C D
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
In [13]:
In [13]: print(df.dropna(axis=0,how='all'))
A B C D
2016-01-01 0 NaN 2.0 3
2016-01-02 4 5.0 NaN 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
In [14]:
In [14]: print(df.dropna(axis=1,how='all'))
A B C D
2016-01-01 0 NaN 2.0 3
2016-01-02 4 5.0 NaN 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
In [15]:
In [15]: df.fillna(value=0) # 把NaN填充为制定数值
Out[15]:
A B C D
2016-01-01 0 0.0 2.0 3
2016-01-02 4 5.0 0.0 7
2016-01-03 8 9.0 10.0 11
2016-01-04 12 13.0 14.0 15
2016-01-05 16 17.0 18.0 19
2016-01-06 20 21.0 22.0 23
In [16]:
In [19]: print(df.isnull()) # 把数值为NaN的位置标识出来
A B C D
2016-01-01 False True False False
2016-01-02 False False True False
2016-01-03 False False False False
2016-01-04 False False False False
2016-01-05 False False False False
2016-01-06 False False False False
In [20]:
In [22]: print(np.any(df.isnull()) == True) # 检查DataFrame是否含有NaN值
True
In [23]:
Pandas导入导出示例
In [33]: import pandas as pd
In [34]: data = pd.read_csv('student.csv')
In [35]: print(data)
Student ID name age gender
0 1100 Kelly 22 Female
1 1101 Clo 21 Female
2 1102 Tilly 22 Female
3 1103 Tony 24 Male
4 1104 David 20 Male
5 1105 Catty 22 Female
6 1106 M 3 Female
7 1107 N 43 Male
8 1108 A 13 Male
9 1109 S 12 Male
10 1110 David 33 Male
11 1111 Dw 3 Female
12 1112 Q 23 Male
13 1113 W 21 Female
In [36]: print(type(data))
<class 'pandas.core.frame.DataFrame'>
In [37]: data.to_pickle('student.pickle')
In [38]: data.to_json('student.json')
In [39]:
更多IO Tools参考:官方介绍
Pandas concat合并
# pandas 合并
# concatenating
In [40]: import numpy as np
In [41]: import pandas as pd
In [42]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
In [43]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
In [44]: df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
In [45]: print(df1)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
In [46]: print(df2)
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
In [47]: print(df3)
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
In [48]: result = pd.concat([df1,df2,df3],axis=0) # vertical 垂直合并
In [49]: print(result)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
In [50]:
In [50]: result = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 序号重新排列
In [51]: print(result)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
In [52]:
# join合并 ['inner','outer']
In [63]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
In [64]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
In [65]: print(df1)
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
In [66]: print(df2)
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
In [67]:
In [67]: result = pd.concat([df1,df2]) # 即 pd.concat([df1,df2],join='outer') , 默认就是outer模式
/usr/local/anaconda2/bin/ipython2:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=True'.
To retain the current behavior and silence the warning, pass sort=False
#!/usr/local/anaconda2/bin/python
In [68]:
In [68]: print(result)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
In [69]:
In [70]: result = pd.concat([df1,df2],join='inner') # inner模式
In [71]: print(result)
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
In [72]:
In [72]: result = pd.concat([df1,df2],join='inner',ignore_index=True)
In [73]: print(result)
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
In [74]:
# join_axes合并
In [78]: res = pd.concat([df1, df2], axis=1)
In [79]: print(res)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
In [80]:
In [74]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
In [75]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
In [76]: res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
In [77]: print(res)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
In [78]:
In [80]: res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
In [81]: print(res)
a b c d b c d e
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
In [82]:
# append合并
In [87]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
In [88]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
In [89]: df1.append(df2,ignore_index=True)
Out[89]:
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
In [90]: df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
In [91]: df1.append([df2,df3],ignore_index=True)
Out[91]:
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 1.0 1.0 1.0 1.0
7 1.0 1.0 1.0 1.0
8 1.0 1.0 1.0 1.0
In [92]:
# 添加一行数据到DataFrame
In [92]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
In [93]: s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
In [94]: res = df1.append(s1,ignore_index=True)
In [95]: print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
In [96]:
Pandas merge合并
# merge合并
In [99]: import pandas as pd
In [100]: left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
...: 'A': ['A0', 'A1', 'A2', 'A3'],
...: 'B': ['B0', 'B1', 'B2', 'B3']})
In [101]: right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
...: 'C': ['C0', 'C1', 'C2', 'C3'],
...: 'D': ['D0', 'D1', 'D2', 'D3']})
In [102]:
In [102]: print(left)
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
In [103]: print(right)
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3
In [104]:
In [104]: res = pd.merge(left,right,on='key')
In [105]: print(res)
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
In [106]:
# consider two keys
In [106]: left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
...: 'key2': ['K0', 'K1', 'K0', 'K1'],
...: 'A': ['A0', 'A1', 'A2', 'A3'],
...: 'B': ['B0', 'B1', 'B2', 'B3']})
In [107]: right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
...: 'key2': ['K0', 'K0', 'K0', 'K0'],
...: 'C': ['C0', 'C1', 'C2', 'C3'],
...: 'D': ['D0', 'D1', 'D2', 'D3']})
In [108]: print(left)
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
In [109]: print(right)
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
In [110]: res = pd.merge(left,right,on=['key1','key2'])
In [111]: print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
# how={'left','right','inner','outer'}
In [112]: res = pd.merge(left,right,on=['key1','key2'],how='inner') # 默认就是inner模式
In [113]: print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
In [114]: res = pd.merge(left,right,on=['key1','key2'],how='outer')
In [115]: print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
In [116]:
In [116]: res = pd.merge(left,right,on=['key1','key2'],how='left')
In [117]: print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
In [118]: res = pd.merge(left,right,on=['key1','key2'],how='right')
In [119]: print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
In [120]:
# indicator
In [121]: df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
In [122]: df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
In [123]: print(df1)
col1 col_left
0 0 a
1 1 b
In [124]: print(df2)
col1 col_right
0 1 2
1 2 2
2 2 2
In [125]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) # 给一个提示
In [126]: print(res)
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
In [127]:
In [129]: res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') # 指定提示的列名
In [130]: print(res)
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
In [131]:
In [127]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=False)
In [128]: print(res)
col1 col_left col_right
0 0 a NaN
1 1 b 2.0
2 2 NaN 2.0
3 2 NaN 2.0
In [129]:
In [131]: left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
...: 'B': ['B0', 'B1', 'B2']},
...: index=['K0', 'K1', 'K2'])
In [132]: right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
...: 'D': ['D0', 'D2', 'D3']},
...: index=['K0', 'K2', 'K3'])
In [133]: print(left)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
In [134]: print(right)
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
In [135]: res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
In [136]: print(res)
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
In [137]: res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
In [138]: print(res)
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
In [139]:
# handle overlapping
In [139]: boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
In [140]: girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
In [141]: print(boys)
age k
0 1 K0
1 2 K1
2 3 K2
In [142]: print(girls)
age k
0 4 K0
1 5 K0
2 6 K3
In [143]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
In [144]: print(res)
age_boy k age_girl
0 1 K0 4
1 1 K0 5
In [145]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
In [146]: print(res)
age_boy k age_girl
0 1.0 K0 4.0
1 1.0 K0 5.0
2 2.0 K1 NaN
3 3.0 K2 NaN
4 NaN K3 6.0
In [147]:
Pandas Moving Window Functions
Pandas plot可视化
#!/usr/bin/python2.7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
#!/usr/bin/python2.7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
index=np.arange(1000), \
columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))
data.plot()
plt.show()
#!/usr/bin/python2.7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
index=np.arange(1000), \
columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))
# plot method:
# 'bar','hist','box','kde','aera','scatter','pie','hexbin'...
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class AB')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class AC',ax=ax)
plt.show()
补充:Matplotlib 3D图像
#!/usr/bin/python2.7
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)
# height value
Z = np.sin(R)
ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
plt.show()
#!/usr/bin/python2.7
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = Axes3D(fig)
# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)
# height value
Z = np.sin(R)
ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
ax.contourf(X,Y,Z,zdir='z',offset=-2,cmap='rainbow') # 增加等高线
ax.set_zlim(-2,2)
plt.show()
参考:https://github.com/MorvanZhou
参考:https://morvanzhou.github.io/tutorials/