In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [7]:
s = pd.Series([1,2,3,np.nan,6,9])
s
Out[7]:
0 1.0
1 2.0
2 3.0
3 NaN
4 6.0
5 9.0
dtype: float64
In [ ]:
dates = pd.date_range('20170101', periods=6)
dates
Out[ ]:
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03','2017-01-04',
'2017-01-05','2017-01-06'],
dtype='datetime64[ns]', freq='D')
In [16]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df
Out[16]:
A | B | C | D | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 |
In [19]:
df2 = pd.DataFrame({'A':1,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1, index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test', 'train', 'test', 'train']),
'F':'foo'})
df2
Out[19]:
A | B | C | D | E | F | |
0 | 1 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1 | 2013-01-02 | 1.0 | 3 | train | foo |
2 | 1 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1 | 2013-01-02 | 1.0 | 3 | train | foo |
In [20]:
df2.dtypes
Out[20]:
A int64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
In [22]:
df.head()
Out[22]:
A | B | C | D | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 |
In [24]:
df.tail(3)
Out[24]:
A | B | C | D | |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 |
In [25]:
df.index
Out[25]:
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03','2017-01-04',
'2017-01-05','2017-01-06'],
dtype='datetime64[ns]', freq='D')
In [27]:
df.columns
Out[27]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [28]:
df.values
Out[28]:
array([[-0.1077083 , -1.02418217, -0.67825098, -2.22939769],
[ 0.92230625, 0.81741758, 0.15266857, -0.00999563],
[-2.15156187,-0.39679118, -1.03003536, -0.56338102],
[-0.6566828 ,-0.8778064 , 0.56461162, 0.84630068],
[-0.12957523,-0.70444375, -0.69742558, -0.32805333],
[ 1.03385071, 0.35553587, 0.3263864 , -0.47715656]])
In [30]:
df.describe()
Out[30]:
A | B | C | D | |
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | -0.181562 | -0.305045 | -0.227008 | -0.460281 |
std | 1.167095 | 0.736130 | 0.655260 | 1.006785 |
min | -2.151562 | -1.024182 | -1.030035 | -2.229398 |
25% | -0.524906 | -0.834466 | -0.692632 | -0.541825 |
50% | -0.118642 | -0.550617 | -0.262791 | -0.402605 |
75% | 0.664803 | 0.167454 | 0.282957 | -0.089510 |
max | 1.033851 | 0.817418 | 0.564612 | 0.846301 |
In [31]:
df.T
Out[31]:
2017-01-01 00:00:00 | 2017-01-02 00:00:00 | 2017-01-03 00:00:00 | 2017-01-04 00:00:00 | 2017-01-05 00:00:00 | 2017-01-06 00:00:00 | |
A | -0.107708 | 0.922306 | -2.151562 | -0.656683 | -0.129575 | 1.033851 |
B | -1.024182 | 0.817418 | -0.396791 | -0.877806 | -0.704444 | 0.355536 |
C | -0.678251 | 0.152669 | -1.030035 | 0.564612 | -0.697426 | 0.326386 |
D | -2.229398 | -0.009996 | -0.563381 | 0.846301 | -0.328053 | -0.477157 |
In [46]:
df.sort_index(axis=1, ascending=False)
Out[46]:
D | C | B | A | |
2017-01-01 | -2.229398 | -0.678251 | -1.024182 | -0.107708 |
2017-01-02 | -0.009996 | 0.152669 | 0.817418 | 0.922306 |
2017-01-03 | -0.563381 | -1.030035 | -0.396791 | -2.151562 |
2017-01-04 | 0.846301 | 0.564612 | -0.877806 | -0.656683 |
2017-01-05 | -0.328053 | -0.697426 | -0.704444 | -0.129575 |
2017-01-06 | -0.477157 | 0.326386 | 0.355536 | 1.033851 |
In [64]:
df.sort_values(by='B')
Out[64]:
A | B | C | D | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 |
In [65]:
df['A']
Out[65]:
2017-01-01 -0.107708
2017-01-02 0.922306
2017-01-03 -2.151562
2017-01-04 -0.656683
2017-01-05 -0.129575
2017-01-06 1.033851
Freq: D, Name: A, dtype: float64
In [66]:
df[0:3]
Out[66]:
A | B | C | D | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 |
In [69]:
df.loc[dates[0]]
Out[69]:
A -0.107708
B -1.024182
C -0.678251
D -2.229398
Name: 2017-01-01 00:00:00, dtype: float64
In [70]:
df.loc[:, 'A':'B']
Out[70]:
A | B | |
2017-01-01 | -0.107708 | -1.024182 |
2017-01-02 | 0.922306 | 0.817418 |
2017-01-03 | -2.151562 | -0.396791 |
2017-01-04 | -0.656683 | -0.877806 |
2017-01-05 | -0.129575 | -0.704444 |
2017-01-06 | 1.033851 | 0.355536 |
In [71]:
df.loc['20170101':'20170103', 'A':'B']
Out[71]:
A | B | |
2017-01-01 | -0.107708 | -1.024182 |
2017-01-02 | 0.922306 | 0.817418 |
2017-01-03 | -2.151562 | -0.396791 |
In [72]:
df.loc['20170104', 'A':'B']
Out[72]:
A -0.656683
B -0.877806
Name: 2017-01-04 00:00:00, dtype: float64
In [73]:
df.loc['20170105','C']
Out[73]:
-0.69742558177888991
In [76]:
df.iloc[3]
Out[76]:
A -0.656683
B -0.877806
C 0.564612
D 0.846301
Name: 2017-01-04 00:00:00, dtype: float64
In [77]:
df.iloc[3:5, 0:2]
Out[77]:
A | B | |
2017-01-04 | -0.656683 | -0.877806 |
2017-01-05 | -0.129575 | -0.704444 |
In [79]:
df.iloc[[1,3,5],[0,3]]
Out[79]:
A | D | |
2017-01-02 | 0.922306 | -0.009996 |
2017-01-04 | -0.656683 | 0.846301 |
2017-01-06 | 1.033851 | -0.477157 |
In [80]:
df.iloc[:,1:3]
Out[80]:
B | C | |
2017-01-01 | -1.024182 | -0.678251 |
2017-01-02 | 0.817418 | 0.152669 |
2017-01-03 | -0.396791 | -1.030035 |
2017-01-04 | -0.877806 | 0.564612 |
2017-01-05 | -0.704444 | -0.697426 |
2017-01-06 | 0.355536 | 0.326386 |
In [81]:
df.iloc[1,1]
Out[81]:
0.81741758295028633
In [82]:
df.iat[1,1]
Out[82]:
0.81741758295028633
In [83]:
df[df.A>0]
Out[83]:
A | B | C | D | |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 |
In [84]:
df[df>0]
Out[84]:
A | B | C | D | |
2017-01-01 | NaN | NaN | NaN | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | NaN |
2017-01-03 | NaN | NaN | NaN | NaN |
2017-01-04 | NaN | NaN | 0.564612 | 0.846301 |
2017-01-05 | NaN | NaN | NaN | NaN |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | NaN |
In [86]:
df3=df.copy()
df3['E']=['one','one', 'two','three','four','five']
df3
Out[86]:
A | B | C | D | E | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 | one |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 | one |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 | two |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 | three |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 | four |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 | five |
In [88]:
df3[df3['E'].isin(['two', 'four'])]
Out[88]:
A | B | C | D | E | |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 | two |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 | four |
- 设置 #### 1、设置一个新的列:
In [93]:
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20170102',periods=6))
s1
Out[93]:
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
2017-01-06 5
2017-01-07 6
Freq: D, dtype: int64
2、通过标签设置新的值:¶
In [94]:
df['F'] = s1
df
Out[94]:
A | B | C | D | F | |
2017-01-01 | -0.107708 | -1.024182 | -0.678251 | -2.229398 | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 | 2.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 | 3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 | 4.0 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 | 5.0 |
In [95]:
df.at[dates[0], 'A'] = 0
df
Out[95]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | -1.024182 | -0.678251 | -2.229398 | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 | 2.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 | 3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 | 4.0 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 | 5.0 |
In [97]:
df.iat[0,1] = 0
df
Out[97]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | -2.229398 | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | -0.009996 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -0.563381 | 2.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 0.846301 | 3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -0.328053 | 4.0 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | -0.477157 | 5.0 |
4、通过numpy数组设置一组新值:¶
In [99]:
df.loc[:,'D'] = np.array([5]*len(df))
df
Out[99]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | 5 | 2.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 5 | 3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | 5 | 4.0 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | 5 | 5.0 |
5、通过where操作来设置新的值:¶
In [101]:
df4 = df.copy()
df4[df4 >0] = -df4
df4
Out[101]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | -5 | NaN |
2017-01-02 | -0.922306 | -0.817418 | -0.152669 | -5 | -1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | -5 | -2.0 |
2017-01-04 | -0.656683 | -0.877806 | -0.564612 | -5 | -3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | -5 | -4.0 |
2017-01-06 | -1.033851 | -0.355536 | -0.326386 | -5 | -5.0 |
四、缺失值处理¶
在pandas中,使用np.nan来代替缺失值,这些值将默认不会包含在计算中,详细请参阅:MissingData Section
1、reindex()方法可以对指定轴上的索引进行改变/增加/删除操作,这将返回原始数据的一个拷贝:¶
In [105]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) +['E'])
df1
Out[105]:
A | B | C | D | F | E | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | NaN | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 | NaN |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | 5 | 2.0 | NaN |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 5 | 3.0 | NaN |
In [106]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
Out[106]:
A | B | C | D | F | E | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | NaN | 1.0 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | 5 | 2.0 | NaN |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 5 | 3.0 | NaN |
2、去掉包含缺失值的行:¶
In [107]:
df1.dropna(how='any')
Out[107]:
A | B | C | D | F | E | |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 | 1.0 |
3、对缺失值进行填充:¶
In [108]:
df1.fillna(value=5)
Out[108]:
A | B | C | D | F | E | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | 5.0 | 1.0 |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | 5 | 2.0 | 5.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 5 | 3.0 | 5.0 |
4、对数据进行布尔填充:¶
In [109]:
pd.isnull(df)
Out[109]:
A | B | C | D | F | |
2017-01-01 | False | False | False | False | True |
2017-01-02 | False | False | False | False | False |
2017-01-03 | False | False | False | False | False |
2017-01-04 | False | False | False | False | False |
2017-01-05 | False | False | False | False | False |
2017-01-06 | False | False | False | False | False |
五、 相关操作¶
详情请参阅BasicSection OnBinary Ops
- 统计(相关操作通常情况不包括缺失值)
1、执行描述性统计:¶
In [112]:
df.mean()
Out[112]:
A -0.163610
B -0.134348
C -0.227008
D 5.000000
F 3.000000
dtype: float64
2、在其它轴上进行相同操作:¶
In [113]:
df.mean(1)
Out[113]:
2017-01-01 1.080437
2017-01-02 1.578478
2017-01-03 0.684322
2017-01-04 1.406024
2017-01-05 1.493711
2017-01-06 2.343155
Freq: D, dtype: float64
3、对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播:¶
In [120]:
s3 =pd.Series([1,3,4,np.nan,6,8], index=dates).shift(1)
s3
Out[120]:
2017-01-01 NaN
2017-01-02 1.0
2017-01-03 3.0
2017-01-04 4.0
2017-01-05 NaN
2017-01-06 6.0
Freq: D, dtype: float64
In [123]:
df.sub(s3, axis='index')
Out[123]:
A | B | C | D | F | |
2017-01-01 | NaN | NaN | NaN | NaN | NaN |
2017-01-02 | -0.077694 | -0.182582 | -0.847331 | 4.0 | 0.0 |
2017-01-03 | -5.151562 | -3.396791 | -4.030035 | 2.0 | -1.0 |
2017-01-04 | -4.656683 | -4.877806 | -3.435388 | 1.0 | -1.0 |
2017-01-05 | NaN | NaN | NaN | NaN | NaN |
2017-01-06 | -4.966149 | -5.644464 | -5.673614 | -1.0 | -1.0 |
In [122]:
df
Out[122]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | NaN |
2017-01-02 | 0.922306 | 0.817418 | 0.152669 | 5 | 1.0 |
2017-01-03 | -2.151562 | -0.396791 | -1.030035 | 5 | 2.0 |
2017-01-04 | -0.656683 | -0.877806 | 0.564612 | 5 | 3.0 |
2017-01-05 | -0.129575 | -0.704444 | -0.697426 | 5 | 4.0 |
2017-01-06 | 1.033851 | 0.355536 | 0.326386 | 5 | 5.0 |
In [126]:
df.apply(np.cumsum)
Out[126]:
A | B | C | D | F | |
2017-01-01 | 0.000000 | 0.000000 | -0.678251 | 5 | NaN |
2017-01-02 | 0.922306 | 0.817418 | -0.525582 | 10 | 1.0 |
2017-01-03 | -1.229256 | 0.420626 | -1.555618 | 15 | 3.0 |
2017-01-04 | -1.885938 | -0.457180 | -0.991006 | 20 | 6.0 |
2017-01-05 | -2.015514 | -1.161624 | -1.688432 | 25 | 10.0 |
2017-01-06 | -0.981663 | -0.806088 | -1.362045 | 30 | 15.0 |
In [127]:
df.apply(lambda x:x.max() - x.min())
Out[127]:
A 3.185413
B 1.695224
C 1.594647
D 0.000000
F 4.000000
dtype: float64
In [128]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[128]:
0 6
1 1
2 3
3 6
4 5
5 3
6 3
7 3
8 5
9 4
dtype: int32
In [129]:
s.value_counts()
Out[129]:
3 4
6 2
5 2
4 1
1 1
dtype: int64
In [130]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA','dog', 'cat'])
s.str.lower()
Out[130]:
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
In [131]:
df = pd.DataFrame(np.random.randn(10, 4))
df
Out[131]:
0 | 1 | 2 | 3 | |
0 | 0.112665 | 0.048688 | 0.202798 | 0.025471 |
1 | -1.390846 | -0.538326 | -0.078290 | -0.995754 |
2 | 2.089888 | 0.442472 | 0.038945 | -1.785647 |
3 | -0.063596 | -0.104295 | 1.330084 | 0.673432 |
4 | -1.090966 | -1.106314 | 1.203585 | 0.752954 |
5 | 0.650855 | -0.551656 | -0.476231 | 0.874108 |
6 | 0.515420 | -0.099656 | -1.028804 | 0.163295 |
7 | -1.587624 | -1.178458 | 0.869725 | 0.553387 |
8 | -1.547900 | -0.387638 | 0.458020 | -0.131574 |
9 | 0.696191 | -0.826334 | 0.482097 | -1.577040 |
In [135]:
pieces = [df[:3], df[3:7], df[7:]]
pieces
Out[135]:
[ 0 1 2 3
0 0.112665 0.048688 0.202798 0.025471
1 -1.390846 -0.538326-0.078290 -0.995754
2 2.089888 0.442472 0.038945 -1.785647,
0 1 2 3
3 -0.063596 -0.104295 1.330084 0.673432
4 -1.090966 -1.106314 1.203585 0.752954
5 0.650855 -0.551656 -0.476231 0.874108
6 0.515420 -0.099656 -1.028804 0.163295,
0 1 2 3
7 -1.587624 -1.178458 0.869725 0.553387
8 -1.547900 -0.387638 0.458020 -0.131574
9 0.696191 -0.826334 0.482097 -1.577040]
In [136]:
pd.concat(pieces)
Out[136]:
0 | 1 | 2 | 3 | |
0 | 0.112665 | 0.048688 | 0.202798 | 0.025471 |
1 | -1.390846 | -0.538326 | -0.078290 | -0.995754 |
2 | 2.089888 | 0.442472 | 0.038945 | -1.785647 |
3 | -0.063596 | -0.104295 | 1.330084 | 0.673432 |
4 | -1.090966 | -1.106314 | 1.203585 | 0.752954 |
5 | 0.650855 | -0.551656 | -0.476231 | 0.874108 |
6 | 0.515420 | -0.099656 | -1.028804 | 0.163295 |
7 | -1.587624 | -1.178458 | 0.869725 | 0.553387 |
8 | -1.547900 | -0.387638 | 0.458020 | -0.131574 |
9 | 0.696191 | -0.826334 | 0.482097 | -1.577040 |
In [137]:
left = pd.DataFrame({'key':['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo', 'foo'], 'rval':[4,5]})
left
Out[137]:
key | lval | |
0 | foo | 1 |
1 | foo | 2 |
In [138]:
right
Out[138]:
key | rval | |
0 | foo | 4 |
1 | foo | 5 |
In [140]:
pd.merge(left, right, on='key')
Out[140]:
key | lval | rval | |
0 | foo | 1 | 4 |
1 | foo | 1 | 5 |
2 | foo | 2 | 4 |
3 | foo | 2 | 5 |
In [141]:
df = pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])
df
Out[141]:
A | B | C | D | |
0 | -0.277390 | -0.004491 | 0.328250 | -0.752631 |
1 | 1.246361 | -0.560859 | 0.140553 | -2.784473 |
2 | -0.777135 | -1.778617 | 1.686014 | -0.227608 |
3 | -0.016098 | 0.530940 | -1.249833 | 1.977533 |
4 | -1.018847 | 0.540397 | -0.241101 | -1.287702 |
5 | 1.836941 | 1.422687 | 0.376302 | -0.804313 |
6 | 0.279718 | 0.694741 | 0.363781 | 1.292481 |
7 | 0.575892 | -2.357896 | 0.781150 | 0.789141 |
In [142]:
s = df.loc[3]
df.append(s, ignore_index=True)
Out[142]:
A | B | C | D | |
0 | -0.277390 | -0.004491 | 0.328250 | -0.752631 |
1 | 1.246361 | -0.560859 | 0.140553 | -2.784473 |
2 | -0.777135 | -1.778617 | 1.686014 | -0.227608 |
3 | -0.016098 | 0.530940 | -1.249833 | 1.977533 |
4 | -1.018847 | 0.540397 | -0.241101 | -1.287702 |
5 | 1.836941 | 1.422687 | 0.376302 | -0.804313 |
6 | 0.279718 | 0.694741 | 0.363781 | 1.292481 |
7 | 0.575892 | -2.357896 | 0.781150 | 0.789141 |
8 | -0.016098 | 0.530940 | -1.249833 | 1.977533 |
In [143]:
df = pd.DataFrame({'A':['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'bar'],
'B':['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
df
Out[143]:
A | B | C | D | |
0 | foo | one | -0.200720 | -2.172299 |
1 | bar | one | 0.701414 | 1.142280 |
2 | foo | two | -0.384221 | -0.842057 |
3 | bar | three | -0.883506 | -0.683085 |
4 | foo | two | 1.438267 | 0.485135 |
5 | bar | two | 0.252307 | 1.145261 |
6 | foo | one | 0.572104 | 0.869513 |
7 | bar | three | 0.760147 | 0.056735 |
In [146]:
df.groupby('A').sum()
Out[146]:
C | D | |
A | ||
bar | 0.830361 | 1.661191 |
foo | 1.425430 | -1.659709 |
In [147]:
df.groupby(['A', 'B']).sum()
Out[147]:
C | D | ||
A | B | ||
bar | one | 0.701414 | 1.142280 |
three | -0.123359 | -0.626350 | |
two | 0.252307 | 1.145261 | |
foo | one | 0.371384 | -1.302786 |
two | 1.054046 | -0.356923 |
In [158]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
tuples
Out[158]:
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
In [159]:
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
index
Out[159]:
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1,1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first','second'])
In [163]:
df = pd.DataFrame(np.random.randn(8, 2), index=index,columns=['A', 'B'])
df
Out[163]:
A | B | ||
first | second | ||
bar | one | 2.496134 | -0.893512 |
two | 2.286660 | -0.248709 | |
baz | one | 0.147845 | 0.985195 |
two | -0.529439 | -0.095119 | |
foo | one | 0.618096 | -0.364225 |
two | -0.090616 | 0.032860 | |
qux | one | -0.912772 | -0.715831 |
two | -1.164991 | -1.050507 |
In [164]:
stacked = df.stack()
stacked
Out[164]:
first second
bar one A 2.496134
B -0.893512
two A 2.286660
B -0.248709
baz one A 0.147845
B 0.985195
two A -0.529439
B -0.095119
foo one A 0.618096
B -0.364225
two A -0.090616
B 0.032860
qux one A -0.912772
B -0.715831
two A -1.164991
B -1.050507
dtype: float64
In [166]:
stacked.unstack()
Out[166]:
A | B | ||
first | second | ||
bar | one | 2.496134 | -0.893512 |
two | 2.286660 | -0.248709 | |
baz | one | 0.147845 | 0.985195 |
two | -0.529439 | -0.095119 | |
foo | one | 0.618096 | -0.364225 |
two | -0.090616 | 0.032860 | |
qux | one | -0.912772 | -0.715831 |
two | -1.164991 | -1.050507 |
In [167]:
stacked.unstack(1)
Out[167]:
second | one | two | |
first | |||
bar | A | 2.496134 | 2.286660 |
B | -0.893512 | -0.248709 | |
baz | A | 0.147845 | -0.529439 |
B | 0.985195 | -0.095119 | |
foo | A | 0.618096 | -0.090616 |
B | -0.364225 | 0.032860 | |
qux | A | -0.912772 | -1.164991 |
B | -0.715831 | -1.050507 |
In [168]:
stacked.unstack(0)
Out[168]:
first | bar | baz | foo | qux | |
second | |||||
one | A | 2.496134 | 0.147845 | 0.618096 | -0.912772 |
B | -0.893512 | 0.985195 | -0.364225 | -0.715831 | |
two | A | 2.286660 | -0.529439 | -0.090616 | -1.164991 |
B | -0.248709 | -0.095119 | 0.032860 | -1.050507 |
In [ ]: