Python数据科学入门(Pandas玩转数据)
慕课网数据科学入门课程学习笔记
一、Series和DataFrame的简单数学运算
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.Series 相加
s1 = Series([1,2,3],index=['A','B','C'])
s1
A 1
B 2
C 3
dtype: int64
s2 = Series([4,5,6,7],index=['B','C','D','E'])
s2
B 4
C 5
D 6
E 7
dtype: int64
s1 + s2
A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64
2.Dataframe运算
df1 = DataFrame(np.arange(4).reshape(2,2),index=['A','B'],columns=['AA','BB'])
df1
df2 = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['AA','BB','CC'])
df2
df1 + df2
| AA | BB | CC |
---|
A | 0.0 | 2.0 | NaN |
---|
B | 5.0 | 7.0 | NaN |
---|
C | NaN | NaN | NaN |
---|
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],
index=['A','B','C'],columns=['c1','c2','c3'])
df3
df3.sum()
c1 12.0
c2 15.0
c3 12.0
dtype: float64
df3.min()
c1 1.0
c2 2.0
c3 3.0
dtype: float64
df3.describe()
| c1 | c2 | c3 |
---|
count | 3.0 | 3.0 | 2.000000 |
---|
mean | 4.0 | 5.0 | 6.000000 |
---|
std | 3.0 | 3.0 | 4.242641 |
---|
min | 1.0 | 2.0 | 3.000000 |
---|
25% | 2.5 | 3.5 | 4.500000 |
---|
50% | 4.0 | 5.0 | 6.000000 |
---|
75% | 5.5 | 6.5 | 7.500000 |
---|
max | 7.0 | 8.0 | 9.000000 |
---|
二、Series和DataFrame的排序
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.Series排序
s1 = Series(np.random.randn(10))
s1
0 0.773184
1 -0.403266
2 0.980296
3 -0.631971
4 0.636405
5 0.732921
6 0.756932
7 -0.597943
8 -1.152224
9 0.771753
dtype: float64
s1.sort_values()
8 -1.152224
3 -0.631971
7 -0.597943
1 -0.403266
4 0.636405
5 0.732921
6 0.756932
9 0.771753
0 0.773184
2 0.980296
dtype: float64
s1.sort_index(ascending=False)
9 0.771753
8 -1.152224
7 -0.597943
6 0.756932
5 0.732921
4 0.636405
3 -0.631971
2 0.980296
1 -0.403266
0 0.773184
dtype: float64
2.DataFrame排序
df1 = DataFrame(np.random.randn(20).reshape(4,5),columns=['a','b','c','d','e'])
df1
| a | b | c | d | e |
---|
0 | 0.173908 | -0.978290 | -0.791529 | -1.769024 | -1.971549 |
---|
1 | 0.553871 | -1.436033 | -1.348179 | 0.244897 | 1.465071 |
---|
2 | 1.378478 | -0.869294 | 0.797711 | -0.539044 | -1.211413 |
---|
3 | 2.739525 | -0.111984 | -1.468831 | 1.418846 | 0.182972 |
---|
df1.sort_values('a',ascending=False)
| a | b | c | d | e |
---|
3 | 2.739525 | -0.111984 | -1.468831 | 1.418846 | 0.182972 |
---|
2 | 1.378478 | -0.869294 | 0.797711 | -0.539044 | -1.211413 |
---|
1 | 0.553871 | -1.436033 | -1.348179 | 0.244897 | 1.465071 |
---|
0 | 0.173908 | -0.978290 | -0.791529 | -1.769024 | -1.971549 |
---|
三、重命名DataFrame的index
import numpy as np
import pandas as pa
from pandas import Series,DataFrame
1.重命名 index
df1 = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['AA','BB','CC'])
df1
df1.index=Series(['a','b','c'])
df1
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df1.index = df1.index.map(str.upper)
df1
df1.rename(index=str.lower,columns=str.lower)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df1
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df1.rename(index={'A':'g'})
2.复习
list1 = [1,2,3,4]
list2 = ['1','2','3','4']
[str(x) for x in list1]
[‘1’, ‘2’, ‘3’, ‘4’]
list(map(str,list1))
[‘1’, ‘2’, ‘3’, ‘4’]
3.写一个自己的 map
def test_map(x):
return x + '_ABC'
df1.index.map(test_map)
Index([‘A_ABC’, ‘B_ABC’, ‘C_ABC’], dtype=’object’)
df1.rename(index=test_map)
| AA | BB | CC |
---|
A_ABC | 0 | 1 | 2 |
---|
B_ABC | 3 | 4 | 5 |
---|
C_ABC | 6 | 7 | 8 |
---|
四、DataFrame 的merge(合并)操作
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({'key':['X','Y','Z'],'data_set_1':[1,2,3]})
df1
df2 = DataFrame({'key':['X','X','C'],'data_set_2':[4,5,6]})
df2
pd.merge(df1,df2)
| key | data_set_1 | data_set_2 |
---|
0 | X | 1 | 4 |
---|
1 | X | 1 | 5 |
---|
pd.merge(df1,df2,on='key',how='left')
| key | data_set_1 | data_set_2 |
---|
0 | X | 1 | 4.0 |
---|
1 | X | 1 | 5.0 |
---|
2 | Y | 2 | NaN |
---|
3 | Z | 3 | NaN |
---|
pd.merge(df1,df2,on='key',how='right')
| key | data_set_1 | data_set_2 |
---|
0 | X | 1.0 | 4 |
---|
1 | X | 1.0 | 5 |
---|
2 | C | NaN | 6 |
---|
pd.merge(df1,df2,on='key',how='outer')
| key | data_set_1 | data_set_2 |
---|
0 | X | 1.0 | 4.0 |
---|
1 | X | 1.0 | 5.0 |
---|
2 | Y | 2.0 | NaN |
---|
3 | Z | 3.0 | NaN |
---|
4 | C | NaN | 6.0 |
---|
五、Concatenaten和Combine
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1>Concatenaten
1.Numpy array
arr1 = np.arange(9).reshape(3,3)
arr1
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
arr2 = np.arange(9).reshape(3,3)
arr2
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
np.concatenate([arr1,arr2])
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
2.Series
s1 = Series([1,2,3],index=['X','Y','Z'])
s2 = Series([4,5],index=['A','B'])
pd.concat([s1,s2])
X 1
Y 2
Z 3
A 4
B 5
dtype: int64
pd.concat([s1,s2],axis=1)
| 0 | 1 |
---|
A | NaN | 4.0 |
---|
B | NaN | 5.0 |
---|
X | 1.0 | NaN |
---|
Y | 2.0 | NaN |
---|
Z | 3.0 | NaN |
---|
3.DataFrame
df1 = DataFrame(np.random.randn(4,3),columns=['X','Y,','Z'])
df1
| X | Y, | Z |
---|
0 | 0.542921 | 1.710888 | 1.027810 |
---|
1 | 0.624385 | -0.825916 | 0.893589 |
---|
2 | -1.000241 | 2.551461 | 1.541975 |
---|
3 | 0.020806 | 0.635956 | 0.573629 |
---|
df2 = DataFrame(np.random.randn(3,3),columns=['X','Y,','A'])
df2
| X | Y, | A |
---|
0 | -0.721702 | -0.106360 | -1.701335 |
---|
1 | 0.591009 | 0.233578 | 0.212189 |
---|
2 | 0.855895 | -1.831944 | 0.247578 |
---|
pd.concat([df1,df2])
| A | X | Y, | Z |
---|
0 | NaN | 0.542921 | 1.710888 | 1.027810 |
---|
1 | NaN | 0.624385 | -0.825916 | 0.893589 |
---|
2 | NaN | -1.000241 | 2.551461 | 1.541975 |
---|
3 | NaN | 0.020806 | 0.635956 | 0.573629 |
---|
0 | -1.701335 | -0.721702 | -0.106360 | NaN |
---|
1 | 0.212189 | 0.591009 | 0.233578 | NaN |
---|
2 | 0.247578 | 0.855895 | -1.831944 | NaN |
---|
2> Combine
1.Series
s1 = Series([2,np.nan,4,np.nan],index=['A','B','C','D'])
s1
A 2.0
B NaN
C 4.0
D NaN
dtype: float64
s2 = Series([1,2,3,4],index=['A','B','C','D'])
s2
A 1
B 2
C 3
D 4
dtype: int64
s1.combine_first(s2)
A 2.0
B 2.0
C 4.0
D 4.0
dtype: float64
2.DataFrame
df1 = DataFrame({'X':[1, np.nan, 3, np.nan],
'Y':[5, np.nan, 7, np.nan],
'Z':[9, np.nan, 11, np.nan]})
df1
| X | Y | Z |
---|
0 | 1.0 | 5.0 | 9.0 |
---|
1 | NaN | NaN | NaN |
---|
2 | 3.0 | 7.0 | 11.0 |
---|
3 | NaN | NaN | NaN |
---|
df2 = DataFrame({'Z':[np.nan, 10, np.nan, 12],
'A':[1,2,3,4],})
df2
df1.combine_first(df2)
| A | X | Y | Z |
---|
0 | 1.0 | 1.0 | 5.0 | 9.0 |
---|
1 | 2.0 | NaN | NaN | 10.0 |
---|
2 | 3.0 | 3.0 | 7.0 | 11.0 |
---|
3 | 4.0 | NaN | NaN | 12.0 |
---|
六、通过apply进行数据预处理
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('data.csv')
df.head()
| time | data |
---|
0 | 1124352.0 | AA: BB: CC: DD |
---|
1 | 1124352.0 | AA: BB: CC: DD |
---|
2 | 1124352.0 | AA: BB: CC: DD |
---|
3 | 1124353.0 | AA: BB: CC: DD |
---|
4 | 1124353.0 | AA: BB: CC: DD |
---|
df.size
190
s1 = Series(['a']*190)
df['A'] = s1
df.head()
| time | data | A |
---|
0 | 1124352.0 | AA: BB: CC: DD | a |
---|
1 | 1124352.0 | AA: BB: CC: DD | a |
---|
2 | 1124352.0 | AA: BB: CC: DD | a |
---|
3 | 1124353.0 | AA: BB: CC: DD | a |
---|
4 | 1124353.0 | AA: BB: CC: DD | a |
---|
df['A'] = df['A'].apply(str.upper)
df.head()
| time | data | A |
---|
0 | 1124352.0 | AA: BB: CC: DD | A |
---|
1 | 1124352.0 | AA: BB: CC: DD | A |
---|
2 | 1124352.0 | AA: BB: CC: DD | A |
---|
3 | 1124353.0 | AA: BB: CC: DD | A |
---|
4 | 1124353.0 | AA: BB: CC: DD | A |
---|
将data分割多列
df['data'][0].split(' ')
[”, ‘AA:’, ‘BB:’, ‘CC:’, ‘DD’]
list_1 = df['data'][0].strip().split(' ')
list_1
[‘AA:’, ‘BB:’, ‘CC:’, ‘DD’]
要求:将第2列和第4列取出重新放入data列
使用 head() 需要注意,如果对 df 操作时使用1了 head()会使只操作了前几行
使用 head() 来查看就好
list_1[1],list_1[3]
(‘BB:’, ‘DD’)
def foo(line):
items = line.strip().split(' ')
return Series([items[1],items[3]])
df_tmp = df['data'].apply(foo)
df_tmp.head()
| 0 | 1 |
---|
0 | BB: | DD |
---|
1 | BB: | DD |
---|
2 | BB: | DD |
---|
3 | BB: | DD |
---|
4 | BB: | DD |
---|
df_tmp = df_tmp.rename(columns={0:"B",1:"D"})
df_tmp.head()
| B | D |
---|
0 | BB: | DD |
---|
1 | BB: | DD |
---|
2 | BB: | DD |
---|
3 | BB: | DD |
---|
4 | BB: | DD |
---|
df.head()
| time | data | A |
---|
0 | 1124352.0 | AA: BB: CC: DD | A |
---|
1 | 1124352.0 | AA: BB: CC: DD | A |
---|
2 | 1124352.0 | AA: BB: CC: DD | A |
---|
3 | 1124353.0 | AA: BB: CC: DD | A |
---|
4 | 1124353.0 | AA: BB: CC: DD | A |
---|
df_new = df.combine_first(df_tmp)
df_new.head()
| A | B | D | data | time |
---|
0 | A | BB: | DD | AA: BB: CC: DD | 1124352.0 |
---|
1 | A | BB: | DD | AA: BB: CC: DD | 1124352.0 |
---|
2 | A | BB: | DD | AA: BB: CC: DD | 1124352.0 |
---|
3 | A | BB: | DD | AA: BB: CC: DD | 1124353.0 |
---|
4 | A | BB: | DD | AA: BB: CC: DD | 1124353.0 |
---|
del df_new['A']
del df_new['data']
df_new.head()
| B | D | time |
---|
0 | BB: | DD | 1124352.0 |
---|
1 | BB: | DD | 1124352.0 |
---|
2 | BB: | DD | 1124352.0 |
---|
3 | BB: | DD | 1124353.0 |
---|
4 | BB: | DD | 1124353.0 |
---|
df_new.to_csv('df_new.csv')
——–
七、通过去重清洗数据
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('data.csv')
df.head()
| time | data |
---|
0 | 1124352 | AA: BB: CC: DD |
---|
1 | 1124352 | AA: BB: CC: DD |
---|
2 | 1124352 | AA: BB: CC: DD |
---|
3 | 1124353 | AA: BB: CC: DD |
---|
4 | 1124353 | FF: BB: CC: DD |
---|
df.size
190
len(df)
95
len(df['time'].unique())
22
去重
df['time'].duplicated().head()
0 False
1 True
2 True
3 False
4 True
Name: time, dtype: bool
df_new = df.drop_duplicates()
df_new.head()
| time | data |
---|
0 | 1124352 | AA: BB: CC: DD |
---|
3 | 1124353 | AA: BB: CC: DD |
---|
4 | 1124353 | FF: BB: CC: DD |
---|
6 | 1124354 | AA: BB: CC: DD |
---|
7 | 1124354 | CC: BB : CC: DD |
---|
df_new = df.drop_duplicates({'data'},keep='last')
df_new.head()
| time | data |
---|
4 | 1124353 | FF: BB: CC: DD |
---|
7 | 1124354 | CC: BB : CC: DD |
---|
94 | 1124373 | AA: BB: CC: DD |
---|
八、时间序列的操作基础
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from datetime import datetime
t1 = datetime(2009,10,20)
t1
datetime.datetime(2009, 10, 20, 0, 0)
date_list = [
datetime(2016,9,1),
datetime(2016,9,10),
datetime(2017,9,1),
datetime(2017,9,10),
datetime(2017,10,1)
]
date_list
[datetime.datetime(2016, 9, 1, 0, 0),
datetime.datetime(2016, 9, 10, 0, 0),
datetime.datetime(2017, 9, 1, 0, 0),
datetime.datetime(2017, 9, 10, 0, 0),
datetime.datetime(2017, 10, 1, 0, 0)]
s1 = Series(np.random.rand(5),index=date_list)
s1
2016-09-01 0.497374
2016-09-10 0.686276
2017-09-01 0.788562
2017-09-10 0.765383
2017-10-01 0.546197
dtype: float64
s1[1]
0.6862758320157275
s1[datetime(2016,9,10)]
0.6862758320157275
s1['2016-9-10']
0.6862758320157275
s1['20160910']
0.6862758320157275
s1['2016-9']
2016-09-01 0.497374
2016-09-10 0.686276
dtype: float64
s1['2017']
2017-09-01 0.788562
2017-09-10 0.765383
2017-10-01 0.546197
dtype: float64
date_list_new = pd.date_range(start='2017-01-01',periods=10)
date_list_new
DatetimeIndex([‘2017-01-01’, ‘2017-01-02’, ‘2017-01-03’, ‘2017-01-04…)
s2 = Series(np.random.rand(10),index=date_list_new)
s2
2017-01-01 0.152626
2017-01-02 0.590219
…
Freq: D, dtype: float64
——
九、时间序列数据的采样和画图
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
t_range = pd.date_range('2016-01-01','2016-12-01')
t_range
DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-
….)
s1 = Series(np.random.randn(len(t_range)),index=t_range)
s1.head()
2016-01-01 0.145550
2016-01-02 2.444428
...
Freq: D, dtype: float64
采样
s1['2016-01'].mean()
-0.14446619115276657
s1_month = s1.resample('M').mean()
s1_month.head()
2016-01-31 -0.144466
2016-02-29 0.083245
2016-03-31 0.204149
Freq: M, dtype: float64
数据填充
s1_test = s1.resample('H').ffill()
s1_test.head()
2016-01-01 00:00:00 0.14555
2016-01-01 01:00:00 0.14555
2016-01-01 02:00:00 0.14555
2016-01-01 03:00:00 0.14555
2016-01-01 04:00:00 0.14555
Freq: H, dtype: float64
s1_test = s1.resample('H').bfill()
s1_test.head()
2016-01-01 00:00:00 0.145550
2016-01-01 01:00:00 2.444428
2016-01-01 02:00:00 2.444428
2016-01-01 03:00:00 2.444428
2016-01-01 04:00:00 2.444428
Freq: H, dtype: float64
简单画图
t_range = pd.date_range('2016-01-01','2016-12-31',freq='H')
t_range
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
'2016-01-01 02:00:00', '2016-01-01 03:00:00',
'2016-01-01 04:00:00', '2016-01-01 05:00:00',
'2016-01-01 06:00:00', '2016-01-01 07:00:00',
'2016-01-01 08:00:00', '2016-01-01 09:00:00',
...
)
stock_df = DataFrame(index=t_range)
stock_df.head(10)
|
---|
2016-01-01 00:00:00 |
---|
2016-01-01 01:00:00 |
---|
2016-01-01 02:00:00 |
---|
2016-01-01 03:00:00 |
---|
2016-01-01 04:00:00 |
---|
2016-01-01 05:00:00 |
---|
stock_df['data1'] = np.random.randint(80,160,size=len(t_range))
stock_df.head(10)
| data1 |
---|
2016-01-01 00:00:00 | 147 |
---|
2016-01-01 01:00:00 | 147 |
---|
2016-01-01 02:00:00 | 159 |
---|
2016-01-01 03:00:00 | 99 |
---|
2016-01-01 04:00:00 | 119 |
---|
2016-01-01 05:00:00 | 136 |
---|
2016-01-01 06:00:00 | 102 |
---|
stock_df['data2'] = np.random.randint(40,140,size=len(t_range))
stock_df.head(10)
| data1 | data2 |
---|
2016-01-01 00:00:00 | 147 | 44 |
---|
2016-01-01 01:00:00 | 147 | 93 |
---|
2016-01-01 02:00:00 | 159 | 81 |
---|
2016-01-01 03:00:00 | 99 | 108 |
---|
2016-01-01 04:00:00 | 119 | 55 |
---|
2016-01-01 05:00:00 | 136 | 106 |
---|
stock_df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x249d416d438>

import matplotlib.pyplot as plt
plt.show()
按周取样再绘图
weekly_df = DataFrame()
weekly_df['data1'] = stock_df['data1'].resample('W').mean()
weekly_df['data2'] = stock_df['data2'].resample('W').mean()
weekly_df.head()
| data1 | data2 |
---|
2016-01-03 | 119.000000 | 85.111111 |
---|
2016-01-10 | 119.095238 | 90.345238 |
---|
weekly_df.plot()
plt.show()

十、数据分箱技术-Binning
什么是数据分箱?
例如 不同大小的苹果分装在不同的箱子
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
score_list = np.random.randint(0,100,size=20)
score_list
array([87, 83, 40, 14, 4, 31, 46, 17, 96, 99, 39, 4, 85, 8, 64, 82, 32,
28, 36, 40])
bins = [0,59,70,80,100]
数据分箱
score_cut = pd.cut(score_list,bins)
score_cut
[(80, 100], (80, 100], (0, 59], (0, 59], (0, 59], …, (80, 100], (0, 59], (0, 59], (0, 59], (0, 59]]
Length: 20
Categories (4, interval[int64]): [(0, 59]
pd.value_counts(score_cut)
(0, 59] 13
(80, 100] 6
(59, 70] 1
(70, 80] 0
dtype: int64
df = DataFrame()
df['score'] = score_list
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df.head()
| score | student |
---|
0 | 87 | w5S |
---|
1 | 83 | cQX |
---|
2 | 40 | JYH |
---|
3 | 14 | Cs3 |
---|
4 | 4 | niX |
---|
数据分箱 cut()函数
cut(数据,区间-list,labels=[‘各区间的表示标签’,’…’])
df['Categories'] = pd.cut(df['score'],bins,labels=['Low','OK','Good','Great'])
df.head()
| score | student | Categories |
---|
0 | 87 | w5S | Great |
---|
1 | 83 | cQX | Great |
---|
2 | 40 | JYH | Low |
---|
3 | 14 | Cs3 | Low |
---|
4 | 4 | niX | Low |
---|
十一、数据分组技术-GroupBy
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
原理类似SQL
df = pd.read_csv('weather.csv')
df.head()
| date | city | temperature | wind |
---|
0 | 2017/1/1 | GZ | 12 | 3 |
---|
1 | 2017/1/8 | BJ | 2 | 5 |
---|
2 | 2017/1/15 | SH | -4 | 6 |
---|
3 | 2017/1/22 | GZ | 6 | 1 |
---|
4 | 2017/1/29 | BJ | 13 | 3 |
---|
分组
g = df.groupby(df['city'])
g
g.groups
{‘BJ’: Int64Index([1, 4, 7, 10, 13, 16, 19, 22, 25, 28], dtype=’int64’),
‘GZ’: Int64Index([0, 3, 6, 9, 12, 15, 18, 21, 24, 27], dtype=’int64’),
‘SH’: Int64Index([2, 5, 8, 11, 14, 17, 20, 23, 26], dtype=’int64’)}
g_gz = g.get_group('GZ')
g_gz.head()
| date | city | temperature | wind |
---|
0 | 2017/1/1 | GZ | 12 | 3 |
---|
3 | 2017/1/22 | GZ | 6 | 1 |
---|
6 | 2017/2/12 | GZ | 12 | 5 |
---|
9 | 2017/3/5 | GZ | 6 | 9 |
---|
12 | 2017/3/26 | GZ | 12 | 6 |
---|
g_gz.mean()
temperature 9.0
wind 4.5
dtype: float64
g.mean()
| temperature | wind |
---|
city | | |
---|
BJ | 7.500000 | 4.5 |
---|
GZ | 9.000000 | 4.5 |
---|
SH | -1.333333 | 4.0 |
---|
十二、数据聚合技术Aggregation
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('weather.csv')
g = df.groupby('city')
聚合
mean() count() max() 等都是聚合
g.agg('min')
| date | temperature | wind |
---|
city | | | |
---|
BJ | 2017/1/29 | 2 | 1 |
---|
GZ | 2017/1/1 | 6 | 1 |
---|
SH | 2017/1/15 | -4 | 1 |
---|
自定义函数 来聚合
def foo(attr):
return attr.max() - attr.min()
g.agg(foo)
| temperature | wind |
---|
city | | |
---|
BJ | 11 | 8 |
---|
GZ | 6 | 8 |
---|
SH | 6 | 6 |
---|
多columns分组
g_new = df.groupby(['city','wind'])
g_new.groups
{(‘BJ’, 1): Int64Index([13], dtype=’int64’),
(‘BJ’, 2): Int64Index([25], dtype=’int64’),
(‘BJ’, 3): Int64Index([4, 10], dtype=’int64’),
(‘BJ’, 4): Int64Index([7], dtype=’int64’),
……}
g_new.get_group(('BJ',3))
| date | city | temperature | wind |
---|
4 | 2017/1/29 | BJ | 13 | 3 |
---|
10 | 2017/3/12 | BJ | 13 | 3 |
---|
对多 columns 遍历需要注意
for (name_1,name_2),group in g_new:
print(name_1,name_2)
print(group)
BJ 1
date city temperature wind
13 2017/4/2 BJ 2 1
BJ 2
date city temperature wind
25 2017/6/25 BJ 2 2
…..
十三、透视表
透视表概念来自Excel,为了更好展示数据,对原来的行列进行了一些变化,根据这些变化,相应的数据也变化
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
无数据源 不演示了
df = pd.read_excel()
pd.pivot_table(df,index=[''],...)