频数分布
2.1汇总定性数据
柱状图
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%pylab
df = pd.read_csv("E:\\data\\SoftDrink.csv")
df
df.describe()
grouped_df= df.groupby("Brand Purchased")
group1 = grouped_df.size()
group2.index
group2.values
plt.bar(group2.index,group2.values,color = ['b','r','y','c','g'],)
plt.title('frequency of softdring')
plt.xlabel("soft drink")
plt.ylabel("frequency")
?plt.bar
plt.ylim(ymax = 25) #设置y轴区间
for a,b in zip(group2.index,group2.values):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=10) #设置显示每条柱的数字标注
饼状图
plt.pie(group2.values,labels =group2.index,autopct = '%1.1f%%',shadow = False,startangle =90 )
plt.axis("equal")
plt.title("pie",loc= 'right')
2.2汇总定量数据
为定义定量数据中组的概念,需要下列三个步骤
1.确定分组的数目,互不重叠
2。确定分组区间 (近似吧 (最大值-最小值)/组数
3。确定组限
df = pd.read_csv("E:\\data\\Audit.csv")
In [124]:
df['Audit Time'].values
Out[124]:
array([12, 15, 20, 22, 14, 14, 15, 27, 21, 18, 19, 18, 22, 33, 16, 18, 17,
23, 28, 13], dtype=int64)
In [117]:
df.describe()
Out[117]:
Audit Time
count 20.000000
mean 19.250000
std 5.437443
min 12.000000
25% 15.000000
50% 18.000000
75% 22.000000
max 33.000000
In [126]:
AT
bins=[10,15,20,25,30,35]
AT=pd.cut(df['Audit Time'],bins)
AT
Out[126]:
0 (10, 15]
1 (10, 15]
2 (15, 20]
3 (20, 25]
4 (10, 15]
5 (10, 15]
6 (10, 15]
7 (25, 30]
8 (20, 25]
9 (15, 20]
10 (15, 20]
11 (15, 20]
12 (20, 25]
13 (30, 35]
14 (15, 20]
15 (15, 20]
16 (15, 20]
17 (20, 25]
18 (25, 30]
19 (10, 15]
Name: Audit Time, dtype: category
Categories (5, interval[int64]): [(10, 15] < (15, 20] < (20, 25] < (25, 30] < (30, 35]]
In [130]:
freqDisofAT = pd.value_counts(AT)
In [135]:
freqDisofAT = freqDisofAT.sort_index()
In [151]:
freqDisofAT
Out[151]:
(10, 15] 6
(15, 20] 7
(20, 25] 4
(25, 30] 2
(30, 35] 1
Name: Audit Time, dtype: int64
plt.style.use( 'ggplot') #ggplot风格
plt.hist(df['Audit Time'].values,bins,edgecolor = 'k')
plt.hist(df['Audit Time'].values,bins,edgecolor = 'k')
Out[65]:
(array([ 4., 8., 5., 2., 1.]),
array([10, 15, 20, 25, 30, 35]),
<a list of 5 Patch objects>)
In [12]:
import matplotlib
In [18]:
plt.ylabel ("频数",fontproperties = 'SimHei',fontsize = 14) #中文显示
Out[18]:
Text(55.8472,0.5,'频数')
In [19]:
)
plt.xlabel ("审计时间(天)",fontproperties = 'SimHei',fontsize = 14)
Out[19]:
Text(0.5,23.1922,'审计时间(天)')
plt.hist(df['Audit Time'].values,bins,normed=True, histtype='step', cumulative=True) #频数累计图 normed 转换成频数
2.3 茎叶图
plt.stem(df.sort_values(“Correct”))
2.4 交叉分组列表和散点图
df = pd.read_csv("E:\\data\\Restaurant.csv")
In [18]:
c
df.describe()
Out[18]:
Restaurant Meal Price ($)
count 300.000000 300.000000
mean 150.500000 25.896667
std 86.746758 9.287636
min 1.000000 10.000000
25% 75.750000 19.000000
50% 150.500000 25.000000
75% 225.250000 32.000000
max 300.000000 48.000000
In [19]:
df
df
Out[19]:
Restaurant Quality Rating Meal Price ($)
0 1 Good 18
1 2 Very Good 22
2 3 Good 28
3 4 Excellent 38
4 5 Very Good 33
5 6 Good 28
6 7 Very Good 19
7 8 Very Good 11
8 9 Very Good 23
9 10 Good 13
10 11 Very Good 33
11 12 Very Good 44
12 13 Excellent 42
13 14 Excellent 34
14 15 Good 25
15 16 Good 22
16 17 Good 26
17 18 Excellent 17
18 19 Very Good 30
19 20 Good 19
20 21 Very Good 33
21 22 Very Good 22
22 23 Excellent 32
23 24 Excellent 33
24 25 Very Good 34
25 26 Very Good 38
26 27 Good 27
27 28 Good 27
28 29 Very Good 26
29 30 Very Good 34
... ... ... ...
270 271 Excellent 48
271 272 Very Good 33
272 273 Very Good 25
273 274 Very Good 34
274 275 Very Good 20
275 276 Excellent 36
276 277 Excellent 40
277 278 Good 13
278 279 Very Good 12
279 280 Very Good 27
280 281 Very Good 20
281 282 Excellent 30
282 283 Good 29
283 284 Very Good 22
284 285 Good 27
285 286 Very Good 20
286 287 Excellent 37
287 288 Very Good 27
288 289 Good 23
289 290 Good 16
290 291 Very Good 23
291 292 Very Good 24
292 293 Excellent 45
293 294 Good 14
294 295 Good 18
295 296 Good 17
296 297 Good 16
297 298 Good 15
298 299 Very Good 38
299 300 Very Good 31
300 rows × 3 columns
In [20]:
30,40,50
bins = (10,20,30,40,50)
In [31]:
cut_pd = pd.cut(df['Meal Price ($)'],bins,right = False)
In [32]:
cut_pd
cut_pd
Out[32]:
0 [10, 20)
1 [20, 30)
2 [20, 30)
3 [30, 40)
4 [30, 40)
5 [20, 30)
6 [10, 20)
7 [10, 20)
8 [20, 30)
9 [10, 20)
10 [30, 40)
11 [40, 50)
12 [40, 50)
13 [30, 40)
14 [20, 30)
15 [20, 30)
16 [20, 30)
17 [10, 20)
18 [30, 40)
19 [10, 20)
20 [30, 40)
21 [20, 30)
22 [30, 40)
23 [30, 40)
24 [30, 40)
25 [30, 40)
26 [20, 30)
27 [20, 30)
28 [20, 30)
29 [30, 40)
...
270 [40, 50)
271 [30, 40)
272 [20, 30)
273 [30, 40)
274 [20, 30)
275 [30, 40)
276 [40, 50)
277 [10, 20)
278 [10, 20)
279 [20, 30)
280 [20, 30)
281 [30, 40)
282 [20, 30)
283 [20, 30)
284 [20, 30)
285 [20, 30)
286 [30, 40)
287 [20, 30)
288 [20, 30)
289 [10, 20)
290 [20, 30)
291 [20, 30)
292 [40, 50)
293 [10, 20)
294 [10, 20)
295 [10, 20)
296 [10, 20)
297 [10, 20)
298 [30, 40)
299 [30, 40)
Name: Meal Price ($), Length: 300, dtype: category
Categories (4, interval[int64]): [[10, 20) < [20, 30) < [30, 40) < [40, 50)]
In [109]:
cross = pd.crosstab(df["Quality Rating"],cut_pd,margins = True)
In [110]:
cross
cross
Out[110]:
Meal Price ($) [10, 20) [20, 30) [30, 40) [40, 50) All
Quality Rating
Excellent 2 14 28 22 66
Good 42 40 2 0 84
Very Good 34 64 46 6 150
All 78 118 76 28 300
In [112]:
cross.reindex(['Good', 'Very Good','Excellent', 'All'])
Out[112]:
Meal Price ($) [10, 20) [20, 30) [30, 40) [40, 50) All
Quality Rating
Good 42 40 2 0 84
Very Good 34 64 46 6 150
Excellent 2 14 28 22 66
All 78 118 76 28 300
pd.crosstab(df["Quality Rating"],cut_pd,margins = True,normalize = True)
###
Meal Price ($) [10, 20) [20, 30) [30, 40) [40, 50) All
Quality Rating
Excellent 0.006667 0.046667 0.093333 0.073333 0.22
Good 0.140000 0.133333 0.006667 0.000000 0.28
Very Good 0.113333 0.213333 0.153333 0.020000 0.50
All 0.260000 0.393333 0.253333 0.093333 1.00
散点图
df2 = pd.read_csv("E:\\data\\Scatter.csv")
df2
# Observation x y
0 1 -22 22
1 2 -33 49
2 3 2 8
3 4 29 -16
4 5 -13 10
5 6 21 -28
6 7 -13 27
7 8 -23 35
8 9 14 -5
9 10 3 -3
10 11 -37 48
11 12 34 -29
12 13 9 -18
13 14 -33 31
14 15 20 -16
15 16 -3 14
16 17 -15 18
17 18 12 17
18 19 -20 -11
19 20 -7 -22
plt.scatter(df2["x"],df2["y"])