第九章 分类数据
import numpy as np
import pandas as pd
cat对象
cat对象的属性
category
类型,处理分类类型变量,将普通序列转化成分类变量可使用astype
方法
df = pd.read_csv('data/learn_pandas.csv',usecols=['Grade', 'Name', 'Gender', 'Height', 'Weight'])
df.head()
| Grade | Name | Gender | Height | Weight |
---|
0 | Freshman | Gaopeng Yang | Female | 158.9 | 46.0 |
---|
1 | Freshman | Changqiang You | Male | 166.5 | 70.0 |
---|
2 | Senior | Mei Sun | Male | 188.9 | 89.0 |
---|
3 | Sophomore | Xiaojuan Sun | Female | NaN | 41.0 |
---|
4 | Sophomore | Gaojuan You | Male | 174.0 | 74.0 |
---|
s = df.Grade.astype('category')
s.head()
0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']
s.cat.categories
Index(['Freshman', 'Junior', 'Senior', 'Sophomore'], dtype='object')
s.cat.ordered
False
s.cat.codes.head()
0 0
1 0
2 2
3 3
4 3
dtype: int8
类别的增加、删除和修改
s = s.cat.add_categories('Graduate')
s.cat.categories
Index(['Freshman', 'Junior', 'Senior', 'Sophomore', 'Graduate'], dtype='object')
s = s.cat.remove_categories('Freshman')
s.cat.categories
Index(['Junior', 'Senior', 'Sophomore', 'Graduate'], dtype='object')
s = s.cat.set_categories(['Sophomore','PhD'])
s.cat.categories
s.head()
0 NaN
1 NaN
2 NaN
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (2, object): ['Sophomore', 'PhD']
s = s.cat.remove_unused_categories()
s.cat.categories
Index(['Sophomore'], dtype='object')
s = s.cat.rename_categories({'Sophomore':'本科二年级学生'})
s.head()
有序分类
序的建立
s = df.Grade.astype('category')
s.head
<bound method NDFrame.head of 0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
...
195 Junior
196 Senior
197 Senior
198 Senior
199 Sophomore
Name: Grade, Length: 200, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']>
s = s.cat.reorder_categories(['Freshman', 'Sophomore', 'Junior', 'Senior'],ordered=True)
s.head()
0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman' < 'Sophomore' < 'Junior' < 'Senior']
s.cat.as_unordered().head()
0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Sophomore', 'Junior', 'Senior']
排序和比较
df.Grade = df.Grade.astype('category')
df.Grade = df.Grade.cat.reorder_categories(['Freshman', 'Sophomore', 'Junior', 'Senior'],ordered=True)
df.sort_values('Grade').head()
| Grade | Name | Gender | Height | Weight |
---|
0 | Freshman | Gaopeng Yang | Female | 158.9 | 46.0 |
---|
105 | Freshman | Qiang Shi | Female | 164.5 | 52.0 |
---|
96 | Freshman | Changmei Feng | Female | 163.8 | 56.0 |
---|
88 | Freshman | Xiaopeng Han | Female | 164.1 | 53.0 |
---|
81 | Freshman | Yanli Zhang | Female | 165.1 | 52.0 |
---|
df.set_index('Grade').sort_index().head()
| Name | Gender | Height | Weight |
---|
Grade | | | | |
---|
Freshman | Gaopeng Yang | Female | 158.9 | 46.0 |
---|
Freshman | Qiang Shi | Female | 164.5 | 52.0 |
---|
Freshman | Changmei Feng | Female | 163.8 | 56.0 |
---|
Freshman | Xiaopeng Han | Female | 164.1 | 53.0 |
---|
Freshman | Yanli Zhang | Female | 165.1 | 52.0 |
---|
df.head()
| Grade | Name | Gender | Height | Weight |
---|
0 | Freshman | Gaopeng Yang | Female | 158.9 | 46.0 |
---|
1 | Freshman | Changqiang You | Male | 166.5 | 70.0 |
---|
2 | Senior | Mei Sun | Male | 188.9 | 89.0 |
---|
3 | Sophomore | Xiaojuan Sun | Female | NaN | 41.0 |
---|
4 | Sophomore | Gaojuan You | Male | 174.0 | 74.0 |
---|
res1 = df.Grade == 'Sophomore'
res1
0 False
1 False
2 False
3 True
4 True
...
195 False
196 False
197 False
198 False
199 True
Name: Grade, Length: 200, dtype: bool
res2 = df.Grade == ['PhD']*df.shape[0]
res2.head()
0 False
1 False
2 False
3 False
4 False
Name: Grade, dtype: bool
res3 = df.Grade <= 'Sophomore'
res3.head()
0 True
1 True
2 False
3 True
4 True
Name: Grade, dtype: bool
df_demo = df.Grade.sample(frac=1).reset_index(drop=True)
df_demo.head()
0 Freshman
1 Sophomore
2 Sophomore
3 Junior
4 Senior
Name: Grade, dtype: category
Categories (4, object): ['Freshman' < 'Sophomore' < 'Junior' < 'Senior']
res4 = df.Grade <= df.Grade.sample(frac=1).reset_index(drop=True)
res4.head()
0 True
1 True
2 False
3 True
4 True
Name: Grade, dtype: bool
区间类别
利用cut和qcut进行区间构造
区间是特殊的类别,通过cut/qcut
可以实现将原序列的数值特征装箱,即用区间位置代替原来的数值位置。
- 区间默认左开右闭,需要进行调整把最小值包含进去,
最小区间左端点- 0.001*(max-min)
- 需要指定左闭右开时,需要
right=False
,同时最大的区间右端点 + 0.001*(max-min)
s = pd.Series([1,2])
pd.cut(s, bins=2)
0 (0.999, 1.5]
1 (1.5, 2.0]
dtype: category
Categories (2, interval[float64]): [(0.999, 1.5] < (1.5, 2.0]]
pd.cut(s, bins=2, right=False)
0 [1.0, 1.5)
1 [1.5, 2.001)
dtype: category
Categories (2, interval[float64]): [[1.0, 1.5) < [1.5, 2.001)]
pd.cut(s, bins=[-np.infty, 1.2, 1.8, 2.2, np.infty])
0 (-inf, 1.2]
1 (1.8, 2.2]
dtype: category
Categories (4, interval[float64]): [(-inf, 1.2] < (1.2, 1.8] < (1.8, 2.2] < (2.2, inf]]
s = pd.Series([1,2])
res = pd.cut(s, bins=2, labels=['small', 'big'], retbins=True)
res
(0 small
1 big
dtype: category
Categories (2, object): ['small' < 'big'],
array([0.999, 1.5 , 2. ]))
s = df.Weight
pd.qcut(s, q=3).head()
0 (33.999, 48.0]
1 (55.0, 89.0]
2 (55.0, 89.0]
3 (33.999, 48.0]
4 (55.0, 89.0]
Name: Weight, dtype: category
Categories (3, interval[float64]): [(33.999, 48.0] < (48.0, 55.0] < (55.0, 89.0]]
pd.qcut(s, q=[0,0.2,0.8,1]).head()
0 (44.0, 69.4]
1 (69.4, 89.0]
2 (69.4, 89.0]
3 (33.999, 44.0]
4 (69.4, 89.0]
Name: Weight, dtype: category
Categories (3, interval[float64]): [(33.999, 44.0] < (44.0, 69.4] < (69.4, 89.0]]
一般区间的构造
my_interval = pd.Interval(0,1,'right')
1 in my_interval
0 in my_interval
False
my_interval_2 = pd.Interval(0.5, 1.5, 'left')
my_interval_2.overlaps(my_interval)
True
pd.IntervalIndex.from_breaks([1,3,6,10], closed='both')
IntervalIndex([[1, 3], [3, 6], [6, 10]],
closed='both',
dtype='interval[int64]')
pd.IntervalIndex.from_arrays(left = [1,3,6,10], right = [5,4,9,11], closed = 'neither')
IntervalIndex([(1, 5), (3, 4), (6, 9), (10, 11)],
closed='neither',
dtype='interval[int64]')
pd.IntervalIndex.from_tuples([(1,5),(3,4),(6,9),(10,11)],closed='neither')
IntervalIndex([(1, 5), (3, 4), (6, 9), (10, 11)],
closed='neither',
dtype='interval[int64]')
pd.interval_range(start=1, end=5, periods=8)
IntervalIndex([(1.0, 1.5], (1.5, 2.0], (2.0, 2.5], (2.5, 3.0], (3.0, 3.5], (3.5, 4.0], (4.0, 4.5], (4.5, 5.0]],
closed='right',
dtype='interval[float64]')
pd.interval_range(end=5,periods=8,freq=0.5)
IntervalIndex([(1.0, 1.5], (1.5, 2.0], (2.0, 2.5], (2.5, 3.0], (3.0, 3.5], (3.5, 4.0], (4.0, 4.5], (4.5, 5.0]],
closed='right',
dtype='interval[float64]')
区间的属性与方法
id_interval = pd.IntervalIndex(pd.cut(s, 3))
id_demo = id_interval[:5]
id_demo
IntervalIndex([(33.945, 52.333], (52.333, 70.667], (70.667, 89.0], (33.945, 52.333], (70.667, 89.0]],
closed='right',
name='Weight',
dtype='interval[float64]')
id_demo.left
Float64Index([33.945, 52.333, 70.667, 33.945, 70.667], dtype='float64')
id_demo.right
Float64Index([52.333, 70.667, 89.0, 52.333, 89.0], dtype='float64')
id_demo.mid
Float64Index([43.138999999999996, 61.5, 79.8335, 43.138999999999996, 79.8335], dtype='float64')
id_demo.length
Float64Index([18.387999999999998, 18.334000000000003, 18.333,
18.387999999999998, 18.333],
dtype='float64')
id_demo.contains(4)
array([False, False, False, False, False])
id_demo.contains(34)
array([ True, False, False, True, False])
id_demo.overlaps(pd.Interval(40,60))
array([ True, True, False, True, False])
练一练
Ex1:统计未出现的类别
Ex2:钻石数据集
df = pd.read_csv('data/diamonds.csv')
df.head()
| carat | cut | clarity | price |
---|
0 | 0.23 | Ideal | SI2 | 326 |
---|
1 | 0.21 | Premium | SI1 | 326 |
---|
2 | 0.23 | Good | VS1 | 327 |
---|
3 | 0.29 | Premium | VS2 | 334 |
---|
4 | 0.31 | Good | SI2 | 335 |
---|
%timeit -n 30 df.cut.nunique()
%timeit -n 30 df.cut.astype('category').nunique()
object_demo = df.cut
category_demo = df.cut.astype('category')
%timeit -n 30 object_demo.nunique()
%timeit -n 30 category_demo.nunique()
3.17 ms ± 294 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)
5.39 ms ± 610 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)
3.82 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 30 loops each)
1.09 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)
df.cut = df.cut.astype('category').cat.reorder_categories(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True)
df.clarity = df.clarity.astype('category').cat.reorder_categories(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'],ordered=True)
res = df.sort_values(['cut','clarity'], ascending=[False, True])
res
| carat | cut | clarity | price |
---|
315 | 0.96 | Ideal | I1 | 2801 |
---|
535 | 0.96 | Ideal | I1 | 2826 |
---|
551 | 0.97 | Ideal | I1 | 2830 |
---|
653 | 1.01 | Ideal | I1 | 2844 |
---|
718 | 0.97 | Ideal | I1 | 2856 |
---|
... | ... | ... | ... | ... |
---|
41242 | 0.30 | Fair | IF | 1208 |
---|
43778 | 0.37 | Fair | IF | 1440 |
---|
47407 | 0.52 | Fair | IF | 1849 |
---|
49683 | 0.52 | Fair | IF | 2144 |
---|
50126 | 0.47 | Fair | IF | 2211 |
---|
53940 rows × 4 columns
df.cut = df.cut.cat.reorder_categories(df.cut.cat.categories[::-1])
df.clarity = df.clarity.cat.reorder_categories(df.clarity.cat.categories[::-1])
df.cut = df.cut.cat.codes
clarity_cat = df.clarity.cat.categories
df.clarity = df.clarity.replace(dict(zip(clarity_cat, np.arange(len(clarity_cat)))))
df.head(3)
| carat | cut | clarity | price |
---|
0 | 0.23 | 0 | 6 | 326 |
---|
1 | 0.21 | 1 | 5 | 326 |
---|
2 | 0.23 | 3 | 3 | 327 |
---|