import numpy as np
import pandas as pd
cat对象
cat对象的属性
category类型,处理分类类型变量,将普通序列转化成分类变量可使用astype方法
df = pd.read_csv('data/learn_pandas.csv',usecols=['Grade', 'Name', 'Gender', 'Height', 'Weight'])
df.head()
| Grade | Name | Gender | Height | Weight | |
|---|---|---|---|---|---|
| 0 | Freshman | Gaopeng Yang | Female | 158.9 | 46.0 |
| 1 | Freshman | Changqiang You | Male | 166.5 | 70.0 |
| 2 | Senior | Mei Sun | Male | 188.9 | 89.0 |
| 3 | Sophomore | Xiaojuan Sun | Female | NaN | 41.0 |
| 4 | Sophomore | Gaojuan You | Male | 174.0 | 74.0 |
s = df.Grade.astype('category')
s.head()
0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']
# 组成部分之一:类别本身,以Index类型存储
s.cat.categories
Index(['Freshman', 'Junior', 'Senior', 'Sophomore'], dtype='object')
# 组成部分之二:是否有序
s.cat.ordered
False
# 每个序列类别会被赋予唯一整数编号,编号取决于cat.categories中的顺序
# 通过codes访问该属性
s.cat.codes.head()
0 0
1 0
2 2
3 3
4 3
dtype: int8
类别的增加、删除和修改
# 类别的增加
s = s.cat.add_categories('Graduate')
s.cat.categories
Index(['Freshman', 'Junior', 'Senior', 'Sophomore', 'Graduate'], dtype='object')
# 类别的删除
s = s.cat.remove_categories('Freshman')
s.cat.categories
Index(['Junior', 'Senior', 'Sophomore', 'Graduate'], dtype='object')
# 直接设置序列的新类别,原来类别中如果存在元素不属于新类别 则被设置为缺失
s = s.cat.set_categories(['Sophomore','PhD'])
s.cat.categories
s.head()
0 NaN
1 NaN
2 NaN
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (2, object): ['Sophomore', 'PhD']
# 删除未出现在序列中的类别
s = s.cat.remove_unused_categories() # 移除了未出现的博士类别
s.cat.categories
Index(['Sophomore'], dtype='object')
# 修改
s = s.cat.rename_categories({
'Sophomore':'本科二年级学生'})
s.head()
有序分类
序的建立
s = df.Grade.astype('category')
s.head
<bound method NDFrame.head of 0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
...
195 Junior
196 Senior
197 Senior
198 Senior
199 Sophomore
Name: Grade, Length: 200, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']>
s = s.cat.reorder_categories(['Freshman', 'Sophomore', 'Junior', 'Senior'],ordered=True)
s.head()
0 Freshman
1 Freshman
2 Senior
3 Sophomore
4 Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman' < 'Sophomore' < 'Junior' < 'Senior']
s.cat.as_unordered

最低0.47元/天 解锁文章
771

被折叠的 条评论
为什么被折叠?



