one_hot编码之前也接触过但是并没有真正去了解过,正好现在遇上一个问题就是我有一堆人的职业,我需要将这些职业向量化处理,最简单粗暴没理由的做法就是按照顺序对所有的职业进行数值的编码,数值编码的值就是对应职业的特征值,这样其实是很不合理的,因为职业并不是一个连续的变量,赋予其连续的数值本身就是没有意义的,比如:我有三个职业:司机、营养师、教练,如果采用连续数值的方式编码的话就会是:
司机:1
营养师:2
教练:3
这样做带来的直观的不合理之处就是,职业之间两两的距离被人为不平衡化了,比如:司机和营养师、营养师和教练之间的距离都是1,但是司机与教练的距离就是2了。
应对这个问题广泛采用的比较有效的方法就是采用one_hot编码,顾名思义,这是一种只有一位有效的编码。
举例说明,同样对于上述的例子,新的编码为:
司机:[0,0,1]
营养师:[0,1,0]
教练:[1,0,0]
这样彼此之间的距离都是一样的了.sklearn中已有封装好的接口,下面是简单的调用
#!usr/bin/env python
#encoding:utf-8
'''
__Author__:沂水寒城
功能:one_hot编码
'''
def int_list(one_list):
'''
列表元素整形化
'''
return [int(one) for one in one_list]
def one_hot_encoder_test(data='encode_data.txt'):
'''
one_hot编码
'''
data_list=[]
with open(data) as f:
content=f.readlines()
for one in content:
data_list.append(int_list(one.split(',')))
print data_list
from sklearn import preprocessing
encoder=preprocessing.OneHotEncoder()
encoder.fit(data_list)
print 'encoder model is:'
print encoder
for one_list in data_list:
one_array=encoder.transform([one_list]).toarray()
print '{0}--------->{1}'.format(one_list,one_array)
if __name__ == '__main__':
one_hot_encoder_test()
[[2, 4, 5, 0, 0, 1, 3, 7, 9], [0, 1, 0, 4, 5, 3, 5, 5, 6], [0, 0, 0, 2, 0, 4, 5, 1, 0], [5, 6, 0, 0, 1, 0, 0, 0, 0], [9, 3, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 4, 7, 8, 9, 1], [0, 3, 4, 1, 0, 3, 4, 1, 2], [0, 2, 3, 1, 4, 5, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1], [2, 1, 1, 1, 0, 0, 0, 0, 3], [0, 9, 9, 9, 1, 2, 5, 3, 1], [0, 6, 0, 3, 0, 5, 0, 2, 0], [0, 5, 6, 7, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 2, 2, 0, 0, 3, 3, 3]]
encoder model is:
OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
handle_unknown='error', n_values='auto', sparse=True)
[2, 4, 5, 0, 0, 1, 3, 7, 9]--------->[[ 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1.]]
[0, 1, 0, 4, 5, 3, 5, 5, 6]--------->[[ 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 1. 0.]]
[0, 0, 0, 2, 0, 4, 5, 1, 0]--------->[[ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[5, 6, 0, 0, 1, 0, 0, 0, 0]--------->[[ 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[9, 3, 1, 0, 0, 0, 1, 0, 0]--------->[[ 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[0, 0, 0, 1, 4, 7, 8, 9, 1]--------->[[ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0.]]
[0, 3, 4, 1, 0, 3, 4, 1, 2]--------->[[ 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 0.]]
[0, 2, 3, 1, 4, 5, 1, 0, 0]--------->[[ 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[0, 0, 0, 0, 0, 0, 0, 1, 1]--------->[[ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 0. 0.]]
[2, 1, 1, 1, 0, 0, 0, 0, 3]--------->[[ 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0.]]
[0, 9, 9, 9, 1, 2, 5, 3, 1]--------->[[ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
0. 0. 0.]]
[0, 6, 0, 3, 0, 5, 0, 2, 0]--------->[[ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[0, 5, 6, 7, 1, 0, 0, 0, 0]--------->[[ 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0.]]
[0, 0, 0, 0, 0, 1, 1, 1, 1]--------->[[ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
0. 0. 0.]]
[0, 0, 2, 2, 0, 0, 3, 3, 3]--------->[[ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
1. 0. 0.]]
样本数据如下,感兴趣可以试试:
关于one_hot编码,网上有很多好的讲解这里就不累赘了,只是简单实践学习一下。

本文通过实例介绍One-Hot编码的原理及应用,演示如何使用sklearn库进行One-Hot编码,并解释为何此方法适用于非连续变量的数据处理。
967

被折叠的 条评论
为什么被折叠?



