1. sklearn包
1.1 labelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Col1'])
df['Col3'] = le.transform(df['Col3'])
再来一个示例
###
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
print(list(le.classes_))
# ['amsterdam', 'paris', 'tokyo']
print(le.transform(["tokyo", "tokyo", "paris"]))
# array([2, 2, 1])
这里结合读取文件,来实现字符编码。
import numpy as np
import pandas as pd
import xlrd
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
#### obtain cols of XX type
def obtain_x(train_df,xtype):
dtype_df = train_df.dtypes.reset_index()
print('dtype_df\n',dtype_df)
dtype_df.columns = ['col','type']
return dtype_df[dtype_df.type==xtype].col.values
train_df = pd.read_excel(r'G:\test_onehot.xlsx')
# print('train_df',train_df)
# obtain str cols
str_col = obtain_x(train_df,'object')#获得字符串类型列代号
print('str_col\n',str_col)
str_col_list=str_col.tolist()
print('str_list\n',str_col_list)
# print('obtained float cols, and count:',len(float64_col))
print('train_df[str_col_list]\n',train_df[str_col_list])
###编码
le = preprocessing.LabelEncoder()
# list= [col for col in str_col ]
list=[]
# list=str_col_list
# list.append(train_df[col] for col in str_col_list)
list.append(train_df[str_col_list[0]])
list.append(train_df[str_col_list[1]])
print('list\n',list[1][0])
le.fit(list[0])
LabelEncoder()
print('le.transform(list[0])\n',le.transform(list[0]))
2. 使用pandas包处理
2.1 独热编码
import pandas as pd
train_df = pd.read_excel(r'G:\test_onehot.xlsx')
# print('train_df',train_df)
#get_dummies
# obtain str cols
str_col = obtain_x(train_df,'object')#获得字符串类型列代号
train_df_dummy=pd.get_dummies(train_df[str_col])
train_df=train_df.drop(str_col,axis=1)
train_df=train_df.join(train_df_dummy)
print('train_df\n',train_df)
参考: