import numpy as np
from sklearn import preprocessing
data = np.array([[1.,100.,-7],[2.,-50.,10],[0.,200,200]])#标准化去均值和方差按比例缩放
model = preprocessing.StandardScaler()
res = model.fit_transform(data)
res
将特征缩放到指定范围(默认是0-1)
import numpy as np
from sklearn import preprocessing
data = np.array([[1.,100.,-7],[2.,-50.,10],[0.,200,200]])
model = preprocessing.MinMaxScaler(feature_range=(-1,1))
res = model.fit_transform(data)
res
import numpy as np
from sklearn import preprocessing
data = np.array([[1.,100.,-7],[2.,-50.,10],[0.,200,200]])#方法一
res = preprocessing.normalize(data, norm ='l2')
res
#方法二
model = preprocessing.Normalizer()
res = model.transform(data)
res
相同数据的归一化后的值是一样的,可以用于查看相似度
3.二值化(可以指定阈值)
import numpy as np
from sklearn import preprocessing
data = np.array([[1.,100.,-7],[2.,-50.,10],[0.,200,200]])#二值化
model = preprocessing.Binarizer(threshold=10)#>10才是1
res = model.transform(data)
res
import numpy as np
from sklearn import preprocessing
data = np.array([[0,0,3],[1,1,0],[0,2,1],[1,0,2]])
model = preprocessing.OneHotEncoder()
model.fit(data)
res = model.transform(np.array([[1,2,3]]))
res.toarray()
可以用作自定义one-hot
import numpy as np
from sklearn import preprocessing
c =[['one'],['two'],['three']]
model = preprocessing.OneHotEncoder()
res9_1 = model.fit(c)
res9_2 = model.transform([['one'],['two']])
res9_2.toarray()
5.缺失值插补
#缺失值插补import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)#np中0-列, 1-行
imp.fit_transform([[1,2,5],[np.nan,3,7],[7,6,10]])