理论知识和代码参考知乎https://www.zhihu.com/question/29316149/answer/110159647
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 18 09:53:43 2018
@author: Administrator
"""
'''
特征工程 Feature Engineering
'''
import numpy as np
from sklearn.datasets import load_iris#加载数据集
'''
数据预处理
'''
from sklearn.preprocessing import StandardScaler#数据预处理-标准化
from sklearn.preprocessing import MinMaxScaler#数据预处理-区间缩放法[0,1
from sklearn.preprocessing import Normalizer#数据预处理-归一化
from sklearn.preprocessing import Binarizer#数据预处理-二值化
from sklearn.preprocessing import OneHotEncoder#数据预处理-哑编码
from sklearn.preprocessing import Imputer#数据预处理-缺省值计算
from sklearn.preprocessing import PolynomialFeatures#数据预处理-多项式转化
from sklearn.preprocessing import FunctionTransformer#数据预处理-自定义函数转换
'''
特征选择
'''
from sklearn.feature_selection import VarianceThreshold#方差选择法
from sklearn.feature_selection import SelectKBest#相关系数法-计算各个特征对类别标记的相关系数
from scipy.stats import pearsonr#计算相关系数
from sklearn.feature_selection import chi2#卡方检验
from minepy import MINE#互信息法
from sklearn.feature_selection import RFE#递归特征消除法
from sklearn.linear_model import LogisticRegression#选择基模型
from sklearn.feature_selection import SelectFromModel#基于惩罚项的特征选择
from sklearn.ensemble import GradientBoostingClassifier##GBDT作为基模型的特征选择
'''
降维,L1惩罚项降维;PCA是为了让映射后的样本具有最大的发散性;而LDA是为了让映射后的样本有最好的分类性能
PCA是一种无监督的降维方法,而LDA是一种有监督的降维方法。
'''
from sklearn.decomposition import PCA#PCA降维
from sklearn.decomposition import LatentDirichletAllocation as LDA#LDA降维
#from sklearn.lda import LDA#出错
#import sklearn
#print(sklearn.__version__)#0.19.
'''
数据预处理
'''
iris = load_iris()
data = iris.data#数据集
target = iris.target#类别标记
data1 = StandardScaler().fit_transform(data)
data2 = MinMaxScaler().fit_transform(data)
data3 = Normalizer().fit_transform(data)
data4 =Binarizer(threshold = 3).fit_transform(data)#二值化的阈值设置为3
data5 = OneHotEncoder().fit_transform(target.reshape(-1,1))
#data = np.vstack((np.array([np.nan, np.nan, np.nan, np.nan]),data))#增加一个缺省值样本
#data6 = Imputer().fit_transform(data)
data7 = PolynomialFeatures(degree = 2).fit_transform(data)
data8 = FunctionTransformer(np.log1p).fit_transform(data)#对数函数-单变元函数
'''
特征选择
'''
data9 = VarianceThreshold(threshold = 3).fit_transform(data)#计算各个特征向量的方差,方差大于阈值的特征保留
data10 = SelectKBest(lambda X, Y: np.array(list(map(lambda x:pearsonr(x, Y), X.T))).T[0], k=2).fit_transform(data, target)
data11 = SelectKBest(chi2, k=2).fit_transform(data, target)
def mic(x, y):#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
m = MINE()
m.compute_score(x, y)
return(m.mic(), 0.5)
data12 = SelectKBest(lambda X, Y: np.array(list(map(lambda x:mic(x, Y), X.T))).T[0], k=2).fit_transform(data, target)
data13 = RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(data, target)
data14 = SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)#带l1惩罚项的逻辑回归
#L1机L2惩罚项结合
class LR(LogisticRegression):
def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0,
fit_intercept=True, intercept_scaling=1, class_weight=None,
random_state=None, solver='liblinear', max_iter=100,
multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
#权值相近的阈值
self.threshold = threshold
LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C,
fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
random_state=random_state, solver=solver, max_iter=max_iter,
multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
#使用同样的参数创建L2逻辑回归
self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
def fit(self, X, y, sample_weight=None):
#训练L1逻辑回归
super(LR, self).fit(X, y, sample_weight=sample_weight)
self.coef_old_ = self.coef_.copy()
#训练L2逻辑回归
self.l2.fit(X, y, sample_weight=sample_weight)
cntOfRow, cntOfCol = self.coef_.shape
#权值系数矩阵的行数对应目标值的种类数目
for i in range(cntOfRow):
for j in range(cntOfCol):
coef = self.coef_[i][j]
#L1逻辑回归的权值系数不为0
if coef != 0:
idx = [j]
#对应在L2逻辑回归中的权值系数
coef1 = self.l2.coef_[i][j]
for k in range(cntOfCol):
coef2 = self.l2.coef_[i][k]
#在L2逻辑回归中,权值系数之差小于设定的阈值,且在L1中对应的权值为0
if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0:
idx.append(k)
#计算这一类特征的权值系数均值
mean = coef / len(idx)
self.coef_[i][idx] = mean
return self
data15 = SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(data, target)
data16 = SelectFromModel(GradientBoostingClassifier()).fit_transform(data, target)#GBDT作为基模型的特征选择
'''
降维
'''
data17 = PCA(n_components = 2).fit_transform(data)#n_components降维后的维数
data18 = LDA(n_components = 2).fit_transform(data, target)#n_components降维后的维数