Table of Contents
# 数据处理、分析
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import os
import glob
# sklearn模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import GradientBoostingClassifier as gbdt
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
# sklearn特征工程、数据准备和评估
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate, KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import BernoulliRBM
from sklearn.datasets.samples_generator import make_blobs
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin,ClassifierMixin
from sklearn import clone
# keras数据准备
from keras.models import load_model
from keras.utils import to_categorical
# keras神经网络
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers
# 图形显示
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# 表格显示
pd.set_option('max_colwidth',20)
pd.set_option('display.max_columns', 30)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Using TensorFlow backend.
1. 数据准备
数据存储在csv当中,文件小于10M,以pandas为主要处理方法。
基本的处理包括:数值型变量归一化,类别型变量数值化
path = r'C:/Users/Administrator/Documents/ls/data/pima-indians-diabetes.data.csv'
data_set = pd.read_csv(filepath_or_buffer=path,encoding='utf-8',sep=',',index_col=False, header=None)
use_data = pd.get_dummies(data_set, columns=[0,7])
use_data.head()
1 | 2 | 3 | 4 | 5 | 6 | 8 | 0_0 | 0_1 | 0_2 | 0_3 | 0_4 | 0_5 | 0_6 | 0_7 | ... | 7_58 | 7_59 | 7_60 | 7_61 | 7_62 | 7_63 | 7_64 | 7_65 | 7_66 | 7_67 | 7_68 | 7_69 | 7_70 | 7_72 | 7_81 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 76 columns
# 对数值型数据进行归一化处理
def normolized_col(train_df, test_df,tranform_colname=list(range(1,7,1))):
stand = StandardScaler()
train_transform = train_df[:,tranform_colname]
stand.fit(train_transform)
train_df[:,tranform_colname