完整代码

# 二元Logistic回归算法
#  数据准备
# 1  载入分析所需要的模块和函数
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import simplefilter


simplefilter(action='ignore', category=FutureWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score


# 2  数据读取及观察
data = pd.read_csv(r'数据5.1.csv')#建议使用绝对路径
data.info()
len(data.columns)
data.columns
data.shape
data.dtypes
data.isnull().values.any()
data.isnull().sum()
data.head()
# 3 描述性分析
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(data.describe())
data.groupby('V1').describe().unstack()  # 按照V1变量的取值分组对其他变量开展描述性分析
print(data.groupby('V1').describe())
pd.crosstab(data.V3, data.V1)
pd.crosstab(data.V3, data.V1, normalize='index')




# 4  数据处理
# 4.1  区分分类特征和连续特征并进行处理
def data_encoding(data):
    data = data[["V1", 'V2', "V3", "V4", "V5", "V6", "V7", "V8", "V9"]]
    Discretefeature = ["V3"]
    Continuousfeature = ['V2', "V4", "V5", "V6", "V7", "V8", "V9"]
    df = pd.get_dummies(data, columns=Discretefeature)
    df[Continuousfeature] = (df[Continuousfeature] - df[Continuousfeature].mean()) / (df[Continuousfeature].std())
    df["V1"] = data[["V1"]]
    return df




data = data_encoding(data)
# 4.2  将样本示例全集分割为训练样本和测试样本
X = data.drop(['V1', 'V3_5'], axis=1)  # 设置特征变量,即除V1、V3_5之外的全部变量
X.shape
X['intercept'] = [1] * X.shape[0]  # 为X增加1列,设置模型中的常数项。
y = data['V1']  # 设置响应变量,即V1
print(data["V1"].value_counts())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=100)
X_train.head()
y_train.head()




# 5  使用sklearn建立二元Logistic回归算法模型
model = LogisticRegression(C=1e10, fit_intercept=True)
model.fit(X_train, y_train)
print("训练样本预测准确率: {:.3f}".format(model.score(X_train, y_train)))  # 训练样本预测对的个数 / 总个数
print("测试样本预测准确率: {:.3f}".format(model.score(X_test, y_test)))  # 测试样本预测对的个数 / 总个数
model.coef_


predict_target = model.predict(X_test)
predict_target
predict_target_prob