根据本章所学内容,选取任一机器学习算法(包括神经网络),编写对应代码。并简述梯度下降算法的原理和流程、什么是过拟合及其解决方案、如何根据混淆矩阵计算准确率等指标。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, confusion_matrix, recall_score, precision_score, accuracy_score, \
f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from 逻辑回归 import List_score, AUC
pd.set_option('display.max_columns', None)
def data_read():
df = pd.read_csv(r"C:\Users\zhang\Desktop\附件133.csv", encoding="gbk")
df.rename(columns={
'Y(1=default, 0=non-default)': 'Y'}, inplace=True)
return df
def get_category_bin(df, col, binsnum, labels, qcut=False):
if qcut:
localdf = pd.qcut(df[col], q=binsnum, labels=labels)
else:
localdf = pd.cut(df[col], bins=binsnum, labels=labels)
localdf = pd.DataFrame(localdf)
name = col + '_' + 'bin'
localdf[name] = localdf[col]
df = df.join(localdf[name])
df = df.drop(i, axis=1)
df[name] = df[name].astype(object)
return df
def category_continue_separation(df, feature_names):
categorical_var = []
numerical_var = []
if 'Y' in feature_names:
feature_names.remove('Y')
if '编号' in feature_names:
feature_names.remove('编号')
numerical_var = list(df[feature_names].select_dtypes(
include=['int', 'float', 'int32', 'float32', 'int64', 'float64']).columns.values)
categorical_var = numerical_var
return categorical_var, numerical_var
def data_analysis_plot(categorical_var, df, start, mid, row, col, showlabel=False):
p = 1
plt.figure(figsize=(20, 18))
for i in categorical_var[start:mid]:
df_var = "df_" + i
df_var = pd.crosstab(df[i], df["Y"])
df_var["bad_rate"] = round(df_var[1] / (df_var[1] + df_var[0]), 4)
ax1 = plt.subplot(row, col, p)
year = df_var.index
data1 = df_var[0]
data2 = df_var[1]
data3 = df_var["bad_rate"]
x = range(len(year))
bar_width = 0.3
ax1.bar(x, data1, width=bar_width, color='#3A669C', label="好用户")
ax1.bar([i + bar_width for i in x], data2, width=bar_width, color='#C0504D', label="违约用户")
plt.xticks([i + bar_width / 2 for i in x], year)
if p == 1 or p == col + 1:
ax1.set_ylabel('用户数量', size=10)
ax1.set_xlabel(i, size=10)
text_heiht = 3
for x1, y1 in enumerate(data1):
ax1.text(x1, y1 + text_heiht, y1, ha='center', fontsize=8)
for x2, y2 in enumerate(data2):
ax1.text(x2 + bar_width, y2 + text_heiht, y2, ha='center', fontsize=8)
ax2 = ax1.twinx()
p3 = ax2.plot([i + bar_width / 2 for i in x], data3, color="gray", linestyle='--', label="坏用户占比")
if p == 3 or p == 6:
ax2.set_ylabel("坏用户占比"