import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
DATA = pd.read_csv("creditcard.csv")
#查看数据是否平衡--------------------------------------------------------------------------------------------------------------------------------
count_classes = pd.value_counts(DATA["Class"], sort=True).sort_index()
count_classes.plot(kind="bar")#pandas可以做简单的图
plt.show()
#数据标准化------------------------------------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
DATA["normAmount"] = StandardScaler().fit_transform(DATA["Amount"].values.reshape(-1, 1))#StandardScaler类的实例可以进行数据标准化,-1的意思是自动选取
DATA = DATA.drop(["Time", "Amount"], axis=1)
# 下采样过程------------------------------------------------------------------------------------------------------------
#下采样的意义是从比较多的那类数据中随机选取比较少的那类数据的数量的样本
DATA_matrix = DATA.values
X = DATA_matrix[:, DATA.columns != "Class"]
y = DATA_matrix[:, DATA.columns == "Class"]
number_records_fraud = len(DATA[DATA["Class"] == 1])
fraud_indices = np.array(DATA[DATA["Class"] == 1].index)
norm_indices = np.array(DATA[DATA["Class"] == 0].index)
random_norm_indics = np.random.choice(norm_indices, number_records_fraud, replace=False)#np.random.choice()函数需要传入两个参数,第一个参数是需要随机选择的array,第二个参数为选取的个数,replace参数控制是否是放回抽样
random_norm_indics = np.array(random_norm_indics)
under_sample_indices = np.concatenate([fraud_indices, random_norm_indics])#合并操作
under_sample = DATA.iloc[under_sample_indices, :]
x_under_sample = under_sample.values[:, DATA.columns != "Class"]
y_under_sample = under_sample.values[:, DATA.columns == "Class"]
# 交叉验证----------------------------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)#train_test_split()函数需要两个参数,第一个参数为feature阵,第二个为label阵,最后会返回四个结果(结果中X_train和y_train以及X_test和y_test会一一对应)
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undersample = train_test_split(x_under_sample,
逻逻辑回归案例:信用卡欺诈检测
最新推荐文章于 2022-06-20 16:42:00 发布