前言:本文为VV观看唐宇迪老师《Python数据分析与机器学习实战》视频课程的学生学习笔记,五星推荐风趣幽默,讲课清晰易懂的唐宇迪老师!
数据:creditcard.csv
百度云https://pan.baidu.com/s/1fzqeieHOrBmV5TJ1RfhBbw 提取码: buiu(文件比较大)
数据来源为优快云博主「三猪」的原创文章。原文链接:https://blog.youkuaiyun.com/weixin_39739342/article/details/98475240
VV笔记
案例实战——信用卡欺诈检测
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('creditcard.csv')
data.head()
data.shape
(284807, 31)
data['Class'].value_counts()
#.value_counts 分类计数,不同的变量值及其频数:两种用法①data['Class'].value_counts② pd.value_counts(data['Class'])
#Class为0__信用正常;Class为1__信用异常!
0 284315
1 492
Name: Class, dtype: int64
#首先检测当前数据是否均衡
count_classes = pd.value_counts(data['Class'],sort=True).sort_index()
#sort=True 排序
#.sort_index() 数据排序
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
#发现极不均衡
#可见0和1的样本数相差非常大,因此我们提出样本不均衡解决方案:①过采样②下采样
#下采样:(同样少)从0组中取出跟1差不多的样本量
#过采样:(同样多)让1组生成跟0组一样多的样本量
数据预处理
from sklearn.preprocessing import StandardScaler #预处理_标准化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1)) #加上新的列
#fit_transform(X) 意思是找出X_train的\mu和\sigma,并应用在X上
#.reshape():重新编排成几行几列(-1表示系统根据另一个数自动运算)
data = data.drop(['Time','Amount'],axis = 1) #删去不要的列
data.head()
下采样
X = data.ix[:,data.columns != 'Class'] #除了"Class"的其余所有列组成X矩阵
y = data.ix[:,data.columns == 'Class'] #只有"Class"的y矩阵
#.ix位置功能类似loc,iloc
number_records_fraud = len(data[ data['Class']==1 ]) #失信人员数
fraud_indices = np.array(data[data.Class == 1].index) #全部失信人员的索引值
normal_indices = data[data.Class == 0].index #正常人员的索引值
random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False)
#从normal_indices当中随机选择了number_records_fraud个样本
random_normal_indices = np.array(random_normal_indices) #生成np.array
under_sample_indices = np.concatenate([fraud_indices , random_normal_indices]) #np.concatenate([ , ])合并数组
under_sample_data = data.iloc[under_sample_indices,:] #下采样样本结果
X_undersample = under_sample_data.ix[:,data.columns != 'Class']
y_undersample = under_sample_data.ix[:,data.columns == 'Class']
#showing ratio
print("Percentage of normal transactions: ",len(under_sample_data[under_sample_data['Class'] == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ",len(under_sample_data[under_sample_data['Class'] != 0])/len(under_sample_data))
print("Total number of transactions in resample data: ",len(under_sample_data))