import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
data=pd.read_csv(r'data.csv')
sns.boxplot(x=data['Annual Income'])
plt.figure(figsize=(6,4))
plt.tight_layout
plt.title('annual income photo')
plt.xlabel('annual income')
#plt.show
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
sns.boxplot(x=data['Annual Income'])
plt.tight_layout
plt.figure(figsize=(6,4))
plt.title('年收入箱线图')
plt.xlabel('年收入')
#plt.show
print(data.columns)
plt.figure(figsize=(6,4))
sns.histplot(data['Years in current job'])
plt.xticks(rotation=45,ha='right')
plt.tight_layout
plt.title('在当前工作年限直方图')
plt.xlabel('在当前工作年限')
plt.ylabel('员工数量')
#plt.show()
plt.figure(figsize=(6,4))
sns.boxplot(x='Credit Default',y='Annual Income',data=data)
plt.tight_layout
plt.title('Annual income vs.credit default')
plt.xlabel('credit default')
plt.ylabel('annual income')
#plt.show()
plt.figure(figsize=(6,4))
sns.histplot(x='Annual Income',hue='Credit Default',data=data,kde=True,element='step')
plt.tight_layout
plt.title('Annual Income vs.credit default')
plt.xlabel('annual income')
plt.ylabel('count')
#plt.show
plt.figure(figsize=(6,4))
sns.countplot(x='Number of Open Accounts',hue='Credit Default',data=data)
#解释:这里 x='Number of Open Account' 表示 data 这个 DataFrame 中的一列。data=data 明确告诉 seaborn 去这个 DataFrame 中找列。seaborn 内部会根据 x 和 hue 的列名在 data 中自动查找和处理。
plt.xticks(rotation=45,ha='right')
plt.tight_layout
plt.title('number of open account vs.credit default')
plt.xlabel('number of open account')
plt.ylabel('count')
plt.show
data['Open Accounts Group'] = pd.cut(data['Number of Open Accounts'],bins=[0,5,10,15,20,float('inf')],labels=['0-5','6-10','11-15','16-20','20+'])
plt.figure(figsize=(6,4))
sns.countplot(x='Open Accounts Group',hue='Credit Default',data=data)
plt.title('Number of Open Accounts (Grouped) vs.Credit Default')
plt.xlabel('number of open accounts group')
plt.ylabel('count')
plt.show()
import pandas as pd
data = pd.read_csv(r'data.csv')
print(data.info())
print(data.isnull().sum())
print(data.head())
for columns in data.columns:
if data[columns].dtype!='object':
data[columns].fillna(data[columns].mean(),inplace=True)
else:
data[columns].fillna(data[columns].mode()[0],inplace=True)
data=pd.get_dummies(data,drop_first=True)
discrete_list=[]
data2=pd.read_csv(r'data.csv')
for i in data.columns:
if i not in data2.columns:
discrete_list.append(i)
for i in discrete_list:
data[i]=data[i].astype(int)
print(data.head())