import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
font = {
'family':'SimHei', 'size':'20'}
plt.rc('font', **font)
df = pd.read_csv('data.csv')
df.head()
|
CustomerID |
Gender |
Age |
Annual Income (k$) |
Spending Score (1-100) |
0 |
1 |
Male |
19 |
15 |
39 |
1 |
2 |
Male |
21 |
15 |
81 |
2 |
3 |
Female |
20 |
16 |
6 |
3 |
4 |
Female |
23 |
16 |
77 |
4 |
5 |
Female |
31 |
17 |
40 |
df.columns
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
'Spending Score (1-100)'],
dtype='object')
df.columns = ['用户ID', '性别', '年龄', '年收入', '支出']
df.head()
|
用户ID |
性别 |
年龄 |
年收入 |
支出 |
0 |
1 |
Male |
19 |
15 |
39 |
1 |
2 |
Male |
21 |
15 |
81 |
2 |
3 |
Female |
20 |
16 |
6 |
3 |
4 |
Female |
23 |
16 |
77 |
4 |
5 |
Female |
31 |
17 |
40 |
df.isnull().sum()
用户ID 0
性别 0
年龄 0
年收入 0
支出 0
dtype: int64
df.describe()
|
用户ID |
年龄 |
年收入 |
支出 |
count |
200.000000 |
200.000000 |
200.000000 |
200.000000 |
mean |
100.500000 |
38.850000 |
60.560000 |
50.200000 |
std |
57.879185 |
13.969007 |
26.264721 |
25.823522 |
min |
1.000000 |
18.000000 |
15.000000 |
1.000000 |
25% |
50.750000 |
28.750000 |
41.500000 |
34.750000 |
50% |
100.500000 |
36.000000 |
61.500000 |
50.000000 |
75% |
150.250000 |
49.000000 |
78.000000 |
73.000000 |
max |
200.000000 |
70.000000 |
137.000000 |
99.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
用户ID 200 non-null int64
性别 200 non-null object
年龄 200 non-null int64
年收入 200 non-null int64
支出 200 non-null int64
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
查看数据分布
fig = plt.figure(figsize=(20,8))
fig.suptitle('各指标数据分布')
ax1 = fig.add_subplot(221)
ax1.hist(df['年龄'])
ax1.title.set_text('年龄分布')
ax2 = fig.add_subplot(222)
male, female = (df['性别'] == 'Male').sum(), (df