import pandas as pda import numpy as np import missingno import matplotlib.pyplot as plt import seaborn as sns #读入数据 data=pda.read_csv("114_congress.csv") #显示前几行 print(data.head()) #查看缺失值 missingno.bar(data,labels=True,color="b") plt.show() #产看数据信息 print(data.info()) print(data.describe()) print(data.shape) # (100, 18) #查看多少党派 print(data["party"].value_counts()) # R 54 # D 44 # I 2 print(len(data["name"].value_counts())) #100无重名 #使用欧式聚类计算相似度 print("=====================欧式距离=================================") from sklearn.metrics import euclidean_distances print(euclidean_distances(data.iloc[0,3:-1].reshape(1,-1),data.iloc[1,3:-1].reshape(1,-1))) # print(help(euclidean_distances)) #使用聚类方法 from sklearn.cluster import KMeans model=KMeans(n_clusters=2,random_state=1) distances=model.fit_transform(data.iloc[:,3:]) # print("distances",distances) labels=model.labels_ print(labels) print(pda.crosstab(labels,data["party"])) democratic_outliers=data[(labels==1) & (data["party"]=="D")] print(democratic_outliers) plt.scatter(x=distances[:,0],y=distances[:,1],c=labels) plt.show() print("==============离群点================") extremism=(distances**3).sum(axis=1) print(extremism) data["extremism"]=extremism print(data.head()) print("==============================") data.sort_values("extremism",inplace=True,ascending=False) print(data.head())