import pandas as pd
votes = pd.read_csv("D:\\test\machineLearning\\114_congress.csv")
print(votes["party"].value_counts())
print votes.mean()
R 54
D 44
I 2
Name: party, dtype: int64
00001 0.325
00004 0.575
00005 0.535
00006 0.945
00007 0.545
00008 0.415
00009 0.545
00010 0.985
00020 0.525
00026 0.545
00032 0.410
00038 0.480
00039 0.510
00044 0.460
00047 0.370
dtype: float64
from sklearn.metrics.pairwise import euclidean_distances
print(euclidean_distances(votes.iloc[0,3:].reshape(1,-1),votes.iloc[1,3:].reshape(1,-1)))
distance = euclidean_distances(votes.iloc[0,3:].reshape(1,-1),votes.iloc[2,3:].reshape(1,-1))
[[ 1.73205081]]
import pandas as pd
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=2,random_state=1)
senator_distances = kmeans_model.fit_transform(votes.iloc[:,3:])
labels=kmeans_model.labels_
print(pd.crosstab(labels,votes["party"]))
democratic = votes[(labels == 1)&(votes["party"]!="D")]
party D I R
row_0
0 41 2 0
1 3 0 54
democratic = votes[(labels == 1)&(votes["party"]=="D")]
print democratic
name party state 00001 00004 00005 00006 00007 00008 00009 \
42 Heitkamp D ND 0.0 1.0 0.0 1.0 0.0 0.0 1.0
56 Manchin D WV 0.0 1.0 0.0 1.0 0.0 0.0 1.0
74 Reid D NV 0.5 0.5 0.5 0.5 0.5 0.5 0.5
00010 00020 00026 00032 00038 00039 00044 00047
42 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
56 1.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0
74 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
import matplotlib.pyplot as plt
plt.scatter(x=senator_distances[:,0],y=senator_distances[:,1],c=labels)
plt.show()

extremism = (senator_distances ** 3).sum(axis=1)
votes["extremism"] = extremism
votes.sort_values("extremism",inplace=True,ascending=False)
print votes.head(2)
name party state 00001 00004 00005 00006 00007 00008 00009 \
98 Wicker R MS 0.0 1.0 1.0 1.0 1.0 0.0 1.0
53 Lankford R OK 0.0 1.0 1.0 0.0 1.0 0.0 1.0
00010 00020 00026 00032 00038 00039 00044 00047 extremism
98 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 46.250476
53 1.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 46.046873