#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 17 03:01:06 2018
@author: vicky
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy import stats
from pandas.core.frame import DataFrame
#--------------变量提取
#df = pd.read_excel('/Users/vicky/Desktop/建模/附件1.xlsx')
result=pd.read_excel('/Users/vicky/Desktop/建模/第一问/结果.xlsx')
data= result[(result.gname =='Unknown')] #提取未知作案者的事件
data= data[(data.iyear >=2015)] #近3年
data1=data[['iyear', 'imonth', 'country', 'region', 'suicide','attacktype1',
'targtype1', 'weaptype1', 'ransom']]#选9个特征变量,都是离散值
#-------------缺失值负值处理
#data1[data1.columns[9]].value_counts()
data1.isnull().sum() #只有ransom有nan和负值
#缺失值和负值都变成0(未知是否有赎金的都当作没有赎金)
data1[data1.ransom<0]=0
data1['ransom']=data1['ransom'].fillna(0)
#data1['ransom'].value_counts()
#--------------相关性检验
corr = data1.corr()#变量之间相关度很低,不用去相关处理
sns.heatmap(corr)
#---------------标准化处理----------------
scaler = StandardScaler().fit(data1)
data2=pd.DataFrame(scaler.fit_transform(data1))
#----------------kmeans聚类---------------
#----选K------
kmax=15#K的选择范围上限
#手肘法:画sse与K的图,拐点处为最佳K
SSE = [] # 存放每次结果的误差平方和
for k in range(1,kmax):
estimator = KMeans(n_clusters=k,init='k-means++',random_state=1234) # 构造聚类器
estimator.fit(data2)
SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-',color='black')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#效果不好,看不出来拐点
#轮廓系数法,轮廓系数最大的k最佳
Scores = [] # 存放轮廓系数
for k in range(2,kmax):
estimator = KMeans(n_clusters=k, init='k-means++',random_state=1234) # 构造聚类器
estimator.fit(data2)
Scores.append(silhouette_score(data2,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('Silhouette Coefficient')
plt.plot(range(2,kmax),Scores,'o-',color='black')
plt.grid()
plt.xticks(np.arange(1, kmax, 1))
plt.show()
#k=3和9时局部最大,结合手肘法得知k=3时sse还很大,所以k=9时聚类效果最好
score=Scores[7]#k=9时对应的轮廓系数
#---带入k聚类-----
num=9 #参数k
clf = KMeans(n_clusters=num,init='k-means++',random_state=1234)
model = clf.fit(data2)
#中心点
centers=clf.cluster_centers_
print(centers)
#Kmeans的sse
print(clf.inertia_)
#分类结果:每个样本所属的簇
label2=list(clf.labels_)
set2={ k:label2.count(k) for k in set(label2)}
print(set2)
result2=DataFrame(copy.copy(data.eventid))
result2.insert(0,'label2',label2)
result2.to_csv('/Users/vicky/Desktop/建模/第二问/result2.csv', sep=',', header=True, index=False,encoding="utf_8_sig")
#--------类别的危害性排序,用到第一问结果--------------
dic={'最高':5,'较高':4,'中等':3,'较低':2,'最低':1}
lev=list(data.level)
wh=[] #危害性=第一问等级从高到底取权重5,4,3,2,1
for i in range(len(data)):
wh.append(dic[lev[i]])
wh=DataFrame(wh,columns=['weihai'])
wh.insert(0,'label2',label2)
wh_label=wh['weihai'].groupby(wh['label2']).sum() #分类别求和
#类别危险度从大到小排序8,4,2,7,5,3,6,1,0
#------提取题目给的事件集
eid=[201701090031,201702210037,201703120023,201705050009,201705050010,201707010028,201707020006,
201708110018,201711010006,201712010003]
data3=copy.copy(data)
data3.insert(0,'label2',label2)
test=DataFrame()
for x in eid:
test=test.append(data3[(data3.eventid==x)])
test1=test[['iyear', 'imonth', 'country', 'region', 'suicide','attacktype1',
'targtype1', 'weaptype1', 'ransom']]
test1[test1.ransom<0]=0
test1['ransom']=test1['ransom'].fillna(0)
#中心化
scaler = StandardScaler().fit(test1)
test2=pd.DataFrame(scaler.fit_transform(test1))
#------计算test到2,4,5,7,8这5类中心点的距离
k=[2,4,5,7,8]
distance=np.zeros((10,5))#距离矩阵
for i in range(len(test1)):
for j in range(len(k)):
distance[i][j]=np.sqrt(sum(np.power(test2.iloc[i] - centers[k[j],:],2)))#欧式距离
rank_dis=DataFrame(distance).rank(axis=1)#每行排序,距离最小排第一
#其它聚类方法尝试
#----------------MeanShift-----------
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
# Compute clustering with MeanShift
ms = MeanShift(bin_seeding=True)
ms.fit(data2)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
#自动分了8类
#--------------谱聚类-----------
from sklearn.cluster import SpectralClustering
from sklearn import metrics
result=[]
for k in range(2,kmax):
y_pred = SpectralClustering(n_clusters=k).fit_predict(data2)
result.append([k,metrics.calinski_harabaz_score(data2, y_pred)])
print("Calinski-Harabasz Score with n_clusters=", k,"score:", metrics.calinski_harabaz_score(data2, y_pred))
#跑的特别慢
y_pred = SpectralClustering(n_clusters=8).fit_predict(data2)
print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(data2, y_pred))
#---------------层次聚类-------------
from scipy.cluster import hierarchy
import scipy.spatial.distance as dist
#生成点与点之间的距离矩阵
distMatrix = dist.pdist(data1)
#进行层次聚类:
Z = hierarchy.linkage(distMatrix, method = 'complete')
#将层级聚类结果以树状图表示出来
hierarchy.dendrogram(Z)
#根据linkage matrix Z得到聚类结果:
cluster= hierarchy.fcluster(Z, 1, 'inconsistent')
num_clusters = cluster.max()
#-------------DBSCAN---------------
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(data2)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)