使用K近邻算法检测异常操作
1、数据清洗
依次读取文件中每行操作命令,每100个命令组成一个操作序列,并且做了标记,统计最频繁使用的前50个命令和最不频繁使用的前50个命令:
with open(filename) as f:
i=0
x=[]
for line in f:
line=line.strip('\n')
x.append(line)
dist.append(line)
i+=1
if i == 100:
cmd_list.append(x)
x=[]
i=0
fdist = list(FreqDist(dist).keys())
dist_max=set(fdist[0:50])
dist_min = set(fdist[-50:])
2、特征化
(1)去重操作命令个数f1,这里是以每100个操作命令为一组
(2)最频繁使用的前10个命令f2
(3)最不频繁使用的前10个命令f3
将f2,f3标量化,即与之前统计的dist_max计算重合度,与dist_min计算重合度
f1=len(set(cmd_block))
fdist = list(FreqDist(cmd_block).keys())
f2=fdist[0:10]
f3=fdist[-10:]
f2 = len(set(f2) & set(dist_max))
f3=len(set(f3)&set(dist_min))
x=[f1,f2,f3]
3、训练模型
从标识文件中加载针对操作序列正常和异常的标识,0为正常操作,1为异常操作
with open(filename) as f:
for line in f:
line=line.strip('\n')
print(line)
x.append( int(line.split()[index]))
加载操作数据,并且将前100个作为训练序列,后50个作为测试序列
user_cmd_list,user_cmd_dist_max,user_cmd_dist_min=load_user_cmd("G:/data/MasqueradeDat/User3")
user_cmd_feature=get_user_cmd_feature(user_cmd_list,user_cmd_dist_max,user_cmd_dist_min)
labels=get_label("G:/data/MasqueradeDat/label.txt",2)
y=[0]*50+labels
x_train=user_cmd_feature[0:N]
y_train=y[0:N]
x_test=user_cmd_feature[N:150]
y_test=y[N:150]
调用KNN函数进行训练
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
y_predict=neigh.predict(x_test)
4、效果验证
测试结果计算
score=np.mean(y_test==y_predict)*100
完整代码:
import sys
import urllib
import re
import numpy as np
from sklearn.externals import joblib
import nltk
import csv
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
N=100
def load_user_cmd(filename):
cmd_list=[]
dist_max=[]
dist_min=[]
dist=[]
with open(filename) as f:
i=0
x=[]
for line in f:
line=line.strip('\n')
x.append(line)
dist.append(line)
i+=1
if i == 100:
cmd_list.append(x)
x=[]
i=0
fdist = list(FreqDist(dist).keys())
dist_max=set(fdist[0:50])
dist_min = set(fdist[-50:])
return cmd_list,dist_max,dist_min
def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):
user_cmd_feature=[]
for cmd_block in user_cmd_list:
f1=len(set(cmd_block))
fdist = list(FreqDist(cmd_block).keys())
f2=fdist[0:10]
f3=fdist[-10:]
f2 = len(set(f2) & set(dist_max))
f3=len(set(f3)&set(dist_min))
x=[f1,f2,f3]
user_cmd_feature.append(x)
return user_cmd_feature
def get_label(filename,index=0):
x=[]
with open(filename) as f:
for line in f:
line=line.strip('\n')
print(line)
x.append( int(line.split()[index]))
return x
if __name__ == '__main__':
user_cmd_list,user_cmd_dist_max,user_cmd_dist_min=load_user_cmd("G:/data/MasqueradeDat/User3")
user_cmd_feature=get_user_cmd_feature(user_cmd_list,user_cmd_dist_max,user_cmd_dist_min)
labels=get_label("G:/data/MasqueradeDat/label.txt",2)
y=[0]*50+labels
x_train=user_cmd_feature[0:N]
y_train=y[0:N]
x_test=user_cmd_feature[N:150]
y_test=y[N:150]
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
y_predict=neigh.predict(x_test)
score=np.mean(y_test==y_predict)*100
#print y
#print y_train
print (y_test)
print (y_predict)
print (score)
print (classification_report(y_test, y_predict))
print(np.array(labels).shape)
上面的思路是比较最频繁和最不频繁的操作命令,可能测试结果不是很理想,接下来进行全比较,在验证的时候,采用交叉验证,10次随机取样和验证,提高可信度。
import sys
import urllib
import re
import numpy as np
from sklearn.externals import joblib
import nltk
import csv
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score
N = 90
def load_user_cmd(filename):
cmd_list = []
dist = []
with open(filename) as f:
i = 0
x = []
for line in f:
line = line.strip('\n')
x.append(line)
i += 1
dist.append(line)
if i == 100:
cmd_list.append(x)
x = []
i = 0
fdist = FreqDist(dist).keys()
return cmd_list,fdist
def get_user_cmd_feature(user_cmd_list,dist):
user_cmd_feature = []
for cmd_list in user_cmd_list:
v = [0] * len(dist)
for i in range(0,len(dist)):
if list(dist)[i] in cmd_list:
v[i] += 1
user_cmd_feature.append(v)
return user_cmd_feature
def get_label(filename,index = 0):
x = []
with open(filename) as f:
for line in f:
line = line.strip('\n')
x.append(int(line.split()[index]))
return x
if __name__ == '__main__':
user_cmd_list,dist=load_user_cmd("G:/data/MasqueradeDat/User3")
print ("Dist:(%s)" % dist)
user_cmd_feature=get_user_cmd_feature(user_cmd_list,dist)
labels=get_label("G:/data/MasqueradeDat/label.txt",2)
y=[0]*50+labels
x_train=user_cmd_feature[0:N]
y_train=y[0:N]
x_test=user_cmd_feature[N:150]
y_test=y[N:150]
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
y_predict=neigh.predict(x_test)
print (cross_val_score(neigh, user_cmd_feature, y, n_jobs=-1, cv=10))