K近邻算法的思路:如果一个样本在空间上最近的K邻居大多数都属于M类,则该样本属于M类。在本章中,使用K近邻算法识别用户操作序列中的异常命令。
分析数据集url:http://www.schonlau.net/
数据集说明:
50个用户的linux操作日志
以User开头的文件为用户命令,总共有50个用户,每个文件记录了用户的15000条命令;其中前5000条是正常操作,而后10000条则包含部分异常操作
label.txt是一个100行,50列的文件,每一列代表一个用户,而每一行则代表了对于每100条命令的标注(异常命令只会出现在50001~15000共10000行内,100行内只要出现一次异常操作,则认为异常)方法一:
#对于每100个操作序列,选取以下特征
#特征1:不重复命令个数
#特征2:操作最频繁的前10个命令,与数据集内操作最频繁的前50个命令,计算重合度
#特征3:操作最不频繁的前10个命令,与数据集内操作最不频繁的前50个命令,计算重合度
代码如下:
#coding:utf-8
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
#对于每100个操作序列,选取以下特征
#特征1:不重复命令个数
#特征2:操作最频繁的前10个命令,与数据集内操作最频繁的前50个命令,计算重合度
#特征3:操作最不频繁的前10个命令,与数据集内操作最不频繁的前50个命令,计算重合度
DATAPATH = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/data")
def parse_data():
rtn = list()
for i in range(1, 51):
FULLPATH = DATAPATH + "/User" + str(i)
curdic = dict()
with open(FULLPATH, "r") as f:
for line in f.readlines():
line = line.strip()
if line in curdic.keys():
curdic[line] = curdic[line] + 1
else:
curdic[line] = 1
clist = sorted(curdic.items(), key=lambda x:x[1], reverse=True)
rtn.append(clist)
return rtn
def parse_all_data(ls):
curdic = dict()
for item in ls:
for citem in item:
if citem[0] not in curdic.keys():
curdic[citem[0]] = 1
curdic[citem[0]] = curdic[citem[0]] + citem[1]
rtn = sorted(curdic.items(), key=lambda x:x[1], reverse=True)
return rtn
def parse_user_data():
rtn = list()
for i in range(1, 51):
FULLPATH = DATAPATH + "/User" + str(i)
curlist = list()
with open(FULLPATH, "r") as f:
for line in f.readlines():
curlist.append(line.strip())
rtn.append(curlist)
return rtn
def parse_label_data():
ls = list()
for i in range(0,50):
ls.append(list())
FULLPATH = DATAPATH + "/label.txt"
with open(FULLPATH, "r") as f:
for line in f.readlines():
lines = line.split()
for i in range(0, 50):
ls[i].append(int(lines[i]))
return ls
if __name__ == '__main__':
data = parse_data()
all_data = parse_all_data(data)
top_cmd_50 = [item[0] for item in all_data[0:50]]
last_cmd_50 = [item[0] for item in all_data[-50:]]
all_user_data = parse_user_data()
all_label_data = parse_label_data()
#使用用户3的数据进行训练和测试
for m in range(0, 50):
testdata = all_user_data[m]
train_data = list()
for i in range(0,15000,100):
cmds = testdata[i:i+100]
feature1 = len(set(cmds))
cmd_dict = dict()
for cmd in cmds:
if cmd not in cmd_dict.keys():
cmd_dict[cmd] = 1
else:
cmd_dict[cmd] = cmd_dict[cmd] + 1
cmd_list = sorted(cmd_dict.items(), key=lambda x:x[1], reverse=True)
top_cmd_10 = [item[0] for item in cmd_list[0:10]]
last_cmd_10 = [item[0] for item in cmd_list[-10:]]
feature2 = len(set(top_cmd_10)&set(top_cmd_50))
feature3 = len(set(last_cmd_10)&set(last_cmd_50))
train_data.append([feature1, feature2, feature3])
#标签
label_data = [0]*50 + all_label_data[m]
#使用前120个训练KNN模型,使用后30个进行验证
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(train_data[0:120], label_data[0:120])
test_result = model.predict(train_data[-30:])
print "user:", m+1, "precision:", np.mean(test_result==label_data[-30:])*100
print "predict", test_result
print "correct result", label_data[-30:]运行效果如下:

方法二:
在50个命令文件中,去出所有命令(不重复)组成词集。然后对于每100个操作序列,根据它们在词集向量空间上的分布情况得到特征。
代码如下:
#coding:utf-8
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
DATAPATH = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/data")
#得到命令词集
def parse_word_dict():
words = set()
for i in range(1, 51):
FULLPATH = DATAPATH + "/User" + str(i)
with open(FULLPATH, "r") as f:
for line in f.readlines():
words.add(line.strip())
return list(words)
def parse_all_data(words):
ls = list()
words_len = len(words)
for i in range(1, 51):
FULLPATH = DATAPATH + "/User" + str(i)
cwordict = dict()
with open(FULLPATH, "r") as f:
cmds = list()
for line in f.readlines():
line = line.strip()
cmds.append(line)
for j in range(0, 15000, 100):
start = j
end = j+100
#每100个命令组成词集向量
clist = [0]*words_len
for m in range(start,end):
for n in range(0, words_len):
if cmds[m] == words[n]:
clist[n] = 1
break
ls.append(clist)
return ls
def parse_label_data():
ls = list()
for i in range(0,50):
ls.append(list())
FULLPATH = DATAPATH + "/label.txt"
with open(FULLPATH, "r") as f:
for line in f.readlines():
lines = line.split()
for i in range(0,50):
ls[i].append(lines[i])
rtnls = list()
for line in ls:
rtnls.extend([0]*50+line)
return rtnls
if __name__ == '__main__':
words = parse_word_dict()
test_data = parse_all_data(words)
label_data = parse_label_data()
neigh = KNeighborsClassifier(n_neighbors = 3)
#10轮交叉验证
scores = cross_val_score(neigh, test_data, label_data, cv=10)
print scores
print "precision:",np.mean(scores)*100
10轮交叉验证的准确度如下:

本文介绍如何利用K近邻算法识别用户操作序列中的异常命令。通过分析50个用户的Linux操作日志,提取不重复命令数量、常用命令重合度等特征,并采用两种方法实现异常检测。
3643

被折叠的 条评论
为什么被折叠?



