#coding:utf-8
import pyspark
import numpy as np
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.mllib.clustering import KMeans
from math import sqrt
from matplotlib.delaunay.testfuncs import TestData
#http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
trainFileName = "kddcup.data_10_percent_corrected.data";
#http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz
testFileName = "corrected.data";
conf = SparkConf().setAppName('anormalyDetect');
sc = SparkContext(conf=conf);
def ReadData(fileName):
rdd = sc.textFile(fileName, 10);
return rdd;
if __name__ == '__main__':
print 'run..'
inData = ReadData(trainFileName).cache();
totalNum = inData.count();
print 'totalNum = ', totalNum;
trainData = inData.map(lambda line: np.array([float(x) for x in line.split(',')[4:-1]]));
labelsAndData = inData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
trainData.cache();
labelsAndData.cache();
sum1 = trainData.reduce(lambda x, y: x + y);
sum2 = trainData.map(lambda line: line * line).reduce(lambda x, y: x + y);
means = sum1 / totalNum;
stdevs = np.maximum(0.0000001, np.sqrt(sum2 * totalNum - sum1 * sum1) / totalNum);
norData = trainData.map(lambda line: (line - means) / stdevs);
norLabelsData = labelsAndData.mapValues(lambda line: (line - means) / stdevs);
#训练会反复迭代读取训练集,这一步非常必要
norData.cache();
norLabelsData.cache();
numK = 0;
for numK in range(5, 10, 5):
model = KMeans.train(norData, numK, maxIterations=10, runs=3, initializationMode='random');
ansRdd = norLabelsData.map(lambda line: (model.predict(line[1]), line[0]));
def CalError(line):
kCenter = np.array(model.centers[model.predict(line)]);
dis = kCenter - line;
return sqrt(np.sum(dis * dis));
totalErr = norData.map(lambda line: CalError(line)).reduce(lambda x, y: x + y) / totalNum;
print 'numK = ', numK, ' and ths error is ', totalErr;
break;
clusterRdd = ansRdd.groupByKey().mapValues(lambda line: list(line));
clusterLabels = [''] * numK;
for (cid, line) in clusterRdd.collect():
dics = sc.parallelize(line).map(lambda line: (line, 1)).countByKey();
labels = sorted(dics.iteritems(), key=lambda x: x[1], reverse=True);
#取此中心类别数量最多的label作为该类的label
clusterLabels[cid] = labels[0][0];
inTestData = ReadData(testFileName).cache();
testData = inTestData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
norTestData = testData.mapValues(lambda line: (line - means) / stdevs);
tn = norTestData.count();
accaurcy = norTestData.map(lambda line: line[0] == clusterLabels[model.predict(line[1])]).reduce(lambda x, y: x + y) * 100.0 / tn;
print 'the number of test data = ', tn, ' and accaurcy is ', accaurcy;
import pyspark
import numpy as np
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.mllib.clustering import KMeans
from math import sqrt
from matplotlib.delaunay.testfuncs import TestData
#http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
trainFileName = "kddcup.data_10_percent_corrected.data";
#http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz
testFileName = "corrected.data";
conf = SparkConf().setAppName('anormalyDetect');
sc = SparkContext(conf=conf);
def ReadData(fileName):
rdd = sc.textFile(fileName, 10);
return rdd;
if __name__ == '__main__':
print 'run..'
inData = ReadData(trainFileName).cache();
totalNum = inData.count();
print 'totalNum = ', totalNum;
trainData = inData.map(lambda line: np.array([float(x) for x in line.split(',')[4:-1]]));
labelsAndData = inData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
trainData.cache();
labelsAndData.cache();
sum1 = trainData.reduce(lambda x, y: x + y);
sum2 = trainData.map(lambda line: line * line).reduce(lambda x, y: x + y);
means = sum1 / totalNum;
stdevs = np.maximum(0.0000001, np.sqrt(sum2 * totalNum - sum1 * sum1) / totalNum);
norData = trainData.map(lambda line: (line - means) / stdevs);
norLabelsData = labelsAndData.mapValues(lambda line: (line - means) / stdevs);
#训练会反复迭代读取训练集,这一步非常必要
norData.cache();
norLabelsData.cache();
numK = 0;
for numK in range(5, 10, 5):
model = KMeans.train(norData, numK, maxIterations=10, runs=3, initializationMode='random');
ansRdd = norLabelsData.map(lambda line: (model.predict(line[1]), line[0]));
def CalError(line):
kCenter = np.array(model.centers[model.predict(line)]);
dis = kCenter - line;
return sqrt(np.sum(dis * dis));
totalErr = norData.map(lambda line: CalError(line)).reduce(lambda x, y: x + y) / totalNum;
print 'numK = ', numK, ' and ths error is ', totalErr;
break;
clusterRdd = ansRdd.groupByKey().mapValues(lambda line: list(line));
clusterLabels = [''] * numK;
for (cid, line) in clusterRdd.collect():
dics = sc.parallelize(line).map(lambda line: (line, 1)).countByKey();
labels = sorted(dics.iteritems(), key=lambda x: x[1], reverse=True);
#取此中心类别数量最多的label作为该类的label
clusterLabels[cid] = labels[0][0];
inTestData = ReadData(testFileName).cache();
testData = inTestData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
norTestData = testData.mapValues(lambda line: (line - means) / stdevs);
tn = norTestData.count();
accaurcy = norTestData.map(lambda line: line[0] == clusterLabels[model.predict(line[1])]).reduce(lambda x, y: x + y) * 100.0 / tn;
print 'the number of test data = ', tn, ' and accaurcy is ', accaurcy;