asdf

#coding:utf-8


import pyspark
import numpy as np
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.mllib.clustering import KMeans
from math import sqrt
from matplotlib.delaunay.testfuncs import TestData


#http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
trainFileName   = "kddcup.data_10_percent_corrected.data";
#http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz
testFileName    = "corrected.data";
conf    = SparkConf().setAppName('anormalyDetect');
sc      = SparkContext(conf=conf); 


def ReadData(fileName): 
       
  rdd = sc.textFile(fileName, 10);
     
    return rdd;  


if __name__ == '__main__':
    
    print 'run..'
    
    inData      = ReadData(trainFileName).cache();  
    totalNum    = inData.count();
    
    print 'totalNum = ', totalNum;
    
    trainData       = inData.map(lambda line: np.array([float(x) for x in line.split(',')[4:-1]]));
    labelsAndData   = inData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
    
    trainData.cache();
    labelsAndData.cache();
    
    sum1 = trainData.reduce(lambda x, y: x + y);
    sum2 = trainData.map(lambda line: line * line).reduce(lambda x,  y: x + y);
    
    means   = sum1 / totalNum;
    stdevs  = np.maximum(0.0000001, np.sqrt(sum2 * totalNum - sum1 * sum1) / totalNum);
    
    norData         = trainData.map(lambda line: (line - means) / stdevs);
    norLabelsData   = labelsAndData.mapValues(lambda line: (line - means) / stdevs);
    
    #训练会反复迭代读取训练集,这一步非常必要
    norData.cache();
    norLabelsData.cache();
    
    numK = 0;
    for numK in range(5, 10, 5):
        model   = KMeans.train(norData, numK, maxIterations=10, runs=3, initializationMode='random');
        ansRdd  = norLabelsData.map(lambda line: (model.predict(line[1]), line[0]));
        
        def CalError(line):
            kCenter = np.array(model.centers[model.predict(line)]);
            dis     = kCenter - line;
            return sqrt(np.sum(dis * dis));
        
        totalErr = norData.map(lambda line: CalError(line)).reduce(lambda x, y: x + y) / totalNum;
        print 'numK = ', numK, ' and ths error is ', totalErr;
        break;
    
    clusterRdd   = ansRdd.groupByKey().mapValues(lambda line: list(line));
    clusterLabels = [''] * numK;
    for (cid, line) in clusterRdd.collect():
        dics    = sc.parallelize(line).map(lambda line: (line, 1)).countByKey();
        labels  = sorted(dics.iteritems(), key=lambda x: x[1], reverse=True);
        #取此中心类别数量最多的label作为该类的label
        clusterLabels[cid] = labels[0][0];
    
    inTestData  = ReadData(testFileName).cache();
    testData    = inTestData.map(lambda line: (line.split(',')[-1], np.array([float(x) for x in line.split(',')[4:-1]])));
    norTestData = testData.mapValues(lambda line: (line - means) / stdevs);
    tn          = norTestData.count();
    accaurcy    = norTestData.map(lambda line: line[0] == clusterLabels[model.predict(line[1])]).reduce(lambda x, y: x + y) * 100.0 / tn;
    
    print 'the number of test data = ', tn, ' and accaurcy is ', accaurcy;
    
    
    
    
        
        
    
     
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值