今天canopy算法大部分内容都差不多理解了,但是还有许多疑问尤其是算法本身的过程T1的作用还是不是太清楚,今天主要是对通过
Path clustersOut = CanopyDriver.buildClusters(new Configuration(), directoryContainingConvertedInput, ouput, measure, t1, t2, t1, t2, 0, false);
确定中心点后怎么对各个Vector进行分类的代码mr进行了分析,它只用了一个mapper
protected void setup(Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String clustersIn = conf.get("clusters_in"); this.threshold = (double)conf.getFloat("pdf_threshold", 0.0F); this.emitMostLikely = conf.getBoolean("emit_most_likely", false); this.clusterModels = new ArrayList();//获得族中心点 if (clustersIn != null && !clustersIn.isEmpty()) { Path clustersInPath = new Path(clustersIn); this.clusterModels = populateClusterModels(clustersInPath, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(clustersInPath));//获得策略 this.clusterClassifier = new ClusterClassifier(this.clusterModels, policy); } this.clusterId = new IntWritable(); }
下面是map
protected void map(WritableComparable<?> key, VectorWritable vw, Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context context) throws IOException, InterruptedException { if (!this.clusterModels.isEmpty()) { Class<? extends Vector> vectorClass = vw.get().getClass(); Vector vector = vw.get(); if (!vectorClass.equals(NamedVector.class)) { if (key.getClass().equals(Text.class)) { vector = new NamedVector((Vector)vector, key.toString()); } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector((Vector)vector, Integer.toString(((IntWritable)key).get())); } } Vector pdfPerCluster = this.clusterClassifier.classify((Vector)vector);
//判断是否是该类
if (this.shouldClassify(pdfPerCluster)) { if (this.emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); //写入到文件中
this.write(new VectorWritable((Vector)vector), context, maxValueIndex, 1.0D); } else { this.writeAllAboveThreshold(new VectorWritable((Vector)vector), context, pdfPerCluster); } } }}InpoutDriver的map很简单就是根据空格将stir
protected void map(LongWritable key, Text values, Mapper<LongWritable, Text, Text, VectorWritable>.Context context) throws IOException, InterruptedException { String[] numbers = SPACE.split(values.toString()); Collection<Double> doubles = new ArrayList(); String[] arr$ = numbers; int index = numbers.length; for(int i$ = 0; i$ < index; ++i$) { String value = arr$[i$]; if (!value.isEmpty()) { doubles.add(Double.valueOf(value)); } } if (!doubles.isEmpty()) { try { Vector result = (Vector)this.constructor.newInstance(doubles.size()); index = 0; Iterator i$ = doubles.iterator(); while(i$.hasNext()) { Double d = (Double)i$.next(); result.set(index++, d.doubleValue()); } VectorWritable vectorWritable = new VectorWritable(result); context.write(new Text(String.valueOf(index)), vectorWritable); } catch (IllegalAccessException | InvocationTargetException | InstantiationException var10) { throw new IllegalStateException(var10); } } }