读代码-RandomSeedGenerator

最新推荐文章于 2024-03-24 10:17:29 发布

最新推荐文章于 2024-03-24 10:17:29 发布 · 357 阅读

文章标签：

#人工智能 #数据结构与算法

mahout 专栏收录该内容

11 篇文章

订阅专栏

package org.apache.mahout.clustering.kmeans;
public final class RandomSeedGenerator
完成中心点随机取样的过程

hdfs操作,比较普遍,先删除再新建


    FileSystem fs = FileSystem.get(output.toUri(), conf);
    HadoopUtil.delete(conf, output);
    Path outFile = new Path(output, "part-randomSeed");
    boolean newFile = fs.createNewFile(outFile);

遍历hdfs路径框架
fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
globStatus返回了匹配pattern的所有路径
logsCRCFilter过滤掉了以_开头的日志,点开头的隐藏及.crc文件
循环时滤掉文件夹,只处理文件


    if (newFile) {
      Path inputPathPattern;

      if (fs.getFileStatus(input).isDir()) {
        inputPathPattern = new Path(input, "*");
      } else {
        inputPathPattern = input;
      }

      FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
      for (FileStatus fileStatus : inputFiles) {
        if (fileStatus.isDir()) {
          continue;
        }
        //process file
      }

    }

初始化writer
准备k个容量的list存储选出的值


      SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
      Random random = RandomUtils.getRandom();
      List<Text> chosenTexts = new ArrayList<Text>(k);
      List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
      int nextClusterId = 0;

随机的核心--蓄水池算法


        for (Pair<Writable,VectorWritable> record
             : new SequenceFileIterable<Writable,VectorWritable>(fileStatus.getPath(), true, conf)) {
          Writable key = record.getFirst();
          VectorWritable value = record.getSecond();
          Cluster newCluster = new Cluster(value.get(), nextClusterId++, measure);
          newCluster.observe(value.get(), 1);
          Text newText = new Text(key.toString());
          int currentSize = chosenTexts.size();
          if (currentSize < k) {
            chosenTexts.add(newText);
            chosenClusters.add(newCluster);
          } else if (random.nextInt(currentSize + 1) == 0) { // with chance 1/(currentSize+1) pick new element
            int indexToRemove = random.nextInt(currentSize); // evict one chosen randomly
            chosenTexts.remove(indexToRemove);
            chosenClusters.remove(indexToRemove);
            chosenTexts.add(newText);
            chosenClusters.add(newCluster);
          }
        }