继续向下看,第二个Job
.................... .................... .................... // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); //读取上个job生成的多个fetchlist的segment FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); //过滤掉不是以fetchlist开头的文件 if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment //一个partition Job 对segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } .................... .................... ....................
// 这里主要是通过URLPartitioner来做的,具体是按哪一个来分类,是通用参数来配置的,这里有PARTITION_MODE_DOMAIN,PARTITION_MODE_IP
// 来配置,默认是按Url的hashCode来分。
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness."); } //产生一个新的目录,以当前时间明明 Path segment = new Path(segmentsDir, generateSegmentName()); //在上面的目录下,再产生一个特定的crawl_generate目录 Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment); NutchJob job = new NutchJob(getConf()); job.setJobName("generate: partition " + segment); job.setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); JobClient.runJob(job); return segment; }