//1 package com.example.mapreduce; //自定义分区器 import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; //1. 继承Partitioner类 //2. 重写getPartition方法 public class CustomerPartitioner extends Partitioner<Text, LongWritable> { @Override public int getPartition(Text key, LongWritable value, int numPartitions) { //返回值是0,1,2....... // 如果key1和Key2的code相同,那么他们会被分配到同一个分区,保存到同一个文件 // 如果单词是以a~m开头的,那么会被分配到第一个分区, // 否则,被分配到第二个分区 //1. 获取单词首字符 char firstChar = key.toString().charAt(0); //2. 判断首字符 if(firstChar >= 'a' && firstChar <= 'm'){ return 0; }else { return 1; } } }
//2
package com.example.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WordCountDriver { // mapreduce的Driver // 提交job public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { //设置操作集群时的用户名为root System.setProperty("HADOOP_USER_NAME", "root"); // 1. 获取配置信息以及获取job对象 Configuration conf = new Configuration(); //设置集群地址 //conf.set("fs.defaultFS", "hdfs://douyin:8020"); Job job = Job.getInstance(conf); // 2. 关联本地的jar job.setJarByClass(WordCountDriver.class); // 3. 关联Mapper和Reducer job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); // 4. 设置Mapper输出的KV类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // 5. 设置最终输出KV类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // // 6. 设置输入和输出路径。请确保wcinput是存在的,并且下面有待统计词频的单词文件。 // // output1会自动被创建,如果它已经存在,程序会报错! // FileInputFormat.setInputPaths(job, new Path("E://vm//wordcount")); // FileOutputFormat.setOutputPath(job, new Path("E://vm//wordcount1")); //改成集群的地址 FileInputFormat.setInputPaths(job, new Path("D:\\vm\\wcinput")); FileOutputFormat.setOutputPath(job, new Path("D:\\vm\\wcoutput1")); // 设置reduceTask的数量 job.setNumReduceTasks(2); // 设置自定义分区器 job.setPartitionerClass(CustomerPartitioner.class); // 7. 提交job boolean b = job.waitForCompletion(true); System.exit(b ? 0 : 1); } }
//3
package com.example.mapreduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; //1.从Mapper继承 //2.重写map方法 //LongWritable,Text:表示初始值的键值对格式。LongWritable是键的数据类型,Text是值的数据类型 //Text,LongWritable:表示map函数输出的数据的格式。Text是键的数据类型,LongWritable是值的数据类型 public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //1.获取一行数据,并拆分成一个个的单词 String[] words = value.toString().split(" "); //2.遍历单词,并输出<单词,1>的键值对 for (String word : words) { context.write(new Text(word), new LongWritable(1)); } } }
//4
package com.example.mapreduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; //1.继承Reducer //2.重写reduce方法 public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { //1.对每个values中的数据进行汇总:累加求和 long sum = 0; for (LongWritable value : values) { sum += value.get(); } //2.输出<单词,总次数> context.write(key, new LongWritable(sum)); } }