//1.从Mapper继承 //2.重写map方法 //LongWritable,Text:表示初始输入的键值对格式。LongWritable是键的数据类型,Text是值的数据类型 //Text,LongWritable:表示map函数输出的数据的格式。Text是键的数据类型,LongWritable是值的数据类型 public class WeblogMapper extends Mapper<LongWritable,Text, Text, NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { System.out.println(key); //1.获取一行数据,使用空格拆分,得到字段组 String[] words= value.toString().split("\\s+"); System.out.println(words.length); System.out.println("=============================================================="); //2.如果有6个字段,我们就保留这条数据 if (words.length == 9) { context.write(value, NullWritable.get()); } } }
package com.example.weblog; //它要做七件事。核心是提交任务给hadoop import com.example.mapreduce.WordCountMapper; import com.example.mapreduce.WordCountReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WeblogDriver { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 设置操作集群时的用户名为root System.setProperty("HADOOP_USER_NAME", "root"); // 1.获取配置信息,初始化Job Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2.关联Driver类 job.setJarByClass(WeblogDriver.class); // 3.关联Mapper和Reducer类 job.setMapperClass(WeblogMapper.class); // 4.设置Mapper输出的key和value类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); 6.设置输入和输出路径:我们要处理的单词所在的文件在哪里?要把最终的结果保存到哪里? FileInputFormat.setInputPaths(job, new Path("D://vm//weblogs")); FileOutputFormat.setOutputPath(job, new Path("D://vm//weblogs_out123456")); // 改成集群上的地址。 /cinput:/表示集群的根目 // FileInputFormat.setInputPaths(job, new Path("/weblog")); // FileOutputFormat.setOutputPath(job, new Path("/weblog_out")); // 7.提交任务 boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }