//1 package com.example.weblog; import com.example.mapreduce.WordCountReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WebLogDriver { // mapreduce的Driver // 提交job public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { //设置操作集群时的用户名为root System.setProperty("HADOOP_USER_NAME", "root"); // 1. 获取配置信息以及获取job对象 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2. 关联本地的jar job.setJarByClass(WebLogDriver.class); // 3. 关联Mapper和Reducer job.setMapperClass(WebLogMapper.class); // 4. 设置Mapper输出的KV类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); // 5. 设置最终输出KV类型 // // 6. 设置输入和输出路径。请确保wcinput是存在的,并且下面有待统计词频的单词文件。 // // output1会自动被创建,如果它已经存在,程序会报错! // FileInputFormat.setInputPaths(job, new Path("E://vm//wordcount")); // FileOutputFormat.setOutputPath(job, new Path("E://vm//wordcount1")); //改成集群的地址 FileInputFormat.setInputPaths(job, new Path("D:\\vm\\weblogs")); FileOutputFormat.setOutputPath(job, new Path("D:\\vm\\weblog_us12")); // 7. 提交job boolean b = job.waitForCompletion(true); System.exit(b ? 0 : 1); } }
//2
package com.example.weblog; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; //1.继承Mapper类 //2.重写map方法 public class WebLogMapper extends Mapper<LongWritable, Text, Text, NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { System.out.println(value); System.out.println("========================================="); //1.获取一行数据,使用空格拆分,得到字段数组 String[] words = value.toString().split("\\s+"); System.out.println(words.length); System.out.println("========================================="); //2.如果有6个字段,我们就保留这条数据 if (words.length == 8) { context.write(value, NullWritable.get()); } } }