大数据（数据清洗）

原创于 2025-05-14 14:46:41 发布 · 719 阅读

CC 4.0 BY-SA版权

文章标签：

一、先修改pom.xml，在后面添加下面代码

<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
</dependencies>

二、创建WebLogMapper类

package org.example;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WebLogMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1. 获取一行数据,使用空格进行拆分，判断是否有8个字段
String[] fields = value.toString().split(" ");
if (fields.length > 7) {
// 这条数据是有意义的,保留
System.out.println(fields[0]);
context.write(value, NullWritable.get());
} else {
// 这条数据是无意义的,不保留
return;
}
}
}

三、创建WebLogDriver类

package org.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WebLogDriver {
public static void main(String[] args) throws Exception {
// 1 获取job信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);

// 2 加载jar包
job.setJarByClass(WebLogDriver.class);

// 3 关联map
job.setMapperClass(WebLogMapper.class);

// 4 设置最终输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);

// 设置reducetask个数为0
job.setNumReduceTasks(0);

// 5 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path("E:\\vm\\weblogs\\web3.log"));
FileOutputFormat.setOutputPath(job, new Path("E:\\vm\\ouput3"));

// 6 提交
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}