上篇文章存在的问题:每个MR作业都去全量读取待处理的原始日志,如果数据流很大,疯掉
ETL:全量数据不方便直接进行计算,最好是晋西一步处理后在进行想要的维度统计分析
解析出你要的字段:Ip==》城市信息
去除一些你不需要的字段:不需要的字段太多了
ip/time/url/page_id/country/provine/city
第一步
import com.imooc.bigdata.hadoop.mr.project.mr.PageStatApp;
import com.imooc.bigdata.hadoop.mr.project.utils.ContentUtils;
import com.imooc.bigdata.hadoop.mr.project.utils.LogParser;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
public class ETLApp {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path outputPath = new Path(args[1]);
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
Job job = Job.getInstance(configuration);
job.setJarByClass(ETLApp.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private LongWritable ONE = new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser = new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log = value.toString();
Map<String, String> info = logParser.parse(log);
String ip=info.get("ip");
String country=info.get("country");
String province=info.get("province");
String city=info.get("city");
String url=info.get("url");
String time=info.get("time");
String pageId=ContentUtils.getPageId(url);
StringBuilder builder=new StringBuilder();
builder.append(ip).append("\t");
builder.append(country).append("\t");
builder.append(province).append("\t");
builder.append(city).append("\t");
builder.append(url).append("\t");
builder.append(time).append("\t");
builder.append(pageId);
context.write(NullWritable.get(),new Text(builder.toString()));
}
}
}
mport org.apache.commons.lang.StringUtils;
import java.util.HashMap;
import java.util.Map;
public class LogParser {
public Map<String,String> parseV2(String log){
Map<String,String> info =new HashMap<>();
IPParser ipParser=IPParser.getInstance();
if (StringUtils.isNotBlank(log)){
String [] splits =log.split("\t");
String ip=splits[0];//第十四哥信息
String country=splits[1];
String province=splits[2];
String city=splits[3];
info.put("ip",ip);
info.put("country",country);
info.put("province",province);
info.put("city",city);
String url =splits[4];
info.put("url",url);
String time =splits[5];
info.put("time",time);
String pageId=splits[6];
info.put("pageId",pageId);
}
return info;
}
}
二.PageStat2App
import com.imooc.bigdata.hadoop.mr.project.utils.ContentUtils;
import com.imooc.bigdata.hadoop.mr.project.utils.LogParser;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
/**
*
*/
public class PageStatV2App {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path outputPath = new Path(args[1]);
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
Job job = Job.getInstance(configuration);
job.setJarByClass(PageStatV2App.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private LongWritable ONE = new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser = new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log = value.toString();
Map<String, String> info = logParser.parseV2(log);
String url = info.get("url");
if (StringUtils.isNotBlank(url)) {
String pageId = ContentUtils.getPageId(url);//使用工具类去获取uID
context.write(new Text(pageId), ONE);
}
}
}
static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable value : values) {
count++;
}
context.write(key, new LongWritable(count));
}
}
}
三.ProvinceSta2App
import com.imooc.bigdata.hadoop.mr.project.utils.LogParser;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
import static com.imooc.bigdata.hadoop.mr.project.utils.IPParser.RegionInfo;
import static com.imooc.bigdata.hadoop.mr.project.utils.IPParser.getInstance;
/**
* 省份浏览量统计
*/
public class ProvinceStatV2App {
public static void main(String[] args)throws Exception {
Configuration configuration =new Configuration();
FileSystem fileSystem=FileSystem.get(configuration);
Path outputPath=new Path(args[1]);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
Job job =Job.getInstance(configuration);
job.setJarByClass(ProvinceStatV2App.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text,Text,LongWritable>{
private LongWritable ONE=new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser =new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log =value.toString();
Map<String,String> info=logParser.parseV2(log);
context.write(new Text(info.get("province")),ONE);
}
}
static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count =0;
for(LongWritable value:values){
count++;
}
context.write(key,new LongWritable(count));
}
}
}
四.PvStat2App
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PVStatV2App {
public static void main(String[] args) throws Exception{
Configuration configuration =new Configuration();
FileSystem fileSystem=FileSystem.get(configuration);
Path outputPath=new Path(args[1]);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
Job job =Job.getInstance(configuration);
job.setJarByClass(PVStatV2App.class);
job.setMapperClass(Mymapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
static class Mymapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text KEY=new Text("key");
private LongWritable ONE=new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(KEY,ONE);
}
}
static class MyReducer extends Reducer<Text,LongWritable, NullWritable,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count =0;
for(LongWritable value :values){
count++;
}
context.write(NullWritable.get(),new LongWritable(count));
}
}
}