MapReduce清洗日志数据统计PV量

本文介绍了一个使用Hadoop MapReduce实现的Web页面访问量(PV)统计程序。该程序通过Mapper处理输入的日志文件,提取省份ID并计数,然后Reducer汇总每个省份的PV。Mapper和Reducer类分别实现了数据映射和聚合的功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 

  1 package mapreduce.webpv;
  2 
  3 import java.io.IOException;
  4 import org.apache.commons.lang.StringUtils;
  5 import org.apache.hadoop.conf.Configuration;
  6 import org.apache.hadoop.conf.Configured;
  7 import org.apache.hadoop.fs.Path;
  8 import org.apache.hadoop.io.IntWritable;
  9 import org.apache.hadoop.io.LongWritable;
 10 import org.apache.hadoop.io.Text;
 11 import org.apache.hadoop.mapreduce.Job;
 12 import org.apache.hadoop.mapreduce.Mapper;
 13 import org.apache.hadoop.mapreduce.Reducer;
 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 import org.apache.hadoop.util.Tool;
 17 import org.apache.hadoop.util.ToolRunner;
 18 
 19 public class WebPvMapReduce extends Configured implements Tool {
 20 
 21     // step 1: Mapper
 22     public static class WebPvMapper extends
 23             Mapper<LongWritable, Text, IntWritable, IntWritable> {
 24         private IntWritable mapOutputKey = new IntWritable();
 25         private IntWritable mapOutputValue = new IntWritable(1);
 26 
 27         @Override
 28         public void map(LongWritable key, Text value, Context context)
 29                 throws IOException, InterruptedException {
 30 
 31             // line value
 32             String lineValue = value.toString();
 33 
 34             // spilt
 35             String[] values = lineValue.split("\t");
 36 
 37             // url
 38             String urlValue = values[1];
 39 
 40             if (StringUtils.isBlank(urlValue)) {
 41                 // conuter
 42                 context.getCounter("WEBPVMAPPER_CUUNTERS", "URL_BLANK")
 43                         .increment(1L);
 44                 return;
 45             }
 46 
 47             if (30 > values.length) {
 48 
 49                 // conuter
 50                 context.getCounter("WEBPVMAPPER_CUUNTERS", "LENGTH_LT_30")
 51                         .increment(1L);
 52 
 53                 return;
 54             }
 55 
 56             // province id
 57             String provinceIdValue = values[23];
 58 
 59             if (StringUtils.isBlank(provinceIdValue)) {
 60                 // conuter
 61                 context.getCounter("WEBPVMAPPER_CUUNTERS", "PROVINCEID_BLANK")
 62                         .increment(1L);
 63                 return;
 64             }
 65 
 66             Integer provinceId = Integer.MAX_VALUE;
 67             try {
 68                 provinceId = Integer.valueOf(provinceIdValue);
 69             } catch (Exception e) {
 70                 // conuter
 71                 context.getCounter("WEBPVMAPPER_CUUNTERS",
 72                         "PROVINCEID_NOT_NUMBER").increment(1L);
 73                 return;
 74             }
 75 
 76             // map outpu key
 77             mapOutputKey.set(provinceId);
 78 
 79             context.write(mapOutputKey, mapOutputValue);
 80         }
 81     }
 82 
 83     // step 2: Reducer
 84     public static class WebPvReducer extends
 85             Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
 86         private IntWritable outputValue = new IntWritable();
 87 
 88         @Override
 89         protected void reduce(IntWritable key, Iterable<IntWritable> values,
 90                 Context context) throws IOException, InterruptedException {
 91             // temp sum
 92             int sum = 0;
 93 
 94             // iterator
 95             for (IntWritable value : values) {
 96                 sum += value.get();
 97             }
 98 
 99             // set output
100             outputValue.set(sum);
101 
102             context.write(key, outputValue);
103         }
104     }
105 
106     // step 3: Driver
107     public int run(String[] args) throws Exception {
108 
109         Configuration configuration = this.getConf();
110 
111         Job job = Job.getInstance(configuration, this.getClass()
112                 .getSimpleName());
113         job.setJarByClass(WebPvMapReduce.class);
114 
115         // set job
116         // input
117         Path inpath = new Path(args[0]);
118         FileInputFormat.addInputPath(job, inpath);
119 
120         // output
121         Path outPath = new Path(args[1]);
122         FileOutputFormat.setOutputPath(job, outPath);
123 
124         // Mapper
125         job.setMapperClass(WebPvMapper.class);
126         job.setMapOutputKeyClass(IntWritable.class);
127         job.setMapOutputValueClass(IntWritable.class);
128 
129         // Reducer
130         job.setReducerClass(WebPvReducer.class);
131         job.setOutputKeyClass(IntWritable.class);
132         job.setOutputValueClass(IntWritable.class);
133 
134         // submit job -> YARN
135         boolean isSuccess = job.waitForCompletion(true);
136         return isSuccess ? 0 : 1;
137     }
138 
139     public static void main(String[] args) throws Exception {
140 
141         Configuration configuration = new Configuration();
142 
143         args = new String[] {
144                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/input/testdata/2015082818",
145                 "hdfs://beifeng01:8020//user/beifeng01/mapreduce/output1" };
146 
147         int status = ToolRunner.run(configuration, new WebPvMapReduce(), args);
148 
149         // exit program
150         System.exit(status);
151     }
152 }

 查看结果

 1 $ bin/hdfs dfs -text /user/beifeng01/mapreduce/output1/pa*
 2 1       3527
 3 2       1672
 4 3       511
 5 4       325
 6 5       776
 7 6       661
 8 7       95
 9 8       80
10 9       183
11 10      93
12 11      135
13 12      289
14 13      264
15 14      374
16 15      163
17 16      419
18 17      306
19 18      272
20 19      226
21 20      2861
22 21      124
23 22      38
24 23      96
25 24      100
26 25      20
27 26      157
28 27      49
29 28      21
30 29      85
31 30      42
32 32      173

 

转载于:https://www.cnblogs.com/perfectdata/p/10103171.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值