WebPvMapReduce

最新推荐文章于 2018-12-11 16:44:00 发布

原创最新推荐文章于 2018-12-11 16:44:00 发布 · 164 阅读

·

0

·

CC 4.0 BY-SA版权

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文介绍了一种使用Hadoop进行Web页面访问量(WebPV)分析的方法。通过MapReduce编程模型，从日志文件中提取省份ID，并统计各省份的访问次数。首先过滤非法数据并清洗字段，然后使用Map阶段提取省份ID，最后在Reduce阶段汇总访问次数。

###WebPvMapReduce 1

package com.myblue.myhdfs;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class WebPvMapReduce extends Configured implements Tool {

//map

public static class ModuleMapper extends

Mapper<LongWritable, Text, IntWritable, IntWritable> {

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

String lineValue = value.toString();

String[] splits = lineValue.split("\t");

//过滤非法数据，若该行数据少于30字段，则视为非法数据，不再处理

if (splits.length < 30) {

//参数：计数器组，计数器名

context.getCounter("Web Pv Counter", "Length limit 30").increment(1L);

return;

}

String url = splits[1];// 第2个字段为url

if (StringUtils.isBlank(url)) {

context.getCounter("Web Pv Counter", "Url is Blank") .increment(1L);

return;

}

String provinceIdValue = splits[23];// 第24个字段为provinceId

if (StringUtils.isBlank(provinceIdValue)) {

context.getCounter("Web Pv Counter", "Province is Blank").increment(1L);

return;

}

int provinceId = 0;

try {

provinceId = Integer.parseInt(provinceIdValue);

} catch (Exception e) {

System.out.println(e);

return;

}

IntWritable mapOutputKey = new IntWritable();

mapOutputKey.set(provinceId);

IntWritable mapOutputValue = new IntWritable(1);//本例输出恒为1

context.write(mapOutputKey, mapOutputValue);

}

}

//reduce

public static class ModuleReducer extends

Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

protected void reduce(IntWritable key, Iterable<IntWritable> values,

Context context) throws IOException, InterruptedException {

int sum = 0;

for (IntWritable value : values) {

sum += value.get();

}

IntWritable outputValue = new IntWritable();

outputValue.set(sum);

context.write(key, outputValue);

}

}

public int run(String[] args) throws Exception {

// 创建作业

Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

job.setJarByClass(getClass());

// 输入、输出路径

FileInputFormat.addInputPath(job, new Path(args[0]));

Path outPath = new Path(args[1]);

FileSystem dfs = FileSystem.get(conf);

if (dfs.exists(outPath)) {

dfs.delete(outPath, true);

}

FileOutputFormat.setOutputPath(job, outPath);

// mapper

job.setMapperClass(ModuleMapper.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(IntWritable.class);

// reducer

job.setReducerClass(ModuleReducer.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(IntWritable.class);

// 提交作业

return job.waitForCompletion(true) ? 0 : 1;

}

public static void main(String[] args) throws Exception {

args=new String[]{"/input2","/output2"};

// 使用ToolRunner运行作业

Configuration conf = new Configuration();

int status = ToolRunner.run(conf, new WebPvMapReduce(), args);

System.exit(status);

}

}

###WebPvMapReduce 2

package com.myblue.myhdfs;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class WebPvMapReduce2 extends Configured implements Tool{

//Map

public static class Map extends Mapper<LongWritable, Text, IntWritable, IntWritable>{

private IntWritable mapOutputKey = new IntWritable();

private static final IntWritable mapOutPutValue = new IntWritable(1);

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

//1 value toString hadoop ---> java

String lineValue = value.toString();

//2 split \t

String[] wordValues = lineValue.split("\t");

//3 清洗数据

if(wordValues.length < 30){

//context.getCounter("Web Pv Count", "Length limit 30").increment(1l);

return;

}

if(StringUtils.isEmpty(wordValues[1])){

//context.getCounter("Web Pv Count", "Url is Blank");

return;

}

String provinceIdValue = wordValues[23];

if(StringUtils.isEmpty(provinceIdValue)){

//context.getCounter("Web Pv Count", "Province is Blank");

return;

}

int provinceId = 0;

try {

provinceId = Integer.parseInt(provinceIdValue);

} catch (Exception e) {

System.out.println(e);

return;

}

//4 写入框架

mapOutputKey.set(provinceId);

context.write(mapOutputKey, mapOutPutValue);

}

}

public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{

private IntWritable reduceOutputValue = new IntWritable();

@Override

protected void reduce(IntWritable key, Iterable<IntWritable> vales,Context context)

throws IOException, InterruptedException {

//sum value

int sum = 0;

for (IntWritable value : vales) {

sum += value.get();

}

//write frame

reduceOutputValue.set(sum);

context.write(key, reduceOutputValue);

}

}

//run

public int run(String[] args) throws Exception {

//1 get Configuration

Configuration conf = getConf();

//2 get job

Job job = Job.getInstance(conf);

//run jar

job.setJarByClass(this.getClass());

//3 set job

//3.1 input

Path inputPath = new Path(args[0]);

FileInputFormat.addInputPath(job, inputPath);

//3.2 map

job.setMapperClass(Map.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(IntWritable.class);

//3.3 reduce

job.setReducerClass(Reduce.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(IntWritable.class);

//3.4 output

Path outputPath = new Path(args[1]);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(outputPath)){

fs.delete(outputPath,true);

}

FileOutputFormat.setOutputPath(job, outputPath);

//4 submit job

boolean isSuccess = job.waitForCompletion(true);

return isSuccess ? 0 : 1;

}

public static void main(String[] args) throws Exception{

//1 new Configuration

args = new String[]{"/input","/output"};

Configuration conf = new Configuration();

//2 run

int status = ToolRunner.run(conf, new WebPvMapReduce2(), args);

//3 exits

System.exit(status);

}

}

评论

成就一亿技术人!

拼手气红包6.0元

还能输入1000个字符

添加红包

插入表情

表情包

代码片

HTML/XML
objective-c
Ruby
PHP
C
C++
JavaScript
Python
Java
CSS
SQL
其它

条评论被折叠查看

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。