PV(page view)即页面浏览量,通常是衡量一个网络新闻频道或网站甚至一条网络新闻的主要指标,是评价网站流量最常用的指标之一。监测网站PV的变化趋势和分析其变化原因是很多站长定期要做的工作。 Page Views中的Page一般是指普通的html网页,也包含php、jsp等动态产生的html内容。来自浏览器的一次html内容请求会被看作一个PV,逐渐累计成为PV总数。
一书写WebLogPvMapper类的代码
package com.huadian.weblogpv; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WebLogPVMapper extends Mapper <LongWritable, Text, Text, IntWritable>{ private Text outputKey = new Text( ); private IntWritable outputValue = new IntWritable( 1 ); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //分割每一行内容, String line = value.toString(); String[] items = line.split( "\t" ); /** * (1)一个有36个字段,如果分割之后,数组的长度小于36,这条数据是脏数据,可以丢弃 * (2)如果URL是空的话,该条记录丢弃 下标1 ,"" ,null,"null" * 省份下标23 * 输出(省份Id,1) */ if(items.length >=36){ if(StringUtils.isBlank( items[1] )){ return; } outputKey.set( items[23] ); context.write( outputKey,outputValue ); }else { return; } } } |
二 书写webLogMapReduce代码
package com.huadian.weblogpv; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WebLogPVMapReduce extends Configured implements Tool { @Override public int run(String[] args) throws Exception { //2、创建job Job job = Job.getInstance( this.getConf(), "WebLogUVMapReduce" ); //设置job运行的主类 job.setJarByClass( WebLogPVMapReduce.class ); //设置Job //a、input Path inputPath = new Path( args[0] ); FileInputFormat.setInputPaths( job, inputPath); //b、map job.setMapperClass( WebLogPVMapper.class ); job.setMapOutputKeyClass( Text.class ); job.setMapOutputValueClass( IntWritable.class ); job.setNumReduceTasks( 2 ); //c、reduce job.setReducerClass( WebLogPvReducer.class); job.setOutputKeyClass( Text.class ); job.setOutputValueClass( IntWritable.class ); //d、output Path outputPath = new Path( args[1] ); //如果输出目录存在,先删除 FileSystem hdfs = FileSystem.get( this.getConf() ); if(hdfs.exists(outputPath )){ hdfs.delete( outputPath,true ); } FileOutputFormat.setOutputPath( job,outputPath ); //第四步,提交job boolean isSuccess = job.waitForCompletion( true ); return isSuccess?0:1 ; } public static void main(String[] args) { Configuration configuration = new Configuration(); ///public static int run(Configuration conf, Tool tool, String[] args) try { int status = ToolRunner.run( configuration,new WebLogPVMapReduce(),args ); System.exit( status ); } catch (Exception e) { e.printStackTrace(); } } } |
三 书写WebLogPvReducer代码
package com.huadian.weblogpv; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class WebLogPvReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable outputValue = new IntWritable( ); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException, IOException { //key :省份; value:<1,1,1,1> int sum = 0; for (IntWritable value:values) { sum+= value.get(); } outputValue.set( sum ); context.write( key,outputValue ); } } |