用Hadoop的MapReduce计算框架实战URL流量分析

输入的数据格式:

123.13.17.13 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/icon_v120/apk_80111_1.jpg HTTP/1.1" 206 51934 "http://img.xxx.com:8080/AppFiles/apk/studynet/icon_v120/apk_80111_1.jpg" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S100 Build/KOT49H)"
120.210.166.150 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/products/product_lc01.zip HTTP/1.1" 206 16631 "http://img.xxx.com:8080/AppFiles/apk/studynet/products/product_lc01.zip" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S908 Build/KVT49L)"
123.13.17.13 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/icon_v120/apk_80111_0.jpg HTTP/1.1" 206 53119 "http://img.xxx.com:8080/AppFiles/apk/studynet/icon_v120/apk_80111_0.jpg" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S100 Build/KOT49H)"
219.137.119.16 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/gamenet/icon/icon_0_506_0.jpg HTTP/1.1" 404 1035 "-" "Dalvik/v3.3.110_update3 (Linux; U; Android 2.2.1-R-20151127.1131; ET_35 Build/KTU84Q)"
120.210.166.150 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/products/product_lc01.zip HTTP/1.1" 206 40719 "http://img.xxx.com:8080/AppFiles/apk/studynet/products/product_lc01.zip" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S908 Build/KVT49L)"

字段含义:

    1、客户端IP
    2、空白(远程登录名称)
    3、空白(认证的远程用户)
    4、请求时间
    5、时区(UTC)
    6、请求方法
    7、请求资源
    8、http协议
    9、状态码
    10、发送字节数
    11、访问来源
    12、客户浏览信息(不具体拆分)

目标输出格式:

代码示例:


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;

/**
 * FileName: URLLog
 * Author:   hadoop
 * Email:    3165845957@qq.com
 * Date:     18-10-6 下午10:23
 * Description:
 * 不同访问方式,每种URL访问多少次
 */
public class URLLog {
    /**
     * 使用Mapper将数据文件中的数据本身作为Mapper输出的key直接输出
     */
    public static class URLLogMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
        private LongWritable resultValue = new LongWritable(1); //标记每条日志访问方式为一次
        private Text text = new Text(); //存储具体的访问方式
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString(); //将读入的每一行转化为是String
            String result = handleLine(line);//获取具体的访问方式和访问的URl
            if (result != null && result.length() >0){ //如果有记录
                text.set(result); //添加这条记录
                context.write(text,resultValue); //将这条URL写入磁盘(URL,1)
            }
        }

        /**
         * 根据传入的日志,返回URL和访问链接的字符串
         * @param line
         * @return
         */

        private String handleLine(String line) {
            StringBuffer buffer = new StringBuffer();
            if (line.length() > 0 && line.contains("HTTP/1.1")){ //这里只是做简单的过滤
                if (line.contains("GET")){//GET方式
                    buffer.append(line.substring(line.indexOf("GET"),line.indexOf("HTTP/1.1")).trim()); //截取获取方式和资源链接
                }else if (line.contains("POST")){ //POST方式
                    buffer.append(line.substring(line.indexOf("POST"),line.indexOf("HTTP/1.1")).trim()); //截取获取方式和资源链接
                }
            }
            return buffer.toString(); //返回URL和访问链接的字符串
        }


    }


    /**
     * 使用Reducer将输入的key本身作为key直接输出
     */


    public static class URLLogReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        private LongWritable totalResult = new LongWritable(1); //保存相同访问链接出现的总次数
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            int total = 0; //统计相同链接的总次数
            for (LongWritable item :values){
                total += item.get();
            }
            totalResult.set(total);
            context.write(key,totalResult); //将统计结果写入磁盘
        }

    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//设置MapReduce的配置
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length < 2){
            System.out.println("Usage: URLLog <in> [<in>...] <out>");
            System.exit(2);
        }

        //设置作业
        //Job job = new Job(conf);
        Job job = Job.getInstance(conf);
        job.setJarByClass(URLLog.class);
        job.setJobName("URLLog");
        //设置处理map,reduce的类
        job.setMapperClass(URLLogMapper.class);
        job.setReducerClass(URLLogReducer.class);
        //设置输入输出格式的处理
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //设定输入输出路径
        for (int i = 0; i < otherArgs.length-1;++i){
            FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值