web日志解析清洗

测试web日志解析清洗复杂版

1:需求

对web访问日志中的各字段识别切分

去除日志中不合法的记录,(字段小于等于11,状态码大于等于400)

根据统计需求,生成各类访问请求过滤数据

2:定义一个类bean,用来记录日志数据中的各数据字段

package LogClean_02;
public class LogBean {
    //客户端的IP地址
    private String addr;
    //客户端的用户名,属性忽略“-”
    private String user;
    //用户访问时间
    private String time;
    //用户的url与http协议
    private String request;
    //是否访问成功的状态码:大于等于400 失败
    private String status;
    //发送给客户端文件大下
    private String size;
    //记录用户从哪个网页访问的
    private String refener;
    //记录客户端游览器信息
    private String user_agent;
    //判断是否合法
    private boolean vail = true;

    public String getAddr() {
        return addr;
    }
    public void setAddr(String addr) {
        this.addr = addr;
    }
    public String getUser() {
        return user;
    }
    public void setUser(String user) {
        this.user = user;
    }
    public String getTime() {
        return time;
    }
    public void setTime(String time) {
        this.time = time;
    }
    public String getRequest() {
        return request;
    }
    public void setRequest(String request) {
        this.request = request;
    }
    public String getStatus() {
        return status;
    }
    public void setStatus(String status) {
        this.status = status;
    }
    public String getSize() {
        return size;
    }
    public void setSize(String size) {
        this.size = size;
    }
    public String getRefener() {
        return refener;
    }
    public void setRefener(String refener) {
        this.refener = refener;
    }
    public String getUser_agent() {
        return user_agent;
    }
    public void setUser_agent(String user_agent) {
        this.user_agent = user_agent;
    }
    public boolean isVail() {
        return vail;
    }
    public void setVail(boolean vail) {
        this.vail = vail;
    }
    @Override
    public String toString() {
        //194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
        StringBuilder s  = new StringBuilder();
        s.append(this.addr);
        s.append("\001").append(this.user);
        s.append("\001").append(this.time);
        s.append("\001").append(this.request);
        s.append("\001").append(this.status);
        s.append("\001").append(this.refener);
        s.append("\001").append(this.user_agent);
        s.append("\001").append(this.vail);
        return s.toString();
    }
}

3:定义LogMap类

package LogClean_02;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

public class LogMap extends Mapper<LongWritable, Text, NullWritable, Text>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //获取数据
        String line = value.toString();
        //解析日志
        LogBean log = preaaLog(line);
        //判断是不是true
        if(log.isVail()){
            context.write(NullWritable.get(),new Text(log.toString()));
        }
    }

    private LogBean preaaLog(String line) {
        LogBean logBean = new LogBean();
        //切分字段
        String[] fields = line.split(" ");
//194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"

        if(fields.length > 11){
            //封装数据
            logBean.setAddr(fields[0]);
            logBean.setUser(fields[1]);
            logBean.setTime(fields[3].substring(1));
            logBean.setRequest(fields[6]);
            logBean.setStatus(fields[8]);
            logBean.setSize(fields[9]);
            logBean.setRefener(fields[10]);
            if(fields.length>12){
                logBean.setUser_agent(fields[11]+fields[12]);
            }else {
                logBean.setUser_agent(fields[11]);
            }
            //状态码大于等于400为非法
            if(Integer.parseInt(logBean.getStatus()) >= 400){
                logBean.setVail(false);
            }
        }else {
            logBean.setVail(false);
        }
        return logBean;
    }
}

4:定义LogMain 类

package LogClean_02;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class LogMain {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        
        args = new String[]{"E:\\bigdata_code\\web.txt","E:\\bigdata_code\\out"};
        //1:获取job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2:加载jar包
        job.setJarByClass(LogClean_02.LogMain.class);

        //3:关联Map
        job.setMapperClass(LogClean_02.LogMap.class);

        //4:设置最终输出类型
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        //5:设置Reducetask个数为0
        job.setNumReduceTasks(0);

        //6:设置输入和输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //7:提交
        job.waitForCompletion(true);
    }
}

清洗后的日志如下:

194.237.142.21-18/Sep/2013:06:49:18/wp-content/uploads/2013/07/rstudio-git3.png304"-""Mozilla/4.0(compatible;)"true
163.177.71.12-18/Sep/2013:06:49:33/200"-""DNSPod-Monitor/1.0"true
163.177.71.12-18/Sep/2013:06:49:36/200"-""DNSPod-Monitor/1.0"true
101.226.68.137-18/Sep/2013:06:49:42/200"-""DNSPod-Monitor/1.0"true
101.226.68.137-18/Sep/2013:06:49:45/200"-""DNSPod-Monitor/1.0"true
60.208.6.156-18/Sep/2013:06:49:48/wp-content/uploads/2013/07/rcassandra.png200"http://cos.name/category/software/packages/""Mozilla/5.0(Windowstrue
222.68.172.190-18/Sep/2013:06:49:57/images/my.jpg200"http://www.angularjs.cn/A00n""Mozilla/5.0(Windowstrue

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值