测试web日志解析清洗复杂版
1:需求
对web访问日志中的各字段识别切分
去除日志中不合法的记录,(字段小于等于11,状态码大于等于400)
根据统计需求,生成各类访问请求过滤数据
2:定义一个类bean,用来记录日志数据中的各数据字段
package LogClean_02; public class LogBean { //客户端的IP地址 private String addr; //客户端的用户名,属性忽略“-” private String user; //用户访问时间 private String time; //用户的url与http协议 private String request; //是否访问成功的状态码:大于等于400 失败 private String status; //发送给客户端文件大下 private String size; //记录用户从哪个网页访问的 private String refener; //记录客户端游览器信息 private String user_agent; //判断是否合法 private boolean vail = true; public String getAddr() { return addr; } public void setAddr(String addr) { this.addr = addr; } public String getUser() { return user; } public void setUser(String user) { this.user = user; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getSize() { return size; } public void setSize(String size) { this.size = size; } public String getRefener() { return refener; } public void setRefener(String refener) { this.refener = refener; } public String getUser_agent() { return user_agent; } public void setUser_agent(String user_agent) { this.user_agent = user_agent; } public boolean isVail() { return vail; } public void setVail(boolean vail) { this.vail = vail; } @Override public String toString() { //194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)" StringBuilder s = new StringBuilder(); s.append(this.addr); s.append("\001").append(this.user); s.append("\001").append(this.time); s.append("\001").append(this.request); s.append("\001").append(this.status); s.append("\001").append(this.refener); s.append("\001").append(this.user_agent); s.append("\001").append(this.vail); return s.toString(); } }
3:定义LogMap类
package LogClean_02; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class LogMap extends Mapper<LongWritable, Text, NullWritable, Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取数据 String line = value.toString(); //解析日志 LogBean log = preaaLog(line); //判断是不是true if(log.isVail()){ context.write(NullWritable.get(),new Text(log.toString())); } } private LogBean preaaLog(String line) { LogBean logBean = new LogBean(); //切分字段 String[] fields = line.split(" "); //194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)" if(fields.length > 11){ //封装数据 logBean.setAddr(fields[0]); logBean.setUser(fields[1]); logBean.setTime(fields[3].substring(1)); logBean.setRequest(fields[6]); logBean.setStatus(fields[8]); logBean.setSize(fields[9]); logBean.setRefener(fields[10]); if(fields.length>12){ logBean.setUser_agent(fields[11]+fields[12]); }else { logBean.setUser_agent(fields[11]); } //状态码大于等于400为非法 if(Integer.parseInt(logBean.getStatus()) >= 400){ logBean.setVail(false); } }else { logBean.setVail(false); } return logBean; } }
4:定义LogMain 类
package LogClean_02; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class LogMain { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { args = new String[]{"E:\\bigdata_code\\web.txt","E:\\bigdata_code\\out"}; //1:获取job信息 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); //2:加载jar包 job.setJarByClass(LogClean_02.LogMain.class); //3:关联Map job.setMapperClass(LogClean_02.LogMap.class); //4:设置最终输出类型 job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); //5:设置Reducetask个数为0 job.setNumReduceTasks(0); //6:设置输入和输出路径 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //7:提交 job.waitForCompletion(true); } } 清洗后的日志如下:
194.237.142.21-18/Sep/2013:06:49:18/wp-content/uploads/2013/07/rstudio-git3.png304"-""Mozilla/4.0(compatible;)"true
163.177.71.12-18/Sep/2013:06:49:33/200"-""DNSPod-Monitor/1.0"true
163.177.71.12-18/Sep/2013:06:49:36/200"-""DNSPod-Monitor/1.0"true
101.226.68.137-18/Sep/2013:06:49:42/200"-""DNSPod-Monitor/1.0"true
101.226.68.137-18/Sep/2013:06:49:45/200"-""DNSPod-Monitor/1.0"true
60.208.6.156-18/Sep/2013:06:49:48/wp-content/uploads/2013/07/rcassandra.png200"http://cos.name/category/software/packages/""Mozilla/5.0(Windowstrue
222.68.172.190-18/Sep/2013:06:49:57/images/my.jpg200"http://www.angularjs.cn/A00n""Mozilla/5.0(Windowstrue