1、需求
对web访问日志中的各字段识别切分,去除日志中不合法的记录,数据如下:
124.42.13.230 - - [18/Sep/2013:06:57:55 +0000] "GET /wp-content/themes/silesia/images/natty-logo.png HTTP/1.1" 200 1438 "http://blog.fens.me/mongodb-replica-set/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)"
124.42.13.230 - - [18/Sep/2013:06:58:01 +0000] "GET /wp-content/uploads/2013/05/favicon.ico HTTP/1.1" 200 1150 "-" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)"
124.42.13.230 - - [18/Sep/2013:06:58:04 +0000] "GET /mongodb-replica-set/?cf_action=sync_comments&post_id=343 HTTP/1.1" 200 28 "http://blog.fens.me/mongodb-replica-set/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)"
163.177.71.12 - - [18/Sep/2013:06:58:18 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2013:06:58:21 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:58:25 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:58:28 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
61.135.216.104 - - [18/Sep/2013:06:58:30 +0000] "GET /feed/ HTTP/1.1" 304 0 "-" "Mozilla/5.0 (compatible;YoudaoFeedFetcher/1.0;http://www.youdao.com/help/reader/faq/topic006/;2 subscribers;)"
183.195.232.138 - - [18/Sep/2013:06:59:01 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
2、实现代码
2.1、 定义一个bean,用来记录日志数据中的各数据字段
public class WebLogBean {
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private boolean valid = true;// 判断数据是否合法
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.valid);
sb.append("\001").append(this.remote_addr);
sb.append("\001").append(this.remote_user);
sb.append("\001").append(this.time_local);
sb.append("\001").append(this.request);
sb.append("\001").append(this.status);
sb.append("\001").append(this.body_bytes_sent);
sb.append("\001").append(this.http_referer);
sb.append("\001").append(this.http_user_agent);
return sb.toString();
}
}
2.2、定义一个parser用来解析过滤web访问日志原始记录
public class WebLogParser {
static SimpleDateFormat sd1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
static SimpleDateFormat sd2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public static WebLogBean parser(String line) {
WebLogBean webLogBean = new WebLogBean();
String[] arr = line.split(" ");
if (arr.length > 11) {
webLogBean.setRemote_addr(arr[0]);
webLogBean.setRemote_user(arr[1]);
webLogBean.setTime_local(parseTime(arr[3].substring(1)));
webLogBean.setRequest(arr[6]);
webLogBean.setStatus(arr[8]);
webLogBean.setBody_bytes_sent(arr[9]);
webLogBean.setHttp_referer(arr[10]);
if (arr.length > 12) {
webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
webLogBean.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// 大于400,HTTP错误
webLogBean.setValid(false);
}
} else {
webLogBean.setValid(false);
}
return webLogBean;
}
public static String parseTime(String dt) {
String timeString = "";
try {
Date parse = sd1.parse(dt);
timeString = sd2.format(parse);
} catch (ParseException e) {
e.printStackTrace();
}
return timeString;
}
}
2.3、mapreduce程序
public class WeblogPreProcess {
static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
WebLogBean webLogBean = WebLogParser.parser(line);
if (!webLogBean.isValid())
return;
k.set(webLogBean.toString());
context.write(k, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WeblogPreProcess.class);
job.setMapperClass(WeblogPreProcessMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:/wordcount/weblog/input"));
FileOutputFormat.setOutputPath(job, new Path("C:/wordcount/weblog/output"));
job.waitForCompletion(true);
}
}