输入的数据格式:
123.13.17.13 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/icon_v120/apk_80111_1.jpg HTTP/1.1" 206 51934 "http://img.xxx.com:8080/AppFiles/apk/studynet/icon_v120/apk_80111_1.jpg" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S100 Build/KOT49H)"
120.210.166.150 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/products/product_lc01.zip HTTP/1.1" 206 16631 "http://img.xxx.com:8080/AppFiles/apk/studynet/products/product_lc01.zip" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S908 Build/KVT49L)"
123.13.17.13 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/icon_v120/apk_80111_0.jpg HTTP/1.1" 206 53119 "http://img.xxx.com:8080/AppFiles/apk/studynet/icon_v120/apk_80111_0.jpg" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S100 Build/KOT49H)"
219.137.119.16 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/gamenet/icon/icon_0_506_0.jpg HTTP/1.1" 404 1035 "-" "Dalvik/v3.3.110_update3 (Linux; U; Android 2.2.1-R-20151127.1131; ET_35 Build/KTU84Q)"
120.210.166.150 - - [25/Aug/2016:00:00:01 +0800] "GET /AppFiles/apk/studynet/products/product_lc01.zip HTTP/1.1" 206 40719 "http://img.xxx.com:8080/AppFiles/apk/studynet/products/product_lc01.zip" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; S908 Build/KVT49L)"
字段含义:
1、客户端IP
2、空白(远程登录名称)
3、空白(认证的远程用户)
4、请求时间
5、时区(UTC)
6、请求方法
7、请求资源
8、http协议
9、状态码
10、发送字节数
11、访问来源
12、客户浏览信息(不具体拆分)
目标输出格式:

代码示例:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
/**
* FileName: URLLog
* Author: hadoop
* Email: 3165845957@qq.com
* Date: 18-10-6 下午10:23
* Description:
* 不同访问方式,每种URL访问多少次
*/
public class URLLog {
/**
* 使用Mapper将数据文件中的数据本身作为Mapper输出的key直接输出
*/
public static class URLLogMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private LongWritable resultValue = new LongWritable(1); //标记每条日志访问方式为一次
private Text text = new Text(); //存储具体的访问方式
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString(); //将读入的每一行转化为是String
String result = handleLine(line);//获取具体的访问方式和访问的URl
if (result != null && result.length() >0){ //如果有记录
text.set(result); //添加这条记录
context.write(text,resultValue); //将这条URL写入磁盘(URL,1)
}
}
/**
* 根据传入的日志,返回URL和访问链接的字符串
* @param line
* @return
*/
private String handleLine(String line) {
StringBuffer buffer = new StringBuffer();
if (line.length() > 0 && line.contains("HTTP/1.1")){ //这里只是做简单的过滤
if (line.contains("GET")){//GET方式
buffer.append(line.substring(line.indexOf("GET"),line.indexOf("HTTP/1.1")).trim()); //截取获取方式和资源链接
}else if (line.contains("POST")){ //POST方式
buffer.append(line.substring(line.indexOf("POST"),line.indexOf("HTTP/1.1")).trim()); //截取获取方式和资源链接
}
}
return buffer.toString(); //返回URL和访问链接的字符串
}
}
/**
* 使用Reducer将输入的key本身作为key直接输出
*/
public static class URLLogReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable totalResult = new LongWritable(1); //保存相同访问链接出现的总次数
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
int total = 0; //统计相同链接的总次数
for (LongWritable item :values){
total += item.get();
}
totalResult.set(total);
context.write(key,totalResult); //将统计结果写入磁盘
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();//设置MapReduce的配置
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length < 2){
System.out.println("Usage: URLLog <in> [<in>...] <out>");
System.exit(2);
}
//设置作业
//Job job = new Job(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(URLLog.class);
job.setJobName("URLLog");
//设置处理map,reduce的类
job.setMapperClass(URLLogMapper.class);
job.setReducerClass(URLLogReducer.class);
//设置输入输出格式的处理
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设定输入输出路径
for (int i = 0; i < otherArgs.length-1;++i){
FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

935

被折叠的 条评论
为什么被折叠?



