61.235.163.33 - - [12/Jul/2017:00:00:59 +0800] "GET /good?shopId=4&goodId=10 HTTP/1.1" 200 450 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" "-"
36.56.133.27 - - [12/Jul/2017:00:03:02 +0800] "GET /good?shopId=1&goodId=2 HTTP/1.1" 200 450 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" "-"
106.83.184.240 - - [12/Jul/2017:00:03:50 +0800] "GET /good?shopId=5&goodId=10 HTTP/1.1" 200 450 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" "-"
36.63.247.51 - - [12/Jul/2017:00:04:16 +0800] "GET /search?value=小茴香 HTTP/1.1" 200 450 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" "-"
日志格式如上:
获取日志内容,分割获取Key - value:
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] lines = value.toString().split(" "); String ip = lines[0]; String time = lines[3] + " " + lines[4]; String url = lines[6]; outputKey.setIp(ip); outputKey.setTime(time); outputValue.set(url); context.write(outputKey, outputValue);
}
自定义排序规则:
首先以Ip为Key 进行排序,Ip相同时以Time 进行排序,时间格式转化使用SimpleDateFormat (在Hadoop中,如果用FastDateFormat 会有冲突)
public static class IpTimeKey implements WritableComparable<IpTimeKey> { private String ip; private String time; public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } @Override public int compareTo(IpTimeKey o) { int ipCompareResult = ip.compareTo(o.getIp()); if (ipCompareResult == 0) { SimpleDateFormat simpleDateFormat = new SimpleDateFormat("[dd/MMM/yyyy:HH:mm:ss Z]", Locale.ENGLISH); try { if (simpleDateFormat.parse(o.getTime()).getTime() - simpleDateFormat.parse(time).getTime() > 0) { return -1; } else { return 1; } } catch (ParseException e) { e.printStackTrace(); } } else { return ipCompareResult; } return 0; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(ip); dataOutput.writeUTF(time); } @Override public void readFields(DataInput dataInput) throws IOException { ip = dataInput.readUTF(); time = dataInput.readUTF(); } }
自定义聚合规则:
以Ip为Key 进行聚合
public static class ClickingGropingComparator extends WritableComparator { public ClickingGropingComparator() { super(IpTimeKey.class, true); } @Override public int compare(WritableComparable a, WritableComparable b) { IpTimeKey k1 = (IpTimeKey) a; IpTimeKey k2 = (IpTimeKey) b; return k1.getIp().compareTo(k2.getIp()); } }
最后 Reduce 同一Ip等待时间超过30m,或者Ip不同看作不同的人登录,用不同的session代表,输出Ip session time Url 单页面访问时长 步长(每个session进行了多少步操作)
public static class ClickingReduce extends Reducer<IpTimeKey, Text, Text, NullWritable> { private Text outputKey = new Text(); String time = null; String lastIp = "0"; String session = UUID.randomUUID().toString(); SimpleDateFormat inputTime = new SimpleDateFormat("[dd/MMM/yyyy:HH:mm:ss Z]", Locale.ENGLISH); SimpleDateFormat outputTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); @Override protected void reduce(IpTimeKey key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Integer by = 0; Long stayTime = 0L; String lastValue = null; String last = null; for (Text value : values) { if (last != null) { try { time = outputTime.format(inputTime.parse(last)); } catch (ParseException e) { e.printStackTrace(); } try { stayTime = inputTime.parse(key.getTime()).getTime() - inputTime.parse(last).getTime(); } catch (ParseException e) { e.printStackTrace(); } if (stayTime < 1800000) { outputKey.set(key.getIp() + " " + session + " " + time + " " + lastValue + " " + stayTime + " " + by); by = by + 1; } else { outputKey.set(key.getIp() + " " + session + " " + time + " " + lastValue + " " + 0 + " " + by); session = UUID.randomUUID().toString(); by = 0; } context.write(outputKey, NullWritable.get()); } last = key.getTime(); lastValue = value.toString(); } try { outputKey.set(key.getIp() + " " + session + " " + outputTime.format(inputTime.parse(last)) + " " + lastValue + " " + 0 + " " + by); } catch (ParseException e) { e.printStackTrace(); } context.write(outputKey, NullWritable.get()); } }
最后在Main函数输出,就OK啦!~
在定义LastValue时,我开始定义成Text类型了,输出结果时发现 value == LastValue ,坑死了,虽然Hadoop中Text 与 String 看似相同,但是在存储格式和访问方式上是有区别的,如果是Text型,当 LastValue = value 是,循环中是得不到想要的上一个value的,因为Text的访问方式决定的。