一、需求分析
二、项目架构
三、具体实现
1.js埋点
0准备工作(准备两个页面a.jsp和b.jsp):
a.jsp
代码如下:
<%@ page language="java" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"%>
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>页面A</title>
<script type="text/javascript" src="<%=request.getContextPath()%>/js/tongji.js"></script>
</head>
<body>
<span>页面A</span>
AAAAAAAAAAAAA
<a href="${pageContext.request.contextPath }/b.jsp">BBBB</a>
</body>
</html>
b.jsp
代码如下:
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%>
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>页面B</title>
<script type="text/javascript" src="<%=request.getContextPath()%>/js/tongji.js"></script>
</head>
<body>
BBBBBBBBBBBBB
</body>
</html>
1编写js埋点代码
1.代码如下
/**函数可对字符串进行编码,这样就可以在所有的计算机上读取该字符串。*/
function ar_encode(str)
{
//进行URL编码
return encodeURI(str);
}
/**屏幕分辨率*/
function ar_get_screen()
{
var c = "";
if (self.screen) {
c = screen.width+"x"+screen.height;
}
return c;
}
/**颜色质量*/
function ar_get_color()
{
var c = "";
if (self.screen) {
c = screen.colorDepth+"-bit";
}
return c;
}
/**返回当前的浏览器语言*/
function ar_get_language()
{
var l = "";
var n = navigator;
if (n.language) {
l = n.language.toLowerCase();
}
else
if (n.browserLanguage) {
l = n.browserLanguage.toLowerCase();
}
return l;
}
/**返回浏览器类型IE,Firefox*/
function ar_get_agent()
{
var a = "";
var n = navigator;
if (n.userAgent) {
a = n.userAgent;
}
return a;
}
/**方法可返回一个布尔值,该值指示浏览器是否支持并启用了Java*/
function ar_get_jvm_enabled()
{
var j = "";
var n = navigator;
j = n.javaEnabled() ? 1 : 0;
return j;
}
/**返回浏览器是否支持(启用)cookie */
function ar_get_cookie_enabled()
{
var c = "";
var n = navigator;
c = n.cookieEnabled ? 1 : 0;
return c;
}
/**检测浏览器是否支持Flash或有Flash插件*/
function ar_get_flash_ver()
{
var f="",n=navigator;
if (n.plugins && n.plugins.length) {
for (var ii=0;ii<n.plugins.length;ii++) {
if (n.plugins[ii].name.indexOf('Shockwave Flash')!=-1) {
f=n.plugins[ii].description.split('Shockwave Flash ')[1];
break;
}
}
}
else
if (window.ActiveXObject) {
for (var ii=10;ii>=2;ii--) {
try {
var fl=eval("new ActiveXObject('ShockwaveFlash.ShockwaveFlash."+ii+"');");
if (fl) {
f=ii + '.0';
break;
}
}
catch(e) {}
}
}
return f;
}
/**匹配顶级域名*/
function ar_c_ctry_top_domain(str)
{
var pattern = "/^aero$|^cat$|^coop$|^int$|^museum$|^pro$|^travel$|^xxx$|^com$|^net$|^gov$|^org$|^mil$|^edu$|^biz$|^info$|^name$|^ac$|^mil$|^co$|^ed$|^gv$|^nt$|^bj$|^hz$|^sh$|^tj$|^cq$|^he$|^nm$|^ln$|^jl$|^hl$|^js$|^zj$|^ah$|^hb$|^hn$|^gd$|^gx$|^hi$|^sc$|^gz$|^yn$|^xz$|^sn$|^gs$|^qh$|^nx$|^xj$|^tw$|^hk$|^mo$|^fj$|^ha$|^jx$|^sd$|^sx$/i";
if(str.match(pattern)){ return 1; }
return 0;
}
/**处理域名地址*/
function ar_get_domain(host)
{
//如果存在则截去域名开头的 "www."
var d=host.replace(/^www\./, "");
//剩余部分按照"."进行split操作,获取长度
var ss=d.split(".");
var l=ss.length;
//如果长度为3,则为xxx.yyy.zz格式
if(l == 3){
//如果yyy为顶级域名,zz为次级域名,保留所有
if(ar_c_ctry_top_domain(ss[1]) && ar_c_ctry_domain(ss[2])){
}
//否则只保留后两节
else{
d = ss[1]+"."+ss[2];
}
}
//如果长度大于3
else if(l >= 3){
//如果host本身是个ip地址,则直接返回该ip地址为完整域名
var ip_pat = "^[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*$";
if(host.match(ip_pat)){
return d;
}
//如果host后两节为顶级域名及次级域名,则保留后三节
if(ar_c_ctry_top_domain(ss[l-2]) && ar_c_ctry_domain(ss[l-1])) {
d = ss[l-3]+"."+ss[l-2]+"."+ss[l-1];
}
//否则保留后两节
else{
d = ss[l-2]+"."+ss[l-1];
}
}
return d;
}
/**返回cookie信息*/
function ar_get_cookie(name)
{
var mn=name+"=";//0.将传来的名字拼接上“=”
var b,e;
var co=document.cookie;//1.获取当前页面当前浏览器的所有cookie信息
if (mn=="=") {//2.当传进来的name为空就返回所有“当前页面当前浏览器的所有cookie信息”
return co;
}
b=co.indexOf(mn);//3.在“当前页面当前浏览器的所有cookie信息”的数组中查找传递来的cookie
if (b < 0) {//4.如果没有当前的cookie就返回空
return "";
}
e=co.indexOf(";", b+name.length);//5.如果找到了拿出cookie
if (e < 0) {//6.返回cookie中的值(此处涉及到cookie的结构
return co.substring(b+name.length + 1);
}
else {
return co.substring(b+name.length + 1, e);
}
}
/**设置cookie信息*/
function ar_set_cookie(name, val, cotp)
{
var date=new Date;
var year=date.getFullYear();
var hour=date.getHours();
var cookie="";
if (cotp == 0) {
cookie=name+"="+val+";";
}
else if (cotp == 1) {
year=year+10;
date.setYear(year);
cookie=name+"="+val+";expires="+date.toGMTString()+";";
}
else if (cotp == 2) {
hour=hour+1;
date.setHours(hour);
cookie=name+"="+val+";expires="+date.toGMTString()+";";
}
var d=ar_get_domain(document.domain);
if(d != ""){
cookie +="domain="+d+";";
}
cookie +="path="+"/;";
document.cookie=cookie;
}
/**返回客户端时间*/
function ar_get_stm()
{
return new Date().getTime();
}
/**返回指定个数的随机数字串*/
function ar_get_random(n) {
var str = "";
for (var i = 0; i < n; i ++) {
str += String(parseInt(Math.random() * 10));
}
return str;
}
/* main function */
function ar_main() {
var dest_path = "http://localhost/LogDemox/servlet/LogServlet?";
var expire_time = 30 * 60 * 1000;//会话超时时长
//处理uv
//--获取cookie ar_stat_uv的值
var uv_str = ar_get_cookie("ar_stat_uv");
var uv_id = "";
//--如果cookie ar_stat_uv的值为空
if (uv_str == ""){
//--为这个新uv配置id,为一个长度20的随机数字
uv_id = ar_get_random(20);
//--设置cookie ar_stat_uv 保存时间为10年
ar_set_cookie("ar_stat_uv", uv_id, 1);
}
//--如果cookie ar_stat_uv的值不为空
else{
//--获取uv_id
uv_id = uv_str;
}
//处理ss
//--获取cookie ar_stat_ss
var ss_str = ar_get_cookie("ar_stat_ss");
var ss_id = ""; //sessin id
var ss_no = 0; //session有效期内访问页面的次数
//--如果cookie中不存在ar_stat_ss 说明是一次新的会话
if (ss_str == ""){
//--随机生成长度为10的session id
ss_id = ar_get_random(10);
//--session有效期内页面访问次数为0
ss_no = 0;
//--拼接cookie ar_stat_ss 值 格式为 会话编号_会话期内访问次数_客户端时间_网站id
value = ss_id+"_"+ss_no+"_"+ar_get_stm();
//--设置cookie ar_stat_ss
ar_set_cookie("ar_stat_ss", value, 0);
}
//--如果cookie中存在ar_stat_ss
else {
//获取ss相关信息
var items = ss_str.split("_");
//--ss_id
var cookie_ss_id = items[0];
//--ss_no
var cookie_ss_no = parseInt(items[1]);
//--ss_stm
var cookie_ss_stm = items[2];
//如果当前时间-当前会话上一次访问页面的时间>30分钟,虽然cookie还存在,但是其实已经超时了!仍然需要重新生成cookie
if (ar_get_stm() - cookie_ss_stm > expire_time) {
//--重新生成会话id
ss_id = ar_get_random(10);
//--设置会话中的页面访问次数为0
ss_no = 0;
}
//--如果会话没有超时
else{
//--会话id不变
ss_id = cookie_ss_id;
//--设置会话中的页面方位次数+1
ss_no = cookie_ss_no + 1;
}
//--重新拼接cookie ar_stat_ss的值
value = ss_id+"_"+ss_no+"_"+ar_get_stm();
ar_set_cookie("ar_stat_ss", value, 0);
}
//返回导航到当前网页的超链接所在网页的URL
var ref = document.referrer;
ref = ar_encode(String(ref));
//当前地址
var url = document.URL;
url = ar_encode(String(url));
//当前资源名
var urlname = document.URL.substring(document.URL.lastIndexOf("/")+1);
urlname = ar_encode(String(urlname));
//网页标题
var title = document.title;
title = ar_encode(String(title));
//网页字符集
var charset = document.charset;
charset = ar_encode(String(charset));
//屏幕信息
var screen = ar_get_screen();
screen = ar_encode(String(screen));
//颜色信息
var color =ar_get_color();
color =ar_encode(String(color));
//语言信息
var language = ar_get_language();
language = ar_encode(String(language));
//浏览器类型
var agent =ar_get_agent();
agent =ar_encode(String(agent));
//浏览器是否支持并启用了java
var jvm_enabled =ar_get_jvm_enabled();
jvm_enabled =ar_encode(String(jvm_enabled));
//浏览器是否支持并启用了cookie
var cookie_enabled =ar_get_cookie_enabled();
cookie_enabled =ar_encode(String(cookie_enabled));
//浏览器flash版本
var flash_ver = ar_get_flash_ver();
flash_ver = ar_encode(String(flash_ver));
//当前uv状态 格式为"会话id_会话次数_当前时间"
var stat_ss = ss_id+"_"+ss_no+"_"+ar_get_stm();
//拼接访问地址 增加如上信息
dest=dest_path+"url="+url+"&urlname="+urlname+"&title="+title+"&chset="+charset+"&scr="+screen+"&col="+color+"&lg="+language+"&je="+jvm_enabled+"&ce="+cookie_enabled+"&fv="+flash_ver+"&cnv="+String(Math.random())+"&ref="+ref+"&uagent="+agent+"&stat_uv="+uv_id+"&stat_ss="+stat_ss;
//通过插入图片访问该地址
document.getElementsByTagName("body")[0].innerHTML += "<img src=\""+dest+"\" border=\"0\" width=\"1\" height=\"1\" />";
}
window.onload = function(){
//触发main方法
ar_main();
}
2.js阅读顺序
3.验证js埋点
访问方式
访问结果:
连续访问3次的结果
关闭浏览器,再代开后访问的结果
重新换一个浏览器访问的结果
证明js埋点收集到的数据正常
2编写收集日志的服务端
1.需要的字段
2.发现通过js收集到的日志信息有多余的信息并且缺少数据(公网ip)所以收集到的日志需要预处理如下
ip需要在收集日志的服务端收集这样收集的ip是公网ip,如果在js中收集的ip是内网ip不是我们需要的ip
因此服务端收集到数据后需要做的操作有:
(1)进行url转码
(2)将需要的信息用|分割
(3)收集公网ip
代码如下:
package cn.tedu.flux;
import java.io.IOException;
import java.net.URLDecoder;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.log4j.Logger;
public class LogServlet extends HttpServlet {
private static Logger logger = Logger.getLogger(LogServlet.class);
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
//获取所有参数
String str = request.getQueryString();
//进行url解码
str = URLDecoder.decode(str,"utf-8");
//将str进行处理变为将值用|分隔的形式
StringBuffer buffer = new StringBuffer();
String [] kvs = str.split("&");
for(String kv : kvs){
String value = kv.split("=").length >=2 ? kv.split("=")[1] : "";
buffer.append(value+"|");
}
//拼接ip
String ip = request.getRemoteAddr();
buffer.append(ip);
str = buffer.toString();
System.out.println(str);
logger.info(str);
}
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doGet(request, response);
}
}
3.服务端进行预处理后的数据,发现符合需求
log4j:ERROR Cannot Append to Appender! Appender either closed or not setup correctly!
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1||0.6847100276188913||Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0|58290320824172982441|5764225803_0_1524211141502|0:0:0:0:0:0:0:1
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1||0.6847100276188913||Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0|58290320824172982441|5764225803_0_1524211141502|0:0:0:0:0:0:0:1
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1||0.6818952847160782||Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0|58290320824172982441|5764225803_1_1524211151191|0:0:0:0:0:0:0:1
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1||0.6818952847160782||Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0|58290320824172982441|5764225803_1_1524211151191|0:0:0:0:0:0:0:1
log4j:ERROR Cannot Append to Appender! Appender either closed or not setup correctly!
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1|26.9 r9|0.07667920747010104||Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400|38627225991274809163|1282704282_0_1524211501931|0:0:0:0:0:0:0:1
http://localhost/demo/a.jsp|a.jsp|页面A|UTF-8|1366x768|24-bit|zh-cn|0|1|26.9 r9|0.07667920747010104||Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400|38627225991274809163|1282704282_0_1524211501931|0:0:0:0:0:0:0:1
log4j:ERROR Cannot Append to Appender! Appender either closed or not setup correctly!
4.进行预处理过的数据需要让flume收集日志
log4j可以和flume直接对接:官网资料
这种方式虽然快但有一个缺点就是数据可能会丢,所以需要布置两条线来收集日志:一条直接对接到flume中,一条落地到日志中
(1)导包:资源
(2)配置log4j
高可用配置log4j对接flume
文件中的日志:
3.配置flume,将log4j传来的数据存放到hdfs中
flume配置信息
#命名Agent a1的组件
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#描述/配置Source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
a1.sources.r1.interceptors.i1.regex = ^(?:[^\\|]*\\|){14}\\d+_\\d+_(\\d+)\\|[^\\|]*$
a1.sources.r1.interceptors.i1.serializers = s1
a1.sources.r1.interceptors.i1.serializers.s1.name = timestamp
#描述Sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop01:9000/flux/reportTime=%Y-%m-%d
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.rollInterval=30
a1.sinks.k1.hdfs.rollSize=0
a1.sinks.k1.hdfs.rollCount=0
#描述内存Channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
#为Channle绑定Source和Sink
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
访问a.jsp后查看hdfs中收集到了数据
4.通过hive进行处理
1.准备测试数据
ie浏览器:连续访问3次a.jsp,访问2次b.jsp,再访问1次a.jsp
关闭ie,再打开:访问2次a.jsp
换谷歌浏览器:访问2次a.jsp,访问1次b.jsp
人为可以计算出:pv-----11
uv-----2
vv-----3
2.检查埋点收集的数据是否正确
uv验证正确为2,pv验证正确为11
验证vv正确为3,并且每个会话的计数都是递增的
3.将hdfs中的数据存入到hive的表中
#建立数据库
create database flux;
#建立外部表
create EXTERNAL table flux (url String,urlname String,title String,chset String,scr String,col String,lg String,je String,ec String,fv String,cn String,ref String,uagent String,stat_uv String,stat_ss String,cip String) partitioned by (reportTime string) row format delimited fields terminated by '|' stored as textfile ;
#增加分区
ALTER TABLE flux add PARTITION (reportTime='2018-04-20') location '/flux/reportTime=2018-04-20';
4.数据已经到位,接下来进行业务计算
====================================================================================
time
'2018-04-20'
pv 访问的次数 日志的条数就是访问的次数 由他这个数据单方面的来的体现一个网站的访问受欢迎程度显然是不合理的
select count(*) as pv from flux where reportTime='2018-04-20'
uv stat_uv(里面的值是uv_id的值,每一个用户都有一个自己独立的uv_id是20位的随即数 如果一个客户在30分钟内没有点击或者这个网页认为和关闭浏览器的效果是一样的,认为是1次会话的结束)
select count(distinct stat_uv) as uv from flux where reportTime='2017-01-10'
vv 会话的次数,一个ss――id就是一个会话的次数 ss_id_浏览次数_会话当前的时间
根据session利用会话级别的session,这个session――id信息是一个10位的随机数,保存在浏览器的内存中,当浏览器关闭的时候,session――id消失。一个用户可以贡献多个session――id
select count(distinct split(stat_ss,'_')[0]) as vv from flux where reportTime='2017-01-10';
br 跳出率 反应一个网站是否受欢迎的程度,如果一个用户进入该网站了,但是没有点击任何其他的页面说明该用户对该网站的内容不感兴趣。关闭了该网页称为一次跳出。跳出的次数/会话的次数
按照会话分组,分组后的urlname去重后只有一个的作为a/会话ss_id去重
select round(a / b,2) from (select count(*) as a from (select count(distinct urlname) as pcount from flux where reportTime='2017-01-10' group by split(stat_ss,'_')[0]) as tbr11 where pcount == 1) as tbr1 left join (select count(distinct split(stat_ss,'_')[0]) as b from flux) as tbr2;
我的hql语句
select round(a/b,2) from
(select count(*) as a from
(select count(distinct urlname) as p from flux where reportTime='2017-01-10' group by split(stat_ss,'_')[0])as tab11 where p=1)(如果这个表不给一个别名的话,p就不知道从哪里来的) as tab1
left join
(select count(distinct split(stat_ss,'_')[0]) as b from flux where reportTime='2017-01-10' )as tab2;
newip 新增ip,就是在以前的cip记录中不存在的ip
select count(distinct cip) as newip from flux where cip not in (select distinct cip as history from (select * from flux) as tnewip where reportTime<>'2017-01-10') and reportTime='2017-01-10';
我的hql:新的cip就是在此之前的去重cip中没有的新的cip的数量
select count(distinct cip) as newip from flux where cip not in( select distinct cip as history from (select * from flux) as tab1 where reportTime<>'2017-01-10') and reportTime='2017-01-10';(绿色部分必须这样写,不能写成flux,因为这样很多的flux表都不知道是哪个了)
avgtime:平均访问时长,同一个会话中找到会话时间最大的值,减去会话时长最小的值,然后求平均
select round(avg(time),2) as avgtime from(select round((max(split(stat_ss,'_')[2]) - min(split(stat_ss,'_')[2]))/1000,0) time from flux group by split(stat_ss,'_')[0]) as tavgtime
select round(avg(time),2) as avgtime from (select round(max(split(stat_ss,'_')[2])-min(split(stat_ss,'_')[2])/1000,0) as time from flux group by split(stat_ss,'_')[0] ) as tab1;
newcust:新独立访客
select count(distinct stat_uv) as newcust from flux where stat_uv not in (select distinct stat_uv as history from (select * from flux) as tnewcust where reportTime<>'2017-01-10') and reportTime='2017-01-10';
select count(distinct stat_uv) as newcust from flux where stat_uv not in(
select distinct stat_uv as history from(select * from flux) as tab1 where reportTime<>'2017-01-10')and reportTime='2017-01-10';
viewdeep :平均访问深度 客户去重以后的平均访问页面的数量
select round(avg(deep),2) as viewdeep from (select count(distinct urlname) as deep from flux where reportTime='2017-01-10' group by split(stat_uv,'_')[0]) as tviewdeep;
select round(avg(deep),2) as viewdeep from (select count(distinct urlname) as deep from flux where reportTime='2017-01-10' group by split(stat_ss,'_')[0])as tab1;
====================================================================================
pv计算: