go pv uv 统计

最新推荐文章于 2022-06-14 16:51:00 发布

原创最新推荐文章于 2022-06-14 16:51:00 发布 · 3.5k 阅读

4 ·

CC 4.0 BY-SA版权

【GO】专栏收录该内容

20 篇文章

订阅专栏

go pv uv 统计

概述

当你网站访问量过大时第3方统计系统就不愿意了，这时就需要自己来设计个流量统计了。

流量分析系统是用来分析NGINX的日志数据的，NGINX的日志数据是用户网站访问后生成的，所以我们需要搭建一个网站，来产生一些用户行为数据。然后通过javascript快速上报到打点服务器，然后打点服务器再上报到nginx，nginx会生成用户行为日志，再通过我们用GO开发的流量统计系统分析处理后中，再由前台数据展示出来。

在一个比较大的WEB网站，几百万几千万流量的上报打点请求，如果我们用一个低性能的WEBSERVER做打点服务器，会非常卡，影响效率，所以最好用好一些的服务器。我们的打点服务器需要用到nginx的一个模块，ngx_http_empty_gif_module,这个模块，这个module可以生成1x1像素大小的透明的gif图片。非常小的

打点js 参照：

[js获取客户端time,cookie,url,ip,refer,user_agent信息](https://blog.youkuaiyun.com/qq_27384769/article/details/81009884)
[nginx空白图片访问打点](https://blog.youkuaiyun.com/qq_27384769/article/details/81012625)

第三方库

go get github.com/sirupsen/logrus

go get github.com/mgutz/str

go get github.com/mediocregopher/radix.v2

代码

package main

import (
	"flag"
	"github.com/sirupsen/logrus"
	"time"
	"os"
	"bufio"
	"io"
	"strings"
	"github.com/mgutz/str"
	"net/url"
	"crypto/md5"
	"encoding/hex"
	"github.com/mediocregopher/radix.v2/pool"
	"strconv"
)

const HANDLE_DIG = " /dig?"
const HANDLE_MOVIE = "/movie/"
const HANDLE_LIST = "/list/"
const HANDLE_HTML = ".html"


type cmdParams struct {
	logFilePath string
	routineNum int
}
type digData struct{
	time   string
	url    string
	refer  string
	ua        string
}
type urlData struct {
	data   digData
	uid    string
	unode  urlNode
}
type urlNode struct {
	unType     string // 详情页 或者 列表页 或者 首页
	unRid  int       // Resource ID 资源ID
	unUrl  string // 当前这个页面的url
	unTime  string // 当前访问这个页面的时间
}
type storageBlock struct {
	counterType       string
	storageModel   string
	unode        urlNode
}

var log = logrus.New()

func init() {
	log.Out = os.Stdout
	log.SetLevel( logrus.DebugLevel )
}

func main() {
	// 获取参数
	logFilePath := flag.String( "logFilePath", "/Users/pangee/Public/nginx/logs/dig.log", "log file path" )
	routineNum := flag.Int( "routineNum", 5, "consumer numble by goroutine" )
	l := flag.String( "l", "/tmp/log", "this programe runtime log target file path" )
	flag.Parse()

	params := cmdParams{ *logFilePath, *routineNum }

	// 打日志
	logFd, err := os.OpenFile( *l, os.O_CREATE|os.O_WRONLY, 0644 )
	if err == nil {
		log.Out = logFd
		defer logFd.Close()
	}
	log.Infof( "Exec start." )
	log.Infof( "Params: logFilePath=%s, routineNum=%d", params.logFilePath, params.routineNum )

	// 初始化一些channel，用于数据传递
	var logChannel = make(chan string, 3*params.routineNum)
	var pvChannel = make(chan urlData, params.routineNum)
	var uvChannel = make(chan urlData, params.routineNum)
	var storageChannel = make(chan storageBlock, params.routineNum)

	// Redis Pool
	redisPool, err := pool.New( "tcp", "localhost:6379", 2*params.routineNum );
	if err != nil{
		log.Fatalln( "Redis pool created failed." )
		panic(err)
	} else {
		go func(){
			for{
				redisPool.Cmd( "PING" )
				time.Sleep( 3*time.Second )
			}
		}()
	}


	// 日志消费者
	go readFileLinebyLine( params, logChannel )

	// 创建一组日志处理
	for i:=0; i<params.routineNum; i++ {
		go logConsumer( logChannel, pvChannel, uvChannel )
	}

	// 创建PV UV 统计器
	go pvCounter( pvChannel, storageChannel )
	go uvCounter( uvChannel, storageChannel, redisPool )
	// 可扩展的 xxxCounter

	// 创建 存储器
	go dataStorage( storageChannel, redisPool )

	time.Sleep( 1000*time.Second )
}

// HBase 劣势：列簇需要声明清楚
func dataStorage( storageChannel chan storageBlock, redisPool *pool.Pool) {
	for block := range storageChannel {
		prefix := block.counterType + "_"

		// 逐层添加，加洋葱皮的过程
		// 维度： 天-小时-分钟
		// 层级： 定级-大分类-小分类-终极页面
		// 存储模型： Redis  SortedSet
		setKeys := []string{
			prefix+"day_"+getTime(block.unode.unTime, "day"),
			prefix+"hour_"+getTime(block.unode.unTime, "hour"),
			prefix+"min_"+getTime(block.unode.unTime, "min"),
			prefix+block.unode.unType+"_day_"+getTime(block.unode.unTime, "day"),
			prefix+block.unode.unType+"_hour_"+getTime(block.unode.unTime, "hour"),
			prefix+block.unode.unType+"_min_"+getTime(block.unode.unTime, "min"),
		}

		rowId := block.unode.unRid

		for _,key := range setKeys {
			ret, err := redisPool.Cmd( block.storageModel, key, 1, rowId ).Int()
			if ret<=0 || err!=nil {
				log.Errorln( "DataStorage redis storage error.", block.storageModel, key, rowId )
			}
		}
	}
}

func pvCounter( pvChannel chan urlData, storageChannel chan storageBlock ) {
	for data := range pvChannel {
		sItem := storageBlock{ "pv", "ZINCRBY", data.unode }
		storageChannel <- sItem
	}
}
func uvCounter( uvChannel chan urlData, storageChannel chan storageBlock, redisPool *pool.Pool ) {
	for data := range uvChannel {
		//HyperLoglog redis
		hyperLogLogKey := "uv_hpll_"+getTime(data.data.time, "day")
		ret, err := redisPool.Cmd( "PFADD", hyperLogLogKey, data.uid, "EX", 86400 ).Int()
		if err!=nil {
			log.Warningln( "UvCounter check redis hyperloglog failed, ", err )
		}
		if ret!=1 {
			continue
		}

		sItem := storageBlock{ "uv", "ZINCRBY", data.unode }
		storageChannel <- sItem
	}
}

func logConsumer( logChannel chan string, pvChannel, uvChannel chan urlData ) error {
	for logStr := range logChannel {
		// 切割日志字符串，扣出打点上报的数据
		data := cutLogFetchData( logStr )

		// uid
		// 说明： 课程中模拟生成uid， md5(refer+ua)
		hasher := md5.New()
		hasher.Write( []byte( data.refer+data.ua ) )
		uid := hex.EncodeToString( hasher.Sum(nil) )

		// 很多解析的工作都可以放到这里完成
		// ...
		// ...

		uData := urlData{ data, uid, formatUrl( data.url, data.time ) }

		pvChannel <- uData
		uvChannel <- uData
	}
	return nil
}
func cutLogFetchData( logStr string ) digData {
	logStr = strings.TrimSpace( logStr )
	pos1 := str.IndexOf( logStr,  HANDLE_DIG, 0)
	if pos1==-1 {
		return digData{}
	}
	pos1 += len( HANDLE_DIG )
	pos2 := str.IndexOf( logStr, " HTTP/", pos1 )
	d := str.Substr( logStr, pos1, pos2-pos1 )

	urlInfo, err := url.Parse( "http://localhost/?"+d )
	if err != nil {
		return digData{}
	}
	data := urlInfo.Query()
	return digData{
		data.Get("time"),
		data.Get("refer"),
		data.Get("url"),
		data.Get("ua"),
	}
}
func readFileLinebyLine( params cmdParams, logChannel chan string ) error {
	fd, err := os.Open( params.logFilePath )
	if err != nil {
		log.Warningf( "ReadFileLinebyLine can't open file:%s", params.logFilePath )
		return err
	}
	defer fd.Close()

	count := 0
	bufferRead := bufio.NewReader( fd )
	for {
		line, err := bufferRead.ReadString( '\n' )
		logChannel <- line
		count++

		if count%(1000*params.routineNum) == 0 {
			log.Infof( "ReadFileLinebyLine line: %d", count )
		}
		if err != nil {
			if err == io.EOF {
				time.Sleep( 3*time.Second )
				log.Infof( "ReadFileLinebyLine wait, raedline:%d", count )
			} else {
				log.Warningf( "ReadFileLinebyLine read log error" )
			}
		}
	}
	return nil
}


func formatUrl( url, t string ) urlNode{
	// 一定从量大的着手,  详情页>列表页≥首页
	pos1 := str.IndexOf( url, HANDLE_MOVIE, 0)
	if pos1!=-1 {
		pos1 += len( HANDLE_MOVIE )
		pos2 := str.IndexOf( url, HANDLE_HTML, 0 )
		idStr := str.Substr( url , pos1, pos2-pos1 )
		id, _ := strconv.Atoi( idStr )
		return urlNode{ "movie", id, url, t }
	} else {
		pos1 = str.IndexOf( url, HANDLE_LIST, 0 )
		if pos1!=-1 {
			pos1 += len( HANDLE_LIST )
			pos2 := str.IndexOf( url, HANDLE_HTML, 0 )
			idStr := str.Substr( url , pos1, pos2-pos1 )
			id, _ := strconv.Atoi( idStr )
			return urlNode{ "list", id, url, t }
		} else {
			return urlNode{ "home", 1, url, t}
		} // 如果页面url有很多种，就不断在这里扩展
	}
}

func getTime( logTime, timeType string ) string {
	var item string
	switch timeType {
	case "day":
		item = "2018-01-02"
		break
	case "hour":
		item = "2018-01-02 15"
		break
	case "min":
		item = "2018-01-02 15:04"
		break
	}
	t, _ := time.Parse( item, time.Now().Format(item) )
	return strconv.FormatInt( t.Unix(), 10 )
}