实时项目统计实战之三

实时项目统计实战之三

更多干货

一、概述

简单代码事例

https://github.com/csy512889371/learndemo/tree/master/spark-streaming-kafka

二、功能开发

  • 目标:今天到现在为止,每个栏目的访问量
  • 分析:yyyyMMdd categoryId

1、Spark Streaming 把统计的结构吸入到数据库里面 2、使用数据库进行存储我们的统计结果 3、可视化前端根据:yyyyMMdd categoryId 把数据库里面的统计结构展示出来

选择什么数据库作为统计结构存储呢?

1、 关系型数据库 RDBMS:MySQL Oracle

day categoryId click_count
20171117 1 10 
20171117 2 19
  • 下一个批次数据进来以后,我们需要取出 20171117 1 对应的值 10 + 对应的数据

2、非关系型数据库:Hbase,Redis

Hbase 一个 API 就能搞定,非常方便

三、使用 Hbase

./hbase shell

list 查看对应的数据库

创建表:

create ‘category_clickcount’,’info’

Rowkey 设计:day_categoryid

desc ‘category_clickcount’
scan ‘category_clickcount’ 查看表

操作 Hbase 数据库的工具类:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;

public class HBaseUtils {
    HBaseAdmin admin = null;
    Configuration configration = null;
    /**
     * 私有构造方法
     */
    private HBaseUtils(){
        configration = new Configuration();
        configration.set("hbase.zookeeper.quorum","s201:2181");
        configration.set("hbase.rootdir","hdfs://s201/hbase");
        try {
            admin = new HBaseAdmin(configration);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    private static HBaseUtils instance = null;
    public static synchronized HBaseUtils getInstance(){
        if(null == instance){
            instance = new HBaseUtils();
        }
        return instance;
    }
    /**
     * 根据表名获取到 Htable 实例
     */
    public HTable getTable(String tableName){
        HTable table = null;
        try {
            table = new HTable(configration,tableName);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return table;
    }
    /**
     * 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
     * @param tableName Hbase 表名
     * @param rowkey Hbase 表的 rowkey
     * @param cf Hbase 表的 columnfamily
     * @param column Hbase 表的列
     * @param value 写入 Hbase 表的值 
     */
    public void put(String tableName,String rowkey,String cf,String column,String value){
        HTable table = getTable(tableName);
        Put put = new Put(Bytes.toBytes(rowkey));
        put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
        try {
            table.put(put);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {
		//HTable table = HBaseUtils.getInstance().getTable("category_clickcount");
		//System.out.println(table.getName().getNameAsString());
        String tableName = "category_click";
        String rowkey = "20271111_88";
        String cf="info";
        String column ="click_count";
        String value = "2";
        HBaseUtils.getInstance().put(tableName,rowkey,cf,column,value);
    }
}

四、使用 spark-streaming 完成数据清洗操作

时间工具类

156.187.29.132 2017-11-20 00:39:26 "GET www/2 HTTP/1.0" - 200
30.187.124.132 2017-11-20 00:39:26 "GET www/2 HTTP/1.0" - 302

DataUtil.java

import java.util.Date

import org.apache.commons.lang3.time.FastDateFormat


object DataUtils {
    val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
    val TAG_FORMAT = FastDateFormat.getInstance("yyyyMMdd")

    /**
      * 把当前时间转换成时间戳
      * @param time
      * @return
      */
    def getTime(time:String) ={
        YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
    }

    def parseToMin(time:String) ={
        TAG_FORMAT.format(new Date(getTime(time)))
    }


    def main(args: Array[String]): Unit = {
        print(parseToMin("2017-11-20 00:39:26"))
    }


}

针对数据进行分析

清洗数据,过滤出无用数据

val cleanData = logs.map(line =>{
	val infos = line.split("\t")
	val url = infos(2).split(" ")(1)
	var categaryId = 0
	//把爱奇艺的类目编号拿到了
	if(url.startsWith("/www")){
		categaryId = url.split("/")(2).toInt
	}
	ClickLog(infos(0),DataUtils.parseToMin(infos(1)),categaryId,infos(4).toInt,infos(3))
}).filter(clickLog=>clickLog.categaryId!=0)

cleanData.print()

点击 ClickLog 类

case class ClickLog(ip:String,time:String,categaryId:Int,refer:String,statusCode:Int)

产生的日志

ClickLog(187.124.156.143,20171122,6,-,404)
ClickLog(29.132.10.100,20171122,1,-,302)
ClickLog(100.167.30.124,20171122,3,-,404)
ClickLog(30.10.167.132,20171122,4,-,200)
ClickLog(132.124.100.143,20171122,4,-,200)
ClickLog(167.132.10.156,20171122,4,http://cn.bing.com/search?q=幸福满院,302)
ClickLog(29.143.10.187,20171122,3,-,302)

API 操作 Hbase

case class CategaryClickCount(categaryID:String,clickCout:Int)
import cn.ctoedu.spark.domain.CategaryClickCount
import cn.ctoedu.spark.utils.HBaseUtils
import org.apache.hadoop.hbase.client.{Get, HTable}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable.ListBuffer

/**
  *
  */
object CategaryClickCountDAO {

     val tableName = "category_clickcount"
     val cf = "info"
     val qualifer = "click_count"

    /**
      * 保存数据
      * @param list
      */
    def save(list:ListBuffer[CategaryClickCount]): Unit ={
      val table =  HBaseUtils.getInstance().getHtable(tableName)
        for(els <- list){
            table.incrementColumnValue(Bytes.toBytes(els.categaryID),Bytes.toBytes(cf),Bytes.toBytes(qualifer),els.clickCout);
        }

    }

    def count(day_categary:String) : Long={
        val table  =HBaseUtils.getInstance().getHtable(tableName)
        val get = new Get(Bytes.toBytes(day_categary))
        val  value =  table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
         if(value == null){
           0L
         }else{
             Bytes.toLong(value)
         }
    }

    def main(args: Array[String]): Unit = {
       val list = new ListBuffer[CategaryClickCount]
        list.append(CategaryClickCount("20171122_1",300))
        //list.append(CategaryClickCount("20171122_9", 60))
        //list.append(CategaryClickCount("20171122_10", 160))
        save(list)

      //  print(count("20171122_10")+"---")
    }

}

保存收集数据到 HBase 里面

cleanLog.map(log=>{
(log.time.substring(0,8)+"_"+log.categaryId,1)
	}).reduceByKey(_+_).foreachRDD(rdd=>{
		rdd.foreachPartition(partriosRdds=>{
			val list = new ListBuffer[CategaryClickCount] 
			partriosRdds.foreach(pair=>{
			list.append(CategaryClickCount(pair._1,pair._2))
		})
		CategaryClickCountDAO.save(list)
	})
})

功能二:功能一+从搜索引擎引流过来的

Hbase 表设计

create 'category_search_clickcount','info'

rowkey 设计:也是根据我们的业务需求来的封装类

case class CategarSearchClickCount
(day_search_categary:String,clickCount:Int)

DAO 操作类

import cn.ctoedu.spark.domain.{CategaryClickCount, CategarySearchClickCount}
import cn.ctoedu.spark.utils.HBaseUtils
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable.ListBuffer

/**
  *
  */
object CategarySearchClickCountDAO {

     val tableName = "categary_search_cout"
     val cf = "info"
     val qualifer = "click_count"

    /**
      * 保存数据
      * @param list
      */
    def save(list:ListBuffer[CategarySearchClickCount]): Unit ={
      val table =  HBaseUtils.getInstance().getHtable(tableName)
        for(els <- list){
            table.incrementColumnValue(Bytes.toBytes(els.day_search_categary),Bytes.toBytes(cf),Bytes.toBytes(qualifer),els.clickCout);
        }

    }

    def count(day_categary:String) : Long={
        val table  =HBaseUtils.getInstance().getHtable(tableName)
        val get = new Get(Bytes.toBytes(day_categary))
        val  value =  table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
         if(value == null){
           0L
         }else{
             Bytes.toLong(value)
         }
    }

    def main(args: Array[String]): Unit = {
       val list = new ListBuffer[CategarySearchClickCount]
        //list.append(CategarySearchClickCount("20171122_1_1",300))
        list.append(CategarySearchClickCount("20171122_2_1", 300))
        list.append(CategarySearchClickCount("20171122_1_2", 1600))
      //  save(list)

        print(count("20171122_www.sogou.com_2")+"---" )
    }

}

业务功能实现


cleanLog.map(log=>{
	val referer = log.refer.replace("//","/")
	val splits = referer.split("/")
	var host = ""
	if(splits.length > 2){
		host = splits(1)
	}
	(host,log.categaryId,log.time)
	}).filter(_._1!="").map(x =>{
		(x._3.substring(0,8)+"_"+x._1+"_"+x._2,1)
	}).reduceByKey(_+_).foreachRDD(rdd=>{
		rdd.foreachPartition(partitionRecods=>{
		val list = new ListBuffer[CategarySearchClickCount]
		partitionRecods.foreach(pair=>{
		list.append(CategarySearchClickCount(pair._1,pair._2)) 
	})
	CategarySearchClickCountDAO.save(list)

	})
})

五、生成环境上运行代码

1、打包

<build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <!--这里要替换成jar包main方法所在类 -->
                            <mainClass>cn.ctoedu.spark.project.StatStreamingApp</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id> <!-- this is used for inheritance merges -->
                        <phase>package</phase> <!-- 指定在打包节点执行jar包合并操作 -->
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
    <reporting>
        <plugins>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <configuration>
                    <scalaVersion>${scala.version}</scalaVersion>
                </configuration>
            </plugin>
        </plugins>
    </reporting>

2、上传 jar 包到对应的服务器上面

3、运行

spark-submit --master spark://s201:7077 --class com.study.spark.project.StatStreamingApp
/home/centos/dowload/SparkTrain.jar

3、开启 kafka ,flume,hbase,hadoop,spark

4、清空 hbase 里面的数据truncate ‘表名’

5、执行日志生成器

脚本生成器

/soft/spark/bin/spark-submit \
--class com.study.spark.project.StatStreamingApp \
/home/centos/SparkTrain-1.0-jar-with-dependencies.jar \

六、可视化

  • Spring Boot 构建 web 项目
  • 使用 Echarts 构建静态数据可视化
  • 使用 Echarts 构建动态数据可视化
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
 * HBase 操作工具类
 */
public class HBaseUtils {
    HBaseAdmin admin = null;
    Configuration configration = null;
    /**
     * 私有构造方法
     */
    private HBaseUtils(){
        configration = new Configuration();
        configration.set("hbase.zookeeper.quorum", "s201:2181");
        configration.set("hbase.rootdir", "hdfs://s201/hbase");
        try {
            admin = new HBaseAdmin(configration);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    private static HBaseUtils instance = null;
    /**
     * 获取单实例对象
     * @return
     */
    public static synchronized HBaseUtils getInstance(){
        if(null == instance){
            instance = new HBaseUtils();
        }
        return instance;
    }
    /**
     * 根据表明获取到 Htable 实例
     * @param tableName
     * @return
     */
    public HTable getTable(String tableName){
        HTable table = null;
        try {
            table = new HTable(configration,tableName);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return table;
    }
    /**
     * 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
     *
     * @param tableName Hbase 表名
     * @param rowkey Hbase 表的 rowkey
     * @param cf Hbase 表的 columnfamily
     * @param column Hbase 表的列
     * @param value 写入 Hbase 表的值
     */
    public void put(String tableName,String rowkey,String cf,String column,String value){
        HTable table = getTable(tableName);
        Put put = new Put(Bytes.toBytes(rowkey));
        put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
        try {
            table.put(put);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 根据表名输入条件获取 Hbase 的记录数
     */
    public Map<String, Long> query(String tableName, String condition) throws IOException {
        Map<String, Long> map = new HashMap<>();
        HTable table = getTable(tableName);
        String cf = "info";
        String qualifier = "click_count";
        Scan scan = new Scan();
        Filter filter = new PrefixFilter(Bytes.toBytes(condition));
        scan.setFilter(filter);
        ResultScanner rs = table.getScanner(scan);
        for (Result result : rs) {
            String row = Bytes.toString(result.getRow());
            String clickCount = Bytes.toString(result.getValue(cf.getBytes(),
                    qualifier.getBytes()));
            long clickCount1 = Long.valueOf(clickCount);
            map.put(row, clickCount1);
        }
        return map;
    }
    public static void main(String[] args) throws IOException {
        Map<String, Long> map = HBaseUtils.getInstance().query("category_clickcount",
                "20171022");
        for (Map.Entry<String, Long> entry : map.entrySet()) {
            System.out.println(entry.getKey() + " : " + entry.getValue());
        }
    }
}

类目访问量 domain 和 dao 开发

/**
 * 类别访问数量实体类
 */
public class CategoryClickCount {
    private String name;
    private long value;
    public void setName(String name) {
        this.name = name;
    }
    public void setValue(long value) {
        this.value = value;
    }
    public long getValue() {
        return value;
    }
    public String getName() {
        return name;
    }
}

Dao 的开发

import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;

public class CategoryClickCountDAO {
    public List<CategoryClickCount> query(String day) throws IOException {
        List<CategoryClickCount> list = new ArrayList<>();
        Map<String,Long> map = HBaseUtils.getInstance().query("category_clickcount",day);
        for (Map.Entry<String, Long> entry : map.entrySet()) {
            CategoryClickCount categoryClickCount = new CategoryClickCount();
            categoryClickCount.setName(entry.getKey());;
            categoryClickCount.setValue(entry.getValue());
            list.add(categoryClickCount);
        }
        return list;
    }
    public static void main(String[] args) throws IOException {
        CategoryClickCountDAO dao = new CategoryClickCountDAO();
        List<CategoryClickCount> list = dao.query("2017");
        for (CategoryClickCount c : list) {
            System.out.println(c.getValue());
        }
    }
}


import com.studty.dao.CourseClickCountDAO;
import com.studty.domain.CourseClickCount;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.servlet.ModelAndView;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

@RestController
public class SparkStatApp {
    private static Map<String,String> courses = new HashMap<>();
    static {
        courses.put("112","偶像爱情");
        courses.put("128","宫斗谋权");
        courses.put("145","玄幻史诗");
        courses.put("146", "都市生活");
        courses.put("131", "罪案谍战");
        courses.put("130", "历险科幻");
    }
    @Autowired
    CourseClickCountDAO courseClickCountDAO;
    @RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.POST)
    @ResponseBody
    public List<CourseClickCount> courseClickCount() throws Exception {
        List<CourseClickCount> list = courseClickCountDAO.query("20771111");
        for(CourseClickCount model:list){
            model.setName(courses.get(model.getName().substring(9)));
        }
        return list;
    }
    @RequestMapping(value = "/echarts", method = RequestMethod.GET)
    public ModelAndView echarts(){
        return new ModelAndView("echarts");
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值