实时项目统计实战之三
更多干货
- 分布式实战(干货)
- spring cloud 实战(干货)
- mybatis 实战(干货)
- spring boot 实战(干货)
- React 入门实战(干货)
- 构建中小型互联网企业架构(干货)
- python 学习持续更新
- ElasticSearch 笔记
- kafka storm 实战 (干货)
一、概述
简单代码事例
https://github.com/csy512889371/learndemo/tree/master/spark-streaming-kafka
二、功能开发
- 目标:今天到现在为止,每个栏目的访问量
- 分析:yyyyMMdd categoryId
1、Spark Streaming 把统计的结构吸入到数据库里面 2、使用数据库进行存储我们的统计结果 3、可视化前端根据:yyyyMMdd categoryId 把数据库里面的统计结构展示出来
选择什么数据库作为统计结构存储呢?
1、 关系型数据库 RDBMS:MySQL Oracle
day categoryId click_count
20171117 1 10
20171117 2 19
- 下一个批次数据进来以后,我们需要取出 20171117 1 对应的值 10 + 对应的数据
2、非关系型数据库:Hbase,Redis
Hbase 一个 API 就能搞定,非常方便
三、使用 Hbase
./hbase shell
list 查看对应的数据库
创建表:
create ‘category_clickcount’,’info’
Rowkey 设计:day_categoryid
desc ‘category_clickcount’
scan ‘category_clickcount’ 查看表
操作 Hbase 数据库的工具类:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
public class HBaseUtils {
HBaseAdmin admin = null;
Configuration configration = null;
/**
* 私有构造方法
*/
private HBaseUtils(){
configration = new Configuration();
configration.set("hbase.zookeeper.quorum","s201:2181");
configration.set("hbase.rootdir","hdfs://s201/hbase");
try {
admin = new HBaseAdmin(configration);
} catch (IOException e) {
e.printStackTrace();
}
}
private static HBaseUtils instance = null;
public static synchronized HBaseUtils getInstance(){
if(null == instance){
instance = new HBaseUtils();
}
return instance;
}
/**
* 根据表名获取到 Htable 实例
*/
public HTable getTable(String tableName){
HTable table = null;
try {
table = new HTable(configration,tableName);
} catch (IOException e) {
e.printStackTrace();
}
return table;
}
/**
* 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
* @param tableName Hbase 表名
* @param rowkey Hbase 表的 rowkey
* @param cf Hbase 表的 columnfamily
* @param column Hbase 表的列
* @param value 写入 Hbase 表的值
*/
public void put(String tableName,String rowkey,String cf,String column,String value){
HTable table = getTable(tableName);
Put put = new Put(Bytes.toBytes(rowkey));
put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
try {
table.put(put);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
//HTable table = HBaseUtils.getInstance().getTable("category_clickcount");
//System.out.println(table.getName().getNameAsString());
String tableName = "category_click";
String rowkey = "20271111_88";
String cf="info";
String column ="click_count";
String value = "2";
HBaseUtils.getInstance().put(tableName,rowkey,cf,column,value);
}
}
四、使用 spark-streaming 完成数据清洗操作
时间工具类
156.187.29.132 2017-11-20 00:39:26 "GET www/2 HTTP/1.0" - 200
30.187.124.132 2017-11-20 00:39:26 "GET www/2 HTTP/1.0" - 302
DataUtil.java
import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat
object DataUtils {
val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
val TAG_FORMAT = FastDateFormat.getInstance("yyyyMMdd")
/**
* 把当前时间转换成时间戳
* @param time
* @return
*/
def getTime(time:String) ={
YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
}
def parseToMin(time:String) ={
TAG_FORMAT.format(new Date(getTime(time)))
}
def main(args: Array[String]): Unit = {
print(parseToMin("2017-11-20 00:39:26"))
}
}
针对数据进行分析
清洗数据,过滤出无用数据
val cleanData = logs.map(line =>{
val infos = line.split("\t")
val url = infos(2).split(" ")(1)
var categaryId = 0
//把爱奇艺的类目编号拿到了
if(url.startsWith("/www")){
categaryId = url.split("/")(2).toInt
}
ClickLog(infos(0),DataUtils.parseToMin(infos(1)),categaryId,infos(4).toInt,infos(3))
}).filter(clickLog=>clickLog.categaryId!=0)
cleanData.print()
点击 ClickLog 类
case class ClickLog(ip:String,time:String,categaryId:Int,refer:String,statusCode:Int)
产生的日志
ClickLog(187.124.156.143,20171122,6,-,404)
ClickLog(29.132.10.100,20171122,1,-,302)
ClickLog(100.167.30.124,20171122,3,-,404)
ClickLog(30.10.167.132,20171122,4,-,200)
ClickLog(132.124.100.143,20171122,4,-,200)
ClickLog(167.132.10.156,20171122,4,http://cn.bing.com/search?q=幸福满院,302)
ClickLog(29.143.10.187,20171122,3,-,302)
API 操作 Hbase
case class CategaryClickCount(categaryID:String,clickCout:Int)
import cn.ctoedu.spark.domain.CategaryClickCount
import cn.ctoedu.spark.utils.HBaseUtils
import org.apache.hadoop.hbase.client.{Get, HTable}
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.mutable.ListBuffer
/**
*
*/
object CategaryClickCountDAO {
val tableName = "category_clickcount"
val cf = "info"
val qualifer = "click_count"
/**
* 保存数据
* @param list
*/
def save(list:ListBuffer[CategaryClickCount]): Unit ={
val table = HBaseUtils.getInstance().getHtable(tableName)
for(els <- list){
table.incrementColumnValue(Bytes.toBytes(els.categaryID),Bytes.toBytes(cf),Bytes.toBytes(qualifer),els.clickCout);
}
}
def count(day_categary:String) : Long={
val table =HBaseUtils.getInstance().getHtable(tableName)
val get = new Get(Bytes.toBytes(day_categary))
val value = table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
if(value == null){
0L
}else{
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
val list = new ListBuffer[CategaryClickCount]
list.append(CategaryClickCount("20171122_1",300))
//list.append(CategaryClickCount("20171122_9", 60))
//list.append(CategaryClickCount("20171122_10", 160))
save(list)
// print(count("20171122_10")+"---")
}
}
保存收集数据到 HBase 里面
cleanLog.map(log=>{
(log.time.substring(0,8)+"_"+log.categaryId,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition(partriosRdds=>{
val list = new ListBuffer[CategaryClickCount]
partriosRdds.foreach(pair=>{
list.append(CategaryClickCount(pair._1,pair._2))
})
CategaryClickCountDAO.save(list)
})
})
功能二:功能一+从搜索引擎引流过来的
Hbase 表设计
create 'category_search_clickcount','info'
rowkey 设计:也是根据我们的业务需求来的封装类
case class CategarSearchClickCount
(day_search_categary:String,clickCount:Int)
DAO 操作类
import cn.ctoedu.spark.domain.{CategaryClickCount, CategarySearchClickCount}
import cn.ctoedu.spark.utils.HBaseUtils
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.mutable.ListBuffer
/**
*
*/
object CategarySearchClickCountDAO {
val tableName = "categary_search_cout"
val cf = "info"
val qualifer = "click_count"
/**
* 保存数据
* @param list
*/
def save(list:ListBuffer[CategarySearchClickCount]): Unit ={
val table = HBaseUtils.getInstance().getHtable(tableName)
for(els <- list){
table.incrementColumnValue(Bytes.toBytes(els.day_search_categary),Bytes.toBytes(cf),Bytes.toBytes(qualifer),els.clickCout);
}
}
def count(day_categary:String) : Long={
val table =HBaseUtils.getInstance().getHtable(tableName)
val get = new Get(Bytes.toBytes(day_categary))
val value = table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
if(value == null){
0L
}else{
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
val list = new ListBuffer[CategarySearchClickCount]
//list.append(CategarySearchClickCount("20171122_1_1",300))
list.append(CategarySearchClickCount("20171122_2_1", 300))
list.append(CategarySearchClickCount("20171122_1_2", 1600))
// save(list)
print(count("20171122_www.sogou.com_2")+"---" )
}
}
业务功能实现
cleanLog.map(log=>{
val referer = log.refer.replace("//","/")
val splits = referer.split("/")
var host = ""
if(splits.length > 2){
host = splits(1)
}
(host,log.categaryId,log.time)
}).filter(_._1!="").map(x =>{
(x._3.substring(0,8)+"_"+x._1+"_"+x._2,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition(partitionRecods=>{
val list = new ListBuffer[CategarySearchClickCount]
partitionRecods.foreach(pair=>{
list.append(CategarySearchClickCount(pair._1,pair._2))
})
CategarySearchClickCountDAO.save(list)
})
})
五、生成环境上运行代码
1、打包
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<!--这里要替换成jar包main方法所在类 -->
<mainClass>cn.ctoedu.spark.project.StatStreamingApp</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- 指定在打包节点执行jar包合并操作 -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
2、上传 jar 包到对应的服务器上面
3、运行
spark-submit --master spark://s201:7077 --class com.study.spark.project.StatStreamingApp
/home/centos/dowload/SparkTrain.jar
3、开启 kafka ,flume,hbase,hadoop,spark
4、清空 hbase 里面的数据truncate ‘表名’
5、执行日志生成器
脚本生成器
/soft/spark/bin/spark-submit \
--class com.study.spark.project.StatStreamingApp \
/home/centos/SparkTrain-1.0-jar-with-dependencies.jar \
六、可视化
- Spring Boot 构建 web 项目
- 使用 Echarts 构建静态数据可视化
- 使用 Echarts 构建动态数据可视化
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* HBase 操作工具类
*/
public class HBaseUtils {
HBaseAdmin admin = null;
Configuration configration = null;
/**
* 私有构造方法
*/
private HBaseUtils(){
configration = new Configuration();
configration.set("hbase.zookeeper.quorum", "s201:2181");
configration.set("hbase.rootdir", "hdfs://s201/hbase");
try {
admin = new HBaseAdmin(configration);
} catch (IOException e) {
e.printStackTrace();
}
}
private static HBaseUtils instance = null;
/**
* 获取单实例对象
* @return
*/
public static synchronized HBaseUtils getInstance(){
if(null == instance){
instance = new HBaseUtils();
}
return instance;
}
/**
* 根据表明获取到 Htable 实例
* @param tableName
* @return
*/
public HTable getTable(String tableName){
HTable table = null;
try {
table = new HTable(configration,tableName);
} catch (Exception e) {
e.printStackTrace();
}
return table;
}
/**
* 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
*
* @param tableName Hbase 表名
* @param rowkey Hbase 表的 rowkey
* @param cf Hbase 表的 columnfamily
* @param column Hbase 表的列
* @param value 写入 Hbase 表的值
*/
public void put(String tableName,String rowkey,String cf,String column,String value){
HTable table = getTable(tableName);
Put put = new Put(Bytes.toBytes(rowkey));
put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
try {
table.put(put);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据表名输入条件获取 Hbase 的记录数
*/
public Map<String, Long> query(String tableName, String condition) throws IOException {
Map<String, Long> map = new HashMap<>();
HTable table = getTable(tableName);
String cf = "info";
String qualifier = "click_count";
Scan scan = new Scan();
Filter filter = new PrefixFilter(Bytes.toBytes(condition));
scan.setFilter(filter);
ResultScanner rs = table.getScanner(scan);
for (Result result : rs) {
String row = Bytes.toString(result.getRow());
String clickCount = Bytes.toString(result.getValue(cf.getBytes(),
qualifier.getBytes()));
long clickCount1 = Long.valueOf(clickCount);
map.put(row, clickCount1);
}
return map;
}
public static void main(String[] args) throws IOException {
Map<String, Long> map = HBaseUtils.getInstance().query("category_clickcount",
"20171022");
for (Map.Entry<String, Long> entry : map.entrySet()) {
System.out.println(entry.getKey() + " : " + entry.getValue());
}
}
}
类目访问量 domain 和 dao 开发
/**
* 类别访问数量实体类
*/
public class CategoryClickCount {
private String name;
private long value;
public void setName(String name) {
this.name = name;
}
public void setValue(long value) {
this.value = value;
}
public long getValue() {
return value;
}
public String getName() {
return name;
}
}
Dao 的开发
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
public class CategoryClickCountDAO {
public List<CategoryClickCount> query(String day) throws IOException {
List<CategoryClickCount> list = new ArrayList<>();
Map<String,Long> map = HBaseUtils.getInstance().query("category_clickcount",day);
for (Map.Entry<String, Long> entry : map.entrySet()) {
CategoryClickCount categoryClickCount = new CategoryClickCount();
categoryClickCount.setName(entry.getKey());;
categoryClickCount.setValue(entry.getValue());
list.add(categoryClickCount);
}
return list;
}
public static void main(String[] args) throws IOException {
CategoryClickCountDAO dao = new CategoryClickCountDAO();
List<CategoryClickCount> list = dao.query("2017");
for (CategoryClickCount c : list) {
System.out.println(c.getValue());
}
}
}
import com.studty.dao.CourseClickCountDAO;
import com.studty.domain.CourseClickCount;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.servlet.ModelAndView;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@RestController
public class SparkStatApp {
private static Map<String,String> courses = new HashMap<>();
static {
courses.put("112","偶像爱情");
courses.put("128","宫斗谋权");
courses.put("145","玄幻史诗");
courses.put("146", "都市生活");
courses.put("131", "罪案谍战");
courses.put("130", "历险科幻");
}
@Autowired
CourseClickCountDAO courseClickCountDAO;
@RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.POST)
@ResponseBody
public List<CourseClickCount> courseClickCount() throws Exception {
List<CourseClickCount> list = courseClickCountDAO.query("20771111");
for(CourseClickCount model:list){
model.setName(courses.get(model.getName().substring(9)));
}
return list;
}
@RequestMapping(value = "/echarts", method = RequestMethod.GET)
public ModelAndView echarts(){
return new ModelAndView("echarts");
}
}