package demo
import java.text.SimpleDateFormat
import java.util.ArrayList
import java.util.Calendar
import scala.collection.JavaConversions._
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks
object CompareFlowDemo {
var activateRepeatPath:Set[Int] = Set() //重复的路径集合
def main(args:Array[String]) {
val conf = SparkSession
.builder()
.appName("spark dataframe test").master("local")
val spark = conf.getOrCreate()
/**
* 获取当前日期
*/
var dateFormat:SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
var cal1:Calendar=Calendar.getInstance()
var thisDay=dateFormat.format(cal1.getTime())
val nowDay : String = thisDay.format()
/**
* 获取三天之前日期
*/
var cal3:Calendar=Calendar.getInstance()
cal3.add(Calendar.DATE,-3)
var threeDay=dateFormat.format(cal3.getTime())
val threeDayAgo : String = threeDay.format()
/**
* 初始化历史路径
* (路径编号、路径、路径步长、频次、创建时间、最新有效时间、是否生效、分区日期)
*/
var repeatHisPathData = spark.createDataFrame(List(
(1,"a->b->c->d->e->", 5, 182,"20200630","20200716",1,"20200717"),
(2,"h->g->e->", 3, 22,"20200630","20200710",1,"20200717")
)) toDF("flow_id","paths", "step", "freq","create_dt","active_dt","is_enable","dt")
/**
* 初始化当日新增路径
*/
val repeatNewPathData=new ArrayList[(String,Int,Int)]
repeatNewPathData.add(("a->b->c->d->e->",5,2)) //路径、步长、频次
repeatNewPathData.add(("h->g->e->i->",4,2))
repeatNewPathData.add(("h->g->e->",3,2))
/**
* 第一步:新增路径中的重复性判断,并标记
*/
val afterRepeatCom = isInHistPath(repeatNewPathData,repeatHisPathData)
println(afterRepeatCom)
/**
* 第二步:新增路径中的相似性判断
*/
val afterSimilarCom = newActivatePath(afterRepeatCom,repeatHisPathData,threeDayAgo)
// println(afterSimilarCom)
/**
* 第三步:对历史路径进行整理
*/
val maxId= repeatHisPathData.groupBy().max("flow_id").select("max(flow_id)").first().get(0).toString.toInt
val completeData = noAvailHisPath(afterSimilarCom,repeatHisPathData,maxId,nowDay,threeDayAgo)
// println(completeData)
// var resultDF = spark.createDataFrame(completeData) toDF("flow_id","paths", "step", "freq","create_dt","active_dt","is_enable")
val n= "a->b->c->d->"
repeatHisPathData.where("paths = '"+n+"'").count()
val similarFeatures:DataFrame = repeatHisPathData.select("flow_id","paths")
// similarFeatures.show()
val x =similarFeatures.first().get(0).toString.toInt
// println(isInHistPath(delPath,todayPath))
val desOrderedDF=repeatHisPathData.orderBy(-repeatHisPathData("step"),-repeatHisPathData("freq")) //按照步长、频率降序
val incOrderedDF=repeatHisPathData.orderBy("step","freq") //按照步长、频率升序
}
/**
* 对路径进行重复性判断
* @param inputPath 新输入路径
* @param comparePath 被比较路径
* @return 新输入路径重复性标记
*/
def isInHistPath(inputPath:ArrayList[(String,Int,Int)],comparePath:DataFrame):ArrayBuffer[(String,Int,Int,Int,Int,String)] ={
val pathAddRepeatFlag=new ArrayBuffer[(String,Int,Int,Int,Int,String)]()
//遍历当日新增路径
for(ele <- inputPath){
val path = ele._1
val thisPath = comparePath.where("paths = '"+ path +"'") //是否在历史路径中有完全相同的路径
val isIn = thisPath.count()
var inputTuple = ("",0,0,0,0,"")
if (isIn>0){ //如果历史路径中出现相同路径
val path_id = thisPath.first().get(0).toString.toInt
val path_create_day = thisPath.first().get(4).toString
inputTuple = (ele._1,ele._2,ele._3,1,path_id,path_create_day) //记录路径id、历史创建时间
}else {
inputTuple = (ele._1,ele._2,ele._3,0,0,"") //路径、步长、频次、是否重复(1重复,0不重复)、重复的路径id、创建时间
}
pathAddRepeatFlag += inputTuple //对路径进行重复判断后标记加入集合返回
}
pathAddRepeatFlag
}
/**
* 通过相似性判断过滤新增路径是否要添加到有效路径中
* @param inputPath 新输入路径
* @param hisComparePath 历史路径
* @param noActDayNum 路径最新有效日期
* @return 确认添加的新增路径集合
*/
def newActivatePath(inputPath:ArrayBuffer[(String,Int,Int,Int,Int,String)],hisComparePath:DataFrame,noActDayNum:String):ArrayBuffer[(String,Int,Int,Int,Int,String)] ={
val addPaths =new ArrayBuffer[(String,Int,Int,Int,Int,String)]
val actData = hisComparePath.where("is_enable = 1 and is_enable = 1 and active_dt>='"+ noActDayNum +"'")
val rows = actData.collect()
//对输入路径进行遍历判断
for(ele <- inputPath){
var isAdd=true
val isDeleteLoop = new Breaks;
isDeleteLoop.breakable{
//将新增路径与历史有效路径比较
for(row <- rows){
//输入路径步长小于历史路径步长,或者输入路径等于历史路径步长但频次小于历史路径频次
if(ele._2 < row.get(2).toString.toInt || (ele._2 == row.get(2).toString.toInt && ele._3 < row.get(3).toString.toInt)){
val delete = isSimilar(ele._1,row.get(1).toString,0.8) //判断路径是否相似
if(delete){
isAdd=false
isDeleteLoop.break()
}
}
}
}
if(isAdd) {
addPaths += ele
if(ele._5!=0) { //表明为重复的路径id
val a:Int = ele._5
activateRepeatPath += ele._5
}
}
}
addPaths
}
/**
* 判断某个路径是否与另一个路径相似
* @param beJudgedPath 比较路径
* @param comPath 被比较路径
* @param factor 比较因子
* @return 是否相似
*/
def isSimilar(beJudgedPath:String,comPath:String,factor:Double): Boolean ={
val pageIds = beJudgedPath.split("->")
var isSimilar = false
var count = 0
for(i <- 0 until pageIds.length-1){
if(comPath.contains(pageIds(i)+"->")) count += 1 //如果这个pageid被比较路径包含,记录+1
}
val percentage = count/(pageIds.length-1) //被包含的pageid占比
if(percentage >= factor){ //占比是否大于等于比较因子值,若大于等于,则判断为相似
isSimilar=true
}
isSimilar
}
def noAvailHisPath(inputPath:ArrayBuffer[(String,Int,Int,Int,Int,String)],hisAllPath:DataFrame,maxIdValue:Int,nowDay:String,threeDayAgo:String)={
val resultPaths =new ArrayBuffer[(Int,String,Int,Int,String,String,Int)] //(路径id、路径内容、步长、频次、创建时间、有效期、是否有效)
val rows = hisAllPath.collect()
for(row <- rows){
/**
* 如果当前日期未出现过该路径,对其进行是否过期判断
* 如果当前日期出现过该路径,不对其判断,在当前路径数据中再判断添加
*/
if(!activateRepeatPath.contains(row.get(0).toString.toInt)){
if(row.get(6).toString.toInt==0){
//如果历史路径为无效,直接添加
resultPaths += ((row.get(0).toString.toInt,row.get(1).toString,row.get(2).toString.toInt,row.get(3).toString.toInt,row.get(4).toString,row.get(5).toString,0))
}else{
//如果历史路径为有效,先判断是否在日期上过期
var isAdd=true
val later = isLater(row.get(5).toString,threeDayAgo) //通过数据有效日期判断是否需要设为失效
if(later) { //如果截止当前时间过期,则设为失效
isAdd=false
resultPaths +=
((row.get(0).toString.toInt,row.get(1).toString,row.get(2).toString.toInt,row.get(3).toString.toInt,row.get(4).toString,row.get(5).toString,0))
}else{
//如果日期上未过期,再判断是否与某个当前新增的路径相似
val isSimilarLoop = new Breaks;
isSimilarLoop.breakable{
for(newPath <- inputPath){ //将历史数据遍历与当前新路径做比较
if(newPath._2 > row.get(2).toString.toInt ||
(newPath._2 == row.get(2).toString.toInt) && (newPath._3 > row.get(3).toString.toInt)){
val isNouse = isSimilar(row.get(1).toString,newPath._1,0.8) //判断是否与历史某个路径相似
if(isNouse){ //如果相似,将数据设为失效
isAdd=false
resultPaths +=
((row.get(0).toString.toInt,row.get(1).toString,row.get(2).toString.toInt,row.get(3).toString.toInt,row.get(4).toString,row.get(5).toString,0))
isSimilarLoop.break()
}
}
}
}
}
//如果为过期,且不满足相似性,则设为有效
if(isAdd) resultPaths +=
((row.get(0).toString.toInt,row.get(1).toString,row.get(2).toString.toInt,row.get(3).toString.toInt,row.get(4).toString,row.get(5).toString,1))
}
}
}
var increadValue = maxIdValue
//新路径加入集合
for(newPath <- inputPath){
if(newPath._5!=0){
resultPaths += ((newPath._5,newPath._1,newPath._2,newPath._3,newPath._6,nowDay,1))
}else{
increadValue=increadValue+1
resultPaths += ((increadValue,newPath._1,newPath._2,newPath._3,nowDay,nowDay,1))
}
}
resultPaths
}
/**
* 判断是否过期超过三天未出现
* @param dataDay 数据有效日期
* @param compareDay 作为比较的标准日期
* @return 是否设置为过期
*/
def isLater(dataDay:String,compareDay:String) ={
var flag=false
if(dataDay < compareDay) flag=true
flag
}
}