根据业务不同,一般都是需要自定义udf来操作
package Test
import Test.SQLIIpLocation1.ip2Long
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* 使用SparkSql实现access中的ip与ip规则库的关联
优点:不需要提前拿到全量的ip地址库,可以采用这种自定义函数的方式读取关系型数据库、Nosql、第三方Api等等
*/
object SQLIpLocation2 {
/**
* 定义一个ip转换的成十进制
*
* @param ip
* @return
*/
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length) {
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
/**
* 二分查找
* @param lines
* @param ip
* @return
*/
def binarySearch(lines:Array[(Long,Long,String)],ip:Long):Int={
//定义一个初始值
var low =0
//定义一个末位置
var high =lines.length-1
while(low<= high){
val middle =(low +high) /2
if((ip>=lines(middle)._1) && (ip <=lines(middle)._2))
return middle
if (ip< lines(middle)._1)
high = middle -1
else{
low = middle +1
}
}
-1
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("SQLIpLocation2")
.master("local[*]")
.getOrCreate()
//读取ip规则数据
val ipRulesLines: Dataset[String] = spark.read.textFile(args(0))
//导入隐式转换
import spark.implicits._
//整理ip规则数据
val tpDs: Dataset[(Long, Long, String)] = ipRulesLines.map(line => {
val fields: Array[String] = line.split("[|]")
val startNum = fields(2).toLong
val endNum = fields(3).toLong
val province = fields(6)
(startNum, endNum, province)
})
//将全部的ip规则收集到Driver端
val ipRulesInDriver: Array[(Long, Long, String)] = tpDs.collect()
//广播,阻塞的方法,没有广播完,不在往下执行
val broadCastRef: Broadcast[Array[(Long, Long, String)]] = spark.sparkContext.broadcast(ipRulesInDriver)
//读取访问日志数据
val accessLog: Dataset[String] = spark.read.textFile(args(1))
//整理访问日志数据
val ipLogs: DataFrame = accessLog.map(line => {
val fields: Array[String] = line.split("[|]")
val ip: String = fields(1)
ip2Long(ip)
}).toDF("ip_num")
//将ip日志注册成视图
ipLogs.createOrReplaceTempView("v_ip_logs")
//udf,定义并注册一个自定义函数
//自定义函数是在哪里定义的?(Driver),业务逻辑在哪里执行?(Executor)
spark.udf.register("ip_num2Province",(ipNum:Long)=>{
//获取广播到Executor端的全部ip规则
val rulesInExecutor: Array[(Long, Long, String)] = broadCastRef.value
val index = binarySearch(rulesInExecutor,ipNum)
var province="未知省份"
if(index != -1)
province =rulesInExecutor(index)._3
province
})
val result: DataFrame = spark.sql("SELECT ip_num2Province(ip_num),province,count(1) counts from v_ip_logs group by province order by counts desc")
result.show()
spark.stop()
}
}