package hive
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.mutable.ArrayBuffer
/**
* 需求测试,通过客户的选择,求出指定列的中位数,平均数,众数,或者指定值,来替换缺值
*/
object NullAdd {
def main(args: Array[String]): Unit = {
val spark: SparkSession = getSpark
spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
val dataFrame: DataFrame = spark.sql("select * from test.freq3")
dataFrame.createOrReplaceTempView("temp")
val columns: Array[String] = dataFrame.columns
val array = new ArrayBuffer[String]()
var column="areabeamd"
var ntype="mode"
var fixedvalue="0"
for (i<-0 to columns.length-1){
if (columns(i)!=column){
array.append(columns(i))
}
}
var datatable:DataFrame=null
if(array.length!=0){
val col: String = array.mkString(",")
//平均数
if (ntype.equals("avg")){
val hql="select "+col+",case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()
//中位数
}else if(ntype.equals("percentile")){
val hql="select "+col+",case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()
//众数
}else if(ntype.eq("mode")){
var hql="select "+col+ ",case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) limit 1) else " + column + " end as " + column + " from temp"
datatable= spark.sql(hql)
datatable.show()
//指定值
}else if (ntype.eq("fixedvalue")){
var hql="select "+col+ ",case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
datatable= spark.sql(hql)
datatable.show()
}
}else{
if (ntype.equals("avg")){
val hql="select case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()
}else if(ntype.equals("percentile")){
val hql="select case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()
}else if(ntype.eq("mode")){
var hql="select case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) order by "+column+" limit 1) else " + column + " end as " + column + " from temp"
datatable= spark.sql(hql)
datatable.show()
}else if (ntype.eq("fixedvalue")){
var hql="select case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
datatable= spark.sql(hql)
datatable.show()
}
}
}
//获取Spark环境
def getSpark={
val conf = new SparkConf()
.setMaster("local[*]")
val spark=SparkSession.builder().config(conf).config("spark.sql.watehouse.dir","hdfs://hadoop102:9000/user/hive/warehouse")
.enableHiveSupport()
.getOrCreate()
spark
}
}
scala求平均值,中位值,众数
最新推荐文章于 2024-08-14 22:38:15 发布