packagehiveimportorg.apache.spark.SparkConf
importorg.apache.spark.sql.{DataFrame, SparkSession}importscala.collection.mutable.ArrayBuffer
/**
* 需求测试,通过客户的选择,求出指定列的中位数,平均数,众数,或者指定值,来替换缺值
*/object NullAdd {def main(args: Array[String]):Unit={val spark: SparkSession = getSpark
spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别val dataFrame: DataFrame = spark.sql("select * from test.freq3")
dataFrame.createOrReplaceTempView("temp")val columns: Array[String]= dataFrame.columns
val array =new ArrayBuffer[String]()var column="areabeamd"var ntype="mode"var fixedvalue="0"for(i<-0 to columns.length-1){if(columns(i)!=column){
array.append(columns(i))}}var datatable:DataFrame=nullif(array.length!=0){val col:String= array.mkString(",")//平均数if(ntype.equals("avg")){val hql="select "+col+",case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()//中位数}elseif(ntype.equals("percentile")){val hql="select "+col+",case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()//众数}elseif(ntype.eq("mode")){var hql="select "+col+",case when "+ column +" is null then (select "+ column +" from temp where "+ column +" is not null group by "+ column +" having count(*) >=(select max(num) from (select count("+column+") as num from temp where "+ column +" is not null group by "+ column +")) limit 1) else "+ column +" end as "+ column +" from temp"
datatable= spark.sql(hql)
datatable.show()//指定值}elseif(ntype.eq("fixedvalue")){var hql="select "+col+",case when "+ column +" is null then "+fixedvalue+" else "+ column +" end as "+ column +" from temp"
datatable= spark.sql(hql)
datatable.show()}}else{if(ntype.equals("avg")){val hql="select case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()}elseif(ntype.equals("percentile")){val hql="select case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
datatable= spark.sql(hql)
datatable.show()}elseif(ntype.eq("mode")){var hql="select case when "+ column +" is null then (select "+ column +" from temp where "+ column +" is not null group by "+ column +" having count(*) >=(select max(num) from (select count("+column+") as num from temp where "+ column +" is not null group by "+ column +")) order by "+column+" limit 1) else "+ column +" end as "+ column +" from temp"
datatable= spark.sql(hql)
datatable.show()}elseif(ntype.eq("fixedvalue")){var hql="select case when "+ column +" is null then "+fixedvalue+" else "+ column +" end as "+ column +" from temp"
datatable= spark.sql(hql)
datatable.show()}}}//获取Spark环境def getSpark={val conf =new SparkConf().setMaster("local[*]")val spark=SparkSession.builder().config(conf).config("spark.sql.watehouse.dir","hdfs://hadoop102:9000/user/hive/warehouse").enableHiveSupport().getOrCreate()
spark
}}