scala求平均值,中位值,众数

package hive

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}

import scala.collection.mutable.ArrayBuffer

/**
  * 需求测试,通过客户的选择,求出指定列的中位数,平均数,众数,或者指定值,来替换缺值
  */
object NullAdd {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = getSpark
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    val dataFrame: DataFrame = spark.sql("select * from test.freq3")
    dataFrame.createOrReplaceTempView("temp")
    val columns: Array[String] = dataFrame.columns
    val array = new ArrayBuffer[String]()
    var column="areabeamd"
    var ntype="mode"
    var fixedvalue="0"
    for (i<-0 to columns.length-1){
      if (columns(i)!=column){
        array.append(columns(i))
      }
    }
    var datatable:DataFrame=null
    if(array.length!=0){
      val col: String = array.mkString(",")
      //平均数
      if (ntype.equals("avg")){
        val hql="select "+col+",case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
      datatable.show()
          //中位数
      }else if(ntype.equals("percentile")){
        val hql="select "+col+",case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
        datatable.show()
        //众数
      }else if(ntype.eq("mode")){
        var hql="select "+col+ ",case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) limit 1) else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
        //指定值
      }else if (ntype.eq("fixedvalue")){
        var hql="select "+col+ ",case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }
    }else{
      if (ntype.equals("avg")){
        val hql="select case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
       datatable= spark.sql(hql)
        datatable.show()
      }else if(ntype.equals("percentile")){
        val hql="select case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }else if(ntype.eq("mode")){
    var hql="select case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) order by "+column+" limit 1) else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }else if (ntype.eq("fixedvalue")){
        var hql="select case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }
    }
  }
  //获取Spark环境
  def getSpark={
    val conf = new SparkConf()
      .setMaster("local[*]")
    val spark=SparkSession.builder().config(conf).config("spark.sql.watehouse.dir","hdfs://hadoop102:9000/user/hive/warehouse")
      .enableHiveSupport()
      .getOrCreate()
    spark
  }
}





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值