scala求平均值，中位值，众数

最新推荐文章于 2024-08-14 22:38:15 发布

原创最新推荐文章于 2024-08-14 22:38:15 发布 · 1.7k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#scala #spark #开发语言

大数据专栏专栏收录该内容

10 篇文章

订阅专栏

该博客主要介绍如何使用Spark SQL处理数据中的缺失值。通过创建DataFrame并设置日志级别，对数据进行操作。根据用户选择，计算指定列的中位数、平均数、众数或使用固定值来填充缺失数据，并展示处理后的结果。示例代码展示了如何根据条件构造SQL查询并执行，以达到数据清洗的目的。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

package hive

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}

import scala.collection.mutable.ArrayBuffer

/**
  * 需求测试，通过客户的选择，求出指定列的中位数，平均数，众数，或者指定值，来替换缺值
  */
object NullAdd {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = getSpark
    spark.sqlContext.sparkContext.setLogLevel("WARN")//打印日志的级别
    val dataFrame: DataFrame = spark.sql("select * from test.freq3")
    dataFrame.createOrReplaceTempView("temp")
    val columns: Array[String] = dataFrame.columns
    val array = new ArrayBuffer[String]()
    var column="areabeamd"
    var ntype="mode"
    var fixedvalue="0"
    for (i<-0 to columns.length-1){
      if (columns(i)!=column){
        array.append(columns(i))
      }
    }
    var datatable:DataFrame=null
    if(array.length!=0){
      val col: String = array.mkString(",")
      //平均数
      if (ntype.equals("avg")){
        val hql="select "+col+",case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
      datatable.show()
          //中位数
      }else if(ntype.equals("percentile")){
        val hql="select "+col+",case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
        datatable.show()
        //众数
      }else if(ntype.eq("mode")){
        var hql="select "+col+ ",case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) limit 1) else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
        //指定值
      }else if (ntype.eq("fixedvalue")){
        var hql="select "+col+ ",case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }
    }else{
      if (ntype.equals("avg")){
        val hql="select case when "+column+" is null then ceiling(avg("+column+") over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
       datatable= spark.sql(hql)
        datatable.show()
      }else if(ntype.equals("percentile")){
        val hql="select case when "+column+" is null then ceiling(percentile("+column+",0.5) over(rows between unbounded preceding and unbounded following)) else "+column+" end as "+ column+" from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }else if(ntype.eq("mode")){
    var hql="select case when " + column + " is null then (select " + column + " from temp where " + column + " is not null group by " + column + " having count(*) >=(select max(num) from (select count(" +column+ ") as num from temp where " + column + " is not null group by " + column + ")) order by "+column+" limit 1) else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }else if (ntype.eq("fixedvalue")){
        var hql="select case when " + column + " is null then "+fixedvalue+ " else " + column + " end as " + column + " from temp"
        datatable= spark.sql(hql)
        datatable.show()
      }
    }
  }
  //获取Spark环境
  def getSpark={
    val conf = new SparkConf()
      .setMaster("local[*]")
    val spark=SparkSession.builder().config(conf).config("spark.sql.watehouse.dir","hdfs://hadoop102:9000/user/hive/warehouse")
      .enableHiveSupport()
      .getOrCreate()
    spark
  }
}