data数据源,请参考我的博客http://blog.youkuaiyun.com/hadoop_spark_storm/article/details/53412598
import org.apache.spark.sql.DataFrameStatFunctions
查看字段中频繁元素的集合
val colArray1=Array("affairs", "gender", "age", "yearsmarried")
data.stat.freqItems(colArray1).show(10,truncate=false)
+-------------------------------+----------------+------------------------------------------------------+-----------------------------------------------+
|affairs_freqItems |gender_freqItems|age_freqItems |yearsmarried_freqItems |
+-------------------------------+----------------+------------------------------------------------------+-----------------------------------------------+
|[2.0, 7.0, 1.0, 3.0, 12.0, 0.0]|[male, female] |[32.0, 47.0, 22.0, 52.0, 37.0, 17.5, 27.0, 57.0, 42.0]|[0.75, 0.125, 1.5, 0.417, 4.0, 7.0, 10.0, 15.0]|
+-------------------------------+----------------+------------------------------------------------------+-----------------------------------------------+
val colArray2=Array("children", "religiousness", "education", "occupation", "rating")
data.stat.freqItems(colArray2).show(10,truncate=false)
+------------------+-------------------------+-----------------------------------------+-----------------------------------+-------------------------+
|children_freqItems|religiousness_freqItems |education_freqItems |occupation_freqItems |rating_freqItems |
+------------------+-------------------------+-----------------------------------------+-----------------------------------+-------------------------+
|[no, yes] |[2.0, 5.0, 4.0, 1.0, 3.0]|[17.0, 20.0, 14.0, 16.0, 9.0, 18.0, 12.0]|[2.0, 5.0, 4.0, 7.0, 1.0, 3.0, 6.0]|[2.0, 5.0, 4.0, 1.0, 3.0]|
+------------------+-------------------------+-----------------------------------------+-----------------------------------+-------------------------+
相关系数
val df = Range(0,10,step=1).toDF("id").withColumn("rand1", rand(seed=10)).withColumn("rand2", rand(seed=27))
df: org.apache.spark.sql.DataFrame = [id: int, rand1: double ... 1 more field]
df.show
+---+-------------------+-------------------+
| id| rand1| rand2|
+---+-------------------+-------------------+
| 0|0.41371264720975787| 0.714105256846827|
| 1| 0.7311719281896606| 0.8143487574232506|
| 2| 0.9031701155118229| 0.5282207324381174|
| 3|0.09430205113458567| 0.4420100497826609|
| 4|0.38340505276222947| 0.9387162206758006|
| 5| 0.5569246135523511| 0.6398126862647711|
| 6| 0.4977441406613893| 0.9895498513115722|
| 7| 0.2076666106201438| 0.3398720242725498|
| 8| 0.9571919406508957|0.15042237695815963|
| 9| 0.7429395461204413| 0.7302723457066639|
+---+-------------------+-------------------+
df.stat.corr("rand1", "rand2", "pearson")
res24: Double = -0.10993962467082698