import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Administrator on 2017/2/8 0008.
*/objectChiSqlTest {/*
分别对Vector和Matrix 进行卡方检验
*********************************************************
* 卡方检测表示统计样本的实际观测值和预测值之间的偏离程度,
* 实际观测值与预测值之间的偏离程度决定卡方值的大小,卡方值
* 越大,表示越偏离样本的实际值,反之,越小表示越接近实际值
* 如果卡方为0,表示预测值和实际值完全吻合。
* *********************************************************
*/def main(args: Array[String]) {
val conf = new SparkConf()
.setMaster("local") .setAppName(this.getClass.getSimpleName.filter(!_.equals('$')))
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
val vd = Vectors.dense(1, 2, 3, 4, 5)
val vResult = Statistics.chiSqTest(vd)
println(s"向量卡方检测 :$vResult")
val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6))
val mtxResult = Statistics.chiSqTest(mtx)
println(s"矩阵的卡方检测:$mtxResult")
val mtx2 = Matrices.dense(2, 2, Array(1, 2, 3, 4))
printChiSqTest(mtx2)
sc.stop()
//打印信息 方差,自由度,统计量,p值
}
def printChiSqTest(matrix: Matrix): Unit = {
val mtxResult = Statistics.chiSqTest(matrix)
println(mtxResult)
}
}
运行结果
17/04/0118:55:46 INFO Utils: Successfully started service 'SparkUI'on port 4040.
17/04/0118:55:46 INFO SparkUI: Started SparkUI at http://121.48.185.192:404017/04/0118:55:46 INFO Executor: Starting executor ID driver on host localhost
17/04/0118:55:46 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService'on port 56289.
17/04/0118:55:46 INFO NettyBlockTransferService: Server created on5628917/04/0118:55:46 INFO BlockManagerMaster: Trying toregister BlockManager
17/04/0118:55:46 INFO BlockManagerMasterEndpoint: Registering block manager localhost:56289with457.9 MB RAM, BlockManagerId(driver, localhost, 56289)
17/04/0118:55:46 INFO BlockManagerMaster: Registered BlockManager
向量卡方检测 :Chi squared test summary:
method: pearson
degrees of freedom = 4
statistic = 3.333333333333333
pValue = 0.5036682742334986
No presumption against null hypothesis: observed follows the same distribution as expected..
矩阵的卡方检测:Chi squared test summary:
method: pearson
degrees of freedom = 2
statistic = 0.14141414141414144
pValue = 0.931734784568187
No presumption against null hypothesis: the occurrence of the outcomes is statistically independent..
Chi squared test summary:
method: pearson
degrees of freedom = 1
statistic = 0.07936507936507939
pValue = 0.7781596861761658
No presumption against null hypothesis: the occurrence of the outcomes is statistically independent..
Process finished withexit code 0