《Spark MLlib 机器学习》第三章代码
3.1 Breeze 介绍
package book_code
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import breeze.linalg._
import breeze.numerics._
import org.apache.spark.mllib.linalg.Vectors
object breeze_test01 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("breeze_test01")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 3.1.1 Breeze 创建函数
val m1 = DenseMatrix.zeros[Double](2, 3)
val v1 = DenseVector.zeros[Double](3)
val v2 = DenseVector.ones[Double](3)
val v3 = DenseVector.fill(3) { 5.0 }
val v4 = DenseVector.range(1, 10, 2)
val m2 = DenseMatrix.eye[Double](3)
val v6 = diag(DenseVector(1.0, 2.0, 3.0))
val m3 = DenseMatrix((1.0, 2.0), (3.0, 4.0))
val v8 = DenseVector(1, 2, 3, 4)
val v9 = DenseVector(1, 2, 3, 4).t
val v10 = DenseVector.tabulate(3) { i => 2 * i }
val m4 = DenseMatrix.tabulate(3, 2) { case (i, j) => i + j }
val v11 = new DenseVector(Array(1, 2, 3, 4))
val m5 = new DenseMatrix(2, 3, Array(11, 12, 13, 21, 22, 23))
val v12 = DenseVector.rand(4)
val m6 = DenseMatrix.rand(2, 3)
// 3.1.2 Breeze 元素访问及操作函数
// 元素访问
val a = DenseVector(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
a(0)
a(1 to 4)
a(5 to 0 by -1)
a(1 to -1)
a(-1)
val m = DenseMatrix((1.0, 2.0, 3.0), (3.0, 4.0, 5.0))
m(0, 1)
m(::, 1)
// 元素操作
val m_1 = DenseMatrix((1.0, 2.0, 3.0), (3.0, 4.0, 5.0))
m_1.reshape(3, 2)
m_1.toDenseVector
val m_3 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))
lowerTriangular(m_3)
upperTriangular(m_3)
m_3.copy
diag(m_3)
m_3(::, 2) := 5.0
m_3
m_3(1 to 2, 1 to 2) := 5.0
m_3
val a_1 = DenseVector(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
a_1(1 to 4) := 5
a_1(1 to 4) := DenseVector(1, 2, 3, 4)
a_1
val a1 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
val a2 = DenseMatrix((1.0, 1.0, 1.0), (2.0, 2.0, 2.0))
DenseMatrix.vertcat(a1, a2)
DenseMatrix.horzcat(a1, a2)
val b1 = DenseVector(1, 2, 3, 4)
val b2 = DenseVector(1, 1, 1, 1)
DenseVector.vertcat(b1, b2)
// 3.1.3 Breeze 数值计算函数
val a_3 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
val b_3 = DenseMatrix((1.0, 1.0, 1.0), (2.0, 2.0, 2.0))
a_3 + b_3
a_3 :* b_3
a_3 :/ b_3
a_3 :< b_3
a_3 :== b_3
a_3 :+= 1.0
a_3 :*= 2.0
max(a_3)
argmax(a_3)
DenseVector(1, 2, 3, 4) dot DenseVector(1, 1, 1, 1)
// 3.1.4 Breeze 求和函数
val a_4 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))
sum(a_4)
sum(a_4, Axis._0)
sum(a_4, Axis._1)
trace(a_4)
accumulate(DenseVector(1, 2, 3, 4))
// 3.1.5 Breeze 布尔函数
val a_5 = DenseVector(true, false, true)
val b_5 = DenseVector(false, true, true)
a_5 :& b_5
a_5 :| b_5
!a_5
val a_5_2 = DenseVector(1.0, 0.0, -2.0)
any(a_5_2)
all(a_5_2)
// 3.1.6 Breeze 线性代数函数
val a_6 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0))
val b_6 = DenseMatrix((1.0, 1.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0))
a_6 \ b_6
a_6.t
det(a_6)
inv(a_6)
val svd.SVD(u, s, v) = svd(a_6)
a_6.rows
a_6.cols
// 3.1.7 Breeze 取整函数
val a_7 = DenseVector(1.2, 0.6, -2.3)
round(a_7)
ceil(a_7)
floor(a_7)
signum(a_7)
abs(a_7)
}
}
3.6 MLlib 分布式矩阵
package book_code
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import breeze.linalg._
import breeze.numerics._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
object rowmatri_test01 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("rowmatri_test01")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 3.6 分布式矩阵
// 3.6.2 行矩阵(RowMatrix)
val rdd1 = sc.parallelize(Array(Array(1.0, 2.0, 3.0, 4.0), Array(2.0, 3.0, 4.0, 5.0), Array(3.0, 4.0, 5.0, 6.0))).map(f => Vectors.dense(f))
val RM = new RowMatrix(rdd1)
val simic1 = RM.columnSimilarities(0.5)
val simic2 = RM.columnSimilarities()
val simic3 = RM.computeColumnSummaryStatistics()
simic3.max
simic3.min
simic3.mean
val cc1 = RM.computeCovariance
val cc2 = RM.computeGramianMatrix
val pc1 = RM.computePrincipalComponents(3)
val svd = RM.computeSVD(4, true)
val U = svd.U
U.rows.foreach(println)
val s = svd.s
val V = svd.V
}
}
代码和数据地址网盘:
http://pan.baidu.com/s/1c1J8ZN6