spark ml 聚类源码笔记一

最新推荐文章于 2022-04-21 00:07:32 发布

原创最新推荐文章于 2022-04-21 00:07:32 发布 · 1.3k 阅读

2 ·

CC 4.0 BY-SA版权

spark ml 机器学习源码专栏收录该内容

9 篇文章

订阅专栏

本文详细介绍了Spark MLlib中的KMeans聚类算法的源码实现，包括参数设置、重要方法如predict和computeCost，以及fit方法的调用过程。重点讨论了算法的初始化模式、迭代过程、聚类中心的计算和更新策略，展示了如何通过随机或K_MEANS_PARALLEL模式选择初始中心，并在数据集上进行迭代优化以达到最小化成本的目标。

首先是参数

k : 聚类数，默认2

initMode : 初始化算法的参数，可以是RANDOM或K_MEANS_PARALLEL，RANDOM是随机选择初始聚类中心，K_MEANS_PARALLEL是使用算法选择初始聚类中心，也是默认情况

initSteps : K_MEANS_PARALLEL方法迭代步数，默认5

接下来是一些重要的方法

private[clustering] def predict(features: Vector): Int = parentModel.predict(features)//predict预测新的属性属于哪个类

def clusterCenters: Array[Vector] = parentModel.clusterCenters//列出最终的聚类中心

def computeCost(dataset: DataFrame): Double = {//计算距离平方的总和
SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
val data = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
parentModel.computeCost(data)//主要是调用computeCost方法
}

下面是聚类算法默认的参数

setDefault(//2类，迭代20次，初始化算法为K_MEANS_PARALLEL，该算法迭代5次，收敛参数0.0001
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 5,
tol -> 1e-4)

调用fit,填充参数

override def fit(dataset: DataFrame): KMeansModel = {
val rdd = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }

val algo = new MLlibKMeans()
.setK($(k))
.setInitializationMode($(initMode))
.setInitializationSteps($(initSteps))
.setMaxIterations($(maxIter))
.setSeed($(seed))
.setEpsilon($(tol))
val parentModel = algo.run(rdd)//最重要的就是这里，algo是MLlibKMeans类，传入要聚类的rdd,调用run方法,得到模型，下面我们就看下这个是如何运行的
val model = new KMeansModel(uid, parentModel)
copyValues(model)
}

run方法

def run(data: RDD[Vector]): KMeansModel = {

if (data.getStorageLevel == StorageLevel.NONE) {
logWarning("The input data is not directly cached, which may hurt performance if its"
+ " parent RDDs are also uncached.")
}

// Compute squared norms and cache them.
val norms = data.map(Vectors.norm(_, 2.0))//计算每个向量的2范数，即平方和开放
norms.persist()//缓存，因为要多次用这个
val zippedData = data.zip(norms).map { case (v, norm) =>//把向量和2范数放在一起，VectorWithNorm是新的结构
new VectorWithNorm(v, norm)
}
val model = runAlgorithm(zippedData)//把VectorWithNorm放里面运行，返回模型
norms.unpersist()//释放缓存的2范数

// Warn at the end of the run as well, for increased visibility.
if (data.getStorageLevel == StorageLevel.NONE) {
logWarning("The input data was not directly cached, which may hurt performance if its"
+ " parent RDDs are also uncached.")
}
model//返回模型
}

下面看这个模型是怎么获得的

private def runAlgorithm(data: RDD[VectorWithNorm]): KMeansModel = {

val sc = data.sparkContext

val initStartTime = System.nanoTime()

// Only one run is allowed when initialModel is given
val numRuns = if (initialModel.nonEmpty) {//如果指定initialModel，这个initialModel就是初始化聚类中心，runs只能是1，这里的nunRuns就是之前提到的initSteps，如果不指定聚类中心，默认运行5次
if (runs > 1) logWarning("Ignoring runs; one run is allowed when initialModel is given.")
1
} else {
runs
}

val centers = initialModel match {
case Some(kMeansCenters) => {//如果指定聚类中心
Array(kMeansCenters.clusterCenters.map(s => new VectorWithNorm(s)))//把中心弄成一个带有2范数的VectorWithNorm的数组
}
case None => {//如果没指定
if (initializationMode == KMeans.RANDOM) {//看initializationMode是随机还是用算法，不管用哪个，最后返回都是聚类中心的数组，后续我们分别看这两种方式
initRandom(data)
} else {
initKMeansParallel(data)
}
}
}
val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
logInfo(s"Initialization with $initializationMode took " + "%.3f".format(initTimeInSeconds) +
" seconds.")

val active = Array.fill(numRuns)(true)//搞两个数组，长度为运行次数，一个用true填充，用来表示是否还需要运算，一个用0填充，用来标示成本
val costs = Array.fill(numRuns)(0.0)

var activeRuns = new ArrayBuffer[Int] ++ (0 until numRuns)//弄一个0-4的可变数组
var iteration = 0

val iterationStartTime = System.nanoTime()

// Execute iterations of Lloyd's algorithm until all runs have converged
while (iteration < maxIterations && !activeRuns.isEmpty) {//按默认总迭代次数不超过20次，步数还有的迭代，后面会知道，迭代好的会过滤，所以越来越少
type WeightedPoint = (Vector, Long)//定义一个类型
def mergeContribs(x: WeightedPoint, y: WeightedPoint): WeightedPoint = {//合并贡献值
axpy(1.0, x._1, y._1)//把x的向量加到y的向量上
(y._1, x._2 + y._2)//返回y的向量和x+y的长度
}//这个后面会用到

val activeCenters = activeRuns.map(r => centers(r)).toArray//取出每个步骤的中心数组
val costAccums = activeRuns.map(_ => sc.accumulator(0.0))//每步弄一个累加器，用来累加代价函数

val bcActiveCenters = sc.broadcast(activeCenters)//把中心广播出去，这俩广播和累加用的很经典

// Find the sum and count of points mapping to each center
val totalContribs = data.mapPartitions { points =>//分片算数据
val thisActiveCenters = bcActiveCenters.value//获取每步的聚类中心数组
val runs = thisActiveCenters.length//获取步数
val k = thisActiveCenters(0).length//获取聚类中心的维度，也就是聚成几个点
val dims = thisActiveCenters(0)(0).vector.size//获取聚类中心点的向量长度，也就是原始数据的属性个数

val sums = Array.fill(runs, k)(Vectors.zeros(dims))//搞一个二维数组，行是运行步数，列是聚类中心个数，每个元素是原始数据维度的向量，也就是一条数据
val counts = Array.fill(runs, k)(0L)//同理，再搞一哥二维数组，存放每个类别的数据条数

points.foreach { point =>//分片的每条数据
(0 until runs).foreach { i =>//遍历步数
val (bestCenter, cost) = KMeans.findClosest(thisActiveCenters(i), point)//传入每步的聚类中心和固定数据，计算最短距离，返回最好的中心和代价，我们先看findClosest()如何工作的

private[mllib] def findClosest(//返回最近中心的索引和平方距离，注意参数都是带2范数的
centers: TraversableOnce[VectorWithNorm],
point: VectorWithNorm): (Int, Double) = {
var bestDistance = Double.PositiveInfinity//距离弄成正无穷
var bestIndex = 0//索引弄成0
var i = 0
centers.foreach { center =>//每个中心点与给定点的距离
// Since `\|a - b\| \geq |\|a\| - \|b\||`, we can use this lower bound to avoid unnecessary
// distance computation.//通过不等式简化距离计算，有魄力
var lowerBoundOfSqDist = center.norm - point.norm//直接2范数差平方替代距离
lowerBoundOfSqDist = lowerBoundOfSqDist * lowerBoundOfSqDist
if (lowerBoundOfSqDist < bestDistance) {//如果比最优距离小
val distance: Double = fastSquaredDistance(center, point)//又调了一个，下面是具体的实现代码

private[mllib] def fastSquaredDistance(//\|a - b\|_2^2 = \|a\|_2^2 + \|b\|_2^2 - 2 a^T b.用这个公式近似计算距离，我们捋下思路，先是范数计算近似距离，如果有更近的再进一步算，进一步算也是通过近似计算得到的，都是为了提高性能，确实吊
v1: Vector,
norm1: Double,
v2: Vector,
norm2: Double,
precision: Double = 1e-6): Double = {//这有个参数是精度
val n = v1.size//求向量长度
require(v2.size == n)
require(norm1 >= 0.0 && norm2 >= 0.0)
val sumSquaredNorm = norm1 * norm1 + norm2 * norm2//范数平方和
val normDiff = norm1 - norm2//范数相减，这都是根据公式
var sqDist = 0.0//距离先设为0
/*
* The relative error is
* <pre>
* EPSILON * ( \|a\|_2^2 + \|b\\_2^2 + 2 |a^T b|) / ( \|a - b\|_2^2 ),//根据公式弄了一个相对错误，EPSILON是一个趋于1但比1大的数
* </pre>
* which is bounded by
* <pre>
* 2.0 * EPSILON * ( \|a\|_2^2 + \|b\|_2^2 ) / ( (\|a\|_2 - \|b\|_2)^2 ).//搞了一个边界，我不仔细看了
* </pre>
* The bound doesn't need the inner product, so we can use it as a sufficient condition to//他说边界不需要计算点积
* check quickly whether the inner product approach is accurate.
*/
val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON)//按公式走，这是个边界
if (precisionBound1 < precision) {//如果边界比精度小，再按开始的公式算带点积的，这个太bug了
sqDist = sumSquaredNorm - 2.0 * dot(v1, v2)
} else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) {///如果边界大于等于精度
val dotValue = dot(v1, v2)//老老实实算点积
sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)//按开始的公式算，取与0中较大者
val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) //算边界二，其实就是一个上界一个下界
(sqDist + EPSILON)//距离+常数
if (precisionBound2 > precision) {//如果大于精度
sqDist = Vectors.sqdist(v1, v2)//计算向量距离的平方，这里面也分了很多情况，但没什么难度
}
} else {
sqDist = Vectors.sqdist(v1, v2)
}
sqDist//最后返回距离的平方，想完全看懂近似计算的应该去看论文了
}

if (distance < bestDistance) {//回到findClosest，如果距离不是最优，赋值，记录索引，这个都明白
bestDistance = distance
bestIndex = i
}
}
i += 1//索引自加
}
(bestIndex, bestDistance)//返回最优索引，和最优距离
}

costAccums(i) += cost//累加距离
val sum = sums(i)(bestCenter)//拿出最优索引的
axpy(1.0, point.vector, sum)//把点放到sum，为了更新中心点
counts(i)(bestCenter) += 1//最优距离对应的中心累加，留个权重
}
}

val contribs = for (i <- 0 until runs; j <- 0 until k) yield {//每步每个中心点，看下贡献，捋一下，为什么循环runs应为每次拿数据点相当于同时做5个聚类，为什么做5个聚类，因为初始化聚类中心会有5组，最后要选出5组最优的
((i, j), (sums(i)(j), counts(i)(j)))
}
contribs.iterator
}.reduceByKey(mergeContribs).collectAsMap()//这里调用mergeContribs，把步相同中心点相同的距离和数量都累加，最后弄成一个map

bcActiveCenters.unpersist(blocking = false)//释放缓存，这个东西其实不大，主要是清理广播变量

// Update the cluster centers and costs for each active run
for ((run, i) <- activeRuns.zipWithIndex) {
var changed = false
var j = 0
while (j < k) {//k是要聚几个类
val (sum, count) = totalContribs((i, j))//取出对应的贡献
if (count != 0) {//如果有贡献
scal(1.0 / count, sum)//数据除以数量，这就是新的聚类中心
val newCenter = new VectorWithNorm(sum)
if (KMeans.fastSquaredDistance(newCenter, centers(run)(j)) > epsilon * epsilon) {//看新旧距离变化是否明显，如果明显置为true
changed = true
}
centers(run)(j) = newCenter//旧的不去新的不来
}
j += 1
}
if (!changed) {//如果没改变
active(run) = false
logInfo("Run " + run + " finished in " + (iteration + 1) + " iterations")//就说运行的第几步运行了几次结束了
}
costs(run) = costAccums(i).value//取出代价函数
}

activeRuns = activeRuns.filter(active(_))//迭代好的过滤
iteration += 1//迭代次数自增
}

val iterationTimeInSeconds = (System.nanoTime() - iterationStartTime) / 1e9
logInfo(s"Iterations took " + "%.3f".format(iterationTimeInSeconds) + " seconds.")

if (iteration == maxIterations) {
logInfo(s"KMeans reached the max number of iterations: $maxIterations.")
} else {
logInfo(s"KMeans converged in $iteration iterations.")
}

val (minCost, bestRun) = costs.zipWithIndex.min//取出5步里面效果最好的，我没骗你吧

logInfo(s"The cost for the best run is $minCost.")

new KMeansModel(centers(bestRun).map(_.vector))//把最好的聚类中心弄成向量返回
}