本地运行Kmeans算法

最新推荐文章于 2023-08-01 17:13:06 发布

weixin_33728268

最新推荐文章于 2023-08-01 17:13:06 发布

阅读量139

点赞数

CC 4.0 BY-SA版权

文章标签：数据结构与算法人工智能大数据

原文链接：https://my.oschina.net/kyo4321/blog/3011750

本文介绍了一个使用Apache Spark MLlib库进行K-means聚类分析的例子。通过加载Iris数据集，预处理数据，应用特征缩放，并使用Pipeline API训练K-means模型。文章展示了如何评估聚类效果并打印出聚类中心。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

2019独角兽企业重金招聘Python工程师标准>>>

参考资料链接：
https://github.com/CraigCovey/spark-examples/blob/f8182a6736fd5293dfa03b023eb1423363ba6041/spark-1_6/scala/clustering/kmeans/kmeans_clustering_main.scala


package com.xx.Kmeans_sample

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{StandardScaler, VectorAssembler}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}

object KmeansClusteringMain {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("ReadData").setMaster("local").set("spark.sql.warehouse.dir", "file:///C:/Users/username/IdeaProjects/spark_demo/spark-warehouse")
    val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
    val input_path = "D:/spark_data/iris.csv"
    val data = sparkSession.sqlContext.read.format("csv").option("sep", ",")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(input_path)

    val predictorVariables : Array[String] = Array("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width")

    val assembler = new VectorAssembler()
      .setInputCols(predictorVariables)
      .setOutputCol("clusteringFeatures")

    val scaler = new StandardScaler()
      .setInputCol("clusteringFeatures")
      .setOutputCol("scaledClusteringFeatures")
      .setWithMean(true)
      .setWithStd(true)

    val kmeansAlgorithm = new KMeans()
      .setK(10)                   // <-- number of clusters
      .setSeed(1024)
      .setMaxIter(20)                 // <-- hyperparameter
      .setTol(1.0e-05)                // <-- hyperparameter
      .setFeaturesCol("scaledClusteringFeatures")
      .setPredictionCol("columnCategory")   // <-- create your own column name


    val pipeline = new Pipeline().setStages(Array(assembler, scaler, kmeansAlgorithm))

    // Train model
    val pipelineModel = pipeline.fit(data)


    // Apply model to dataframe
    val kmeansPrediction = pipelineModel.transform(data)
    kmeansPrediction.show()

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
    val cost = kmeansModel.computeCost(kmeansPrediction)
    println("Clustering Cost: " + cost)

    // Print cluster centers
    val centers = kmeansModel.clusterCenters
    println("Cluster Centers:")
    centers.foreach(println)

  }

}

转载于:https://my.oschina.net/kyo4321/blog/3011750