import jieba
import jieba.analyse
import jieba.posseg as pseg
from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.ml.feature import Word2Vec,CountVectorizer
import pandas as pd
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils
conf = SparkConf().setAppName("cluster")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
#my_df 加载数据
spark_df = sqlContext.createDataFrame(my_df)
#计算tfidf
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
cvmodel =cv.fit(spark_df);
cvResult= cvmodel.transform(spark_df);
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(cvResult)
cvResult = idfModel.transform(cvResult)
ddf = MLUtils.convertVectorColumnsFromML(cvResult, 'features')
ddf=ddf.select('features').rdd.map(lambda row : row[0])
mat = RowMatrix(ddf)
#奇异值分解
svd = mat.computeSVD(k=60, computeU=True)
#转成dataframe格式
svd_u = svd.U.rows.map(lambda row : row.tolist())
svd_df = sqlContext.createDataFrame(svd_u)
#kmeans聚类
kmeans = KMeans().setK(60).setSeed(1)
vecAssembler = VectorAssembler(inputCols=svd_df.schema.names, outputCol='features')
svd_df = vecAssembler.transform(svd_df)
#聚类结果
c_result = svd_df.select('features')
model = kmeans.fit(c_result)
results = model.transform(svd_df)
spark进行svd降维和kmeans聚类
最新推荐文章于 2022-12-31 10:49:59 发布
博客主要讲述利用Spark进行SVD降维和Kmeans聚类。SVD降维可减少数据维度,降低计算复杂度;Kmeans聚类能将数据划分成不同簇。借助Spark的强大计算能力,可高效完成这两项数据处理任务,在数据分析等领域有重要应用。

1万+

被折叠的 条评论
为什么被折叠?



