Spark LDA 主题抽取

最新推荐文章于 2025-06-27 09:01:21 发布

selfpoised

最新推荐文章于 2025-06-27 09:01:21 发布

阅读量1w

点赞数 1

CC 4.0 BY-SA版权

分类专栏：程序技术文章标签： spark LDA 主题抽取 scala

本文链接：https://blog.youkuaiyun.com/poised/article/details/50382107

本文主要对使用Spark MLlib LDA进行主题抽取时遇到的工程问题做一总结，列出其中的一些小坑，或可供读者借鉴。关于LDA的具体理论等可以自行google。主题预测请参考：Spark LDA 主题预测

开发环境：spark-1.5.2，hadoop-2.6.0，spark-1.5.2要求jdk7+。语料有大概70万篇博客，十亿+词汇量，词典大概有五万左右的词。

训练语料代码

：apache/spark/examples/mllib/

// scalastyle:off println
package org.apache.spark.examples.mllib
import java.text.BreakIterator
import scala.collection.mutable
import scopt.OptionParser
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.clustering.{EMLDAOptimizer, OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
/**
 * An example Latent Dirichlet Allocation (LDA) app. Run with
 * {
   
   {
   
   {
 * ./bin/run-example mllib.LDAExample [options] <input>
 * }}}
 * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
 */
object LDAExample {
   
   
  private case class Params(
      input: Seq[String] = Seq.empty,
      k: Int = 20,
      maxIterations: Int = 10,
      docConcentration: Double = -1,
      topicConcentration: Double = -1,
      vocabSize: Int = 10000,
      stopwordFile: String = "",
      algorithm: String = "em",
      checkpointDir: Option[String] = None,
      checkpointInterval: Int = 10) extends AbstractParams[Params]
  def main(args: Array[String]) {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("LDAExample") {
      head("LDAExample: an example LDA app for plain text data.")
      opt[Int]("k")
        .text(s"number of topics. default: ${defaultParams.k}")
        .action((x, c) => c.copy(k = x))
      opt[Int]("maxIterations")
        .text(s"number of iterations of learning. default: ${defaultParams.maxIterations}")
        .action((x, c) => c.copy(maxIterations = x))
      opt[Double]("docConcentration")
        .text(s"amount of topic smoothing to use (> 1.0) (-1=auto)." +
        s"  default: ${defaultParams.docConcentration}")
        .action((x, c) => c.copy(docConcentration = x))
      opt[Double]("topicConcentration")
        .text(s"amount of term (word) smoothing to use (> 1.0) (-1=auto)." +
        s"  default: ${defaultParams.topicConcentration}")
        .action((x, c) => c.copy(topicConcentration = x))
      opt[Int]("vocabSize")
        .text(s"number of distinct word types to use, chosen by frequency. (-1=all)" +
          s"  default: ${defaultParams.vocabSize}")
        .action((x, c) => c.copy(vocabSize = x))
      opt[String]("stopwordFile")
        .text(s"filepath for a list of stopw