本文主要对使用Spark MLlib LDA进行主题抽取时遇到的工程问题做一总结,列出其中的一些小坑,或可供读者借鉴。关于LDA的具体理论等可以自行google。主题预测请参考:Spark LDA 主题预测
开发环境:spark-1.5.2,hadoop-2.6.0,spark-1.5.2要求jdk7+。语料有大概70万篇博客,十亿+词汇量,词典大概有五万左右的词。
训练语料代码
// scalastyle:off println
package org.apache.spark.examples.mllib
import java.text.BreakIterator
import scala.collection.mutable
import scopt.OptionParser
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.clustering.{EMLDAOptimizer, OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
/**
* An example Latent Dirichlet Allocation (LDA) app. Run with
* {
{
{
* ./bin/run-example mllib.LDAExample [options] <input>
* }}}
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
*/
object LDAExample {
private case class Params(
input: Seq[String] = Seq.empty,
k: Int = 20,
maxIterations: Int = 10,
docConcentration: Double = -1,
topicConcentration: Double = -1,
vocabSize: Int = 10000,
stopwordFile: String = "",
algorithm: String = "em",
checkpointDir: Option[String] = None,
checkpointInterval: Int = 10) extends AbstractParams[Params]
def main(args: Array[String]) {
val defaultParams = Params()
val parser = new OptionParser[Params]("LDAExample") {
head("LDAExample: an example LDA app for plain text data.")
opt[Int]("k")
.text(s"number of topics. default: ${defaultParams.k}")
.action((x, c) => c.copy(k = x))
opt[Int]("maxIterations")
.text(s"number of iterations of learning. default: ${defaultParams.maxIterations}")
.action((x, c) => c.copy(maxIterations = x))
opt[Double]("docConcentration")
.text(s"amount of topic smoothing to use (> 1.0) (-1=auto)." +
s" default: ${defaultParams.docConcentration}")
.action((x, c) => c.copy(docConcentration = x))
opt[Double]("topicConcentration")
.text(s"amount of term (word) smoothing to use (> 1.0) (-1=auto)." +
s" default: ${defaultParams.topicConcentration}")
.action((x, c) => c.copy(topicConcentration = x))
opt[Int]("vocabSize")
.text(s"number of distinct word types to use, chosen by frequency. (-1=all)" +
s" default: ${defaultParams.vocabSize}")
.action((x, c) => c.copy(vocabSize = x))
opt[String]("stopwordFile")
.text(s"filepath for a list of stopw