import org.apache.spark.mllib.feature.Word2Vec import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkContext, SparkConf} object word2vec { val conf = new SparkConf().setAppName("GBDT_xc") val sc = new SparkContext(conf) val sqlContext = new HiveContext(sc) //读表 def readData1(sc: SparkContext, sqlContext: HiveContext): DataFrame = { val table = "bi.dpmid_shop_review_summary_all" val sql ="select review_words from " + table+" where review_words like '%朋友%'and review_words like '%聚会%'and ind_bu_name ='休闲娱乐' group by review_words " sqlContext.sql(sql) } val data = readData1(sc, sqlContext).map(_.toString()) //val inputData = sc.textFile("/user/hadoop-generalshop/user_upload/wenting.yang_qinglv_yuehui.csv") val dataFlatMap = data.flatMap { x => x.split(" ") } // val dataMap = inputData.map { x => x.split(" ") } val stopWords = Set("的","了","得") val stopFull=dataFlatMap.filter{ y => !stopWords.contains(y)} //过滤停用词 //stopFull.collect().foreach(x => print(x+',')) //println() val regex1 = """[^a-z]*""".r val engFilter = stopFull.filter{ item =>regex1.pattern.matcher(item).matches} //过滤英文 //engFilter.collect().foreach(x => print(x+',')) val regex2 = """[^0-9]*""".r val numFilter = engFilter.filter{ item =>regex2.pattern.matcher(item).matches} //过滤数字 //numFilter.collect().foreach(x => print(x+',')) var tokenEnd = numFilter.filter { column => column.size >= 2 } val tokenCounts = tokenEnd.map(t => (t, 1)).reduceByKey(_ + _) val oreringDesc = Ordering.by[(String, Int), Int](_._2) //val oreringAsc = Ordering.by[(String, Int), Int](-_._2) val rareTokens = tokenCounts.filter { case (k, v) => v > 1 }.map { case (k, v) => k }//过滤低词频 val rareTokens2 = tokenCounts.filter{ case (k, v) => v<100}.map { case (k, v) => k }//过滤高词频 //val rareTokens = tokenEnd.filter{ case (k, v) => v >1 }.map { case (k, v) => k }.collect.toSet //过滤低词频 val inputData = rareTokens.map(_.split(" ").toSeq) //val rareTokens2 =rareTokens.filter{ case (k, v) => v<100}.map { case (k, v) => k }.collect.toSet val word2vec = new Word2Vec() val model = word2vec.fit(inputData) val synonyms = model.findSynonyms("朋友", 100) val tokens=synonyms.filter{ case (k, v) =>k.size >= 2 } for((tokens, cosineSimilarity) <- tokens) { println(s"$tokens $cosineSimilarity") } }
spark之自然语言处理--word2Vec
最新推荐文章于 2024-08-31 20:43:07 发布