package com.org.de.tagGenerator
import com.alibaba.fastjson.JSON
import org.apache.spark.{SparkConf, SparkContext}
/**
* 标签生成器
*/
object TGTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]")
val sc = new SparkContext(conf)
//加载文件
val rdd1 = sc.textFile("file:///d:/temptags.txt")
//变换
val rdd2 = rdd1.map(line => {
val arr = line.split("\t")
//获取86913510 {"reviewPics":null,"extInfoList":[{"title":"contentTags","values":["午餐","分量适中"],"desc":"","defineType":0},{"title":"tagIds","values":["684","240"],"desc":"","defineType":0}],"expenseList":null,"reviewIndexes":[2],"scoreList":null}
//商家ID
val busId = arr(0)
//获取json格式串
val text = arr(1)
//将text串转为JSON对象
val jstext = JSON.parseObject(text)
//通过对象得到 "extInfoList":-->对应的数组[{"title":"contentTags","values":["午餐","分量适中"]
val jarr = jstext.getJSONArray("extInfoList")
//判断数组内容不为空
if (jarr != null && jarr.size() > 0) {
//得到json对象
val v1 = jarr.getJSONObject(0)
//通过values: 得到 ["午餐","分量适中"]
val arr2 = v1.getJSONArray("values")
if (arr2 != null && arr2.size() > 0) {
var str = ""
var i = 0
while (i < arr2.size()) {
str = str + arr2.getString(i) + ","
i += 1
}
(busId, str.substring(0, str.length - 1))
}
else (busId, "")
}
else (busId, "")
}
)
//(77287793, 服务热情, 音响效果好)
//(73812440,)
//过滤,没有评论的过滤掉
val rdd3 = rdd2.filter(t => {
t._2 != null && !"".equals(t._2)
})
//按照value压扁
val rdd4 = rdd3.flatMapValues(_.split(","))
//重组key busId-comm,1
val rdd5 = rdd4.map(t => {
(t._1 + "_" + t._2, 1)
})
//聚合
val rdd6 = rdd5.reduceByKey(_ + _)
//变换成(busId,(comm,count))
val rdd7 = rdd6.map(t => {
var arr = t._1.split("_")
(arr(0), (arr(1), t._2) :: Nil)
})
//按照商场id进行聚合 value 是list
val rdd8 = rdd7.reduceByKey(_ ++ _)
//进行排序(倒序)
val rdd9 = rdd8.map(t => {
val x = t._2.sortBy(t => {
t._2
}).reverse.take(5)
(t._1, x)
})
//按照列来排序,将之前每家评价的第一个数量为key,进行排序
val rdd99 = rdd9.sortBy(t => {
t._2(0)._2
}, false, 1)
//(83644298,List((性价比高,1)
// t._1 t._2
val rdd10 = rdd99.map(t => {
val col = t._2
var desc = ""
for (tt <- col) {
//List((性价比高 , 1)
// tt._1 tt._2
desc = desc + tt._1 + "(" + tt._2 + ")" + ","
}
(t._1, desc)
}
)
val rdd11 = rdd10.map(t => {
val s1 = t._1
var s2 = t._2
(t._1, t._2.substring(0, t._2.length - 1))
}
)
rdd11.foreach(println)
}
}
标签练习
最新推荐文章于 2024-07-11 17:51:07 发布