需求:
如果定义一个单词的邻域为这个单词的前两个单词和后两个单词,求的是每个邻域单词占每个单词邻域的比重
如:
w01,w02,w03,w04,w05
邻域表:
单词 | 邻域 |
---|---|
w01 | w02,w03 |
w02 | w01,w03,w04 |
w03 | w01,w02,w03,w05 |
w04 | w02,w03,w05 |
w05 | w03,w04 |
那么对于 w01而言,w02和 w03所占的比重都是1/2
那么对于 w02而言,w01,w03,w04所占的比重都是1/3
以此类推…数据一般不会出现这种所有单词只出现一次的情况
数据:
w01,w02,w03,w04,w05,w06,w07,w08,w09,w10,w01,w02,w03,w04,w05,w06,w07,w08,w09,w10
代码:
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object RelativeFrequency {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("RelativeFrequency")
.master("local")
.config("spark.sql.shuffle.partitions", "5")
.getOrCreate()
val sc = spark.sparkContext
val brodcastWindow = sc.broadcast(2)
val rawData = sc.textFile("RelativeFrequency.csv")
// (word, (neighbour, 1))
val pairs = rawData.flatMap(line => {
val tokens = line.split(",")
for {
i <- 0 until tokens.length
start = if (i - brodcastWindow.value < 0) 0 else i - brodcastWindow.value
end = if (i + brodcastWindow.value >= tokens.length) tokens.length - 1 else i + brodcastWindow.value
j <- start to end if i != j
} yield (tokens(i), (tokens(j), 1))
})
// 第一种方式
// (word, sum(word))
val totalByKey = pairs.map(t => (t._1, t._2._2)).reduceByKey(_ + _)
// (word, (neighbour, sum(neighbour)))
val uniquePairs = pairs.groupByKey()
.flatMapValues(_.groupBy(_._1).mapValues(_.unzip._2.sum))
// (word, ((neighbour, sum(neighbour)), sum(word)))
val joined = uniquePairs join totalByKey
// ((key, neighbour), sum(neighbour)/sum(word))
val relativeFrequency = joined.map(t => (t._1, t._2._1._1, (t._2._1._2.toDouble / t._2._2.toDouble).formatted("%.2f")))
relativeFrequency.foreach(println)
// 第二种方式,DataFrame
val rfSchema = StructType(StructField("word", StringType, false) ::
StructField("neighbour", StringType, false) ::
StructField("frequency", IntegerType, false) :: Nil)
spark.createDataFrame(pairs.map(t => Row(t._1, t._2._1, t._2._2)), rfSchema)
.createOrReplaceTempView("rfTable")
spark.sql(
"""
|SELECT a.word,
| a.neighbour,
| (a.feq_total/b.total) rf
|FROM
| (SELECT word,
| neighbour,
| SUM(frequency) feq_total
| FROM rfTable
| GROUP BY word,
| neighbour) a
|INNER JOIN
| (SELECT word,
| SUM(frequency) AS total
| FROM rfTable
| GROUP BY word) b ON a.word = b.word
|ORDER BY a.word, a.neighbour
""".stripMargin).show()
spark.stop()
}
}
结果:
+----+---------+-------------------+
|word|neighbour| rf|
+----+---------+-------------------+
| w01| w02| 0.3333333333333333|
| w01| w03| 0.3333333333333333|
| w01| w09|0.16666666666666666|
| w01| w10|0.16666666666666666|
| w02| w01| 0.2857142857142857|
| w02| w03| 0.2857142857142857|
| w02| w04| 0.2857142857142857|
| w02| w10|0.14285714285714285|
| w03| w01| 0.25|
| w03| w02| 0.25|
| w03| w04| 0.25|
| w03| w05| 0.25|
| w04| w02| 0.25|
| w04| w03| 0.25|
| w04| w05| 0.25|
| w04| w06| 0.25|
| w05| w03| 0.25|
| w05| w04| 0.25|
| w05| w06| 0.25|
| w05| w07| 0.25|
+----+---------+-------------------+