import org.apache.spark.graphx.{Edge, Graph}
import utility.Helpers
import scala.collection.mutable
/**
* Created by fhqplzj on 2017/7/20.
*/
object SemanticNormalization {
def word_count(s: String): mutable.Map[Char, Int] = {
/**
* word count的作用
*/
val hashMap = new mutable.HashMap[Char, Int]().withDefaultValue(0)
s.foreach {
c =>
hashMap(c) += 1
}
hashMap
}
def jaccard_similarity(s1: String, s2: String): Double = {
/**
* 广义雅可比相似度计算
*/
val m1 = word_count(s1)
val m2 = word_count(s2)
val ks1 = m1.keySet
val ks2 = m2.keySet
val numerator = (ks1 & ks2).toSeq.map {
x =>
Math.min(m1(x), m2(x))
}.sum
val denominator = (ks1 | ks2).toSeq.map {
x =>
Math.max(m1(x), m2(x))
}.sum
1.0 * numerator / denominator
}
def main(args: Array[String]): Unit = {
val sc = Helpers.getSc
val vertices = sc.textFile("file:///tmp/graph.txt").map {
line =>
val parts = line.split("\t")
(parts(0), parts(1))
}.zipWithIndex().map(_.swap)
val similarities = vertices.cartesian(vertices).flatMap {
case ((id1, (content1, _)), (id2, (content2, _))) =>
val sim = jaccard_similarity(content1, content2)
if (id1 < id2) {
Seq(Edge(id1, id2, sim), Edge(id2, id1, sim))
} else if (id1 == id2) {
Seq(Edge(id1, id2, sim))
} else {
None
}
}
val graph = Graph.fromEdges(similarities, 0.0)
/*构建好了图,接下来做自己想做的事,比如提取图的连通分量。*/
println(graph.vertices.count())
}
}
spark构建图graphx
最新推荐文章于 2021-05-30 21:37:53 发布