from pyspark.sql import SparkSession
from datasketch import MinHash
from pyspark.rdd import RDD
# 创建 Spark 会话
spark = SparkSession.builder.appName("MinHashExample").getOrCreate()
# 示例文档
documents = [
"The quick brown fox jumps over the lazy dog",
"The fast brown fox jumps over the lazy dog",
"The quick brown dog jumped over the lazy fox",
"A lazy dog jumps quickly over the brown fox"
]
# 将文档加载到 Spark RDD
rdd: RDD = spark.sparkContext.parallelize(documents)
# MinHash 函数
def minhash_document(doc: str) -> MinHash:
m = MinHash()
for word in set(doc.split()):
m.update(word.encode('utf8'))
return m
# 使用 MinHash 计算每个文档的签名
minhash_signatures = rdd.map(minhash_document)
# 计算两个 MinHash 签名之间的相似度
def compute_jaccard_similarity(pair):
minhash1, minhash2 = pair
return minhash1.jaccard(minhash2)
# 将 MinHash 签名两两组合进行相似度计算
minhash_pairs = minhash_signatures.cartesian(minhash_signatures)
similarities = minhash_pairs.map(compute_jaccard_similarity)
# 获取结果
similarity_results = similarities.collect()
# 打印相似度结果
print("MinHash Similarities:")
for similarity in similarity_results:
print(similarity)
# 停止 Spark 会话
spark.stop()
7.2. 使用 Spark 进行 SimHash 计算
from pyspark.sql import SparkSession
from simhash import Simhash
from pyspark.rdd import RDD
# 创建 Spark 会话
spark = SparkSession.builder.appName("SimHashExample").getOrCreate()
# 示例文档
documents = [
"The quick brown fox jumps over the lazy dog",
"The fast brown fox jumps over the lazy dog",
"The quick brown dog jumped over the lazy fox",
"A lazy dog jumps quickly over the brown fox"
]
# 将文档加载到 Spark RDD
rdd: RDD = spark.sparkContext.parallelize(documents)
# SimHash 函数
def simhash_document(doc: str) -> Simhash:
return Simhash(doc)
# 使用 SimHash 计算每个文档的签名
simhash_signatures = rdd.map(simhash_document)
# 计算两个 SimHash 签名的汉明距离
def compute_hamming_distance(pair):
simhash1, simhash2 = pair
return simhash1 - simhash2
# 将 SimHash 签名两两组合进行汉明距离计算
simhash_pairs = simhash_signatures.cartesian(simhash_signatures)
hamming_distances = simhash_pairs.map(compute_hamming_distance)
# 获取结果
hamming_results = hamming_distances.collect()
# 打印汉明距离结果
print("SimHash Hamming Distances:")
for hamming_distance in hamming_results:
print(hamming_distance)
# 停止 Spark 会话
spark.stop()