# From the official spark examples.
import random
partitions = 1000
n = 1000 * partitions
def f(_):
x = random.random()
y = random.random()
return 1 if x ** 2 + y ** 2 < 1 else 0
count = sc.parallelize(range(1, n + 1), partitions) \
.map(f).sum()
print("Pi is roughly", 4.0 * count / n)
法二:
# Correct version
import random
import time
partitions = 1000
n = 1000 * partitions
seed = time.time()
def f(index, it):
random.seed(index + seed)
for i in it:
x = random.random()
y = random.random()
yield 1 if x ** 2 + y ** 2 < 1 else 0
count = sc.parallelize(range(1, n + 1), partitions) \
.mapPartitionsWithIndex(f).sum()
print("Pi is roughly", 4.0 * count / n)
区别:
法一的random.random是同一个seed,随机性不够大。
法二是根据时间和分区区号产生seed,随机性比法一更好。