import re
from operator import add
def computeContribs(urls, rank):
# Calculates URL contributions to the rank of other URLs.
num_urls = len(urls)
for url in urls:
yield (url, rank / num_urls)
def parseNeighbors(urls):
# Parses a urls pair string into urls pair."""
parts = urls.split(' ')
return parts[0], parts[1]
# Loads in input file. It should be in format of:
# URL neighbor URL
# URL neighbor URL
# URL neighbor URL
# ...
# The data file can be downloaded at http://www.cse.ust.hk/msbd5003/data/*
lines = sc.textFile("../data/pagerank_data.txt", 2)
# lines = sc.textFile("../data/dblp.in", 5)
numOfIterations = 10
# Loads all URLs from input file and initialize their neighbors.
links = lines.map(lambda urls: parseNeighbors(urls)) \
.groupByKey()
# Loads all URLs with other URL(s) link to from input file
# and initialize rank