map函数如下:
import sys
for line in sys.stdin:
line = line.strip()
words = line.split()
print '%s %s 1' % (words[0],words[1] )
print '%s %s 2' % (words[1],words[0] )
reduce函数如下:
#!/usr/bin/env python
import sys
word_1=[]
word_2=[]
i=1
print 'grandChild grandParent'
def printf(word_1,word_2):
if (len(word_1)!=0) and (len(word_2)!=0):
for i in range(len(word_1)):
for j in range(len(word_2)):
print '%s %s'%(word_2[j][1],word_1[i][1])
for line in sys.stdin:
line = line.strip()
words=line.split()
if i==1:
word_temp=words[0]
if words[0]==word_temp:
if words[2]=='1':
word_1.append(words)
else:
word_2.append(words)
else:
printf(word_1,word_2)
word_1=[]
word_2=[]
word_temp=words[0]
if words[2]=='1':
word_1.append(words)
else:
word_2.append(words)
i=i+1
printf(word_1,word_2)
输入文件:
child parent %由于hdfs文件是以行为单位分布式存储,执行map函数时,在每个节点执行,所以要保证每行数据格式一致,所以在程序输入数据文本时把本行去掉
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma