测试文本:
hello|nice|chx|||hhh|yiyi|12345
hello2|nice2|chx2|5|heh2|hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5||heh5|hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
hello|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347
hello|nice|chx|||hhh|yiyi|12345
hello2|nice2|chx2|5||hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5|||hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347
hello|nice|chx||heh|hhh|yiyi|12345
hello2||chx2|5|heh2|hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5||heh5|hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
hello|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347
各个字段的非占空比:
mapper:
#!/usr/bin/env python
'''
求解各个字段的非占空比情况
'''
import sys
def read_input(file,separator):
for line in file:
yield line.strip().split(separator)
def main(separator='|'):
data = read_input(sys.stdin,separator)
for words in data:
for i in range(8):
word = words[i].strip()
if word=='':
tag="NULL"
else:
tag="NONULL"
print "%s\t%s\t" %(i,tag)
if __name__ == "__main__":
main()
reducer:
#!/usr/bin/env python
'''
求解各个字段的非占空比情形
'''
from __future__ import division
from operator import itemgetter
from itertools import groupby
import sys
def read_mapper_output(file, separator = '\t'):
for line in file:
yield line.rstrip().split(separator,1)
def get_ff(data):
ff={}
for words in data:
no=words[0]
#word=words[1]
tag=words[1]
key=no+"-"+tag
if ff.has_key(key):
ff[key]+=1
else:
ff[key]=1
return ff
def get_result(ff):
fff={}
for i in range(8):
key1=str(i)+"-"+"NULL"
key2=str(i)+"-"+"NONULL"
if ff.has_key(key1) and ff.has_key(key2):
fff[i]=ff[key2]/(ff[key1]+ff[key2])
else:
fff[i]=1
print "%s\t%s" %(i,fff[i])
def get_count(ff):
for key in ff.keys():
print "%s\t%s" %(key,ff[key])
def main(separator = '\t'):
data = read_mapper_output(sys.stdin, separator = separator)
ff=get_ff(data)
get_count(ff)
get_result(ff)
if __name__=='__main__':
main()
关键字段 关联区域的非占空比 以第二列为类
mapper:
#!/usr/bin/env python
'''
求解特征字段关联的各个字段的非占空比情形
'''
import sys
def read_input(file,separator):
for line in file:
yield line.strip().split(separator)
def main(separator='|'):
data = read_input(sys.stdin,separator)
for words in data:
req=words[2]
#取前八个字段
for i in range(8):
word = words[i].strip()
#tag 表示为是否为空的字段
if word=='':
tag="NULL"
else:
tag="NONULL"
print "%s\t%s\t%s\t" %(i,tag,req)
if __name__ == "__main__":
main()
对应的redder:
#!/usr/bin/env python
'''
求解特征字段关联的各个字段的非占空比情形
'''
from __future__ import division
from operator import itemgetter
from itertools import groupby
import sys
def read_mapper_output(file, separator = '\t'):
for line in file:
yield line.rstrip().split(separator,2)
def get_ff(data):
ff={}
#lis 链表存放对应的关键字段
lis=[]
for words in data:
no=words[0]
tag=words[1]
#关键的特征字段值
req=words[2]
if not req in lis:
lis.append(req)#链表追加字符
key=no+"-"+tag+"-"+req
if ff.has_key(key):
ff[key]+=1
else:
ff[key]=1
return ff,lis
#写成字典形式
def get_result(ff,lis):
#存放结果值
fff={}
print lis
a1=range(0,2)
a2=range(3,8)
a3=a1+a2
for req in lis:
for i in a3:
key1=str(i)+"-"+"NULL"+"-"+req
key2=str(i)+"-"+"NONULL"+"-"+req
#求解非占空比
if ff.has_key(key1) and ff.has_key(key2):
fff[i]=ff[key2]/(ff[key1]+ff[key2])
else:
fff[i]=1
print "%s\t%s\t%s" %(i,req,fff[i])
#各个字段的总数和
def get_count(ff):
for key in ff.keys():
print "%s\t%s" %(key,ff[key])
def main(separator = '\t'):
data = read_mapper_output(sys.stdin, separator = separator)
ff,lis=get_ff(data)
get_count(ff)
get_result(ff,lis)
if __name__=='__main__':
main()
最后放入hadoop集群中,通过streaming运行~