字段占空比和特征字段占空比

本文深入探讨了大数据处理的关键技术,重点讲解了Hadoop集群的搭建、配置及其实战案例,从数据导入、数据处理到数据存储,全方位展示了Hadoop在大数据领域的强大能力。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

测试文本:

hello|nice|chx|||hhh|yiyi|12345
hello2|nice2|chx2|5|heh2|hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5||heh5|hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
hello|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347
hello|nice|chx|||hhh|yiyi|12345
hello2|nice2|chx2|5||hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5|||hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347
hello|nice|chx||heh|hhh|yiyi|12345
hello2||chx2|5|heh2|hhh2|yiyi2|12341
hello3|nice3|chx3||heh3|hhh3|yiyi3|12342
hello4|nice4|chx4|4|heh4|hhh|yiyi|12343
hello|nice5|chx5||heh5|hhh5|yiyi|12344
hello|nice4|chx3||heh2|hhh|yiyi|12345
hello|nice|chx|3|heh2|hhh1|yiyi|12346
hello|nice|chx|2|heh|hhh3|yiyi|12347

各个字段的非占空比:

mapper:

#!/usr/bin/env python
'''
	求解各个字段的非占空比情况
'''
import sys

def read_input(file,separator):
    for line in file:
        yield line.strip().split(separator)

def main(separator='|'):
    data = read_input(sys.stdin,separator)
    for words in data:
		for i in range(8):
			word = words[i].strip()
			if word=='':
				tag="NULL"
			else:
				tag="NONULL"
			print "%s\t%s\t" %(i,tag)


if __name__ == "__main__":
    main()
reducer:

#!/usr/bin/env python
'''
    求解各个字段的非占空比情形
'''
from __future__ import division
from operator import itemgetter
from itertools import groupby

import sys

def read_mapper_output(file, separator = '\t'):
    for line in file:
        yield line.rstrip().split(separator,1)
        
def get_ff(data):
    ff={}
    for words in data:
        no=words[0]
        #word=words[1]
        tag=words[1]
        
        key=no+"-"+tag
        
        if ff.has_key(key):
            ff[key]+=1
        else:
            ff[key]=1
    return ff

def get_result(ff):
    fff={}
    for i in range(8):
        key1=str(i)+"-"+"NULL"
        key2=str(i)+"-"+"NONULL"
        if ff.has_key(key1) and ff.has_key(key2):
            fff[i]=ff[key2]/(ff[key1]+ff[key2])
        else:
            fff[i]=1
        print "%s\t%s" %(i,fff[i])
        
def get_count(ff):
    for key in ff.keys():
        print "%s\t%s" %(key,ff[key])

def main(separator = '\t'):
    data = read_mapper_output(sys.stdin, separator = separator)
    ff=get_ff(data)
    get_count(ff)
    get_result(ff)
    

if __name__=='__main__':
    main()




关键字段 关联区域的非占空比 以第二列为类

mapper:

#!/usr/bin/env python
'''
	求解特征字段关联的各个字段的非占空比情形
'''
import sys

def read_input(file,separator):
    for line in file:
        yield line.strip().split(separator)

def main(separator='|'):
    data = read_input(sys.stdin,separator)
    for words in data:
		req=words[2]
		#取前八个字段
		for i in range(8):
			word = words[i].strip()
			#tag 表示为是否为空的字段
			if word=='':
				tag="NULL"
			else:
				tag="NONULL"
			print "%s\t%s\t%s\t" %(i,tag,req)


if __name__ == "__main__":
    main()

对应的redder:

#!/usr/bin/env python
'''
	求解特征字段关联的各个字段的非占空比情形
'''

from __future__ import division 
from operator import itemgetter
from itertools import groupby

import sys

def read_mapper_output(file, separator = '\t'):
	for line in file:
		yield line.rstrip().split(separator,2)
		
def get_ff(data):
	
	ff={}
	#lis 链表存放对应的关键字段
	lis=[]
	for words in data:
		no=words[0]
		tag=words[1]
		#关键的特征字段值
		req=words[2]
		
		if not req in lis:
			lis.append(req)#链表追加字符
		
		key=no+"-"+tag+"-"+req
		
		if ff.has_key(key):
			ff[key]+=1
		else:
			ff[key]=1
	return ff,lis

#写成字典形式
def get_result(ff,lis):
	#存放结果值
	fff={}
	print lis
	a1=range(0,2)
	a2=range(3,8)
	a3=a1+a2
	for req in lis:
		for i in a3:
			key1=str(i)+"-"+"NULL"+"-"+req
			key2=str(i)+"-"+"NONULL"+"-"+req
			#求解非占空比
			if ff.has_key(key1) and ff.has_key(key2):
				fff[i]=ff[key2]/(ff[key1]+ff[key2])
			else:
				fff[i]=1
			print "%s\t%s\t%s" %(i,req,fff[i])

#各个字段的总数和
def get_count(ff):
	for key in ff.keys():
		print "%s\t%s" %(key,ff[key])

def main(separator = '\t'):
	data = read_mapper_output(sys.stdin, separator = separator)
	ff,lis=get_ff(data)
	get_count(ff)
	get_result(ff,lis)
	

if __name__=='__main__':
	main()



最后放入hadoop集群中,通过streaming运行~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值