#!/usr/bin/env python
import sys
class Mapper(object):
"""
Mapper
"""
def __init__(self):
return
def mapper(self, arg_map):
"""
mapper
"""
for line in sys.stdin:
line = line.strip()
if not line:
continue
line_sps = line.split('\t')
if len(line_sps) < 6:
continue
# 获取 label 字段
label = line_sps[2]
if not label:
continue
# 提取 anchor_id 值
anchor_id_value = None
for pair in label.split():
if pair.startswith("anchor_id:"):
_, value = pair.split(":", 1)
anchor_id_value = value
break
if anchor_id_value:
# 输出 anchor_id 和计数
print(f"{anchor_id_value}")
pass
class Reducer(object):
"""
Reducer
"""
def __init__(self):
return
def reducer(self, arg_map):
"""
reducer: 统计每个 anchor_id 的出现次数,以及出现次数相同的数量分布
"""
anchor_count = defaultdict(int)
# 聚合 anchor_id 的计数
for line in sys.stdin:
line = line.strip()
if not line:
continue
anchor_id = line
anchor_count[anchor_id] += 1
# 统计出现次数的分布
count_distribution = defaultdict(int)
for count in anchor_count.values():
count_distribution[count] += 1
# 输出结果
for count, num_anchor_ids in sorted(count_distribution.items()):
print(f"{count}\t{num_anchor_ids}")
if __name__ == '__main__':
action = sys.argv[1]
arg_map = {}
if len(sys.argv) > 2:
for i in range(2, len(sys.argv)):
arg_sps = sys.argv[i].split('=')
if len(arg_sps) == 2:
arg_map[arg_sps[0]] = arg_sps[1]
if action == 'mapper':
mapper = Mapper()
mapper.mapper(arg_map)
if action == 'reducer':
reducer = Reducer()
reducer.reducer(arg_map)
样例
# /bin/python
# coding=utf-8
"""
File: merge_active_up.py
Author: dongdoudou(dongdoudou@baidu.com)
Date: 2022/10/09 16:13:48
"""
import sys
import json
import re
import os
import random
import math
import base64
reload(sys)
sys.setdefaultencoding("utf-8")
class Mapper(object):
"""
Mapper
"""
def __init__(self, arg_map):
self._load_his_itemid_(arg_map)
return
def mapper(self, arg_map):
self.debug = int(arg_map.get('debug', '0'))
# 逐行读取输入数据
for line in sys.stdin:
line = line.strip()
if not line:
continue
# if 'uid_pos_itemID' in line:
# # 处理 uid_7d_pos_itemid 文件
# uid, uid_pos_itemID_str, pos_itemids = line.split("\t")
# if len(uid) < 5:
# continue
# print("{}\tPOS_ITEMS\t{}".format(uid, pos_itemids))
if 'uid_itemids_scores' in line:
# 处理 uid_top_itemid_score 文件
uid, uid_itemids_scores_str, items_scores = line.split("\t")
# items_scores = items_scores.split(",")[:20] # 只取前 20 个
item_ids = [item.split(":")[0] for item in items_scores.split(',')]
print("{}\tTOP_ITEMS\t{}".format(uid, ','.join(item_ids)))
def _load_his_itemid_(self, arg_map):
his_itemid_path = arg_map["history_itemid"]
with open(his_itemid_path) as file_obj:
lines = file_obj.readlines()
file_obj.close()
for line in lines:
line = line.strip()
if not line:
continue
if 'uid_pos_itemID' in line:
uid, uid_pos_itemID_str, pos_itemids = line.split("\t")
if len(uid) < 5:
continue
print("{}\tPOS_ITEMS\t{}".format(uid, pos_itemids))
# end for
class Reducer(object):
"""
Reducer
"""
def __init__(self):
return
def proc_group(self, arg_map, now_uid, now_lines):
"""
proc_group
"""
return
def reducer(self, arg_map):
"""
reducer
"""
self.debug = int(arg_map.get('debug', '0'))
pro_uid = None
top_itemids = []
pos_itemids = []
all_uid_pos_itemid_cnt = 0 # 总正向星 ID 数量
all_uid_top_match_cnt = {i: 0 for i in range(10, 101, 10)} # 存储 Top 10, 20, 30... 100 的匹配数
all_uid_top_cover_rate = {i: 0 for i in range(10, 101, 10)} # 存储 Top 10, 20, 30... 100 的覆盖率
for line in sys.stdin:
line = line.strip()
if not line:
continue
uid, data_type, values = line.split("\t")
# 如果遇到新用户 ID,处理之前的用户数据
if uid != pro_uid:
if pro_uid:
if not len(pos_itemids) == 0 and not len(top_itemids) == 0:
if self.debug:
print("UID: ", pro_uid, " POS_ITEMS: ", pos_itemids, " TOP_ITEMS: ", top_itemids)
for item in pos_itemids:
if item in top_itemids:
print("UID: ", pro_uid, "MATCH_ITEM: ", item)
# 计算当前用户的匹配情况
all_uid_pos_itemid_cnt += len(pos_itemids)
for top_n in range(10, 101, 10):
all_uid_top_match_cnt[top_n] += sum(1 for item in pos_itemids if item in top_itemids[:top_n])
# 重置数据,处理新用户
pro_uid = uid
top_itemids = []
pos_itemids = []
# 根据数据类型更新列表
if data_type == "TOP_ITEMS":
top_itemids = values.split(",")
elif data_type == "POS_ITEMS":
pos_itemids = values.split(",")
# 处理最后一个用户的数据
if pro_uid:
if not len(pos_itemids) == 0 and not len(top_itemids) == 0:
if self.debug:
print("UID: ", pro_uid, " POS_ITEMS: ", pos_itemids, " TOP_ITEMS: ", top_itemids)
all_uid_pos_itemid_cnt += len(pos_itemids)
for top_n in range(10, 101, 10):
all_uid_top_match_cnt[top_n] += sum(1 for item in pos_itemids if item in top_itemids[:top_n])
# 输出最终结果
print("总itemid 正样本数量: {}".format(all_uid_pos_itemid_cnt))
for top_n in range(10, 101, 10):
match_cnt = all_uid_top_match_cnt[top_n]
coverage = float(match_cnt) / float(max(1, all_uid_pos_itemid_cnt))
print("Top {} 命中数量: {}".format(top_n, str(match_cnt)))
print("Top {} 覆盖率: {:.3f}".format(top_n, coverage))
if __name__ == '__main__':
action = sys.argv[1]
arg_map = {}
if len(sys.argv) > 2:
for i in range(2, len(sys.argv)):
arg_sps = sys.argv[i].split('=')
if len(arg_sps) == 2:
arg_map[arg_sps[0]] = arg_sps[1]
if action == 'mapper':
mapper = Mapper(arg_map)
mapper.mapper(arg_map)
if action == 'reducer':
reducer = Reducer()
reducer.reducer(arg_map)
任务提交
#!/bin/bash
if [ ! -n "$1" ]; then
date=`date +%Y%m%d -d '-1 days'`
else
date=$1
fi
JOB_NAME=job_name_${date}
INPUT_PATH="afs://xxx.afs.xxx.com:xxx"
OUTPUT_PATH="afs://xxx.afs.xxx.com:xxx"
hadoop=~/.hmpclient/hadoop-client/hadoop/bin/hadoop
HADOOP_CONF=./conf/hadoop-site.xml
while true
do
${hadoop} fs -conf ./conf/hadoop-site.xml -test -e ${INPUT_PATH}/_SUCCESS
if [ $? -ne 0 ]; then
echo "event_day=${INPUT_PATH} not ready"
sleep 3m
else
echo "event_day=${INPUT_PATH}/_SUCCESS exits"
break
fi
done
echo "INPUT_PATH: $INPUT_PATH"
echo "OUTPUT_PATH: $OUTPUT_PATH"
${hadoop} fs -conf ./conf/hadoop-site.xml -rmr ${OUTPUT_PATH}
${hadoop} streaming -D mapred.job.priority=VERY_HIGH \
-conf ./conf/hadoop-site.xml \
-inputformat org.apache.hadoop.mapred.TextInputFormat \
-jobconf mapred.combine.input.format.local.only=false \
-jobconf mapred.combine.input.format.dir.only=true \
-jobconf abaci.split.optimize.enable=false \
-jobconf mapred.max.map.failures.percent=10 \
-jobconf dfs.use.native.api=0 \
-jobconf mapred.job.queue.name=feed_qa_gzhl \
-jobconf mapred.max.split.size=30000000 \
-jobconf mapred.job.tracker=gzns-kunpeng-job.dmop.baidu.com:54311 \
-input "${INPUT_PROFILE_PATH}/part-*","${ACTIVE_USER_PATH}/part-*" \
-output ${OUTPUT_PATH} \
-mapper "./python/python/bin/python task.py mapper" \
-reducer "./python/python/bin/python task.py reducer log_date=${date}" \
-file ./scripts/*.py \
-file ./conf/*.conf \
-file ./infer_data/${yes_date}/* \
-jobconf mapred.reduce.tasks=1000 \
-jobconf mapred.job.reduce.capacity=1000 \
-jobconf mapred.job.map.capacity=4000 \
-jobconf mapreduce.reduce.memory.mb=4096 \
-jobconf mapreduce.reduce.java.opts="-Xmx3072m" \
-jobconf mapred.job.name="${JOB_NAME}" \
-jobconf abaci.split.remote=false \
-jobconf mapred.output.compress=true \
-jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-cacheArchive afs://xingtian.afs.baidu.com:9902/user/feed/mlarch/feasign/sb_feed_live_small_video_v3/python_pb.tar.gz#python
if [[ $? -ne 0 ]];then
echo "[HADOOP ERROR]:job ${JOB_NAME} failed!"
kill -TERM $PPID # 终止父进程
exit 1
else
${hadoop} fs -conf ./conf/hadoop-site.xml -touchz "${OUTPUT_PATH}/to.hadoop.done"
echo "[HADOOP ERROR]:job ${JOB_NAME} succeed!"
fi
写在最后:若本文章对您有帮助,请点个赞啦 ٩(๑•̀ω•́๑)۶