【PYTHON】Mapper Reducer

爱学习的卡比兽

已于 2024-12-26 12:12:38 修改

阅读量517

点赞数 3

分类专栏： Python 文章标签： python 开发语言

于 2024-12-04 17:34:36 首次发布

本文链接：https://blog.youkuaiyun.com/qq_31537885/article/details/144245685

版权

Python 专栏收录该内容

3 篇文章

订阅专栏

#!/usr/bin/env python
import sys


class Mapper(object):
    """
    Mapper
    """
    def __init__(self):
        return

    def mapper(self, arg_map):
        """
        mapper
        """
        for line in sys.stdin:
            
            line = line.strip()
            if not line:
                continue
                
            line_sps = line.split('\t')
            if len(line_sps) < 6:
                continue
            
            # 获取 label 字段
            label = line_sps[2]
            if not label:
                continue
            
            # 提取 anchor_id 值
            anchor_id_value = None
            for pair in label.split():
                if pair.startswith("anchor_id:"):
                    _, value = pair.split(":", 1)
                    anchor_id_value = value
                    break
            
            if anchor_id_value:
                # 输出 anchor_id 和计数
                print(f"{anchor_id_value}")
        pass


class Reducer(object):
    """
    Reducer
    """
    def __init__(self):
        return
    
    def reducer(self, arg_map):
        """
        reducer: 统计每个 anchor_id 的出现次数，以及出现次数相同的数量分布
        """
        anchor_count = defaultdict(int)

        # 聚合 anchor_id 的计数
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            
            anchor_id = line
            anchor_count[anchor_id] += 1
        
        # 统计出现次数的分布
        count_distribution = defaultdict(int)
        for count in anchor_count.values():
            count_distribution[count] += 1

        # 输出结果
        for count, num_anchor_ids in sorted(count_distribution.items()):
            print(f"{count}\t{num_anchor_ids}")


if __name__ == '__main__':
    action = sys.argv[1]
    arg_map = {}
    if len(sys.argv) > 2:
        for i in range(2, len(sys.argv)):
            arg_sps = sys.argv[i].split('=')
            if len(arg_sps) == 2:
                arg_map[arg_sps[0]] = arg_sps[1]
    if action == 'mapper':
        mapper = Mapper()
        mapper.mapper(arg_map)
    if action == 'reducer':
        reducer = Reducer()
        reducer.reducer(arg_map)

样例

# /bin/python
# coding=utf-8

"""
File: merge_active_up.py
Author: dongdoudou(dongdoudou@baidu.com)
Date: 2022/10/09 16:13:48
"""

import sys
import json
import re
import os
import random
import math
import base64

reload(sys)
sys.setdefaultencoding("utf-8")


class Mapper(object):
    """
    Mapper
    """
    def __init__(self, arg_map):
        self._load_his_itemid_(arg_map)
        return
        
    def mapper(self, arg_map):
        self.debug = int(arg_map.get('debug', '0'))
        
        # 逐行读取输入数据
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue

            # if 'uid_pos_itemID' in line:
            #     # 处理 uid_7d_pos_itemid 文件
            #     uid, uid_pos_itemID_str, pos_itemids = line.split("\t")
            #     if len(uid) < 5:
            #         continue
            #     print("{}\tPOS_ITEMS\t{}".format(uid, pos_itemids))
            if 'uid_itemids_scores' in line:
                # 处理 uid_top_itemid_score 文件
                uid, uid_itemids_scores_str, items_scores = line.split("\t")
                # items_scores = items_scores.split(",")[:20]  # 只取前 20 个
                item_ids = [item.split(":")[0] for item in items_scores.split(',')]  
                print("{}\tTOP_ITEMS\t{}".format(uid, ','.join(item_ids)))
    
    def _load_his_itemid_(self, arg_map):
        
        his_itemid_path = arg_map["history_itemid"]
    
        with open(his_itemid_path) as file_obj:
            lines = file_obj.readlines()
            file_obj.close()
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                if 'uid_pos_itemID' in line:
                    uid, uid_pos_itemID_str, pos_itemids = line.split("\t")
                    if len(uid) < 5:
                        continue
                    print("{}\tPOS_ITEMS\t{}".format(uid, pos_itemids))
                    
        # end for
class Reducer(object):
    """
    Reducer 
    """
    def __init__(self):
        return

    def proc_group(self, arg_map, now_uid, now_lines):
        """
        proc_group
        """
        return

    def reducer(self, arg_map):
        """
        reducer
        """
        self.debug = int(arg_map.get('debug', '0'))

        pro_uid = None
        top_itemids = []  
        pos_itemids = []  

        all_uid_pos_itemid_cnt = 0   # 总正向星 ID 数量
        all_uid_top_match_cnt = {i: 0 for i in range(10, 101, 10)}  # 存储 Top 10, 20, 30... 100 的匹配数
        all_uid_top_cover_rate = {i: 0 for i in range(10, 101, 10)}  # 存储 Top 10, 20, 30... 100 的覆盖率

        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            
            uid, data_type, values = line.split("\t")

            # 如果遇到新用户 ID，处理之前的用户数据
            if uid != pro_uid:
                if pro_uid:
                    if not len(pos_itemids) == 0 and not len(top_itemids) == 0:
                        if self.debug:
                            print("UID: ", pro_uid, " POS_ITEMS: ", pos_itemids, " TOP_ITEMS: ", top_itemids)
                            for item in pos_itemids: 
                                if item in top_itemids:
                                    print("UID: ", pro_uid, "MATCH_ITEM: ", item)
                        # 计算当前用户的匹配情况
                        all_uid_pos_itemid_cnt += len(pos_itemids)
                        for top_n in range(10, 101, 10):
                            all_uid_top_match_cnt[top_n] += sum(1 for item in pos_itemids if item in top_itemids[:top_n])

                # 重置数据，处理新用户
                pro_uid = uid
                top_itemids = []
                pos_itemids = []

            # 根据数据类型更新列表
            if data_type == "TOP_ITEMS":
                top_itemids = values.split(",")
            elif data_type == "POS_ITEMS":
                pos_itemids = values.split(",")

        # 处理最后一个用户的数据
        if pro_uid:
            if not len(pos_itemids) == 0 and not len(top_itemids) == 0:
                if self.debug:
                    print("UID: ", pro_uid, " POS_ITEMS: ", pos_itemids, " TOP_ITEMS: ", top_itemids)
                all_uid_pos_itemid_cnt += len(pos_itemids)
                for top_n in range(10, 101, 10):
                    all_uid_top_match_cnt[top_n] += sum(1 for item in pos_itemids if item in top_itemids[:top_n])

        # 输出最终结果
        print("总itemid 正样本数量: {}".format(all_uid_pos_itemid_cnt))
        
        for top_n in range(10, 101, 10):
            match_cnt = all_uid_top_match_cnt[top_n]
            coverage = float(match_cnt) / float(max(1, all_uid_pos_itemid_cnt))
            print("Top {} 命中数量: {}".format(top_n, str(match_cnt)))
            print("Top {} 覆盖率: {:.3f}".format(top_n, coverage))
            
if __name__ == '__main__':
    action = sys.argv[1]
    arg_map = {}
    if len(sys.argv) > 2:
        for i in range(2, len(sys.argv)):
            arg_sps = sys.argv[i].split('=')
            if len(arg_sps) == 2:
                arg_map[arg_sps[0]] = arg_sps[1]
    if action == 'mapper':
        mapper = Mapper(arg_map)
        mapper.mapper(arg_map)
    if action == 'reducer':
        reducer = Reducer()
        reducer.reducer(arg_map)

任务提交

#!/bin/bash

if [ ! -n "$1" ]; then
	date=`date +%Y%m%d -d '-1 days'`
else
	date=$1
fi

JOB_NAME=job_name_${date}
INPUT_PATH="afs://xxx.afs.xxx.com:xxx"
OUTPUT_PATH="afs://xxx.afs.xxx.com:xxx"


hadoop=~/.hmpclient/hadoop-client/hadoop/bin/hadoop
HADOOP_CONF=./conf/hadoop-site.xml


while true
do
    ${hadoop} fs -conf ./conf/hadoop-site.xml -test -e ${INPUT_PATH}/_SUCCESS
    if [ $? -ne 0 ]; then
        echo "event_day=${INPUT_PATH} not ready"
        sleep 3m
    else
        echo "event_day=${INPUT_PATH}/_SUCCESS exits"
        break
    fi
done

echo "INPUT_PATH: $INPUT_PATH"
echo "OUTPUT_PATH: $OUTPUT_PATH"

${hadoop} fs -conf ./conf/hadoop-site.xml -rmr ${OUTPUT_PATH}

${hadoop} streaming -D mapred.job.priority=VERY_HIGH \
-conf ./conf/hadoop-site.xml \
-inputformat org.apache.hadoop.mapred.TextInputFormat \
-jobconf mapred.combine.input.format.local.only=false \
-jobconf mapred.combine.input.format.dir.only=true \
-jobconf abaci.split.optimize.enable=false \
-jobconf mapred.max.map.failures.percent=10 \
-jobconf dfs.use.native.api=0 \
-jobconf mapred.job.queue.name=feed_qa_gzhl \
-jobconf mapred.max.split.size=30000000 \
-jobconf mapred.job.tracker=gzns-kunpeng-job.dmop.baidu.com:54311 \
-input "${INPUT_PROFILE_PATH}/part-*","${ACTIVE_USER_PATH}/part-*" \
-output ${OUTPUT_PATH} \
-mapper "./python/python/bin/python task.py mapper" \
-reducer "./python/python/bin/python task.py reducer log_date=${date}" \
-file ./scripts/*.py \
-file ./conf/*.conf \
-file ./infer_data/${yes_date}/* \
-jobconf mapred.reduce.tasks=1000 \
-jobconf mapred.job.reduce.capacity=1000 \
-jobconf mapred.job.map.capacity=4000 \
-jobconf mapreduce.reduce.memory.mb=4096 \
-jobconf mapreduce.reduce.java.opts="-Xmx3072m" \
-jobconf mapred.job.name="${JOB_NAME}" \
-jobconf abaci.split.remote=false \
-jobconf mapred.output.compress=true \
-jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-cacheArchive afs://xingtian.afs.baidu.com:9902/user/feed/mlarch/feasign/sb_feed_live_small_video_v3/python_pb.tar.gz#python

if [[ $? -ne 0 ]];then
    echo "[HADOOP ERROR]:job ${JOB_NAME} failed!"
    kill -TERM $PPID   # 终止父进程
    exit 1
else
    ${hadoop} fs -conf ./conf/hadoop-site.xml -touchz "${OUTPUT_PATH}/to.hadoop.done"
    echo "[HADOOP ERROR]:job ${JOB_NAME} succeed!"
fi