基于fork + pipe的数据快速打横处理

最新推荐文章于 2021-01-31 19:53:21 发布

xsqlx

最新推荐文章于 2021-01-31 19:53:21 发布

阅读量464

点赞数 1

分类专栏：互联网数据结构数据挖掘 C语言推荐文章标签： Linux fork pipe

本文链接：https://blog.youkuaiyun.com/xsqlx/article/details/84780416

版权

数据挖掘同时被 3 个专栏收录

4 篇文章

订阅专栏

互联网

3 篇文章

订阅专栏

C语言

3 篇文章

订阅专栏

本文介绍了如何在面临大量序列数据需要快速转换成特定模型输入格式时，利用Linux的fork和pipe机制进行高效处理。通过流程图和代码示例详细展示了将<user、item>对转换的方法，适用于大数据量且时间紧迫的情况。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

需求

序列数据（sequencial data） 是机器学习过程中经常会碰到的一类数据形态。在推荐系统中尤其如此，比如我们需要根据用户的点击历史来预测用户下一次可能会点击的Item是什么。我们一般会从日志系统中去抽取用户的历史行为，进行过滤和预处理之后，形成我们模型所需要的数据格式。然而，日志数据的组织是以单次行为堆积在一起的，可以抽象地理解为<user、item> 的pair对。因此我们需要做如下形式的数据转换：

原始格式	目标格式
A、B A、C A、D B、C B、A A、E B、X A、X B、D B、P	A： B,C,D,E,X B： C,A,X,D,P

看起来会比较简单，但当数据量比较大，但又需要在较短的时间内完成，就会比较棘手。

方案

实现

流程图

代码

#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "hash.h"

#define MXU 300       // 用户最大行为量
#define TMC (1L<<24)  // 最大行为数
#define UMC (1L<<23)  // 最大用户数
#define VMC (1L<<20)  // 最大资源数
typedef unsigned long long ULL;
// 字符串切分
char *strsep(char **stringp, const char *delim) {
    char *s;
    const char *spanp;
    int c, sc;
    char *tok;
    if ((s = *stringp)== NULL)
        return (NULL);
    for (tok = s;;) {
        c = *s++;
        spanp = delim;
        do {
            if ((sc =*spanp++) == c) {
                if (c == 0)
                    s = NULL;
                else
                    s[-1] = 0;
                *stringp = s;
                return (tok);
            }
        } while (sc != 0);
    }
}

static inline int charmask(unsigned char *input, int len, char *mask) {
    unsigned char *end;
    unsigned char c;
    int result = 0;
    memset(mask, 0, 256);
    for (end = input + len; input < end; input++) {
        c=*input;
        if ((input + 3 < end) && input[1] == '.' && input[2] == '.'
                && input[3] >= c) {
            memset(mask + c, 1, input[3] - c + 1);
            input += 3;
        } else if ((input + 1 < end) && input[0] == '.' && input[1] == '.') {
            if (end-len >= input) { 
                result = -1;
                continue;
            }
            if (input + 2 >= end) { 
                result = -1;
                continue;
            }
            if (input[-1] > input[2]) { 
                result = -1;
                continue;
            }
            result = -1;
            continue;
        } else {
            mask[c] = 1;
        }
    }
    return result;
}

static char * trim(char *c, int mode){
    if (!c)
        return NULL;
    register int i;
    int len = strlen(c) + 1;
    int trimmed = 0;
    char mask[256];
    charmask((unsigned char*)" \n\r\t\v\0", 6, mask);
    if (mode & 1) {
        for (i = 0; i < len; i++) {
            if (mask[(unsigned char)c[i]]) {
                trimmed++;
            } else {
                break;
            }
        }
        len -= trimmed;
        c += trimmed;
    }
    if (mode & 2) {
        for (i = len - 1; i >= 0; i--) {
            if (mask[(unsigned char)c[i]]) {
                len--;
            } else {
                break;
            }
        }
    }
    c[len] = '\0';
    return c;
}
// 子进程竖转横操作 from “in” to “out” stream
static void v2h(FILE *in, FILE *out){
    Hash * uhs = hash_create(UMC, STRING);
    Hash * vhs = hash_create(VMC, STRING);
    int user_ids_cnt = 0;
    int (*toks_st)[2] = (int(*)[2]) calloc(TMC, sizeof(int[2]));
    int *user_st = (int*) calloc(UMC, sizeof(int));
    int *user_ct = (int*) calloc(UMC, sizeof(int));
    int uid, iid, tk = 0;
    memset(user_st, -1, UMC * sizeof(int));
    char *string, *token;
    char buf[1024] = {0};
    while (NULL != fgets(buf, 1024, in)){
        string = trim(buf, 3);
        uid = hash_add(uhs, strsep(&string, "\t"));
        iid = hash_add(vhs, strsep(&string, "\t"));
        toks_st[tk][0] = iid;
        toks_st[tk][1] = user_st[uid];
        user_st[uid] = tk;
        user_ct[uid] += 1;
        tk += 1;
    }
    user_ids_cnt = hash_cnt(uhs);
    for (int u = 0; u < user_ids_cnt; u++){
        if (user_ct[u] > MXU){
            continue;
        }
        fprintf(out, "%s", hash_keystr(uhs, u));
        int p = user_st[u];
        while (p != -1){
            fprintf(out, "\t%s", hash_keystr(vhs, toks_st[p][0]));
            p = toks_st[p][1];
        }
        fprintf(out, "\n");
    }
}
// hash for key str 
static unsigned long long hash_func(char *arKey)
{
    register unsigned long long hash = 5381;
    int      nKeyLength = strlen(arKey);
    for (; nKeyLength >= 8; nKeyLength -= 8) {
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
        hash = ((hash << 5) + hash) + *arKey++;
    }
    switch (nKeyLength) {
        case 7: hash = ((hash << 5) + hash) + *arKey++; 
        case 6: hash = ((hash << 5) + hash) + *arKey++; 
        case 5: hash = ((hash << 5) + hash) + *arKey++; 
        case 4: hash = ((hash << 5) + hash) + *arKey++; 
        case 3: hash = ((hash << 5) + hash) + *arKey++; 
        case 2: hash = ((hash << 5) + hash) + *arKey++; 
        case 1: hash = ((hash << 5) + hash) + *arKey++; break;
        case 0: break;
        default:
                break;
    }
    return hash;
}
// main function
int main(int argc, char *argv[]) {
    int i;
    int mpcnt = atoi(argv[1]);  // 子进程数
    int (*pipfds)[2] = (int(*)[2])calloc(mpcnt, sizeof(int[2]));
    pid_t *pids = (pid_t*)calloc(mpcnt, sizeof(pid_t));
    pid_t  pid;
    // create pipes
    for (i = 0; i < mpcnt; i++){
        if(pipe(pipfds[i]) == -1){
            fprintf(stderr, "pipe create failed\n");
            return -1;
        }
    }
    // fork 子进程
    for (i = 0; i < mpcnt; i++){
        pid = fork();
        if (pid == -1){
            break;
        }
        else if (pid == 0){
            pids[i] = getpid();
            break;
        }
        else{
            pids[i] = pid;
        }
    }
    if (pid == -1){
        fprintf(stderr, "process create failed\n");
        _exit(EXIT_FAILURE);
    }
    // 子进程处理逻辑
    else if (pid == 0) {
        char  buf[1024] = {0};
        char  outs[100] = {0};
        int   c = 0, p = -1;
        pid_t self = getpid();
        // 关闭除自身管道读端之外的所有读写端
        for (c = 0; c < mpcnt ; c++){
            close(pipfds[c][1]);
            if (pids[c] == self){
                p = c;
            }
            else{
                close(pipfds[c][0]);
            }
        }
        if (p == -1){
            _exit(EXIT_FAILURE);
        }
        sprintf(outs, "%d.txt", p);
        FILE * inf = fdopen(pipfds[p][0], "r");
        FILE * fp  = fopen(outs, "w");
        // transfroming...
        v2h(inf, fp);
        fclose(fp);
        fclose(inf);
        close(pipfds[p][0]);
        _exit(EXIT_SUCCESS);
    }
    else { // 主进程逻辑
        // 关闭所有管道读端
        for (int c = 0; c < mpcnt; c++){
            close(pipfds[c][0]);
        }
        char buf[1024] = {0};
        char out[1024] = {0};
        char *string, *token;
        while (NULL != fgets(buf, 1024, stdin)){
            string = trim(buf, 3);
            memmove(out, string, strlen(string) + 2);
            out[strlen(string)] = '\n';
            out[strlen(string) + 1] = 0;
            token       = strsep(&string, "\t");
            ULL sign    = hash_func(token);
            ULL m = sign % mpcnt;
            if (pids[m] == 0){
                fprintf(stderr, "subprocess create failed, just skip it\n");
            }
            else {
                // 分发数据到各子进程管道
                write(pipfds[m][1], out, strlen(out));
            }
        }
        // 关闭所有管道写端
        for (int c = 0; c < mpcnt; c++){
            close(pipfds[c][1]);
        }
        wait(NULL);
        exit(EXIT_SUCCESS);
    }
    return 0;
}

关于hash.h参见：hash.h