需求
序列数据(sequencial data) 是机器学习过程中经常会碰到的一类数据形态。在推荐系统中尤其如此,比如我们需要根据用户的点击历史来预测用户下一次可能会点击的Item是什么。我们一般会从日志系统中去抽取用户的历史行为,进行过滤和预处理之后,形成我们模型所需要的数据格式。然而,日志数据的组织是以单次行为堆积在一起的,可以抽象地理解为<user、item> 的pair对。因此我们需要做如下形式的数据转换:
原始格式 | 目标格式 |
---|---|
A、B A、C A、D B、C B、A A、E B、X A、X B、D B、P | A: B,C,D,E,X B: C,A,X,D,P |
看起来会比较简单,但当数据量比较大,但又需要在较短的时间内完成,就会比较棘手。
方案
实现
流程图
代码
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "hash.h"
#define MXU 300 // 用户最大行为量
#define TMC (1L<<24) // 最大行为数
#define UMC (1L<<23) // 最大用户数
#define VMC (1L<<20) // 最大资源数
typedef unsigned long long ULL;
// 字符串切分
char *strsep(char **stringp, const char *delim) {
char *s;
const char *spanp;
int c, sc;
char *tok;
if ((s = *stringp)== NULL)
return (NULL);
for (tok = s;;) {
c = *s++;
spanp = delim;
do {
if ((sc =*spanp++) == c) {
if (c == 0)
s = NULL;
else
s[-1] = 0;
*stringp = s;
return (tok);
}
} while (sc != 0);
}
}
static inline int charmask(unsigned char *input, int len, char *mask) {
unsigned char *end;
unsigned char c;
int result = 0;
memset(mask, 0, 256);
for (end = input + len; input < end; input++) {
c=*input;
if ((input + 3 < end) && input[1] == '.' && input[2] == '.'
&& input[3] >= c) {
memset(mask + c, 1, input[3] - c + 1);
input += 3;
} else if ((input + 1 < end) && input[0] == '.' && input[1] == '.') {
if (end-len >= input) {
result = -1;
continue;
}
if (input + 2 >= end) {
result = -1;
continue;
}
if (input[-1] > input[2]) {
result = -1;
continue;
}
result = -1;
continue;
} else {
mask[c] = 1;
}
}
return result;
}
static char * trim(char *c, int mode){
if (!c)
return NULL;
register int i;
int len = strlen(c) + 1;
int trimmed = 0;
char mask[256];
charmask((unsigned char*)" \n\r\t\v\0", 6, mask);
if (mode & 1) {
for (i = 0; i < len; i++) {
if (mask[(unsigned char)c[i]]) {
trimmed++;
} else {
break;
}
}
len -= trimmed;
c += trimmed;
}
if (mode & 2) {
for (i = len - 1; i >= 0; i--) {
if (mask[(unsigned char)c[i]]) {
len--;
} else {
break;
}
}
}
c[len] = '\0';
return c;
}
// 子进程竖转横操作 from “in” to “out” stream
static void v2h(FILE *in, FILE *out){
Hash * uhs = hash_create(UMC, STRING);
Hash * vhs = hash_create(VMC, STRING);
int user_ids_cnt = 0;
int (*toks_st)[2] = (int(*)[2]) calloc(TMC, sizeof(int[2]));
int *user_st = (int*) calloc(UMC, sizeof(int));
int *user_ct = (int*) calloc(UMC, sizeof(int));
int uid, iid, tk = 0;
memset(user_st, -1, UMC * sizeof(int));
char *string, *token;
char buf[1024] = {0};
while (NULL != fgets(buf, 1024, in)){
string = trim(buf, 3);
uid = hash_add(uhs, strsep(&string, "\t"));
iid = hash_add(vhs, strsep(&string, "\t"));
toks_st[tk][0] = iid;
toks_st[tk][1] = user_st[uid];
user_st[uid] = tk;
user_ct[uid] += 1;
tk += 1;
}
user_ids_cnt = hash_cnt(uhs);
for (int u = 0; u < user_ids_cnt; u++){
if (user_ct[u] > MXU){
continue;
}
fprintf(out, "%s", hash_keystr(uhs, u));
int p = user_st[u];
while (p != -1){
fprintf(out, "\t%s", hash_keystr(vhs, toks_st[p][0]));
p = toks_st[p][1];
}
fprintf(out, "\n");
}
}
// hash for key str
static unsigned long long hash_func(char *arKey)
{
register unsigned long long hash = 5381;
int nKeyLength = strlen(arKey);
for (; nKeyLength >= 8; nKeyLength -= 8) {
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
hash = ((hash << 5) + hash) + *arKey++;
}
switch (nKeyLength) {
case 7: hash = ((hash << 5) + hash) + *arKey++;
case 6: hash = ((hash << 5) + hash) + *arKey++;
case 5: hash = ((hash << 5) + hash) + *arKey++;
case 4: hash = ((hash << 5) + hash) + *arKey++;
case 3: hash = ((hash << 5) + hash) + *arKey++;
case 2: hash = ((hash << 5) + hash) + *arKey++;
case 1: hash = ((hash << 5) + hash) + *arKey++; break;
case 0: break;
default:
break;
}
return hash;
}
// main function
int main(int argc, char *argv[]) {
int i;
int mpcnt = atoi(argv[1]); // 子进程数
int (*pipfds)[2] = (int(*)[2])calloc(mpcnt, sizeof(int[2]));
pid_t *pids = (pid_t*)calloc(mpcnt, sizeof(pid_t));
pid_t pid;
// create pipes
for (i = 0; i < mpcnt; i++){
if(pipe(pipfds[i]) == -1){
fprintf(stderr, "pipe create failed\n");
return -1;
}
}
// fork 子进程
for (i = 0; i < mpcnt; i++){
pid = fork();
if (pid == -1){
break;
}
else if (pid == 0){
pids[i] = getpid();
break;
}
else{
pids[i] = pid;
}
}
if (pid == -1){
fprintf(stderr, "process create failed\n");
_exit(EXIT_FAILURE);
}
// 子进程处理逻辑
else if (pid == 0) {
char buf[1024] = {0};
char outs[100] = {0};
int c = 0, p = -1;
pid_t self = getpid();
// 关闭除自身管道读端之外的所有读写端
for (c = 0; c < mpcnt ; c++){
close(pipfds[c][1]);
if (pids[c] == self){
p = c;
}
else{
close(pipfds[c][0]);
}
}
if (p == -1){
_exit(EXIT_FAILURE);
}
sprintf(outs, "%d.txt", p);
FILE * inf = fdopen(pipfds[p][0], "r");
FILE * fp = fopen(outs, "w");
// transfroming...
v2h(inf, fp);
fclose(fp);
fclose(inf);
close(pipfds[p][0]);
_exit(EXIT_SUCCESS);
}
else { // 主进程逻辑
// 关闭所有管道读端
for (int c = 0; c < mpcnt; c++){
close(pipfds[c][0]);
}
char buf[1024] = {0};
char out[1024] = {0};
char *string, *token;
while (NULL != fgets(buf, 1024, stdin)){
string = trim(buf, 3);
memmove(out, string, strlen(string) + 2);
out[strlen(string)] = '\n';
out[strlen(string) + 1] = 0;
token = strsep(&string, "\t");
ULL sign = hash_func(token);
ULL m = sign % mpcnt;
if (pids[m] == 0){
fprintf(stderr, "subprocess create failed, just skip it\n");
}
else {
// 分发数据到各子进程管道
write(pipfds[m][1], out, strlen(out));
}
}
// 关闭所有管道写端
for (int c = 0; c < mpcnt; c++){
close(pipfds[c][1]);
}
wait(NULL);
exit(EXIT_SUCCESS);
}
return 0;
}
关于hash.h参见:hash.h