C Code
Map: Mapper.c
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define BUF_SIZE 2048
#define DELIM "\n"
int
main(
int
argc,
char
*argv[]){
char
buffer[BUF_SIZE];
while
(
fgets
(buffer, BUF_SIZE - 1, stdin)){
int
len =
strlen
(buffer);
if
(buffer[len-1] ==
'\n'
)
buffer[len-1] = 0;
char
*querys = index(buffer,
' '
);
char
*query = NULL;
if
(querys == NULL)
continue
;
querys += 1;
/* not to include '\t' */
query =
strtok
(buffer,
" "
);
while
(query){
printf
(
"%s\t1\n"
, query);
query =
strtok
(NULL,
" "
);
}
}
return
0;
}
Reduce: Reducer.c
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define BUFFER_SIZE 1024
#define DELIM "\t"
int
main(
int
argc,
char
*argv[]){
char
strLastKey[BUFFER_SIZE];
char
strLine[BUFFER_SIZE];
int
count = 0;
*strLastKey =
'\0'
;
*strLine =
'\0'
;
while
(
fgets
(strLine, BUFFER_SIZE - 1, stdin) ){
char
*strCurrKey = NULL;
char
*strCurrNum = NULL;
strCurrKey =
strtok
(strLine, DELIM);
strCurrNum =
strtok
(NULL, DELIM);
/* necessary to check error but.... */
if
( strLastKey[0] ==
'\0'
){
strcpy
(strLastKey, strCurrKey);
}
if
(
strcmp
(strCurrKey, strLastKey)){
printf
(
"%s\t%d\n"
, strLastKey, count);
count =
atoi
(strCurrNum);
}
else
{
count +=
atoi
(strCurrNum);
}
strcpy
(strLastKey, strCurrKey);
}
printf
(
"%s\t%d\n"
, strLastKey, count);
/* flush the count */
return
0;
}
首先编译 C code:
$gcc /home/user/mapred/Mapper.c -o mapper.o
$gcc /home/user/mapred/Reducer.c -o reducer.o
在 hdfs上创建 input目录
$HADOOP_HOME/bin/hadoop fs -mkdir input
然后在本地创建一个文件 /home/user/input/input.txt并上传到 hdfs
$HADOOP_HOME/bin/hadoop fs -put /home/user/input/input.txt input
此时准备工作完成,接着执行 hadoop-streaming命令
$HADOOP_HOME/bin/hadoop jar HADOOP_HOME/hadoop-streaming.jar -input input/* -output output -mapper "/home/user/mapred/mapper .o" -reducer "/home/hadoop/mapred/reducer .o"
运行成功的日志如下:
packageJobJar: [/tmp/hadoop-cp/hadoop-unjar2910356701799592623/] [] /tmp/streamjob4550176904973722526.jar tmpDir=null
11/12/15 19:27:23 INFO mapred.FileInputFormat: Total input paths to process : 1
11/12/15 19:27:23 INFO streaming.StreamJob: getLocalDirs(): [/tmp/hadoop-cp/mapred/local]
11/12/15 19:27:23 INFO streaming.StreamJob: Running job: job_201112151707_0019
11/12/15 19:27:23 INFO streaming.StreamJob: To kill this job, run:
11/12/15 19:27:23 INFO streaming.StreamJob: /home//hadoop/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 -kill job_201112151707_0019
11/12/15 19:27:23 INFO streaming.StreamJob: Tracking URL: http://localhost:50030/jobdetails.jsp?jobid=job_201112151707_0019
11/12/15 19:27:24 INFO streaming.StreamJob: map 0% reduce 0%
11/12/15 19:27:34 INFO streaming.StreamJob: map 100% reduce 0%
11/12/15 19:27:46 INFO streaming.StreamJob: map 100% reduce 100%
11/12/15 19:27:49 INFO streaming.StreamJob: Job complete: job_201112151707_0019
11/12/15 19:27:49 INFO streaming.StreamJob: Output: output