一。写一个程序保存为wordcount-simple.cpp:
#include "hadoop/Pipes.hh"
#include "hadoop/TemplateFactory.hh"
#include "hadoop/StringUtils.hh"
const std::string WORDCOUNT = "WORDCOUNT";
const std::string INPUT_WORDS = "INPUT_WORDS";
const std::string OUTPUT_WORDS = "OUTPUT_WORDS";
class WordCountMap: public HadoopPipes::Mapper { // Mapper类
public:
HadoopPipes::TaskContext::Counter* inputWords;
WordCountMap(HadoopPipes::TaskContext& context) {
inputWords = context.getCounter(WORDCOUNT, INPUT_WORDS);
}
void map(HadoopPipes::MapContext& context) {
std::vector<std::string> words =
HadoopUtils::splitString(context.getInputValue(), " "); // 按空格进行单词分割
for(unsigned int i=0; i < words.size(); ++i) {
context.emit(words[i], "1"); // 单词作为key,value为1
}
context.incrementCounter(inputWords, words.size()); // 向map-reduce提交进度信息
}
};
class WordCountReduce: public HadoopPipes::Reducer { // reduce类
public:
HadoopPipes::TaskContext::Counter* outputWords;
WordCountReduce(HadoopPipes::TaskContext& context) {
outputWords = context.getCounter(WORDCOUNT, OUTPUT_WORDS);
}
void reduce(HadoopPipes::ReduceContext& context) {
int sum = 0;
while (context.nextValue()) {
sum += HadoopUtils::toInt(context.getInputValue()); // 统计单词出现的次数
}
context.emit(context.getInputKey(), HadoopUtils::toString(sum)); // 输出结果
context.incrementCounter(outputWords, 1);
}
};
int main(int argc, char *argv[]) {
return HadoopPipes::runTask(HadoopPipes::TemplateFactory<WordCountMap,
WordCountReduce>()); // 运行任务
}
二。Makefile:
CC = g++
HADOOP_INSTALL = $(HADOOP_HOME)
PLATFORM = Linux-i386-32
CPPFLAGS = -O2 -m32 -I$(HADOOP_INSTALL)/c++/$(PLATFORM)/include
LDFLAGS += -lcrypto -lhadooppipes -lhadooputils -lpthread
wordcount-simple: wordcount-simple.cpp
$(CC) $(CPPFLAGS) $< -Wall -L$(HADOOP_INSTALL)/c++/$(PLATFORM)/lib $(LDFLAGS) -o $@
三。make生成可执行文件
四。将可执行文件编译上传到HDFS:
hadoop dfs -copyFromLocal ./wordcount-simple /home
五。创建配置文件:word.xml
- <?xml version="1.0"?>
- <configuration>
-
<property>
-
// Set the binary path onDFS
-
<name>hadoop.pipes.executable</name>
-
<value>/home/wordcount</value>
-
</property>
-
<property>
-
<name>hadoop.pipes.java.recordreader</name>
-
<value>true</value>
-
</property>
-
<property>
-
<name>hadoop.pipes.java.recordwriter</name>
-
<value>true</value>
-
</property>
- </configuration>
六。创建本地文件hello.txt内容为:hello world
七。将hello.txt上传到dfs
hadoop dfs -copyFromLocal ./hello.txt /home
八。运行程序
hadoop pipes -conf ./word.xml -input /home/hello -output /home/result
(dfs会自动创建result目录保存结果) ============我是博主的昏割线================= 这个是我写的控制行命令 首先在远端创建一个/home文件夹 hadoop dfs -mkdir /home 注:这个文件夹直接用hadoop dfs -ls访问不到,因为其路径直接在远端根目录下面,ls的默认目录不是根目录,需要用命令hadoop dfs -ls /home 命令才可以打印复制过去的目录文件 然后是两个copyFromLocal命令 hadoop dfs -copyFromLocal ./wordcount-simple /homehadoop dfs -copyFromLocal ./hello.txt /home 原文中的word.xml文件可能不好用,改了一点点 <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>hadoop.pipes.executable</name> <value>/home/wordcount</value> </property> <property> <name>hadoop.pipes.java.recordreader</name> <value>true</value> </property> <property> <name>hadoop.pipes.java.recordwriter</name> <value>true</value> </property> </configuration> 这个文件需要放到hadoop/conf文件夹中(没有验证,我的是当前目录和conf中都有一个副本)
运行程序命令
hadoop pipes -conf ./word.xml -input /home/hello.txt -output /home/result
然后就可以跑了
本文介绍了一个使用Hadoop Pipes API实现的WordCount程序示例,包括源代码编写、Makefile配置、可执行文件生成及上传到HDFS的过程,并演示了如何通过配置文件运行程序。
4492

被折叠的 条评论
为什么被折叠?



