Prepare ENV
download mongo-java-driver from http://central.maven.org/maven2/org/mongodb/mongo-java-driver
compile mongo-hadoop-connector for hadoop2.3.0
alter build.gradle to change hadoop-version to 2.3 and delete related download dependencies task
#./gradlew jar
distribute the aboves jars to hadoop clustert's nodes
#cp core/build/libs/mongo-hadoop-core-1.2.1-SNAPSHOT-hadoop_2.3.jar hadoop-2.3.0/share/hadoop/common/lib
#cp mongo-java-driver-2.12.2.jar hadoop-2.3.0/share/hadoop/common/lib
or add the above two jars into project lib dir and add them to build path
Note: the destination dir is not hadoop-2.3.0/lib
Run MongoDB Hadoop examples
#cd /path/to/mongodb-hadoop
#./gradlew historicalYield
The above command will download hadoop and install it. But,I want to run the example on my existed hadoop cluster.
To see what happens for this command, first find the files which execute this commands
#find . |xargs grep 'historicalYield' -sl
task historicalYield(dependsOn: 'configureCluster') << {
exec() {
commandLine "mongoimport", "-d", "mongo_hadoop", "-c", "yield_historical.in", "--drop",
"examples/treasury_yield/src/main/resources/yield_historical_in.json"
}
hadoop("examples/treasury_yield/build/libs/treasury_yield-${project(':core').version}-hadoop_${hadoop_version}.jar",
"com.mongodb.hadoop.examples.treasury.TreasuryYieldXMLConfig", [
"mongo.input.uri=mongodb://localhost:27017/mongo_hadoop.yield_historical.in",
"mongo.output.uri=mongodb://localhost:27017/mongo_hadoop.yield_historical.out"
])
}
task configureCluster(dependsOn: ['copyFiles']) << {
}
task copyFiles(dependsOn: ['installHadoop', 'installHive']) << {
def hadoopEtc
def hadoopLib
if (hadoop_version.startsWith("1")) {
hadoopLib = "${hadoopHome}/lib"
hadoopEtc = "${hadoopHome}/conf"
} else {
hadoopLib = "${hadoopHome}/share/hadoop/common"
hadoopEtc = "${hadoopHome}/etc/hadoop"
}
println "Updating mongo jars"
copy {
from "core/build/libs/mongo-hadoop-core-${project(':core').version}-hadoop_${hadoop_version}.jar"
into hadoopLib
rename { "mongo-hadoop-core.jar" }
}
copy {
from "hive/build/libs/mongo-hadoop-hive-${project(':core').version}-hadoop_${hadoop_version}.jar"
into hiveHome + '/lib'
rename { "mongo-hadoop-hive.jar" }
}
download {
src "http://central.maven.org/maven2/org/mongodb/mongo-java-driver/${javaDriverVersion}/mongo-java-driver-${javaDriverVersion}.jar"
dest "${hadoopLib}/mongo-java-driver.jar"
onlyIfNewer true
}
println "Updating cluster configuration"
copy {
from 'clusterConfigs'
into hadoopEtc
}
}
def hadoop(jar, className, args) {
def line = ["${hadoopHome}/bin/hadoop",
"jar", jar, className,
//Split settings
"-Dmongo.input.split_size=8",
"-Dmongo.job.verbose=true",
]
args.each {
line << "-D${it}"
}
println "Executing hadoop job:\n ${line.join(' \\\n\t')}"
def hadoopEnv = [:]
if (hadoop_version.startsWith("cdh")) {
hadoopEnv.MAPRED_DIR = 'share/hadoop/mapreduce2'
}
exec() {
environment << hadoopEnv
commandLine line
}
}
----------------------------
so,understood the details behind the screen. We can run the example by hand
1. load sample data into mongoDB
#mongoimport -d mongo_hadoop -c yield_historical.in --drop <examples/treasury_yield/src/main/resources/yield_historical_in.json
2.run example
#cd mongo-hadoop/examples/treasury_yield/build/libs
#hadoop jar treasury_yield-1.2.1-SNAPSHOT-hadoop_2.3.jar com.mongodb.hadoop.examples.treasury.TreasuryYieldXMLConfigV2 -Dmongo.input.uri=mongodb://localhost:27017/mongo_hadoop.yield_historical.in -Dmongo.output.uri=mongodb://localhost:27017/mongo_hadoop.yield_historical.out -Dmongo.input.split_size=8 -Dmongo.job.verbose=true
References
http://docs.mongodb.org/ecosystem/tutorial/getting-started-with-java-driver/
https://github.com/mongodb/mongo-hadoop/blob/master/CONFIG.md
https://github.com/mongodb/mongo-hadoop/blob/master/examples/README.md
http://docs.mongodb.org/ecosystem/tutorial/getting-started-with-hadoop/
http://mongodb-documentation.readthedocs.org/en/latest/ecosystem/tutorial/getting-started-with-hadoop.html
http://www.mongodb.com/press/integration-hadoop-and-mongodb-big-data%E2%80%99s-two-most-popular-technologies-gets-significant
http://blog.mortardata.com/post/43080668046/mongodb-hadoop-why-how
http://help.mortardata.com/data_apps/mongo_hadoop
http://www.severalnines.com/blog/big-data-integration-etl-clickstream-mongodb-hadoop-analytics
本文详细介绍了如何在现有的Hadoop集群上部署并运行MongoDB Hadoop示例,包括环境准备、下载所需依赖、配置集群、安装Hadoop和Hive等步骤,并提供了具体命令和操作细节。
119

被折叠的 条评论
为什么被折叠?



