1.
#!/bin/bash
INPUT_DIR="/app/lbs/zz/route/*"
OUTPUT_DIR="/app/lbs/zz/result"
HADOOP_HOME="/home/user/zz/hadoop-navi/hadoop/bin"
${HADOOP_HOME}/hadoop fs -rmr ${OUTPUT_DIR}
${HADOOP_HOME}/hadoop bistreaming \
-input ${INPUT_DIR} \
-output ${OUTPUT_DIR} \
-jobconf stream.tmpdir="./hadooptmp" \
-jobconf mapred.job.name="routine_task_job" \
-jobconf mapred.job.map.capacity=1000 \
-jobconf mapred.reduce.tasks=10 \
-jobconf stream.num.map.output.key.fields=3 \
# -D stream.map.output.field.separator='\t' \
# -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
-jobconf mapred.max.map.failures.percent=5 \
-jobconf num.key.fields.for.partition=1 \
-jobconf mapred.job.priority=VERY_HIGH \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-inputformat org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat \
# -inputformat org.apache.hadoop.mapred.CombineTextInputFormat \
-outputformat org.apache.hadoop.mapred.TextOutputFormat \
-mapper "sh mapper.sh" \
-reducer "cat" \
-file ./mapper.sh \
-file ./reducer.sh \
-file ./proto/interface_pb2.py \
-cacheArchive hdfs://xx/app/lbs/sklearn_env.tar.gz
或
${HADOOP_HOME}/hadoop streaming \
-D stream.tmpdir="./hadooptmp" \
-D mapred.map.tasks=10000 \
-D mapred.job.map.capacity=10000 \
-D mapred.reduce.tasks=500 \
-D mapred.job.reduce.capacity=500 \
-D mapred.job.name="relation_data_${date_info}_zz_[110]" \
-D mapred.output.compress=true \
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-D stream.map.output.field.separator='\t' \
-D stream.num.map.output.key.fields=2 \
-D mapred.max.map.failures.percent=3 \
-D mapred.map.max.attempts=2 \
-D mapred.skip.map.max.skip.records=20 \
-D mapred.skip.mode.enabled=true \
-D num.key.fields.for.partition=1 \
-D mapred.text.key.comparator.options="-k1 -k2n" \
-D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-file ./t1.py \
-file ./t2.py \
-file ./mapper.sh \
-file ./htasks_jar_libs.jar \
-file ./util.so \
-file ./util_gps.py \
-file ./util_gps.pyc \
-input ${TRAJ_INPUT_DIR} \
-input ${PBLOG_INPUT_DIR} \
-input ${RESIDENT_INPUT_DIR} \
-output ${OUTPUT_DIR} \
-mapper "sh mapper.sh" \
-reducer "python t1.py" \
-jobconf mapred.job.priority=VERY_HIGH \
-cacheArchive "/app/lbs/zz/zz-python-env.tar.gz \
-inputformat org.apache.hadoop.mapred.CombineTextInputFormat \
-outputformat baidu.lbs.MultipleTextOutputFormatByKey
1529

被折叠的 条评论
为什么被折叠?



