原文参考:
http://shitouer.cn/2013/02/hbase-hfile-bulk-load/
可能需要依赖一写jar包,在这里下载:http://download.youkuaiyun.com/detail/q79969786/6933683
主要做了如下修改:
package com.upa.hbase;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class HFileGenerator {
public static class HFileMapper extends
Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] items = line.split(",", -1);
ImmutableBytesWritable rowkey = new ImmutableBytesWritable(
items[0].getBytes());
KeyValue kv = new KeyValue(Bytes.toBytes(items[0]),
Bytes.toBytes(items[1]), Bytes.toBytes(items[2]),
System.currentTimeMillis(), Bytes.toBytes(items[3]));
if (null != kv) {
context.write(rowkey, kv);
}
}
}
public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] dfsArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
Job job = new Job(conf, "HFile bulk load test");
job.setJarByClass(HFileGenerator.class);
job.setMapperClass(HFileMapper.class);
job.setReducerClass(KeyValueSortReducer.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
job.setPartitionerClass(SimpleTotalOrderPartitioner.class);
FileInputFormat.addInputPath(job, new Path(dfsArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(dfsArgs[1]));
HFileOutputFormat.configureIncrementalLoad(job,
ConnectionUtil.getTable());
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
增加了ConnectionUtil:
package com.upa.hbase;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
public class ConnectionUtil {
public static Configuration getConfiguration() {
Configuration config = HBaseConfiguration.create();
return config;
}
public static HTable getTable() {
try {
return new HTable(getConfiguration(), "test_hfile");
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
}
loader类没有修改:
package com.unionpay.upa.hbase;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.util.GenericOptionsParser;
public class HFileLoader {
public static void main(String[] args) throws Exception {
String[] dfsArgs = new GenericOptionsParser(
ConnectionUtil.getConfiguration(), args).getRemainingArgs();
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(
ConnectionUtil.getConfiguration());
loader.doBulkLoad(new Path(dfsArgs[0]), ConnectionUtil.getTable());
}
}
接下来需要创建一个hbase数据库:
hbase(main):025:0> create 'test_hfile', 't'
上传测试数据到HDFS:
$ hadoop fs -put test_hfile.txt test_hbase/test_hfile
格式大致是这样:
1,t,45,wer,fg,
we5r,t,sdfsd,fsd,f
wesr,t,wrwer,sdfsdf,sdf,sd,tfg,sd,
werr,t,wer,wesdfdsf,sdfd,ret
rwer,t,werd546,fgh,g,df,ga,ds
接下来执行load:
$ hadoop jar etl.jar com.upa.hbase.HFileGenerator /user/hadoop/test_hbase/test_hfile /user/hadoop/test_hbase/hfile
$ hadoop jar etl.jar com.upa.hbase.HFileLoader /user/hadoop/test_hbase/hfile