平时开发的时候会遇到很多需要将HDFS上多个数据源的某个字段做为关联字段,得出多个数据源的笛卡儿积。了解了MapReduce多数据源关联的基本思路后就很简单。
基本思路:在Map输入阶段获取输入路径,在Map输出阶段根据路径的不同加以区分,即在将关联的列作为Key,在Value中加以区分是哪个数据源的数据,接着在Reduce的输入阶段,在reduce方法的入参会得到所有Key相同的集合,这样便可对数据进行响应的组装,这样便可完成2个数据源的关联,多个数据源的关联也类似。
下面附上本人写的一个demo:
package com.mclaren.hadoop.mr;
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringUtils;
import org.apache.directory.api.util.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mclaren.hadoop.config.ConfSource;
/**
*
* @ClassName: Sync2HBaseJob
* @Description: 数据入hbase MapReduce
* @author Mclaren.Pan
* @date 2014年11月5日 上午9:49:04
*
*/
public class Sync2HBaseJob {
private static final Logger LOG = LoggerFactory.getLogger(Sync2HBaseJob.class);
private static CommandLine cl = null;
private static ConfClz confClz = new ConfClz();
public static class Sync2HBaseMapper extends Mapper<Object, Text, Text, Text> {
private String[] headers = {};
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
private String separator = "";
//判断数据源标志
private String flag;
private Text k;
private Text v;
@Override
protected void setup(Context context){
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
FileSplit split = (FileSplit) context.getInputSplit();
flag = split.getPath().toString();
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
if (StringUtils.isNotBlank(value.toString())){
// 数据源A
if (flag.contains("DS_A")) {
String[] vals = splitLine(value, separator);
k = new Text(vals[1]);
StringBuilder sb = new StringBuilder();
for (String item : vals) {
sb.append(item).append(",");
}
v = new Text("DS_A@_@"
+ sb.delete(sb.length() - 1, sb.length())
.toString());
} else {
String[] vals = splitLine(value, "$");
k = new Text(vals[1]);
StringBuilder sb = new StringBuilder();
for (String item : vals) {
sb.append(item).append(",");
}
v = new Text("DS_B@_@"
+ sb.delete(sb.length() - 1, sb.length())
.toString());
}
context.write(k, v);
}
}
/**
*
* @MethodName: splitLine
* @Description: 分割行
* @param lineValue
* @param split
* @return String[]
* @throws
*/
private String[] splitLine(Text lineValue, String split) {
String line = Strings.trim(lineValue.toString());
String[] vals;
if ("|".equals(split) || StringUtils.isEmpty(split)) {
vals = line.split("\\|", -1);
} else if ("$".equals(split)) {
vals = line.split("\\$", -1);
} else {
vals = line.split(split, -1);
}
return vals;
}
}
/**
*
* @ClassName: Sync2HBaseReducer
* @Description: TODO
* @author Mclaren.Pan
* @date 2014年12月2日 下午5:01:17
*
*/
public static class Sync2HBaseReducer extends
Reducer<Text, Text, Text, Text> {
private String[] headers = {};
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
private String separator = "";
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
}
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
boolean ds_a_found = false;
String[] finalLine = null;
String acc_nbr = "";
boolean ds_b_found = false;
String[] vals = null;
//得到相同key的集合,所以会得到两个数据源的数据
for (Text val : values) {
String[] data = val.toString().split("@_@");
if (data[0].toString().contains("DS_A")) {
finalLine = splitLine(new Text(data[1]), ",");
ds_a_found = true;
} else {
vals = splitLine(new Text(data[1]), ",");
acc_nbr = vals[4];
ds_b_found = true;
}
if (ds_a_found && ds_b_found) {
StringBuilder sb = new StringBuilder();
for (String col : finalLine) {
sb.append(col).append(",");
}
String rowKey_ = acc_nbr + "_" + finalLine[0];
String temp = sb.delete(sb.length() -1 , sb.length()).toString();
//可能存在一对多的情况,都要插入
context.write(new Text(rowKey_), new Text(temp));
ds_b_found = false;
}
}
}
private String[] splitLine(Text lineValue, String split) {
String line = Strings.trim(lineValue.toString());
String[] vals;
if ("|".equals(split) || StringUtils.isEmpty(split)) {
vals = line.split("\\|", -1);
} else {
vals = line.split(split, -1);
}
return vals;
}
}
/**
*
* @ClassName: ExportHBaseMapper
* @Description: TODO
* @author Mclaren.Pan
* @date 2014年12月4日 上午12:35:09
*
*/
public static class ExportHBaseMapper extends Mapper<Text, Text, ImmutableBytesWritable, Put> {
private String[] headers = {};
private String separator = "";
private String familyCol = "";
private String rowKeyStr = "";
private String dataSourceID = "";
@Override
protected void setup(Context context){
String headerStr_ = context.getConfiguration().get("headerStr");
headers = headerStr_.split("\\|", -1);
separator = context.getConfiguration().get("separator");
familyCol = context.getConfiguration().get("familyCol");
rowKeyStr = context.getConfiguration().get("rowKeyStr");
dataSourceID = context.getConfiguration().get("dataSourceID");
}
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//分解行
String[] vals = splitLine(value);
//获取rowKey
byte[] rowKeyVal = key.copyBytes();
//插入hbase
try {
Put put = buildPutInstance(rowKeyVal, vals);
ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
immutableBytesWritable.set(rowKeyVal);
context.write(immutableBytesWritable, put);
} catch (Exception e) {
LOG.error("", e);
}
}
/**
*
* @MethodName: buildPutInstance
* @Description: 组装Put实例
* @param rowKeyVal
* @param vals
* @return Put
* @throws
*/
private Put buildPutInstance(byte[] rowKeyVal, String[] vals) {
String item = "";
String itemDataType = "";
byte[] familyCol_bytes = Bytes.toBytes(familyCol);
Put put = new Put(rowKeyVal);
for (int i = 0; i < vals.length; i++) {
item = Strings.trim(vals[i]);
itemDataType = ConfSource.getDataTypeFromConf(dataSourceID,
headers[i]);
if ("" != itemDataType) {
if (itemDataType.contains("NUMBER")) {
if (itemDataType.contains(",")) {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0.0));
}
else {
Double itemVal = 0.0;
itemVal = Double.parseDouble(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
} else {
Pattern pattern = Pattern.compile("(\\d+)");
Matcher matcher = pattern.matcher(itemDataType);
int precision = 0;
if (matcher.find()) {
precision = Integer.valueOf(matcher.group(1));
}
NUMBER如果超过10位,可能超过int表示范围-2147483648~2147483647,会报错,这里转成long
if (precision >= 10) {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0));
}
else {
Double itemVal = Double.parseDouble(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
} else {
if ("".equals(item.trim())) {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(0));
}
else {
Integer itemVal = 0;
itemVal = Integer.parseInt(item.trim());
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(itemVal));
}
}
}
} else {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]),
Bytes.toBytes(item));
}
} else {
put.add(familyCol_bytes, Bytes.toBytes(headers[i]), Bytes.toBytes(item));
}
}
return put;
}
/**
*
* @MethodName: splitLine
* @Description: 分割行
* @param lineValue
* @return String[]
* @throws
*/
private String[] splitLine(Text lineValue) {
String line = lineValue.toString().trim();
String[] vals;
if ("|".equals(separator) || StringUtils.isEmpty(separator)) {
vals = line.split("\\|", -1);
}
else {
vals = line.split(separator, -1);
}
return vals;
}
/**
*
* @MethodName: getRowKeyVal
* @Description: 获取rowkey
* @param headers
* @param colsPerLine
* @throws InterruptedException
* @return byte[]
* @throws
*/
private byte[] getRowKeyVal(String[] headers, String[] colsPerLine) throws InterruptedException {
String[] rowKeys = rowKeyStr.split(",");
StringBuilder rkBuf = new StringBuilder();
for (String rowKey : rowKeys) {
int rkIdx = Arrays.asList(headers).indexOf(rowKey);
rkBuf.append(colsPerLine[rkIdx]).append("_");
}
int rkBufLen = rkBuf.length();
byte[] rowKeyVal = Bytes.toBytes(rkBuf.delete(rkBufLen - 1, rkBufLen).toString());
return rowKeyVal;
}
}
/**
* @throws IOException
*
* @MethodName: sync2HBase
* @Description: 导入bigtable的共用执行类
* @throws Exception
* @return void
* @throws
*/
private void sync2HBase() throws IOException{
Configuration conf1 = ConfSource.getHBaseConf();
Job job1 = new Job(conf1, "2个数据源关联");
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setJarByClass(Sync2HBaseJob.class);
job1.setMapperClass(Sync2HBaseMapper.class);
job1.setReducerClass(Sync2HBaseReducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
FileInputFormat.addInputPaths(job1, confClz.getDownloadPath());
FileOutputFormat.setOutputPath(job1, new Path("/mr_temp"));
// 设置参数
job1.getConfiguration().set("headerStr", confClz.getHeaderStr());
job1.getConfiguration().set("separator", confClz.getSeparator());
job1.getConfiguration().set("familyCol", confClz.getFamilyCol());
job1.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
job1.getConfiguration().set("dataSourceID", confClz.getDataSourceID());
try {
job1.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
try {
Configuration conf2 = ConfSource.getHBaseConf();
Job job2 = new Job(conf2, "Import into hbase table"
+ confClz.getHbaseTable() + " from "
+ confClz.getDownloadPath());
job2.setJarByClass(Sync2HBaseJob.class);
job2.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.setInputPaths(job2, new Path("/mr_temp"));
job2.setMapperClass(ExportHBaseMapper.class);
HTable table = new HTable(conf2, confClz.getHbaseTable());
job2.setReducerClass(PutSortReducer.class);
Path outputDir = new Path(confClz.getHfilePath());
FileOutputFormat.setOutputPath(job2, outputDir);
job2.setMapOutputKeyClass(ImmutableBytesWritable.class);
job2.setMapOutputValueClass(Put.class);
HFileOutputFormat2.configureIncrementalLoad(job2, table);
TableMapReduceUtil.addDependencyJars(job2);
// 设置参数
job2.getConfiguration().set("headerStr", confClz.getHeaderStr());
job2.getConfiguration().set("separator", confClz.getSeparator());
job2.getConfiguration().set("familyCol", confClz.getFamilyCol());
job2.getConfiguration().set("rowKeyStr", confClz.getRowKeyStr());
job2.getConfiguration().set("dataSourceID", confClz.getDataSourceID());
job2.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*
* @MethodName: getConf
* @Description: 加载配置文件
* @return void
* @throws
*/
private void getConf() {
//读取数据源ID
String dataSourceID = cl.getOptionValue("dsid");
if (StringUtils.isEmpty(dataSourceID)) {
LOG.error("没有指定数据源ID");
System.exit(1);
}
//读取列
String headerStr = ConfSource.getProperty(dataSourceID + ".header");
String[] headers = headerStr.split("\\|", -1);
//读取分隔符
String separator = ConfSource.getProperty(dataSourceID + ".separator");
//读取hbase表名
String hbaseTable = cl.getOptionValue("tb");
if (StringUtils.isEmpty(hbaseTable)) {
LOG.error("没有指定导入的HBASE表名");
System.exit(1);
}
//读取列族
String familyCol = ConfSource.getProperty(dataSourceID + ".familyCol");
//rowkey,支持组合rowKey
String rowKeyStr = ConfSource.getProperty(dataSourceID + ".rowKey");
String[] rowKeys = rowKeyStr.split(",");
for (String rk : rowKeys) {
int rkIdx = Arrays.asList(headers).indexOf(rk);
if (-1 == rkIdx) {
LOG.error("指定RowKey在列数据中未找到!");
System.exit(1);
}
}
//读取hdfs路径
String downloadPath = "";
String inputPath = cl.getOptionValue("path");
if (StringUtils.isEmpty(inputPath)) {
String dataFilePath = ConfSource.getProperty(dataSourceID + ".hdfsPath");
if (StringUtils.isEmpty(dataFilePath)) {
LOG.error("没有指定数据文件地址,并且默认数据文件地址未来找到!");
System.exit(1);
} else {
downloadPath = dataFilePath;
}
} else {
downloadPath = inputPath;
}
// hfile 存放hdfs路径
String hfilePath = cl.getOptionValue("hfilePath");
if (StringUtils.isEmpty(hfilePath)) {
LOG.error("必须设置hfile存放的hdfs路径!");
System.exit(1);
}
confClz.setDataSourceID(dataSourceID);
confClz.setFamilyCol(familyCol);
confClz.setRowKeyStr(rowKeyStr);
confClz.setHbaseTable(hbaseTable);
confClz.setHeaderStr(headerStr);
confClz.setSeparator(separator);
confClz.setDownloadPath(downloadPath);
confClz.setHfilePath(hfilePath);
}
/**
*
* @MethodName: getCommandParam
* @Description: 从命令行获取参数
* @param args
* @return void
* @throws
*/
private void getCommandParam(String[] args) {
Options opt = new Options();
opt.addOption("dsid", true, "data source identity");
opt.addOption("path", true, "hdfs absolute path");
opt.addOption("tb", true, "which hbase table to export");
opt.addOption("hfilePath", "hfile path", true, "hfile output hdfs path");
String formatStr = "sh hadoop [this jar path][-dsid][-path][-tb][-bulkpath] ";
HelpFormatter formatter = new HelpFormatter();
CommandLineParser parser = new PosixParser();
try {
cl = parser.parse(opt, args);
} catch (Exception e) {
formatter.printHelp(formatStr, opt);
System.exit(1);
LOG.error("", e);
}
}
public static void main(String[] args) {
try {
Sync2HBaseJob job = new Sync2HBaseJob();
job.getCommandParam(args);
job.getConf();
job.sync2HBase();
}
catch (Exception e) {
e.printStackTrace();
}
}
}