2017年03月01日 09:20:50 技术人的突破 阅读数:4117
1.使用Map+Reduce方式
-
public class MapReduceImport { -
/** -
* Mapper -
*/ -
static class HMapper extends Mapper<LongWritable, Text, LongWritable, Text> { -
Text v2 = new Text(); -
protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException { -
String[] splited = value.toString().split(" "); -
if (splited.length != 6)//清洗不符合标准的数据 -
return; -
try { -
//GetRowKey.getRowKeyString方法是自己定义生成rowkey的方法 -
//rowkey设计为IP_TimeStamp这种方式 -
v2.set(GetRowKey.getRowKeyString(splited[2], splited[4]) + " " + value.toString()); -
context.write(key, v2); -
} catch (NumberFormatException e) { -
System.out.println("出错了" + e.getMessage()); -
} -
} -
} -
/** -
* Reducer -
*/ -
static class HReducer extends TableReducer<LongWritable, Text, NullWritable> { -
protected void reduce(LongWritable key, java.lang.Iterable<Text> values, Context context) throws java.io.IOException, InterruptedException { -
for (Text text : values) { -
String[] splited = text.toString().split(" "); -
Put put = new Put(Bytes.toBytes(splited[0])); -
for (int j = 1; j < splited.length; j++) { -
put.addColumn(Bytes.toBytes(HConfiguration.colFamily), Bytes.toBytes("log" + j), Bytes.toBytes(splited[j])); -
} -
context.write(NullWritable.get(), put); -
} -
} -
} -
/** -
* Main -
* -
* @param args -
* @throws Exception -
*/ -
public static void main(String[] args) throws Exception { -
Configuration configuration = new Configuration(); -
//设置zookeeper -
configuration.set("hbase.zookeeper.quorum", HConfiguration.hbase_zookeeper_quorum); -
configuration.set("hbase.zookeeper.property.clientPort", "2181"); -
//设置hbase表名称 -
configuration.set(TableOutputFormat.OUTPUT_TABLE, HConfiguration.tableName); -
//将该值改大,防止hbase超时退出 -
configuration.set("dfs.socket.timeout", "180000"); -
MRDriver myDriver = MRDriver.getInstance(); -
try { -
myDriver.createTableIfExistDelete(HConfiguration.tableName, HConfiguration.colFamily); -
} catch (Exception e) { -
e.printStackTrace(); -
} -
Job job = new Job(configuration, "Map+ReduceImport"); -
job.setMapperClass(HMapper.class); -
job.setReducerClass(HReducer.class); -
job.setMapOutputKeyClass(LongWritable.class); -
job.setMapOutputValueClass(Text.class); -
job.setInputFormatClass(TextInputFormat.class); -
//不再设置输出路径,而是设置输出格式类型TableOutputFormat -
job.setOutputFormatClass(TableOutputFormat.class); -
FileInputFormat.setInputPaths(job, HConfiguration.mapreduce_inputPath); -
job.waitForCompletion(true); -
} -
}
———————-分—-割—-线————————–
2.只使用Map的方式
-
public class OnlyMapImport { -
/** -
* Mapper -
*/ -
static class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> { -
@Override -
public void map(LongWritable offset, Text value, Context context) { -
String[] splited = value.toString().split(" "); -
if (splited.length != 6) -
return; -
try { -
//GetRowKey.getRowKeyString方法是自己定义生成rowkey的方法 -
//rowkey设计为IP_TimeStamp这种方式 -
byte[] rowkey = Bytes.toBytes(GetRowKey.getRowKeyString(splited[2], splited[4])); -
Put put = new Put(rowkey); -
for (int j = 0; j < splited.length; j++) { -
put.addColumn(Bytes.toBytes(HConfiguration.colFamily), Bytes.toBytes("log" + j), Bytes.toBytes(splited[j])); -
} -
context.write(new ImmutableBytesWritable(rowkey), put); -
} catch (NumberFormatException e) { -
System.out.println("出错了" + e.getMessage()); -
} catch (IOException e) { -
e.printStackTrace(); -
} catch (InterruptedException e) { -
e.printStackTrace(); -
} -
} -
} -
/** -
* Main -
* -
* @param args -
* @throws Exception -
*/ -
public static void main(String[] args) throws Exception { -
Configuration configuration = new Configuration(); -
//设置zookeeper -
configuration.set("hbase.zookeeper.quorum", HConfiguration.hbase_zookeeper_quorum); -
configuration.set("hbase.zookeeper.property.clientPort", "2181"); -
//设置hbase表名称 -
configuration.set(TableOutputFormat.OUTPUT_TABLE, HConfiguration.tableName); -
//将该值改大,防止hbase超时退出 -
configuration.set("dfs.socket.timeout", "180000"); -
MRDriver myDriver = MRDriver.getInstance(); -
try { -
myDriver.createTableIfExistDelete(HConfiguration.tableName, HConfiguration.colFamily); -
} catch (Exception e) { -
e.printStackTrace(); -
} -
Job job = new Job(configuration, "HBaseBatchImport"); -
job.setJarByClass(OnlyMapImport.class); -
job.setMapperClass(ImportMapper.class); -
//设置map的输出,不设置reduce的输出类型 -
job.setMapOutputKeyClass(ImmutableBytesWritable.class); -
job.setMapOutputValueClass(Writeable.class); -
job.setNumReduceTasks(0); -
job.setInputFormatClass(TextInputFormat.class); -
//不再设置输出路径,而是设置输出格式类型 -
job.setOutputFormatClass(TableOutputFormat.class); -
FileInputFormat.setInputPaths(job, HConfiguration.mapreduce_inputPath); -
job.waitForCompletion(true); -
} -
}
经过测试,导入时间明显减少。
本文对比了使用MapReduce和仅使用Map两种方式向HBase批量导入数据的效率,通过优化rowkey设计和调整HBase参数,实现了数据导入时间的显著减少。
652

被折叠的 条评论
为什么被折叠?



