1. 预先生成HFile入库
这个地址有详细的说明http://blog.youkuaiyun.com/dajuezhao/archive/2011/04/26/6365053.aspx
2. 通过MapReduce入库
import
import
import
import
import
import
import
import
import
import
import
import
import
import
import
import
import
import
import
public class
static final
public static final
public static class
Configuration configuration =
HTable xTable =
private boolean
static long
@Override
protected void
InterruptedException {
//
super.cleanup(context);
xTable.flushCommits();
xTable.close();
}
@Override
protected void
throws
String all[] = value.toString().split("/t");
If(all.length==2){
put =
}
if
put.setWriteToWAL(false);
}
xTable.put(put);
if
context.setStatus(count
context.progress();
System.out.println(count
}
}
@Override
protected void
InterruptedException {
//
super.setup(context);
configuration = context.getConfiguration();
xTable =
xTable.setAutoFlush(false);
xTable.setWriteBufferSize(12*1024*1024);
wal =
}
}
@Override
public int
String input = args[0];
Configuration conf = HBaseConfiguration.create(getConf());
conf.set("hbase.master", "m0:60000");
Job job =
job.setJarByClass(HBaseImport.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, input);
job.setOutputFormatClass(NullOutputFormat.class);
return
}
public static void
Configuration conf =
String[] otherArgs =
int
try
res = ToolRunner.run(conf,
}
e.printStackTrace();
}
System.exit(res);
}
}
注意mapper类中的抽象方法:
3. 通过Java程序入库
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
public class InsertContactJava {
public static long startTime;
public static long rowkey = 0; //起始rowkey
public static final int lineCount = 100000; //每次提交时录入的行数
public static String tableName = "usercontact_kang"; //录入目的表名
public static int countLie = 8; //表的列数
public static void main(String[] args) throws IOException {
startTime = System.currentTimeMillis() / 1000;
System.out.println("start time = " + startTime);
Thread t1 = new Thread() {
@Override
public void run() {
try {
insert_one("/run/jar/123");
//loadByLieWithVector("/run/jar/123");
//loadByLieWithArrayList("/run/jar/123");
} catch (IOException e) {
e.printStackTrace();
}
}
};
t1.start();
}
public static void insert_one(String path) throws IOException {
Configuration conf = HBaseConfiguration.create();
HTable table = new HTable(conf, tableName);
File f = new File(path);
ArrayList list = new ArrayList();
BufferedReader br = new BufferedReader(new FileReader(f));
String tmp = br.readLine();
int count = 0;
while (tmp != null) {
if (list.size() > 10000) {
table.put(list);
table.flushCommits();
list.clear();
} else {
String arr_value[] = tmp.toString().split("/t", 10);
String first[] = arr_value[0].split("~", 5);
String second[] = arr_value[1].split("~", 5);
String rowname = getIncreasRowKey();
String firstaccount = first[0];
String firstprotocolid = first[1];
String firstdomain = first[2];
String inserttime = Utils.getToday("yyyyMMdd");
String secondaccount = second[0];
String secondprotocolid = second[1];
String seconddomain = second[2];
String timescount = Integer.valueOf(arr_value[2]).toString();
Put p = new Put(rowname.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTACCOUNT".getBytes(),
firstaccount.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTDOMAIN".getBytes(),
firstdomain.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTPROTOCOLID".getBytes(),
firstprotocolid.getBytes());
p.add(("ucvalue").getBytes(), "INSERTTIME".getBytes(),
inserttime.getBytes());
p.add(("ucvalue").getBytes(), "SECONDACCOUNT".getBytes(),
secondaccount.getBytes());
p.add(("ucvalue").getBytes(), "SECONDDOMAIN".getBytes(),
seconddomain.getBytes());
p.add(("ucvalue").getBytes(), "SECONDPROTOCOLID".getBytes(),
secondprotocolid.getBytes());
p.add(("ucvalue").getBytes(), "TIMESCOUNT".getBytes(),
timescount.getBytes());
list.add(p);
}
tmp = br.readLine();
count++;
}
if (list.size() > 0) {
table.put(list);
table.flushCommits();
}
table.close();
System.out.println("total = " + count);
long endTime = System.currentTimeMillis() / 1000;
long costTime = endTime - startTime;
System.out.println("end time = " + endTime);
System.out.println(path + ": cost time = " + costTime);
}
4. 入库方式比较
Ø 生成HFile方式:
生成HFile的过程比较慢,生成HFile后写入hbase非常快,基本上就是hdfs上的mv过程.对于生成HFile方式入库的时候有一个改进的方案,就是先对数据排序,然后生成HFile。
HFile方式在所有的加载方案里面是最快的,不过有个前提——数据是第一次导入,表是空的。如果表中已经有了数据。HFile再导入到hbase的表中会触发split操作,最慢的时候这种操作会耗时1小时。
Ø MapReduce方式:
开始会很快,但是由于mr和hbase竞争资源,到一个特定的时间点会变很慢
Ø Java程序方式:
多客户端,多线程同时入库,目前看来是最好的方式,client和regionserver分开,硬盘读写分开,瓶颈只在网络和内存上。咨询了一些牛人,大多推荐这种方式,并且一定要多客户端,多线程。
参考转自:http://blog.sina.com.cn/s/blog_6c994d8f01015fdr.html
protected | cleanup(Mapper.Context |
protected | map(KEYIN |
| run(Mapper.Context |
protected | setup(Mapper.Context |