MapReduce操作HBase-优快云博客

本文详细介绍了使用MapReduce进行HBase数据导入及导出的流程，包括代码实现、常见问题及解决方案。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

这两天看了一下HBase的基本操作，然后又重温了下Hadoop的MapReduce的基本操作(虽然之前看的也是一般般，理解不是很深)。本来打算昨晚完成两件事情的：1、使用map任务读出HDFS 上的文件，并把他导入到HBase中；2、使用map任务读出HBase中的数据，并使用reduce输出到文件中。

我的数据使用如下：（只是很少的数据）

首先贴代码，然后再说明问题吧：

package org.fansy.date830;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class JobOne {

	/**
	 * use map job to read file data and then import the data to HBase
	 * start:22:44
	 * test:OK
	 * end: 22:59
	 */
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		Configuration conf = HBaseConfiguration.create();
	    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	    if(otherArgs.length != 2) {
	      System.err.println("Wrong number of arguments: " + otherArgs.length);
	      System.err.println("Usage: <input> <tablename>");
	      System.exit(-1);
	    }
	    
	    Job job=new Job(conf,"import data to hbase");
	    job.setJarByClass(JobOne.class);
		job.setMapperClass(MapperClass.class);
		job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
		TableMapReduceUtil.initTableReducerJob(args[1], null, job);
		job.setNumReduceTasks(0);
		FileInputFormat.setInputPaths(job, args[0]);
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
	public static class MapperClass extends Mapper<LongWritable,Text,ImmutableBytesWritable,Put>{
		public void map(LongWritable key,Text line,Context context)throws IOException,InterruptedException{
			String[] values=line.toString().split(",");
			if(values.length!=4){ // if there are not four args,then return
				return ;
			}
			  byte [] row = Bytes.toBytes(values[0]);
		      byte [] family = Bytes.toBytes(values[1]);
		      byte [] qualifier = Bytes.toBytes(values[2]);
		      byte [] value = Bytes.toBytes(values[3]);
		     Put put=new Put(row);
		     put.add(family,qualifier,value);
		     // first do not add the next line to test whether needs it
		     // the result turns out that this should be added
		     context.write(new ImmutableBytesWritable(row),put);
		}
	}

}

我参考了网上的一篇文章： http://www.cnblogs.com/liqizhou/archive/2012/05/17/2504279.html，这篇文章上面使用了map任务读取数据，然后在reduce阶段把数据导入到HBase中，我觉得如果用map任务可以完成的话就不用reduce就行了吧。在上面的代码中要注意一点，其中的context.write(...)一句一定要加上，不如不会有结果导入到HBase中，但在网上的这篇文章中的reduce任务没有context.write(...)，不知道可否（不过个人对mapreduce的操作看的也是一般般）。

然后在 hbase shell中查看导入的数据如下：

然后就是问题二了，昨晚没有解决，搞到差不多12点，还没搞定，我都有点晕晕沉沉的了，所以就睡了，今早早起搞定它。

先贴代码：

package org.fansy.date830;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
//import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
//import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
//import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
//import org.fansy.date830.JobOne.MapperClass;

public class JobTwo {

	/**
	 * use map job to read from HBase,and use reduce job to output the data to a file
	 * start:23:00
	 * 
	 * end: 2012/08/31 09:30
	 */
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		Configuration conf = HBaseConfiguration.create();
	    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	    if(otherArgs.length != 1) {
	      System.err.println("Wrong number of arguments: " + otherArgs.length);
	      System.err.println("Usage: <output> ");
	      System.exit(-1);
	    }
	    
	    Job job=new Job(conf,"read data from hbase and import it to a file");
	    Scan scan =new Scan();
	    job.setJarByClass(JobTwo.class);
	//	job.setMapOutputKeyClass(Text.class);
    //   job.setMapOutputValueClass(Text.class);
        TableMapReduceUtil.initTableMapperJob("testtable".getBytes(), scan,MapperClass.class, Text.class,Text.class,job);

		job.setReducerClass(ReducerClass.class);
	//	FileInputFormat.setInputPaths(job, args[0]);
		FileOutputFormat.setOutputPath(job, new Path(args[0]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
	
	public static class MapperClass extends TableMapper<Text,Text>{
		public  void map(ImmutableBytesWritable row,Result result,Context context)throws IOException,InterruptedException{
		//	String newrowq=row.toString();
			String newrow=Bytes.toString(result.getRow());
			String newvalue=null;
			if(result.containsColumn("f1".getBytes(), "age".getBytes())){
				 newvalue=Bytes.toString(result.getValue("f1".getBytes(), "age".getBytes()));
				 context.write(new Text(newrow), new Text(newvalue));
			}
			if(result.containsColumn("f1".getBytes(), "name".getBytes())){
				 newvalue=Bytes.toString(result.getValue("f1".getBytes(), "name".getBytes()));
				 context.write(new Text(newrow), new Text(newvalue));
			}
			
		}
	}
	public static class ReducerClass extends Reducer<Text,Text,Text,Text>{
		public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
			StringBuffer str=new StringBuffer();
			for(Text val:values){
				str.append(val.toString());
			}
			context.write(key, new Text(str.toString()));
		}
	}

}

首先说下我遇到的问题吧，

一：

 TableMapReduceUtil.initTableMapperJob("testtable".getBytes(), scan,MapperClass.class, Text.class,Text.class,job);

这一句，昨晚不知道很多参数的意义，所以老是说map的输出的格式不对，后来认真看来API 才知道，原来是设置错误，弄得我昨晚晕晕的；

二：

context.write(new Text(newrow), new Text(newvalue));

本来这句是在map函数的最后面的，但是运行结果出来后发现map的输入输出records都是8，这样就不对了，应该是16records的吧，然后我就改为上面的代码了，然后map的输入仍然是8,但是输出变为16了，所以我猜测，如果是相同的row的话map应该是把他当作一条记录了。

最后贴上输出的文件：

本人也是刚开始学习HBase，以前有弄过Hadoop的mapreduce，但是不是很深入，很多都不了解，希望以后可以有更深入的了解。

分享，成长