把xls的数据导到Hbase

最新推荐文章于 2024-04-24 10:15:22 发布

原创最新推荐文章于 2024-04-24 10:15:22 发布 · 187 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#大数据 #java

专业知识同时被 2 个专栏收录

177 篇文章

订阅专栏

工具api

85 篇文章

订阅专栏

本文详细介绍了如何使用HBase进行数据导入及MapReduce操作，通过提供示例代码，展示了如何将CSV文件数据转换并存储到HBase中。重点强调了配置参数、数据格式解析以及MapReduce流程的基本应用。

这属于Hbase的一个例子，不过Hbase的例子有点问题，需要更改下。
其实我感觉Hbase属于一个BigTable,感觉和xls真的很像，闲话不说了，上code才是王道。

Java代码

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/**
* Sample Uploader MapReduce
*
* This is EXAMPLE code. You will need to change it to work for your context.
*
* Uses {@link TableReducer} to put the data into HBase. Change the InputFormat
* to suit your data. In this example, we are importing a CSV file.
*
* <pre>row,family,qualifier,value</pre>
*
* The table and columnfamily we're to insert into must preexist.
*
* There is no reducer in this example as it is not necessary and adds
* significant overhead. If you need to do any massaging of data before
* inserting into HBase, you can do this in the map as well.
* Do the following to start the MR job:
* <pre>
* ./bin/hadoop org.apache.hadoop.hbase.mapreduce.SampleUploader /tmp/input.csv TABLE_NAME
* </pre>
*
* This code was written against HBase 0.21 trunk.
*/
public class SampleUploader {
public static Logger loger = Wloger.loger;
private static final String NAME = "SampleUploader";
static class Uploader
extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
private long checkpoint = 100;
private long count = 0;
@Override
public void map(LongWritable key, Text line, Context context)
throws IOException {
// Input is a CSV file
// Each map() is a single line, where the key is the line number
// Each line is comma-delimited; row,family,qualifier,value
// Split CSV line
String [] values = line.toString().split(",");
if(values.length != 4) {
return;
}
// Extract each value
byte [] row = Bytes.toBytes(values[0]);
byte [] family = Bytes.toBytes(values[1]);
byte [] qualifier = Bytes.toBytes(values[2]);
byte [] value = Bytes.toBytes(values[3]);
loger.info(values[0]+":"+values[1]+":"+values[2]+":"+values[3]);
// Create Put
Put put = new Put(row);
put.add(family, qualifier, value);
// Uncomment below to disable WAL. This will improve performance but means
// you will experience data loss in the case of a RegionServer crash.
// put.setWriteToWAL(false);
try {
context.write(new ImmutableBytesWritable(row), put);
} catch (InterruptedException e) {
e.printStackTrace();
loger.error("write到hbase 异常:",e);
}
// Set status every checkpoint lines
if(++count % checkpoint == 0) {
context.setStatus("Emitting Put " + count);
}
}
}
/**
* Job configuration.
*/
public static Job configureJob(Configuration conf, String [] args)
throws IOException {
Path inputPath = new Path(args[0]);
String tableName = args[1];
Job job = new Job(conf, NAME + "_" + tableName);
job.setJarByClass(Uploader.class);
FileInputFormat.setInputPaths(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(Uploader.class);
// No reducers. Just write straight to table. Call initTableReducerJob
// because it sets up the TableOutputFormat.
loger.error("TableName:"+tableName);
TableMapReduceUtil.initTableReducerJob(tableName, null, job);
job.setNumReduceTasks(0);
return job;
}
/**
* Main entry point.
*
* @param args The command line parameters.
* @throws Exception When running the job fails.
*/
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 2) {
System.err.println("Wrong number of arguments: " + otherArgs.length);
System.err.println("Usage: " + NAME + " <input> <tablename>");
System.exit(-1);
}
Job job = configureJob(conf, otherArgs);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;

/**
 * Sample Uploader MapReduce
 * <p>
 * This is EXAMPLE code.  You will need to change it to work for your context.
 * <p>
 * Uses {@link TableReducer} to put the data into HBase. Change the InputFormat
 * to suit your data.  In this example, we are importing a CSV file.
 * <p>
 * <pre>row,family,qualifier,value</pre>
 * <p>
 * The table and columnfamily we're to insert into must preexist.
 * <p>
 * There is no reducer in this example as it is not necessary and adds
 * significant overhead.  If you need to do any massaging of data before
 * inserting into HBase, you can do this in the map as well.
 * <p>Do the following to start the MR job:
 * <pre>
 * ./bin/hadoop org.apache.hadoop.hbase.mapreduce.SampleUploader /tmp/input.csv TABLE_NAME
 * </pre>
 * <p>
 * This code was written against HBase 0.21 trunk.
 */
public class SampleUploader {

	public static Logger loger = Wloger.loger;

  private static final String NAME = "SampleUploader";

  static class Uploader
  extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {

    private long checkpoint = 100;
    private long count = 0;

    @Override
    public void map(LongWritable key, Text line, Context context)
    throws IOException {

      // Input is a CSV file
      // Each map() is a single line, where the key is the line number
      // Each line is comma-delimited; row,family,qualifier,value

      // Split CSV line
      String [] values = line.toString().split(",");
      if(values.length != 4) {
        return;
      }

      // Extract each value
      byte [] row = Bytes.toBytes(values[0]);
      byte [] family = Bytes.toBytes(values[1]);
      byte [] qualifier = Bytes.toBytes(values[2]);
      byte [] value = Bytes.toBytes(values[3]);
      loger.info(values[0]+":"+values[1]+":"+values[2]+":"+values[3]);

      // Create Put
      Put put = new Put(row);
      put.add(family, qualifier, value);

      // Uncomment below to disable WAL. This will improve performance but means
      // you will experience data loss in the case of a RegionServer crash.
      // put.setWriteToWAL(false);

      try {
        context.write(new ImmutableBytesWritable(row), put);
      } catch (InterruptedException e) {
        e.printStackTrace();
        loger.error("write到hbase 异常:",e);
      }

      // Set status every checkpoint lines
      if(++count % checkpoint == 0) {
        context.setStatus("Emitting Put " + count);
      }
    }
  }

  /**
   * Job configuration.
   */
  public static Job configureJob(Configuration conf, String [] args)
  throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    loger.error("TableName:"+tableName);
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
  }

  /**
   * Main entry point.
   *
   * @param args  The command line parameters.
   * @throws Exception When running the job fails.
   */
  public static void main(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if(otherArgs.length != 2) {
      System.err.println("Wrong number of arguments: " + otherArgs.length);
      System.err.println("Usage: " + NAME + " <input> <tablename>");
      System.exit(-1);
    }
    Job job = configureJob(conf, otherArgs);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Map/Reduce的输入/输出就不说了，不懂的，可以看hadoop专栏去.
[这个任务调用和上一个IndexBuilder有些不同哦，具体的可以参照上一个例子，相同点：都只有map任务]
xls内容如下:

Java代码

key3,family1,column1,xls1
key3,family1,column2,xls11
key4,family1,column1,xls2
key4,family1,column2,xls12

key3,family1,column1,xls1
key3,family1,column2,xls11
key4,family1,column1,xls2
key4,family1,column2,xls12

这是csv格式的，如果是xls是可以导为csv格式的，具体可以google一下.
运行命令如下:

Java代码

bin/hadoop jar SampleUploader.jar SampleUploader /tmp/input.csv 'table1'

bin/hadoop jar SampleUploader.jar SampleUploader /tmp/input.csv 'table1'

这里的'table1'是上一遍IndexBuilder的时候建的表，表就使用上一张表[懒]
注意，这里使用的文件需要提交到hdfs上，否则会提示找不到，因为map/reduce是使用的是hdfs的文件系统.

http://www.iteye.com/topic/1117572