MapReduce读取txt文件存储至HBase，以文件名作Key，整个文件内容作Value

本文链接：https://blog.youkuaiyun.com/lengfeng92/article/details/27528309

把已抓取好的网络舆情信息（以txt形式存放），存储到HBase中，再进行信息分析。

要求：

以文件名作Key，整个文件内容作Value

思路：

txt文件先上传到HDFS中，再使用HBase MapReduce将文件写入HBase中。（很简单的思路）

问题分析：

首先必须分析到的问题是，如何读取解析txt文件，TextInputFormat是默认的文件解析类，但此处显然无法满足要求。因此，必须得自定义文件解析类（当然继承自FileInputFormat）。

本文给出读取解析整个文件的自定义FileInputFormat——WholeFileInputFormat，最终可实现以文件名作Key，整个文件内容作Value，存储至HBase。

package file;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class FileInputFormatMake {
	
	
	public static void main(String arge[]) throws IOException, InterruptedException, ClassNotFoundException{
		Configuration conf =new Configuration();
		//设置zookeeper
		conf.set("hbase.zookeeper.quorum", "hadoop00");
		//设置hbase表名称
		conf.set(TableOutputFormat.OUTPUT_TABLE, "tool");
		//将该值改大,防止hbase超时退出
		conf.set("dfs.socket.timeout", "180000");
		Job job = new Job(conf,"HBaseBatchImport");
		job.setMapperClass(BatchImportMapper.class);
		job.setReducerClass(BatchImportReducer.class);
		job.setNumReduceTasks(1);
		//设置map的输出,不设置reduce的输出类型
		job.setMapOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setInputFormatClass(WholeFileInputFormat.class);
		//不设置输出路径,设置输出格式类型
		job.setOutputFormatClass(TableOutputFormat.class);
		FileInputFormat.setInputPaths(job, "hdfs://hadoop00:9000/tool");//tool文件夹下有多个文件
		job.waitForCompletion(true);
		
		
	}
	
	
	static class BatchImportMapper extends Mapper<Text,Text,Text,Text>{
		@Override
		protected void map(Text key, Text value, Context context) {
					try {
						context.write(key, value);
					} catch (IOException e) {
						e.printStackTrace();
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
		}
		
	}
	
	
	static class BatchImportReducer extends TableReducer<Text,Text,NullWritable>{
		@Override
		protected void reduce(Text k2, Iterable<Text> v2,Context context)
				throws IOException, InterruptedException {
					for(Text text:v2){
						Put put = new Put(Bytes.toBytes(k2.toString()));
						put.add(Bytes.toBytes("context"), Bytes.toBytes("value"), Bytes.toBytes(text.toString()));
						context.write(NullWritable.get(), put);
					}
		
		}
		
	}
	
	
	
	static class WholeFileInputFormat extends FileInputFormat{
		@Override
		protected boolean isSplitable(JobContext context, Path filename) {
			return false;
		}
		@Override
		public RecordReader createRecordReader(InputSplit split,
				TaskAttemptContext context) throws IOException,
				InterruptedException {
			
			return new WholeFileRecordReader();
		}
	}
	static  class WholeFileRecordReader extends RecordReader{
		private FileSplit fileSplit;
		private FSDataInputStream fis;
		private Text key = null;
		private Text value = null;
		
		private boolean processed = false;
		@Override
		public void initialize(InputSplit inputSplit, TaskAttemptContext context)
				throws IOException, InterruptedException {
			fileSplit = (FileSplit)inputSplit;
			Configuration job = context.getConfiguration();
		    Path file = fileSplit.getPath();
		    FileSystem fs = file.getFileSystem(job);
		    fis = fs.open(file);
			
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			
			if(key == null){
				key = new Text();
			}
			if(value == null){
				value = new Text();
			}
			if(!processed){
				byte[] content = new byte[(int)fileSplit.getLength()];
				Path file = fileSplit.getPath();
				System.out.println(file.getName());
				key.set(file.getName());
				
				org.apache.hadoop.io.IOUtils.readFully(fis, content, 0, content.length);
				String sendString=new String(  content , "ISO-8859-1" );
				System.out.println(sendString);
				value.set(new Text(sendString));
			processed = true;
			return true;
			}
			return false;
		}

		@Override
		public Text getCurrentKey() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return this.key;
		}

		@Override
		public Text getCurrentValue() throws IOException,
				InterruptedException {
			
			return this.value;
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return processed ? fileSplit.getLength() : 0;
		}

		@Override
		public void close() throws IOException {
			
			
		}
		
	}
	
	

}