把已抓取好的网络舆情信息(以txt形式存放),存储到HBase中,再进行信息分析。
要求:
以文件名作Key,整个文件内容作Value
思路:
txt文件先上传到HDFS中,再使用HBase MapReduce将文件写入HBase中。(很简单的思路)
问题分析:
首先必须分析到的问题是,如何读取解析txt文件,TextInputFormat是默认的文件解析类,但此处显然无法满足要求。因此,必须得自定义文件解析类(当然继承自FileInputFormat)。
本文给出 读取解析整个文件的自定义FileInputFormat——WholeFileInputFormat,最终可实现以文件名作Key,整个文件内容作Value,存储至HBase。
package file;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class FileInputFormatMake {
public static void main(String arge[]) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf =new Configuration();
//设置zookeeper
conf.set("hbase.zookeeper.quorum", "hadoop00");
//设置hbase表名称
conf.set(TableOutputFormat.OUTPUT_TABLE, "tool");
//将该值改大,防止hbase超时退出
conf.set("dfs.socket.timeout", "180000");
Job job = new Job(conf,"HBaseBatchImport");
job.setMapperClass(BatchImportMapper.class);
job.setReducerClass(BatchImportReducer.class);
job.setNumReduceTasks(1);
//设置map的输出,不设置reduce的输出类型
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(WholeFileInputFormat.class);
//不设置输出路径,设置输出格式类型
job.setOutputFormatClass(TableOutputFormat.class);
FileInputFormat.setInputPaths(job, "hdfs://hadoop00:9000/tool");//tool文件夹下有多个文件
job.waitForCompletion(true);
}
static class BatchImportMapper extends Mapper<Text,Text,Text,Text>{
@Override
protected void map(Text key, Text value, Context context) {
try {
context.write(key, value);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
static class BatchImportReducer extends TableReducer<Text,Text,NullWritable>{
@Override
protected void reduce(Text k2, Iterable<Text> v2,Context context)
throws IOException, InterruptedException {
for(Text text:v2){
Put put = new Put(Bytes.toBytes(k2.toString()));
put.add(Bytes.toBytes("context"), Bytes.toBytes("value"), Bytes.toBytes(text.toString()));
context.write(NullWritable.get(), put);
}
}
}
static class WholeFileInputFormat extends FileInputFormat{
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new WholeFileRecordReader();
}
}
static class WholeFileRecordReader extends RecordReader{
private FileSplit fileSplit;
private FSDataInputStream fis;
private Text key = null;
private Text value = null;
private boolean processed = false;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
fileSplit = (FileSplit)inputSplit;
Configuration job = context.getConfiguration();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(job);
fis = fs.open(file);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null){
key = new Text();
}
if(value == null){
value = new Text();
}
if(!processed){
byte[] content = new byte[(int)fileSplit.getLength()];
Path file = fileSplit.getPath();
System.out.println(file.getName());
key.set(file.getName());
org.apache.hadoop.io.IOUtils.readFully(fis, content, 0, content.length);
String sendString=new String( content , "ISO-8859-1" );
System.out.println(sendString);
value.set(new Text(sendString));
processed = true;
return true;
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return this.key;
}
@Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return this.value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return processed ? fileSplit.getLength() : 0;
}
@Override
public void close() throws IOException {
}
}
}