自定义的InputFormat可以更好的读入数据,用textinputformat就只能读取一行,本程序主要实现了读取多行的情况。
MboxFileFormat.java
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/**
* Used to read Mbox files
* @author Srinath Perera (hemapani@apache.org)
*/
public class MboxFileFormat extends FileInputFormat<Text, Text>{
private MBoxFileReader boxFileReader = null;
@Override
public RecordReader<Text, Text> createRecordReader(
InputSplit inputSplit, TaskAttemptContext attempt) throws IOException,
InterruptedException {
boxFileReader = new MBoxFileReader();
boxFileReader.initialize(inputSplit, attempt);
return boxFileReader;
}
}
MBoxFileReader.java
这个类是实现多行读入的关键,而主要的实现在nextKeyValue函数中,默认是一旦下一行的字符串中包含http则不把下一行读入,相当于http为分隔符读入一条条记录,是我用来分析爬虫爬取网页的代码。
import java.io.BufferedReader;http
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* Parse each mail line by line from MBox stream
* @author Srinath Perera (hemapani@apache.org)
*/
public class MBoxFileReader extends RecordReader<Text, Text> {
//private static Pattern pattern1 = Pattern.compile("http");
private BufferedReader reader;
private int count = 0;
private Text key;
private Text value;
private StringBuffer email = new StringBuffer();
String line = null;
public MBoxFileReader() {
}
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext attempt) throws IOException, InterruptedException {
Path path = ((FileSplit) inputSplit).getPath();
//System.out.println(path);
FileSystem fs = FileSystem.get(URI.create(path.toString()),attempt.getConfiguration());
FSDataInputStream fsStream = fs.open(path);
reader = new BufferedReader(new InputStreamReader(fsStream));
//System.out.print(reader.toString());
/*while ((line = reader.readLine()) != null) {
email.append(line).append("\n");
//break;
//System.out.println(line);
}
//email.append("kaishi");
System.out.println(".................."+"init:"+email.toString()+"..........................");*/
//email.append("kaishi");
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (email == null) {
return false;
}
count++;
System.out.println(count);
while ((line = reader.readLine()) != null) {
if (!line.contains("http")) {
email.append(line);
} else {
//parseEmail(email.toString());
key = new Text("i love you");
//System.out.println(email.toString());
value = new Text(email.toString());
email = new StringBuffer();
email.append(line);
return true;
}
}
//parseEmail(email.toString());
//email = null;
key = new Text("i love you");
//System.out.println(email.toString());
value = new Text(email.toString());
email = null;
return true;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return count;
}
@Override
public void close() throws IOException {
reader.close();
}
/* public void parseEmail(String email) {
String[] tokens = email.split("\n");
String from = null;
String subject = null;
String date = null;
for (String token : tokens) {
if (token.contains(":")) {
if (token.startsWith("From:")) {
from = token.substring(5).replaceAll("<.*>|\\\"|,|=[0-9]*", "").replaceAll("\\[.*?\\]", "")
.replaceAll("\\s", "_").trim();
} else if (token.startsWith("Subject:")) {
subject = token.substring(8).trim();
} else if (token.startsWith("Date:")) {
date = token.substring(5).trim();
}
}
}
key = new Text(String.valueOf((from + subject + date).hashCode()));
value = new Text(from + "#" + subject + "#" + date);
}*/
}
MLSendReplyProcessor.java
实现主类
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* Find number of replies sent by each person
*
* @author Srinath Perera (hemapani@apache.org)
*/
public class MLSendReplyProcessor {
//public static SimpleDateFormat dateFormatter = new SimpleDateFormat("dd/MMMMM/yyyy:hh:mm:ss z");
//public static final Pattern httplogPattern = Pattern
// .compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
public static class AMapper extends Mapper<Text, Text, Text, Text> {
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//String[] tokens = value.toString().split("");
//String from = tokens[0];
//String subject = tokens[1];
//System.out.println("from:"+from);
//String date = tokens[2];
//System.out.println(from + "=" + date);
System.out.println(value.toString());
context.write(new Text(key), new Text(value));
}
}
/**
* <p>
* Reduce function receives all the values that has the same key as the
* input, and it output the key and the number of occurrences of the key as
* the output.
* </p>
*/
public static class AReducer extends Reducer<Text, Text, Text, IntWritable> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (Text val : values) {
sum = sum + 1;
}
//System.out.println(key + "=" + sum);
System.out.println(key.toString());
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "LogProcessingHitsByLink");
job.setJarByClass(MLSendReplyProcessor.class);
job.setMapperClass(AMapper.class);
// Uncomment this to
// job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(AReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(MboxFileFormat.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}