package com.iminer.alg.review.movie.xinjian;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.Pair;
import com.iminer.alg.review.opinion.ClassifierModelPredict;
import com.iminer.statistics.entity.Utility;
import com.iminer.tool.common.util.Tools;
public class predictMR extends Configured implements Tool {
public static class MRTemplateNewMappper extends
Mapper<Text, Text, IntWritable, Text> {
ClassifierModelPredict cmp ;//= new ClassifierModelPredict("lexAftRemove", "modelk2Final");
@Override
// 如果没有需要初始化的资源,无需此步驟
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
try {
URI[] uriArr = DistributedCache.getCacheFiles(conf);
for (URI uri : uriArr) {
//从分布式缓存中读取停用词表
FileSystem fs = FileSystem.get(context.getConfiguration());
//从分布式缓存中读取指定单词集构成的trie树
if (uri.toString().contains("predict.out")) {
FSDataInputStream input = fs.open(new Path(uri.toString()));
try {
cmp = (ClassifierModelPredict)Tools.getObjectFromBytes(input2byte(input));
System.out.println("model加载成功!");
} catch (Exception e) {
e.printStackTrace();
}
input.close();
}
}
} catch (IOException e) {
e.printStackTrace();
}
//
}
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
// .. 处理数据
String content = value.toString();
try {
Pair<Integer , Double> pair = cmp.mygetPrediction(content, true);
context.write(new IntWritable(pair.getFirst().intValue()), value);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
// 如果没有需要释放的资源,无需此步驟
protected void cleanup(Context context) throws IOException,
InterruptedException {
// .... 如果没有需要释放的资源,
super.cleanup(context);
}
}
public static class MRTemplateNewReducer extends
Reducer<Writable, Writable, Writable, Writable> {
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
//
}
@Override
protected void reduce(Writable key, Iterable<Writable> values,
Context context) throws IOException, InterruptedException {
for (Writable value : values) {
// .....
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
// ....
super.cleanup(context);
}
}
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
// 设置job名
job.setJobName("Data predict");
job.setJarByClass(predictMR.class);
// 设置map和reduce类
job.setMapperClass(MRTemplateNewMappper.class);
// job.setReducerClass(MRTemplateNewReducer.class);
// 设置输入输出类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
// 设置输入输出格式
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入输出路径
FileInputFormat.addInputPath(job, new Path("/user/fanglei/task/tv_weibo_splitted/北京爱情故事-r-00000"));
FileSystem fs = FileSystem.get(getConf());
Path outPutPath = new Path(args[1]);
if (fs.exists(outPutPath))
fs.delete(outPutPath, true);
FileOutputFormat.setOutputPath(job, outPutPath);
// 提交job
job.waitForCompletion(true);
return job.isSuccessful() ? 0 : 1;
}
public static void main(String[] args) throws Exception {
ClassifierModelPredict cmp = new ClassifierModelPredict("lexAftRemove", "modelk2Final");
FileOutputStream fos = new FileOutputStream("predict.out");
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(cmp);
oos.close();
Configuration conf = new Configuration();
FileSystem localFS = FileSystem.get(conf);
localFS.copyFromLocalFile(true, new Path("predict.out"), new Path("/user/lvxinjian/" + "predict.out"));
DistributedCache.addCacheFile(new URI("/user/lvxinjian/" + "predict.out"), conf);
String[] otheragrs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
int result = ToolRunner.run(conf, new predictMR(), otheragrs);
System.exit(result);
}
public static final byte[] input2byte(InputStream inStream)
throws IOException {
ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
byte[] buff = new byte[100];
int rc = 0;
while ((rc = inStream.read(buff, 0, 100)) > 0) {
swapStream.write(buff, 0, rc);
}
byte[] in2b = swapStream.toByteArray();
return in2b;
}
}
//注意在map阶段使用了一些jar,需要加入libjar中