电影评论预测MapReduce实现-优快云博客

本文链接：https://blog.youkuaiyun.com/nocml/article/details/9091933
package com.iminer.alg.review.movie.xinjian;


import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.Pair;

import com.iminer.alg.review.opinion.ClassifierModelPredict;
import com.iminer.statistics.entity.Utility;
import com.iminer.tool.common.util.Tools;

public class predictMR extends Configured implements Tool {

	public static class MRTemplateNewMappper extends
			Mapper<Text, Text, IntWritable, Text> {
		ClassifierModelPredict cmp ;//= new ClassifierModelPredict("lexAftRemove",	 "modelk2Final");
		
		@Override
		// 如果没有需要初始化的资源，无需此步驟
		protected void setup(Context context) throws IOException,
				InterruptedException {
			super.setup(context);
			Configuration conf = context.getConfiguration();		
			try {
				URI[] uriArr = DistributedCache.getCacheFiles(conf);
				for (URI uri : uriArr) {
					//从分布式缓存中读取停用词表
					FileSystem fs = FileSystem.get(context.getConfiguration());
					//从分布式缓存中读取指定单词集构成的trie树
					if (uri.toString().contains("predict.out")) {
						FSDataInputStream input = fs.open(new Path(uri.toString()));
						try {
							cmp = (ClassifierModelPredict)Tools.getObjectFromBytes(input2byte(input));
							System.out.println("model加载成功！");
						} catch (Exception e) {
							e.printStackTrace();
						}
						input.close();
					}
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			
			//
		}

		@Override
		protected void map(Text key, Text value, Context context)
				throws IOException, InterruptedException {
			// .. 处理数据
			String content = value.toString();
			try {
				
				Pair<Integer , Double> pair = cmp.mygetPrediction(content, true);
				context.write(new IntWritable(pair.getFirst().intValue()), value);
			} catch (Exception e) {			
				e.printStackTrace();
			}
		}

		@Override
		// 如果没有需要释放的资源，无需此步驟
		protected void cleanup(Context context) throws IOException,
				InterruptedException {

			// .... 如果没有需要释放的资源，
			super.cleanup(context);
		}

	}

	public static class MRTemplateNewReducer extends
			Reducer<Writable, Writable, Writable, Writable> {
		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			super.setup(context);
			//
		}

		@Override
		protected void reduce(Writable key, Iterable<Writable> values,
				Context context) throws IOException, InterruptedException {
			for (Writable value : values) {
				// .....
			}
		}

		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {

			// ....
			super.cleanup(context);
		}
	}

	@Override
	public int run(String[] args) throws Exception {

		Job job = new Job(getConf());
		// 设置job名
		job.setJobName("Data  predict");
		job.setJarByClass(predictMR.class);
		// 设置map和reduce类
		job.setMapperClass(MRTemplateNewMappper.class);
//		job.setReducerClass(MRTemplateNewReducer.class);
		// 设置输入输出类型
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		// 设置输入输出格式
		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		// 设置输入输出路径
		FileInputFormat.addInputPath(job, new Path("/user/fanglei/task/tv_weibo_splitted/北京爱情故事-r-00000"));
		FileSystem fs = FileSystem.get(getConf());
		Path outPutPath = new Path(args[1]);
		if (fs.exists(outPutPath))
			fs.delete(outPutPath, true);
		FileOutputFormat.setOutputPath(job, outPutPath);
		// 提交job
		job.waitForCompletion(true);
		return job.isSuccessful() ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
		ClassifierModelPredict cmp = new ClassifierModelPredict("lexAftRemove",	 "modelk2Final");
		FileOutputStream fos = new FileOutputStream("predict.out");
        ObjectOutputStream oos = new ObjectOutputStream(fos);  
        oos.writeObject(cmp);
        oos.close();        
        Configuration conf = new Configuration();
		FileSystem localFS = FileSystem.get(conf);
        localFS.copyFromLocalFile(true, new Path("predict.out"), new Path("/user/lvxinjian/" + "predict.out"));
    	DistributedCache.addCacheFile(new URI("/user/lvxinjian/" + "predict.out"), conf);
        
		String[] otheragrs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		int result = ToolRunner.run(conf, new predictMR(), otheragrs);
		System.exit(result);
	}
	public static final byte[] input2byte(InputStream inStream)
			throws IOException {
		ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
		byte[] buff = new byte[100];
		int rc = 0;
		while ((rc = inStream.read(buff, 0, 100)) > 0) {
			swapStream.write(buff, 0, rc);
		}
		byte[] in2b = swapStream.toByteArray();
		return in2b;
	}

}
//注意在map阶段使用了一些jar，需要加入libjar中