Mapreduce统计hdfs文档单词分组总数并落地到关系型数据库

最新推荐文章于 2024-10-23 17:42:42 发布

原创最新推荐文章于 2024-10-23 17:42:42 发布 · 350 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#Mapreduce统计hdfs文档单词分组总数并落地到关系型数据库 #统计hdfs文档单词次数并落地到关系型数据库 #统计hdfs文档每个单词出现次数并落地到Oracle数据库

package com.zicontd.mifss.datawork.dboutput;

import java.io.DataInput;
import java.io.DataOutput;

/**

Title: DiseaseStatistics
Description:
Company: http://www.zicontd.com/
@author WHQ
@date 2019年5月1日
/
/***********************
Map过程需要继承org.apache.hadoop.mapreduce包中Mapper类，并重写其map方法。
通过在map方法中添加两句把key值和value值输出到控制台的代码，可以发现map方法中value值存储的是文本文件中的一行（以回车符为行结束标记），
而key值为该行的首字母相对于文本文件的首地址的偏移量。然后StringTokenizer类将每一行拆分成为一个个的单词，并将作为map方法的结果输出，
其余的工作都交有MapReduce框架处理。

Reduce过程需要继承org.apache.hadoop.mapreduce包中Reducer类，并重写其reduce方法。
Map过程输出中key为单个单词，而values是对应单词的计数值所组成的列表，Map的输出就是Reduce的输入，
所以reduce方法只要遍历values并求和，即可得到某个单词的总次数。

在MapReduce中，由Job对象负责管理和运行一个计算任务，并通过Job的一些方法对任务的参数进行相关的设置。
此处设置了使用TokenizerMapper完成Map过程中的处理和使用IntSumReducer完成Combine和Reduce过程中的处理。
还设置了Map过程和Reduce过程的输出类型：key的类型为Text，value的类型为IntWritable。
任务的输出和输入路径则由命令行参数指定，并由FileInputFormat和FileOutputFormat分别设定。

*********************/

import java.io.IOException;
import java.net.URI;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.log4j.Logger;

import com.zicontd.mifss.datawork.dbinput.KC86DATA_HDFS;
import com.zicontd.mifss.datawork.dboutput.KC86DATA_ORACLE.KC86Entity;
import com.zicontd.mifss.datawork.dboutput.DiseaseStatistics.Bt;

public class DiseaseStatistics {

/*private static final String input_path = "hdfs://IP:9000/home/kc86_intput";
private static final String driver = "com.mysql.jdbc.Driver";
private static final String url = "jdbc:mysql://localhost:3306/zicontd?useUnicode=true&characterEncoding=UTF-8";
private static final String username = "root";
private static final String password = "root";*/

private static final String input_path = "hdfs://IP:9000/home/kc86_intput";
private static final String driver = "oracle.jdbc.driver.OracleDriver";
private static final String url = "jdbc:oracle:thin:@IP:1521:orcl";
private static final String username = "USERNAME";
private static final String password = "PASSWORD";

private final static Logger log = Logger.getLogger(DiseaseStatistics.class);

public static void  main(String[] args) throws IOException, ClassNotFoundException, InterruptedException 
{
	//当前运行环境中配置的HADOOP_USER_NAME属性值
	String hadoop_user_name = System.getenv("HADOOP_USER_NAME");
	System.setProperty("HADOOP_USER_NAME", hadoop_user_name);
	
	Configuration conf = new Configuration();

	DBConfiguration.configureDB(conf, driver, url,username,password);
	
	Job job = Job.getInstance(conf);
	
	job.setJarByClass(DiseaseStatistics.class);
	job.setMapperClass(TokenizerMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    
    
    // 输出格式
	job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Bt.class);
    job.setOutputValueClass(NullWritable.class);
    
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(DBOutputFormat.class);  
	
	
    FileSystem fs = FileSystem.get(URI.create(input_path), conf,"root");
	Path p = new Path(input_path);
	if(!fs.exists(p)){
		//fs.delete(p,true);
		//System.out.println("输出路径存在,已删除!");
		log.info("输出路径不存在");
	}
	
	//读path路径下的文件
	FileInputFormat.setInputPaths(job, p);
	//此处给出的字段名称约束 一定要和Student类中描述读进或者写出的字段名称个数一致
	String[] fieldNames = new String[] {"NAME","NUM"};
	DBOutputFormat.setOutput(job, "\"KC86_DiseaseStatistics\"", fieldNames);
	System.exit(job.waitForCompletion(true) ? 0 : 1);
	
	
}




//通过继承类Mapper来实现map处理逻辑
public static class TokenizerMapper extends Mapper<LongWritable,Text , Text,IntWritable> 
{
    private static final IntWritable one = new IntWritable(1);
    
    private Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context)
    		throws IOException, InterruptedException {
    	
    	String newStr = value.toString().trim();
		String newStr1 = newStr.substring(newStr.indexOf("[")+1,newStr.indexOf("]")); 
		String[] sp = newStr1.split(",");
		StringBuffer AKC185sb = new StringBuffer();
		for(String s1:sp)
		{
			String tempS = s1;
			String[] s = tempS.split("=");
				
				//获取"AKC185"的值
				if(s[0].trim().equals("AKC185"))
				{
					AKC185sb.append(s[1].trim());
				}
		} 
		
     /*  StringTokenizer itr = new StringTokenizer(AKC185sb.toString());
       
     while(itr.hasMoreTokens()) 
        {
            this.word.set(itr.nextToken());
            context.write(word, one);
        }*/
		word.set(AKC185sb.toString().trim());
		context.write(word, one);
    }
}




    //继承Reducer类，重写其方法
     public static class IntSumReducer extends Reducer<Text,LongWritable,Bt,NullWritable> 
    {
    	
    	 LongWritable result = new LongWritable();
        
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        	// TODO Auto-generated method stub
        	//定义一个计数器 
            int sum = 0;
 
           IntWritable val;
           	//通过value这个迭代器，遍历这一组kv中所有的value，进行累加  
          for(Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
                val = (IntWritable)i$.next();
            }
 
            this.result.set(sum);
            Bt bt = new Bt(key.toString(), result.get());
            context.write(bt, null);
        }
    }
     
     
     
     
     public static class Bt  implements Writable, DBWritable
     {

     	private String bname;
     	private Long num;
     	
     	
 		/**
 		 * @param bname
 		 * @param num
 		 */
 		public Bt(String bname, Long num) {
 			super();
 			this.bname = bname;
 			this.num = num;
 		}

 		public String getBname() {
 			return bname;
 		}

 		public void setBname(String bname) {
 			this.bname = bname;
 		}

 		public Long getNum() {
 			return num;
 		}

 		public void setNum(Long num) {
 			this.num = num;
 		}

 		@Override
 		public void write(PreparedStatement statement) throws SQLException {
 			// TODO Auto-generated method stub
 			statement.setString(1, bname);
 			statement.setLong(2, num);
 		}

 		@Override
 		public void readFields(ResultSet resultSet) throws SQLException {
 			// TODO Auto-generated method stub
 			bname      = resultSet.getString(1); 
 			num      = resultSet.getLong(2);
 		}

 		@Override
 		public void write(DataOutput out) throws IOException {
 			// TODO Auto-generated method stub
 			out.writeUTF(this.bname);
 			out.writeLong(this.num);
 		}

 		@Override
 		public void readFields(DataInput in) throws IOException {
 			// TODO Auto-generated method stub
 			this.bname		 =in.readUTF();
 			this.num		 =in.readLong();
 		}

     	
     }

}