在项目开发中,要实现两个“表”的join操作,其中一个表数据量小,一个表很大,这种场景在实际中非常常见,比如“订单日志” join “产品信息”采用map端连接
原理:适用于大表 + 小表(载入内存)。
map之前执行,加载文件到内存,形成map
可以大大提高join操作的并发度,加快处理速度
1、JoinMapper
package hadoop.join.map;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
/**
* Mapper
*/
public class JoinMapper extends Mapper<LongWritable,Text ,Text,NullWritable>{
private Map<String,String> customers ;
/**
* map之前执行,加载文件到内存,形成map
*/
protected void setup(Context context) throws IOException, InterruptedException {
//加载customers.txt
customers = new HashMap<String, String>();
String path = context.getConfiguration().get("customers.path") ;
FSDataInputStream in = FileSystem.get(context.getConfiguration()).open(new Path(path));
BufferedReader br = new BufferedReader(new InputStreamReader(in)) ;
String line = null ;
while((line = br.readLine()) != null){
String[] arr = line.split(",");
customers.put(arr[0] , line) ;
} }
2.App
package hadoop.join.map;
import com.it18zhang.hadoop.lean.key.DataLeanMapper1;
import com.it18zhang.hadoop.lean.key.DataLeanMapper2;
import com.it18zhang.hadoop.lean.key.DataLeanReducer1;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* join:map端连接
*/
public class App {
public static void main(String[] args) throws Exception {
args = new String[]{"d:/java/mr/join/orders.txt", "d:/java/mr/out", "d:/java/mr/join/customers.txt" } ;
Configuration conf = new Configuration();
conf.set("customers.path",args[2]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}
Job job = Job.getInstance(conf);
job.setJobName("join-map");
job.setJarByClass(App.class);
job.setMapperClass(JoinMapper.class);
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置mapreduce输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
//第一个阶段(job)
job.waitForCompletion(true) ;
}
}