------------------------------------reduce端join-------------------------------
package demo06.reducejoin;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class ReduceJoinMap extends Mapper<LongWritable, Text,Text,Text> {
Text k2 = new Text();
/**
* 进行分类,商品id作为k2行数据作为v2
* 获取文件名,如果入k1,v1的文件名是produce为商品,order为订单
*
*
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
//获取文件路径名
FileSplit inputSplit = (FileSplit) context.getInputSplit();
Path path = inputSplit.getPath();
String pathName = path.getName();
String line = value.toString();
//逻辑处理
if(pathName.startsWith("p")){
//商品表数据
String[] splits = line.split(",");
k2.set(splits[0]);
context.write(k2,value);
}else{
String[] splits = line.split(",");
k2.set(splits[2]);
context.write(k2,value);
}
}
}
package demo06.reducejoin;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class ReduceJoinReduce extends Reducer<Text,Text,Text,NullWritable> {
/**
* 利用hadoop相同的key合并,多行数据变为集合的特性进行join,此时相同产品的产品数据和订单数据都在集合内,
* 如果是订单数据就给orderLine,如果是商品数据就给productLine
* 再进行拼接完成简单的join
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
String orderLine = "";
String productLine = "";
for (Text text: values) {
if(text.toString().startsWith("p")){
productLine=text.toString();
} else{
orderLine=text.toString();
}
}
context.write(new Text(orderLine+"\t"+productLine),NullWritable.get());
}
}
-----------------------------------------------------map端join-------------------------------------------
在主程序里添加缓存文件
//添加我们的缓存文件
DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cachefile/pdts.txt"),configuration);
package demo07.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
public class MapJoinMap extends Mapper<LongWritable, Text,Text, NullWritable> {
//定义一个setup结果集保存数据
HashMap<String,String> map;
/**
* 重写setup方法获取缓存文件,将缓存文件保存在map中
* 重写setup方法,里面用文件系统来处理文件,将商品保存为一个集合
* 在map方法中与订单进行匹配
*/
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
map = new HashMap<String,String>();
//从context中获取configuration
Configuration configuration = context.getConfiguration();
//我们只有一个缓存文件
URI[] caches = DistributedCache.getCacheFiles(configuration);
URI cacheFile = caches[0];
//cacheFile是hdfs://node01:8020/.....
//获取一个文件系统
FileSystem fileSystem = FileSystem.get(cacheFile,configuration);
//获取文件的输入流
FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile));
//流读取为字符串
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream));
String line = bufferedReader.readLine();
String lineStr = null;
while ((line = bufferedReader.readLine())!=null){
String[] lineArray = line.split(",");
map.put(lineArray[0],line);
}
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split(",");
//获取我们商品表的数据
String product = map.get(splits[2]);
//将商品表和订单表中的数据进行拼接
context.write(new Text(value.toString()+"\t"+product),NullWritable.get());
}
}
本文探讨了Hadoop中两种Join操作,包括map端的join算法和reduce端的join算法。重点讲解了map端如何通过缓存文件实现数据的联合,以及这两种方法在大数据处理中的应用和优缺点。
6076

被折叠的 条评论
为什么被折叠?



