左外连接
SELECT filed_1,filed_2..
FROM T1 LEFT OUTER JOIN T2
ON T1.k=T2.k
group by filed_1
MapReduce 两个阶段 第一个阶段 找出所有售出的商品及关联的地址 第二个阶段统计售出的商品的地址个数
public class LeftOuterJoin{
//读取输入参数
//创建javaSparkcontext对象
JavaSparkContext ctx = new JavaSparkContext();
//为用户创建RDD
JavaRDD<String> users=ctx.texFile(userInputfile,1);
JavaPairRDD<String, Tuple2<String,String>> userRDD=users.maptoPair(new PairFunction<
String, String, Tuple2<String,String>>(){
public Tuple2<String,Tuple2<String,String>> call (Strings){
String[] userRecord = s.split("/t");
Tuple2<String,String> location =new Tuple2<String,String>("L", userRecord[1]);
return new Tuple2<String,Tuple2<String,String>>(userRecord[0],location);
}
})
//为交易创建RDD
JavaRDD<String> transcations = ctx.textFile(transcationsInputFile,1);
JavaPairRDD<String,Tuple2<String,String>> transcationsRDD= transcations.mapToPair(
new PairFunction<String,String,Tuple2<String,String>>(){
public Tuple2<String,Tuple2<String,String>>call(String s){
String[] trascationRecord =s.split("/t");
Tuple2<String,String> product=new Tuple2<String,String>("P",transcationRecord[1]);
return new Tuple2<String,Tuple2<String,String>>(transcationRecord[2],product);
}
});
//为上述两个RDD创建并集
JavaPairRDD<String,Tuple2<String,String>> allRDD=transcationsRDD.union(usersRDD)
//调用groupByKey()创建一个javapairRDD
JavaPairRDD<String,Iterable<Tuple2<String,String>>> groupedRDD=allRDD.groupByKey();
//创建productlocationRDD
JavaPairRDD<String,String> productlocationRDD=groupedRDD.flatMapToPair(new PairFlatMapFunction<
Tuple2<String,Iterable<Tuple2<String,String>>>,
String,
String>(){
public Iterable<Tuple2<String,String>>
call(Tuple2<String,Iterable<Tuple2<String,String>>>s){
Iterable<Tuple2<String,String>> pairs= s._2;
String location= "UNKNOWN";
List<String> products =new ArrayList<String>();
for (Tuple2<String,String> t2:pairs){
if (t2._1.equals("L")){
location =t2._2;}
else {
products.add(t2._2);}
}
}
List<Tuple2<String,String>> kvlist= new ArrayList<Tuple2<String,String>>();
for (String product :products){
kvlist.add(new Tuple2<String,String>(product,location));
}
return kvlist;
});
//查找一个商品的所有地址生成javapairRDD
JavaPairRDD<String,Iterable<String>> productBylocations =productLocationRDD.groupByKeys();
List<Tuple2<String,List<String>>> debug3= productByLocations.collect();
//对输出做最终处理
JavaPairRDD<String,Tuple2<Set<String>,Integer>> prodcutByUniqueLocations=
productByLocations.mapValues(
new Function<Iterable<String>, Tuple2<Set<String>,Integer>>(){
public Tuple2<Set<String>,Integer> call (Iterable<String> s){
Set<String> uniqueLocations= new HashSet<String>();
for (String location :s){
uniqueLocations.add(location);}
return new Tuple2<Set<String>,Integer>(uniquelocations, uniquelocations.size());
}});
}
spark 使用leftOuterJoin实现
public class SparkLeftOuterJoin{
public static void main (String[] args) throws Exception{
//读取输入参数
//创建spark上下文对象
JavaSparkContext ctx=new JavaSparkContext();
//为用户数据创建RDD
JavaRDD<String> user=ctx.textFile(userinputfile,1);
//为用户创建user location对
JavaPairRDD userRDD= user.mapToPair(new PairFunction<String,String,String>(){
public Tuple2<String,String>call(String s){
String[] userrecord=s.split("/t");
String userid= userRecord[0];
String location= userRecord[1];
return new Tuple2<String,String>(userID,location)}});
//为交易创建RDD
JavaRDD<String> transactions=ctx.textFile(transcationsinputfile,1);
//为交易创建 user product对
JavaPairRDD<String,String> transcationsRDD=transcations.mapToPair(
new PairFunction<String,String,String>(){
public Tuple2<String,String>call(String s){
String [] transcationrecord =s.split("/t");
String userID= transcationrecord[2];
String product= transcationrecord[1];
return new Tuple2<String,String>(userID,product);}});
//使用leftouterjoin方法
JavaPairRDD <String, Tuple2<String,Optional<String>>> joined=
transcationsRDD.leftOuterJoin(userRDD);
//创建product location对
JavaPairRDD<String,String> product=joined.mapToPair(
new PairFunction<Tuple2<String,Tuple2<String,Optional<String>>>,
String,String>(){
public Tuple2<String,String> call (Tuple2<<String,Tuple2<String,Optional<String>>>t){
Tuple2<String,Optional<String>> value =t._2;
return new Tuple2<String,String>(value._1,value._2)});
//按键对product location分组
JavaPairRDD<String,Iterable<String>> productByLocations= prodcuts.groupByKey();
// 按键创建product set<location>对
JavaPairRDD<String,Tuple2<Set<String>,Integer>> productByUniqueLocations=
productByLocations.mapValues(
new Function<Iterable<String, Tuple2<Set<String>,Integer>>(){
public Tuple2<Set<String>,Integer>call(Iterable<String>s){
Set<String> uniqueLocations= new Hashset<String>();
for (String location :s){
uniqueLocation.add(location);
}
return new Tuple2<String,Integer>(uniqueLocations, uniquelocations.size())}});
}}