Spark二次排序

 

 

package cn.spark.study.core.upgrade.applog;

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
 * 移动端app访问流量日志分析案例
 * @author Administrator
 *
 */
public class AppLogSpark {

	public static void main(String[] args) throws Exception {
		// 创建Spark配置和上下文对象
		SparkConf conf = new SparkConf()
				.setAppName("AppLogSpark")  
				.setMaster("local"); 
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		// 读取日志文件,并创建一个RDD
		// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
		// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
		JavaRDD<String> accessLogRDD = sc.textFile(
				"C://Users//Administrator//Desktop//access.log");   
		
		// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
		JavaPairRDD<String, AccessLogInfo> accessLogPairRDD = 
				mapAccessLogRDD2Pair(accessLogRDD);
		
		// 根据deviceID进行聚合操作
		// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
		JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD = 
				aggregateByDeviceID(accessLogPairRDD);
		
		// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
		JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD = 
				mapRDDKey2SortKey(aggrAccessLogPairRDD);
		
		// 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
		JavaPairRDD<AccessLogSortKey ,String> sortedAccessLogRDD =
				accessLogSortRDD.sortByKey(false);
		// 获取top10数据
		List<Tuple2<AccessLogSortKey, String>> top10DataList = 
				sortedAccessLogRDD.take(10);
		for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
			System.out.println(data._2 + ": " + data._1);  
		}
		
		// 关闭Spark上下文
		sc.close();
	}
	
	/**
	 * 将日志RDD映射为key-value的格式
	 * @param accessLogRDD 日志RDD
	 * @return key-value格式RDD
	 */
	private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
			JavaRDD<String> accessLogRDD) {
		return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Tuple2<String, AccessLogInfo> call(String accessLog)
					throws Exception {
				// 根据\t对日志进行切分
				String[] accessLogSplited = accessLog.split("\t");  
				
				// 获取四个字段
				long timestamp = Long.valueOf(accessLogSplited[0]);
				String deviceID = accessLogSplited[1];
				long upTraffic = Long.valueOf(accessLogSplited[2]);
				long downTraffic = Long.valueOf(accessLogSplited[3]);  
				
				// 将时间戳、上行流量、下行流量,封装为自定义的可序列化对象
				AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp,
						upTraffic, downTraffic);
				
				return new Tuple2<String, AccessLogInfo>(deviceID, accessLogInfo);
			}
			
		});
	}
	
	/**
	 * 根据deviceID进行聚合操作
	 * 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
	 * @param accessLogPairRDD 日志key-value格式RDD
	 * @return 按deviceID聚合RDD
	 */
	private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
			JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
		return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
			
			private static final long serialVersionUID = 1L;
			
			@Override
			public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
					throws Exception {
				long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ? 
						accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
				long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
				long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
				
				AccessLogInfo accessLogInfo = new AccessLogInfo();
				accessLogInfo.setTimestamp(timestamp);
				accessLogInfo.setUpTraffic(upTraffic); 
				accessLogInfo.setDownTraffic(downTraffic);
				
				return accessLogInfo;
			}
			
		});
	}
	
	/**
	 * 将RDD的key映射为二次排序key
	 * @param aggrAccessLogPairRDD 按deviceID聚合RDD
	 * @return 二次排序key RDD
	 */
	private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
			JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD) {
		return aggrAccessLogPairRDD.mapToPair(
				
				new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<AccessLogSortKey, String> call(
							Tuple2<String, AccessLogInfo> tuple) throws Exception {
						// 获取tuple数据
						String deviceID = tuple._1;
						AccessLogInfo accessLogInfo = tuple._2;
						
						// 将日志信息封装为二次排序key 
						AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
								accessLogInfo.getUpTraffic(), 
								accessLogInfo.getDownTraffic(), 
								accessLogInfo.getTimestamp());
						
						// 返回新的Tuple
						return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
					}
					
				});
	}
	
}

 

 

package cn.spark.study.core.upgrade.applog;

import java.io.Serializable;

import scala.math.Ordered;

/**
 * 日志的二次排序key
 * @author Administrator
 *
 */
public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable {

	private static final long serialVersionUID = 3702442700882342403L;
	
	private long upTraffic;
	private long downTraffic;
	private long timestamp;
	
	@Override
	public boolean $greater(AccessLogSortKey other) {
		if(upTraffic > other.upTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic > other.downTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp > other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public boolean $greater$eq(AccessLogSortKey other) {
		if($greater(other)) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp == other.timestamp) {
			return true;
		}
		return false;
	}
	
	@Override
	public boolean $less(AccessLogSortKey other) {
		if(upTraffic < other.upTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic < other.downTraffic) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp < other.timestamp) {
			return true;
		}
		return false;
	}

	@Override
	public boolean $less$eq(AccessLogSortKey other) {
		if($less(other)) {
			return true;
		} else if(upTraffic == other.upTraffic && 
				downTraffic == other.downTraffic &&
				timestamp == other.timestamp) {
			return true;
		}
		return false;
	}
	
	@Override
	public int compare(AccessLogSortKey other) {
		if(upTraffic - other.upTraffic != 0) {
			return (int) (upTraffic - other.upTraffic); 
		} else if(downTraffic - other.downTraffic != 0) {
			return (int) (downTraffic - other.downTraffic);
		} else if(timestamp - other.timestamp != 0) {
			return (int) (timestamp - other.timestamp);
		}
		return 0;
	}
	
	@Override
	public int compareTo(AccessLogSortKey other) {
		if(upTraffic - other.upTraffic != 0) {
			return (int) (upTraffic - other.upTraffic); 
		} else if(downTraffic - other.downTraffic != 0) {
			return (int) (downTraffic - other.downTraffic);
		} else if(timestamp - other.timestamp != 0) {
			return (int) (timestamp - other.timestamp);
		}
		return 0;
	}

	public long getUpTraffic() {
		return upTraffic;
	}

	public void setUpTraffic(long upTraffic) {
		this.upTraffic = upTraffic;
	}

	public long getDownTraffic() {
		return downTraffic;
	}

	public void setDownTraffic(long downTraffic) {
		this.downTraffic = downTraffic;
	}

	public long getTimestamp() {
		return timestamp;
	}

	public void setTimestamp(long timestamp) {
		this.timestamp = timestamp;
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
		result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
		result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		AccessLogSortKey other = (AccessLogSortKey) obj;
		if (downTraffic != other.downTraffic)
			return false;
		if (timestamp != other.timestamp)
			return false;
		if (upTraffic != other.upTraffic)
			return false;
		return true;
	}
	
}

 

 

package cn.spark.study.core.upgrade.applog;

import java.io.Serializable;

/**
 * 访问日志信息类(可序列化)
 * @author Administrator
 *
 */
public class AccessLogInfo implements Serializable {

	private static final long serialVersionUID = 5749943279909593929L;
	
	private long timestamp;		// 时间戳
	private long upTraffic;		// 上行流量
	private long downTraffic;	// 下行流量
	
	public AccessLogInfo() {}
	
	public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
		this.timestamp = timestamp;
		this.upTraffic = upTraffic;
		this.downTraffic = downTraffic;
	}
	
	public long getTimestamp() {
		return timestamp;
	}
	public void setTimestamp(long timestamp) {
		this.timestamp = timestamp;
	}
	public long getUpTraffic() {
		return upTraffic;
	}
	public void setUpTraffic(long upTraffic) {
		this.upTraffic = upTraffic;
	}
	public long getDownTraffic() {
		return downTraffic;
	}
	public void setDownTraffic(long downTraffic) {
		this.downTraffic = downTraffic;
	}
	
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值