package cn.spark.study.core.upgrade.applog;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 移动端app访问流量日志分析案例
* @author Administrator
*
*/
public class AppLogSpark {
public static void main(String[] args) throws Exception {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"C://Users//Administrator//Desktop//access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD =
mapAccessLogRDD2Pair(accessLogRDD);
// 根据deviceID进行聚合操作
// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD =
aggregateByDeviceID(accessLogPairRDD);
// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD =
mapRDDKey2SortKey(aggrAccessLogPairRDD);
// 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
JavaPairRDD<AccessLogSortKey ,String> sortedAccessLogRDD =
accessLogSortRDD.sortByKey(false);
// 获取top10数据
List<Tuple2<AccessLogSortKey, String>> top10DataList =
sortedAccessLogRDD.take(10);
for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
System.out.println(data._2 + ": " + data._1);
}
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD) {
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, AccessLogInfo> call(String accessLog)
throws Exception {
// 根据\t对日志进行切分
String[] accessLogSplited = accessLog.split("\t");
// 获取四个字段
long timestamp = Long.valueOf(accessLogSplited[0]);
String deviceID = accessLogSplited[1];
long upTraffic = Long.valueOf(accessLogSplited[2]);
long downTraffic = Long.valueOf(accessLogSplited[3]);
// 将时间戳、上行流量、下行流量,封装为自定义的可序列化对象
AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp,
upTraffic, downTraffic);
return new Tuple2<String, AccessLogInfo>(deviceID, accessLogInfo);
}
});
}
/**
* 根据deviceID进行聚合操作
* 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
* @param accessLogPairRDD 日志key-value格式RDD
* @return 按deviceID聚合RDD
*/
private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
private static final long serialVersionUID = 1L;
@Override
public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
throws Exception {
long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ?
accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
AccessLogInfo accessLogInfo = new AccessLogInfo();
accessLogInfo.setTimestamp(timestamp);
accessLogInfo.setUpTraffic(upTraffic);
accessLogInfo.setDownTraffic(downTraffic);
return accessLogInfo;
}
});
}
/**
* 将RDD的key映射为二次排序key
* @param aggrAccessLogPairRDD 按deviceID聚合RDD
* @return 二次排序key RDD
*/
private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD) {
return aggrAccessLogPairRDD.mapToPair(
new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<AccessLogSortKey, String> call(
Tuple2<String, AccessLogInfo> tuple) throws Exception {
// 获取tuple数据
String deviceID = tuple._1;
AccessLogInfo accessLogInfo = tuple._2;
// 将日志信息封装为二次排序key
AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
accessLogInfo.getUpTraffic(),
accessLogInfo.getDownTraffic(),
accessLogInfo.getTimestamp());
// 返回新的Tuple
return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
}
});
}
}
package cn.spark.study.core.upgrade.applog;
import java.io.Serializable;
import scala.math.Ordered;
/**
* 日志的二次排序key
* @author Administrator
*
*/
public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable {
private static final long serialVersionUID = 3702442700882342403L;
private long upTraffic;
private long downTraffic;
private long timestamp;
@Override
public boolean $greater(AccessLogSortKey other) {
if(upTraffic > other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic > other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp > other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $greater$eq(AccessLogSortKey other) {
if($greater(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less(AccessLogSortKey other) {
if(upTraffic < other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic < other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp < other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less$eq(AccessLogSortKey other) {
if($less(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public int compare(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
@Override
public int compareTo(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
AccessLogSortKey other = (AccessLogSortKey) obj;
if (downTraffic != other.downTraffic)
return false;
if (timestamp != other.timestamp)
return false;
if (upTraffic != other.upTraffic)
return false;
return true;
}
}
package cn.spark.study.core.upgrade.applog;
import java.io.Serializable;
/**
* 访问日志信息类(可序列化)
* @author Administrator
*
*/
public class AccessLogInfo implements Serializable {
private static final long serialVersionUID = 5749943279909593929L;
private long timestamp; // 时间戳
private long upTraffic; // 上行流量
private long downTraffic; // 下行流量
public AccessLogInfo() {}
public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
}