tomcat访问日志分隔符用tab(利用了tab的unicode表示,“	”),因为不容易出现在各种日志变量的内容中:
tomcat日志的格式化(用表示tab替换默认的空格分隔符): <Valve className="org.apache.catalina.valves.AccessLogValve" directory="logs" prefix="localhost_access_log" suffix=".txt" pattern="%h%l %u %t "%r" %s %b %D" /> |
日志的封装处理
package sparkcore case class LogField( ip: String, time: String, request: String, status: Int, length: Int) object LogField { def parseLog(log: String): LogField = { val fields = log.split("\t"); val len = if (fields(6) == "-") 0 else fields(6).toLong //注意响应字节为0的时候,日志记录为“-” LogField(fields(0), fields(3), fields(4), fields(5).toInt, len); } def main(args: Array[String]): Unit = { val log = "192.68.1.134\t-\t-\t[05/Oct/2017:17:36:41 +0800]\t\"GET /wget/finished/ HTTP/1.1\"\t200\t-"; val fields = parseLog(log) println(fields.ip + "\t" + fields.time + "\t" + fields.request + "\t" + fields.status + "\t" + fields.length) } } |
注意日志字段的处理,特别是需要类型转换的。
package sparkcore import org.apache.spark.{SparkConf, SparkContext} object LogAnalyer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local").setAppName("LogAnalyer") val sc = new SparkContext(conf); val logsRDD = sc.textFile("file:///D:/data/log.txt") .map(line => LogField.parseLog(line)) .cache() /** *统计web服务器所有响应中的最大、最小及平均字节数 */ val contextSize = logsRDD.map { log => log.length } val maxSize = contextSize.max() val minSize = contextSize.min() val averageSize = contextSize.reduce(_ + _) / contextSize.count() println("响应最大值:" + maxSize + " 最小值:" + minSize + " 平均值:" + averageSize); /** * 统计各种响应状态的出现次数 */ logsRDD.map(log => (log.status, 1)) .reduceByKey(_ + _) .foreach(result => println(" 响应状态:" + result._1 + " 计数:" + result._2)) /** * 统计访问总次数超过1000的前3名的ip */ val result = logsRDD.map(log => (log.ip, 1)) .reduceByKey(_ + _) .filter(result => result._2 > 1000) .map(m => (m._2, m._1)) .top(3) for (tuple <- result) { println("ip : " + tuple._2 + " 请求次数:" + tuple._1); } /** * 统计请求URI的TopN */ val topN = logsRDD.map { log => (log.request.split(" ")(1), 1) } .reduceByKey(_ + _) .map(result => (result._2, result._1)) .sortByKey(false) .take(3) for (tuple <- topN) { println("URI : " + tuple._2 + " 请求频次:" + tuple._1); } //最后一定要释放内存 logsRDD.unpersist(true) } } |