参照文档加以修改和深化,mark一下
package mywork
import java.io.File
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object WordCount {
def main(args: Array[String]) {
if (args.length < 2) {
System.out.println("Usage: <input> <output>")
System.exit(1)
}
val infile = args(0) // Should be some file on your system
println(" =======================")
println(" || WordCount in Spark !||")
println(" =======================")
println("Input: " + args(0) + ",size:" + getFileSize(infile) + "bytes")
println("Onput: " + args(1))
val conf = new SparkConf().setAppName("word count")
val sc = new SparkContext(conf)
val indata = sc.textFile(infile, 2).cache()
//flatMap把每行按空格分割单词
//map把每个单词映射成(word,1)的格式
//reduceByKey则按key,在此即单词,做整合,即把相同单词的次数1相加
val words = indata.flatMap(line => line.split(" ")).map(word => (word,1)).reduceByKey((a,b) => a+b)
//获取包含ERROR的所有行
val errlineRDD = indata.filter(line => line.contains("ERROR"))
words.saveAsTextFile(args(1))
println()
println("All words are counted!")
val res = words.count().toInt
//打印出来部分统计结果
if (res > 20) {
println("The first 20 words are ... ")
words.take(20).foreach(println)
println(" ... ")
};
else
words.take(res).foreach(println)
println()
println(errlineRDD.count + " lines with ERROR and the first line with ERROR is:")
println(errlineRDD.first + "\n")
}
//获取文件大小
def getFileSize(fname: String): Long = {
new File(fname) match {
case null => 0
case cat if cat.isFile() => cat.length()
}
}
}
运行结果如下:
[root@sparkmaster bin]# ./spark-submit --class mywork.WordCount /opt/myjars/spark-wordcount-in-scala.jar /log /tmp/wordcount03
=======================
|| WordCount in Spark !||
=======================
Input: /log,size:102400015bytes
Onput: /tmp/wordcount03
16/02/28 04:47:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
All words are counted!
The first 20 words are ...
(15:50:27.516,78)
(15:50:26.205,78)
(15:50:22.326,76)
(15:50:26.586,77)
(15:50:25.056,76)
(15:50:19.248,73)
(15:50:21.324,78)
(15:50:20.304,76)
(15:50:25.119,77)
(15:50:22.506,77)
(15:50:24.870,78)
(15:50:26.007,77)
(15:50:19.116,74)
(15:50:20.475,78)
(15:50:20.559,80)
(15:50:21.483,78)
(15:50:24.249,67)
(15:50:19.032,73)
(15:50:22.401,78)
(15:50:25.122,77)
...
706207 lines with ERROR and the first line with ERROR is:
2015-11-15 15:50:18.864 GMT ERROR [28379:17590581916128] mrss.requesthandler - RequestHandler::svc failed to accept a new connection, error = 24
[root@sparkmaster bin]#