1. 求TOP值
两个文件包含4个字段orderid、userid、payment、productid,求TopN个payment值。
file1.txt
1,1768,50,155
2,1218,600,211
3,2239,788,242
4,3101,28,599
5,4899,290,129
6,3110,54,1201
7,4436,259,877
8,2369,7890,27
file2.txt
100,4287,226,233
101,6562,489,124
102,1124,33,17
103,3267,159,179
104,4569,57,125
105,1438,37,116
import org.apache.spark.sql.SparkSession
object Test {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("wc")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val input_path = "/Users/zz/Desktop/input/"
// 分区数目为2,2个文件
val lines = spark.sparkContext.textFile(input_path, 2)
var num = 0
val result = lines.filter(line => (line.trim().length > 0)
&& (line.split(",").length == 4))
.map(_.split(",")(2)) // 取第三列
.map(x => (x.toInt, "")) // 转换为kv
.sortByKey(false)
.map(x => x._1).take(5)
.foreach(x => {
num = num + 1
println(num + "\t" + x)
})
spark.stop()
}
}
输出
1 7890
2 788

最低0.47元/天 解锁文章
1064





