平均值
val conf = new SparkConf().setMaster("local[*]").setAppName("sort")
//sc sparkContext总入口
val sc = new SparkContext(conf)
sc.textFile(args(0))
.map(line => (line.split(" ")(1),(line.split(" ")(2))))
.groupByKey()
.map(tuple => {
var sum = 0.0
val num = tuple._2.size
for (score <- tuple._2){
sum += score.toDouble
}
val avg = sum / num
val formatAvg = f"$avg%.2f"
(tuple._1,formatAvg)
}).coalesce(1)
.foreach(println(_))
sc.stop()
最值
val conf = new
SparkConf().setMaster("local[*]").setAppName("MaxAndMin")
var sc = new SparkContext(conf)
// val lines = sc.textFile(args(0))
// lines.filter(line => line.trim.length>0).map(line => ("key",
// line.trim.toInt)).groupByKey()
// .map(x => {
// var min = Integer.MAX_VALUE
// var max = Integer.MIN_VALUE
// for(num <- x._2) {
// if (num > max) {
// max = num
// }
// if (num < min) {
// min = num
// }
// }
// (max, min)
// }).foreach(x => {
// println("max\t" + x._1)
// println("min\t" + x._2)
// })
//方式二
val lines = sc.textFile(args(0))
println(lines.getNumPartitions)
lines.filter(line => line.trim.length>0).map(line => (s"key${Random.nextInt(lines.getNumPartitions)}",
line.trim.toInt)).groupByKey()
.map(x => {
var min = Integer.MAX_VALUE
var max = Integer.MIN_VALUE
for(num <- x._2) {
if (num > max) {
max = num
}
if (num < min) {
min = num
}
}
(max, min)
}).map(tuple => ("key", tuple)).groupByKey()
.map(x => {
var min = Integer.MAX_VALUE
var max = Integer.MIN_VALUE
for(num <- x._2) {
if (num._1 > max) {
max = num._1
}
if (num._2 < min) {
min = num._2
}
}
(max, min)
})
.foreach(x => {
println("max\t" + x._1)
println("min\t" + x._2)
})
sc.stop()
排序
val conf = new SparkConf().setMaster("local[*]").setAppName("sort")
//sc sparkContext总入口
val sc = new SparkContext(conf)
// 排序方式一
// sc.textFile(args(0)).sortBy(num => num.toInt).foreach(x => println(x))
// 排序二 计数器
sc.textFile(args(0))
.sortBy(num => num.toInt)
.zipWithIndex()
.map(num => num.swap)
.partitionBy(new HashPartitioner(1)) //重新分区
.foreach(x => println(x))
//关闭环境
sc.stop()
去重
val conf = new SparkConf().setMaster("local[*]").setAppName("distinct")
//SparkContext 总入口
val sc = new SparkContext(conf)
//读取文件 直接去重 输出
//sc.textFile(args(0)).distinct().foreach(x => println(x))
// 2 读取文件
sc.textFile(args(0))
.map(line => (line,"")) //map转换键值对形式
.groupByKey() //按key分组 实现去重
.coalesce(1) //指定分区为1 只能减少分区 不经过shuffle
//.repartition(1) 可以增加 减少分区 经过shuffle
.map(x => x._1) //只要元组的key
.foreach(x => println(x)) //输出
//关闭环境
sc.stop()
去重并且排序
val conf = new SparkConf().setMaster("local[*]").setAppName("distinct-sort")
val sc = new SparkContext(conf)
// 去重并且排序
sc.textFile(args(0))
.distinct()
.sortBy(num => num.toInt)
.coalesce(1)
.foreach(x => println(x))
sc.stop()
单词统计
val conf = new SparkConf().setMaster("local[*]").setAppName("wordcount")
val sc = new SparkContext(conf)
//单词统计
sc.textFile(args(0))
.flatMap(line => line.split(" "))
.map(word => (word, 1))
.reduceByKey((x, y) => x + y)
.foreach(println)
//关闭环境
sc.stop()
最高气温
val conf = new SparkConf().setMaster("local[*]").setAppName("MaxTemp")
var sc = new SparkContext(conf)
val lineRDD = sc.textFile(args(0))
val pairRDD = lineRDD.map(line => {
val year = line.split("\t")(0).substring(0,4)
(year, line)
})
val maxRDD = pairRDD.groupByKey().map(tuple => {
val list = tuple._2
var maxTemp = Double.MinValue
var day = ""
for(valueStr <- list){
//判断获取每年的最高温度及哪天
if(valueStr.split("\t")(1).toDouble > maxTemp){
maxTemp = valueStr.split("\t")(1).toDouble
day = valueStr.split("\t")(0)
}
}
(tuple._1, maxTemp + " " + day)
})
maxRDD.foreach(println)
sc.stop()
topN
val conf = new
SparkConf().setMaster("local[*]").setAppName("topn")
val sc = new SparkContext(conf)
var idx = 0
val six = sc.textFile(args(0))
// six.filter(x => (x.trim().length>0) && (x.split(",").length==4))
// .map(line => line.split(",")(2))
// .map(x => (x.toInt,""))
// .sortByKey(false)
// .map(x=>x._1)
// .take(5)
// .foreach(x => {
// idx = idx+1
// println(idx +"\t"+x)})
//方式二
// six.filter(x => (x.trim().length>0) && (x.split(",").length==4))
// .map(line => line.split(",")(2))
// .sortBy(line => line.toInt, false)
// .take(5)
// .foreach(x => {
// idx = idx+1
// println(idx +"\t"+x)})
//方式三
six.filter(x => (x.trim().length>0) && (x.split(",").length==4))
.map(line => line.split(",")(2).toInt)
.top(5)
.foreach(x => {
idx = idx+1
println(idx +"\t"+x)})
sc.stop()
分组topN
val conf = new
SparkConf().setMaster("local[*]").setAppName("group-topn")
val sc = new SparkContext(conf)
val lines = sc.textFile(args(0))
val pairs = lines.map(line => {
val lineSplited = line.split(" ")
(lineSplited(1), lineSplited(2).toInt)
})
val groupedPairs = pairs.groupByKey
//sortWith(_>_);降序
//sortWith(_<_):升序
val top3Score = groupedPairs.map(groupedPair => (groupedPair._1, groupedPair._2.toList.sortWith((x,y) => x > y).take(3)))
top3Score.foreach(pair => {
println(pair._1 + ":")
pair._2.foreach(println(_))
})
sc.stop()
依赖以及编译插件
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>