数据:
7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10
8888 HIVE PROGRAM 7839 1988-1-23 10300.00
需求:按年份分组,把同年份的数据丢到同一个文件里,以年份为文件名。
实现源码:
package com.ruoze.spark.FileByYear
import java.io.RandomAccessFile
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.{SparkConf, SparkContext}
object fileOutput {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("fileOutput").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("emp.txt")
lines.map(x => {
val temp = x.split("\t")
val year = temp(4).split("-")
(year(0),x)
})
/**
* 方法一:saveAsHadoopFile输出,Windows运行有问题
* */
// .saveAsHadoopFile("G:\\bigdata\\IDEA_workspace\\sparkCore\\year",
// classOf[String],
// classOf[String],
// classOf[RDDMultipleTextOutputFormat])
/**
* 方法二:以年份过滤数据,规约到1个分区输出
* */
// val key = rdd.keys.distinct().collect()//求不重复的key列表
// key.foreach(x=>{
// val context = rdd.filter(y=>x==y._1).coalesce(1,true).foreach(z=>{
// val path = "G:\\bigdata\\IDEA_workspace\\sparkCore\\" + z._1 + ".txt"
// writeToFile(path,z._2+"\n")
// })
// })
/**
* 方法3:自定义分区
* */
.groupByKey(new MySparkPartition(5)).mapValues(_.toList)
.foreachPartition(x => {
val xlist = x.toList
val path = "G:\\bigdata\\IDEA_workspace\\sparkCore\\" + xlist(0)._1 + ".txt"
for (b <- 0 until xlist(0)._2.size) {
val context1 = xlist(0)._2(b)+"\n"
writeToFile(path, context1.toString())
}
})
sc.stop()
}
def writeToFile(outFile: String, content: String) {
val randomFile = new RandomAccessFile(outFile,"rw")
val fileLength = randomFile.length; //得到文件长度
randomFile.seek(fileLength);//指针指向文件末尾
randomFile.writeBytes(content); //写入数据
randomFile.close()
}
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String ={
val rKey = key.asInstanceOf[String]
(rKey+"/"+rKey+".txt")
}
}
class MySparkPartition(numParts: Int) extends org.apache.spark.Partitioner {
override def numPartitions: Int = numParts
override def getPartition(key: Any): Int = {
var code = Math.abs(key.hashCode()%numPartitions)
if (key.toString.equals("1987"))
code = 4
code
}
override def equals(other: Any): Boolean = other match {
case mypartition: MySparkPartition =>
mypartition.numPartitions == numPartitions
case _ =>
false
}
override def hashCode: Int = numPartitions
}
}
部分结果: