spark解析xxxxx.tar.gz形式的压缩包。
压缩包里面是一个个的json文件或者zip的文件,zip里面是json文件。先用spark读取tar.gz的路径,然后开流传给 new TarArchiveInputStream(new GZIPInputStream(file))去处理,大概的代码如下
def main(args: Array[String]): Unit = {
val root: String = args(0) // xxx.tar.gz的路径
val stname = "test/" //压缩包里读取的文件名
val startday =args(2) //压缩包里是多天的文件,这就需要获取一天的所需文件的数据,插入到hive按天分区的表中
val endday=args(3)
val spark = SparkSession.builder()
.appName(s"$stname")
.enableHiveSupport()
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
val days: Array[String] = DataUtils.dataextent(startday,endday) //先按照日期进行遍历每一天的文件夹
for (daytemp <- days) {
val filename: String = daytemp+"/"+stname
val day: String = daytemp.substring(0,4)+daytemp.substring(5,7)+daytemp.substring(8,10)
val filerdd: RDD[(String, PortableDataStream)] = sc.binaryFiles(s"${root}")
filerdd.flatMap(file=>{
TargzUtils.tarfile(file._2.open(),filename)
}).flatMap(json => JsonUtil.parseArray[testbean](json)).toDS()
.createOrReplaceTempView("t1")
spark.sql(
s"""
|insert into table default.test partition(day='$day')
|select * from t1
|""".stripMargin)
}
}
def tarfile(file:InputStream,filename:String) ={
var inputStream : ArchiveInputStream = null
val lines = new ListBuffer[String]
try {
inputStream = new TarArchiveInputStream(new GZIPInputStream(file))
var entry : TarArchiveEntry = null
while ({
entry = inputStream.getNextEntry.asInstanceOf[TarArchiveEntry]
entry
}!=null){
if (!entry.isDirectory){
if (entry.getName.endsWith(".zip")&&entry.getName.contains(filename)){
println(entry.getName)
val zipInputStream = new ZipInputStream(inputStream)
var zipEntry : ZipEntry = null
try {
while ({
zipEntry = zipInputStream.getNextEntry
zipEntry
}!= null){
if (zipEntry.getName.endsWith(".json")){
println(zipEntry.getName)
lines.append(streamToString(zipInputStream))
}
}
}catch {
case e : ZipException =>
e.printStackTrace()
}
} else if (entry.getName.endsWith(".json")&&entry.getName.contains(filename)){
println(entry.getName)
lines.append(streamToString(inputStream).trim)
}
}
}
lines.toList
}catch {
case _: Exception => lines.toList
}finally {
inputStream.close()
}
}
def streamToString(inputStream: InputStream): String = {
val bufferedReader = new BufferedReader(new InputStreamReader(inputStream))
val result = new StringBuilder()
var buffer: String = null
while ( {
try {
buffer = bufferedReader.readLine()
} catch {
case e: Exception =>
e.printStackTrace()
buffer = null
}
buffer
} != null) {
result.append(buffer.trim)
}
result.toString()
}
// 输入起始日期,终止日期,得到这中间的所有日期的集合
def dataextent(startday:String,endday:String):Array[String] ={
val df = new SimpleDateFormat("yyyy-MM-dd")
val date1: Date = df.parse(startday)
val date2: Date = df.parse(endday)
val startcal: Calendar = Calendar.getInstance()
val endcal: Calendar = Calendar.getInstance()
startcal.setTime(date1)
endcal.setTime(date2)
val startmi: Long = startcal.getTimeInMillis
val endmi: Long = endcal.getTimeInMillis
val oneday: Long = 1000*60*60*24L
var time:Long = startmi
val listbuf = new ListBuffer[String]
while (time<=endmi){
val date = new Date(time)
val day: String = df.format(date)
listbuf.append(day.trim)
time += oneday
}
listbuf.toArray
}
def parse[T](json: String)(implicit m: Manifest[T]): Option[T] = {
try {
implicit val formats: Formats = DefaultFormats
val objJValue = parseJson(json).transformField { case (field, value) => (field.toLowerCase, value) }
Option(objJValue.extract[T])
} catch {
case e: Exception =>
e.printStackTrace()
None
}
}
def parseArray[T](json: String)(implicit m: Manifest[T]): Array[T] = {
implicit val formats: Formats = DefaultFormats
val objJValue = parseJson(json)
objJValue.extract[Array[T]]
}
case class testbean(
`type`: Option[String],
id: Option[String] ,
name: Option[String]
)
本文介绍使用Spark处理tar.gz格式的压缩文件,包括读取、解析内部的json和zip文件,并将数据加载到Hive分区表的过程。通过示例代码展示了如何遍历指定日期范围内的文件,以及如何处理各种文件格式。
2万+

被折叠的 条评论
为什么被折叠?



