日志分析如何去除表头
Hive 中如何去除表头
- 方法一
create external table if not exists test(
id string,
name string,
gender string
)
row format delimited fields
terminated by ','
stored as textfile
location '/app/info'
tblproperties("skip.header.line.count"="1");
知识点:tblproperties(“skip.header.line.count”=“1”) 表示导入数据时忽略第一行。如果是需要跳过首位两行,代码如下:
tblproperties("skip.header.line.count"="1", "skip.footer.line.count"="2");
Spark 中如何去除表头
- 方法一
object sparkSQL {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[4]")
.appName(this.getClass.getName)
.enableHiveSupport()
.getOrCreate()
val header = rdd.first()
rdd2 = rdd.filter(_ != header)
}
}
- 方法二
object sparkSQL {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[4]")
.appName(this.getClass.getName)
.enableHiveSupport()
.getOrCreate()
val rdd: RDD[(String, (String, Double))] = sc.textFile(".../***.csv")
.mapPartitionsWithIndex((ix, it) => {
if (ix == 0) it.drop(1)
})
}
}
- 方法三
object sparkSQL {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[4]")
.appName(this.getClass.getName)
.enableHiveSupport()
.getOrCreate()
val sparkDF = spark.read
.format("csv").option("header",true).option("inferSchema",true)
.load(".../***.csv")
}
}
- 方法四
object sparkSQL {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[4]")
.appName(this.getClass.getName)
.enableHiveSupport()
.getOrCreate()
val rdd1: RDD[(String, (String, Double))] = sc.textFile(".../***.csv")
val rdd2: RDD[Array[String]] = rdd1.filter(x => x.startsWith("***") == false)
}
}
【注】:*** 代表的是首行下标为 0 的字段名