val jsonRdd = CommonHelper.getSparkContext().textFile(validPath.toString,8)
val result = jsonRdd.mapPartitions(iter =>parseJsonStr(iter))//取RDD前十条数据进行打印
result.take(10).foreach {
println
}
二、将HDFS每一行JSON字符串按分区处理成RDD[Row]
创建Row类型对应的Schema
val schema =StructType(List(StructField("id",ArrayType(LongType,true),true),StructField("fea",ArrayType(StringType,true),true),StructField("width", IntegerType,true),StructField("height", IntegerType,true),StructField("date", StringType,true),StructField("format_type", StringType,true),StructField("source", StringType,true)))
parseJsonStr方法如下
def parseJsonStr(iter: Iterator[String]): Iterator[Row]={
var list = ArrayBuffer[Row]()while(iter.hasNext){
var array =newArrayBuffer[Any]()
val json: JSONObject = JSON.parseObject(iter.next())
val adIds = json.getJSONArray("ids").toArray
var adIdsList: scala.List[Long]=List()for(id <- adIds){
adIdsList = adIdsList.::(id.asInstanceOf[Number].longValue)}
array += adIdsList.toArray
val feas = json.getJSONArray("fea").toArray
array += feas
val width = json.getIntValue("width")
array += width
val height = json.getIntValue("height")
array += height
val date = json.getString("date")
array += date
val typeStr = json.getString("type")
array += typeStr
val source = json.getString("source")
array += source
var row =newGenericRowWithSchema(array.toArray, schema)
list +=(row)}
list.toList.iterator
}