1 普通textFile读写
读取hdfs上的文本文件
scala>val rdd1=sc.textFile("hdfs://hadoop1:9000/myone/input/a.txt")
进行相关处理,按照文件中每行进行处理
scala> val rdd2=rdd1.flatMap(line=>line.split("\\s+")).map(d=>(d,1)
向hdfs写入textFile
scala> rdd2.saveAsTextFile("/myone/mout")
2 json格式
JSON 是一种使用较广的半结构化数据格式。
编写json格式文件,并上传到hdfs上
{"xname":"张三","xage":"8","address":{"addr":"和平区","zip":"1100"}}
编写java文件
import org.apache.spark.{SparkConf, SparkContext}
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}
/*
{"xname":"张三","xage":"8","address":{"addr":"和平区","zip":"1100"}}
*/
case class student(xname:String,xage:String,address:Addr){
override def toString: String = f"student(xname:$xname,xage:$xage,address:$address)"
}
case class Addr(addr:String,zip:String){
override def toString: String = f"address(addr:$addr,zip:$zip)"
}
object WordCount {
def main(args: Array[String]): Unit = {
val sconf=new SparkConf()
sconf.setAppName("my word count")
sconf.setMaster("spark://hadoop1:7077")
// sconf.setMaster("local")
val sc=new SparkContext(sconf)
val file=sc.textFile("hdfs://hadoop1:9000/myone/input/a1.json")
val rdd=file.map { line =>
implicit val mformat = DefaultFormats
val obj = parse(line)
obj.extract[student]
}
rdd.saveAsTextFile("hdfs://hadoop1:9000/myone/mjsonout/")
sc.stop()
}
}
3 csv
注意,导入
au.com.bytecode.opencsv.CSVReader
读取csv文件
case class Data(index: String, title: String, content: String)
object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
val input = sc.wholeTextFiles("/home/common/coding/coding/Scala/word-count/sample_map.csv") //wholeTextFiles读出来是一个RDD(String,String)
val result = input.flatMap { case (_, txt) =>
val reader = new CSVReader(new StringReader(txt));
//reader.readAll().map(x => Data(x(0), x(1), x(2)))
reader.readAll()
}
result.collect().foreach(x => {
x.foreach(println); println("======")
})
}
写入csv文件
import scala.collection.JavaConversions._
import au.com.bytecode.opencsv.{CSVReader, CSVWriter}
case class Data(index: String, title: String, content: String)
object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
val inputRDD = sc.parallelize(List(Data("index", "title", "content")))
inputRDD.map(data => List(data.index, data.title, data.content).toArray)
.mapPartitions { data =>
val stringWriter = new StringWriter();
val csvWriter = new CSVWriter(stringWriter);
csvWriter.writeAll(data.toList)
Iterator(stringWriter.toString)
}.saveAsTextFile("/home/common/coding/coding/Scala/word-count/sample_map_out")
}
}