spark读取文件和写入文件的API
path参数默认不加协议走的是file:///,如果是hdfs的文件则可以用hdfs://master:port/path
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
/**
* Created by Frank on 2017/8/4.
*/
public class IODemo {
static SparkConf conf = new SparkConf().setAppName("app").setMaster("local");
static JavaSparkContext sc = new JavaSparkContext(conf);
public static void main(String[] a){
//准备好rdd和pairrdd
JavaRDD textRdd = sc.textFile("1.txt");
JavaPairRDD<Text,IntWritable> pairRDD = textRdd.mapToPair(new PairFunction() {
public Tuple2<Text,IntWritable> call(Object o) throws Exception {
return new Tuple2<Text,IntWritable>(new Text(o.toString()),new IntWritable(1));
}
});
//写入text文件
textRdd.saveAsTextFile("textFile");
//写入对象文件
textRdd.saveAsObjectFile("objFile");
//写入hadoop序列化文件
pairRDD.saveAsHadoopFile("hadoopFile", Text.class, IntWritable.class, SequenceFileOutputFormat.class);
//读取text文件
System.out.println("data from textFile");
eachPrint(sc.textFile("textFile"));
//读取obj文件
System.out.println("data from objFile");
eachPrint(sc.objectFile("objFile"));
//读取hadoop序列化文件
System.out.println("data from hadoopFile"); eachPrint(sc.sequenceFile("hadoopFile",Text.class,IntWritable.class));
}
public static void eachPrint(JavaPairRDD rdd){
System.out.println("-------------------------------------------");
rdd.foreach(new VoidFunction() {
public void call(Object o) throws Exception {
Tuple2 tuple2=(Tuple2)o;
System.out.println(tuple2._1+":"+tuple2._2);
}
});
}
public static void eachPrint(JavaRDD rdd){
System.out.println("-------------------------------------------");
rdd.foreach(new VoidFunction() {
public void call(Object s) throws Exception {
System.out.println(s);
}
});
}
}
1.txt
ewternjg
dfsgfd
sgfds
g
dfsg
dfs
g
dfs
gdfs
g
dfs
gdfsgdfsgdfs
gfds
g
dfsg
dfs
gfdsgdsfg
dfsg
dsfgdfsg
sdf
gds
fg
dfs
输出
data from textFile
-------------------------------------------
ewternjg
dfsgfd
sgfds
g
dfsg
dfs
g
dfs
gdfs
g
dfs
gdfsgdfsgdfs
gfds
g
dfsg
dfs
gfdsgdsfg
dfsg
dsfgdfsg
sdf
gds
fg
dfs
data from objFile
-------------------------------------------
ewternjg
dfsgfd
sgfds
g
dfsg
dfs
g
dfs
gdfs
g
dfs
gdfsgdfsgdfs
gfds
g
dfsg
dfs
gfdsgdsfg
dfsg
dsfgdfsg
sdf
gds
fg
dfs
data from hadoopFile
-------------------------------------------
ewternjg:1
dfsgfd:1
sgfds:1
g:1
dfsg:1
dfs:1
g:1
dfs:1
gdfs:1
g:1
dfs:1
gdfsgdfsgdfs:1
gfds:1
g:1
dfsg:1
dfs:1
gfdsgdsfg:1
dfsg:1
dsfgdfsg:1
sdf:1
gds:1
fg:1
dfs:1