package com.fei.simple_project;
import org.apache.spark.api.java.function.Function;
public class ContainsSomething implements Function<String, Boolean> {
private String query;
public ContainsSomething(String mquery){
this.query=mquery;
}
public Boolean call(String x){
return x.contains(query);
}
}
package com.fei.simple_project;
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;
public class App
{
public static void main( String[] args )
{
String logFile = "README.md";
SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
//从外部读取生成RDD
JavaRDD<String> logData = sc.textFile(logFile);
//持久化RDD,方便后面多次动作,无需重复计算
//不能与JavaRDD<String> logData = sc.textFile(logFile).cache();同用,因为cache已指定内存持久化
logData.persist(StorageLevel.MEMORY_AND_DISK());
long numAs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("a"); }
}).count();
long numBs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("b"); }
}).count();
System.out.println("1111111111Lines with a: " + numAs + ", lines with b: " + numBs);
//使用parallelize创建RDD
JavaRDD<String> lines1 = sc.parallelize(Arrays.asList("pandas", "like","i like pandas"));
lines1.persist(StorageLevel.MEMORY_ONLY());
System.out.println("222222222Count is:"+lines1.count());
JavaRDD<String> lines2 = sc.parallelize(Arrays.asList("pandas", "like","i like pandas"));
lines2.persist(StorageLevel.MEMORY_ONLY());
System.out.println("333333333333Count is:"+lines2.count());
//转化操作
//filter
//匿名内部类
JavaRDD<String> likeRDD = lines1.filter(new Function<String, Boolean>(){
public Boolean call(String x){
return x.contains("like");
}
});
System.out.println("44444444444likeRDD Count is:"+likeRDD.count());
//带参数的具名类
JavaRDD<String> pandaRDD = lines1.filter(new ContainsSomething("pandas"));
System.out.println("555555555555pandaRDD Count is:"+pandaRDD.count());
//union
JavaRDD<String> unionRDD = lines1.union(lines2);
System.out.println("666666666666unionRDD Count is:"+unionRDD.count());
//行动操作
//take
System.out.print("777777777777: ");
for(String it:lines1.take(2)){
System.out.print(it+" ");
}
//collect
System.out.print("888888888: ");
for(String it:lines1.collect()){
System.out.print(it+" ");
}
}
}
1111111111Lines with a: 58, lines with b: 26
222222222Count is:3
333333333333Count is:3
44444444444likeRDD Count is:2
555555555555pandaRDD Count is:2
666666666666unionRDD Count is:6
777777777777:
pandas
like
888888888:
pandas
like
i like pandas