RDD
$ val rdd = scc.makeRDD(Seq("value1", "value2", "value3"));
$ val rdd = sc.parallelize(Seq("value", "value1", "value2"))
$ val rdd = sc.textFile("file:///home/person1");
val rdd1 = sc.textFile("hdfs:///user/person")
val df : DataFrame = new HiveContext().sql("select name,id from tmp")
val rdd = df.rdd
DataFrame
$ val df = Seq(
(1, "First Value", java.sql.Date.valueOf("2018-05-01")),
(2, "Second Value", java.sql.Date.valueOf("2018-08-01"))
).toDF("int_column", "string_column", "date_column")
val rdd = sc.makeRDD(Seq("xd","das","da")).map(x => (x,1)).toDF("name", "age")
$ val hive = new HiveContext(sc)
$ val df = hive.sql("select * from person")
import org.apache.spark.sql.types._
val hive = new HiveContext(sc)
val schema = StructType(Array(
StructField("name", StringType, nullable = false),StructField("age", IntegerType, nulable = false), StructField("code", StringType, nullable=true)
))
val rdd = sc.parallelize(Seq(
Row("name",32,"42332"),
Row("name1",42,"4234")
))
val df =hive.createDataFrame(rdd, schema)
$ val df = sc.read.json("/home/person.json")
$ val df = sc.read.parquet("hdfs:/user/file")
val spark = org.apache.spark.sql.SparkSession.builder
.master("local")
.appName("Spark CSV Reader")
.getOrCreate
val df = spark.read
.format("com.databricks.spark.csv")
.option("header", "true") //reading the headers
.option("mode", "DROPMALFORMED")
.load("csv/file/path")
df.show()