1、创建一个新的工程,参考spark的windows开发环境搭建
2、添加依赖
(1)scala依赖
<dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-compiler</artifactId> <version>${scala.version}</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-reflect</artifactId> <version>${scala.version}</version> </dependency>必须添加这三个,否则报错,缺失module
(2)spark依赖
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.10</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.2.1</version> </dependency>3、代码
hiveContext测试
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql._ import org.apache.spark.sql.hive.HiveContext /** * Created by hadoop on 2015/4/17. */ object TestSparkHive { def main(args:Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("TestSparkHive").setMaster("spark://192.168.246.107:7077") val sc = new SparkContext(sparkConf) val hiveContext = new HiveContext(sc) import hiveContext._ println("Result of 'SELECT *': ") sql("SELECT * FROM src").collect().foreach(println) } }sqlContext测试
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext // One method for defining the schema of an RDD is to make a case class with the desired column // names and types. case class Record(key: Int, value: String) object RDDRelation { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("RDDRelation").setMaster("spark://192.168.246.107:7077") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) // Importing the SQL context gives access to all the SQL functions and implicit conversions. import sqlContext._ val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))) // Any RDD containing case classes can be registered as a table. The schema of the table is // automatically inferred using scala reflection. rdd.registerTempTable("records") // Once tables have been registered, you can run SQL queries over them. println("Result of SELECT *:") sql("SELECT * FROM records").collect().foreach(println) // Aggregation queries are also supported. val count = sql("SELECT COUNT(*) FROM records").collect().head.getLong(0) println(s"COUNT(*): $count") // The results of SQL queries are themselves RDDs and support all normal RDD functions. The // items in the RDD are of type Row, which allows you to access each column by ordinal. val rddFromSql = sql("SELECT key, value FROM records WHERE key < 10") println("Result of RDD.map:") rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println) // Queries can also be written using a LINQ-like Scala DSL. rdd.where('key === 1).orderBy('value.asc).select('key).collect().foreach(println) // Write out an RDD as a parquet file. rdd.saveAsParquetFile("pair.parquet") // Read in parquet file. Parquet files are self-describing so the schmema is preserved. val parquetFile = sqlContext.parquetFile("pair.parquet") // Queries can be run using the DSL on parequet files just like the original RDD. parquetFile.where('key === 1).select('value as 'a).collect().foreach(println) // These files can also be registered as tables. parquetFile.registerTempTable("parquetFile") sql("SELECT * FROM parquetFile").collect().foreach(println) sc.stop() } }4、maven编译并打包
5、bin/spark-submit提交
4和5也可以参考spark的windows开发环境搭建
错误:
Exception in thread "main" java.lang.NoSuchMethodError: scala.reflect.api.JavaUniverse.runtimeMirror(Ljava/lang/ClassLoader;)Lscala/reflect/api/JavaUniverse$JavaMirror; at org.rogach.scallop.package$.(package.scala:37) at
说明在环境变量class_path里位增加scala-reflect.jar
为了保险起见,也可以在spark-env.sh里的spark_path里也添加上