spark定制之六:sql版start.scala

本文介绍在Scala环境中使用Spark SQL进行数据处理,并详细解释了如何读取、解析、存储文本文件,包括HDFS和本地文件路径的操作,以及如何在Spark shell中执行SQL查询并保存结果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

上个版本的start.scala用的是HiveContext,这个是SQLContext的,不需编译。

# cat testperson.txt #字段用table键分隔

zs 10 30.0
li 12 32.0

# spark-shell -i:start.scala

scala> help

根据提示逐步运行

import org.apache.spark.sql.SchemaRDD  
  
var FIELD_SEPERATOR = "\t"  
var RECORD_SEPERATOR = "\n"  
var lastrdd : SchemaRDD = null  
  
object MyFileUtil extends java.io.Serializable {  
    import org.apache.hadoop.fs.Path  
    import org.apache.hadoop.fs.FileSystem  
    import org.apache.hadoop.fs.FileStatus  
    import scala.collection.mutable.ListBuffer  
  
    def regularFile(filepath:String):String = {  
        if(filepath == "") {  
            filepath;  
        } else if(filepath.startsWith("hdfs:")) {  
            filepath  
        } else if(filepath.startsWith("file:")) {  
            filepath  
        } else if(filepath.startsWith("/")) {  
            "file://" + filepath  
        } else {  
            val workdir = System.getProperty("user.dir")  
            "file://" + workdir + "/" + filepath  
        }  
    }  
  
    var SAFEMINPATH_LENGTH : Int = 24  
  
    def getFileSystem(filepath:String) = {  
        if(filepath.startsWith("hdfs:")) {  
            FileSystem.get(new org.apache.hadoop.conf.Configuration());  
        } else if(filepath.startsWith("file:")) {  
            FileSystem.getLocal(new org.apache.hadoop.conf.Configuration());  
        } else {  
            throw new Exception("file path invalid")  
        }  
    }  
  
    def deletePath(filepath:String) = {  
        if(filepath.length < SAFEMINPATH_LENGTH)  
            throw new Exception("file path is to short")  
        var fs : FileSystem = getFileSystem(filepath)  
        if (fs.exists(new Path(filepath))) {  
            fs.delete(new Path(filepath), true);  
        }  
    }  
  
    def listFile(fs:FileSystem, path:Path, pathlist:ListBuffer[Path], statuslist:ListBuffer[FileStatus]=null) {  
        if ( fs.exists(path) ) {  
            val substatuslist =  fs.listStatus(path);  
            for(substatus <- substatuslist){  
                if(statuslist != null)  
                    statuslist.append(substatus)  
                if(substatus.isDir()){  
                    listFile(fs,substatus.getPath(),pathlist);  
                }else{  
                    pathlist.append(substatus.getPath());  
                }  
            }  
        }  
    }  
  
    def hasContext(filepath:String) = {  
        val realpath = regularFile(filepath)  
        val fs = getFileSystem(realpath)   
        val pathlist = ListBuffer[Path]()  
        val statuslist = ListBuffer[FileStatus]()  
        listFile(fs,new Path(filepath),pathlist,statuslist)  
        var length:Long = 0  
        for( status <- statuslist )  
            length += status.getLen()  
        length > 0  
    }  
}  
  
org.apache.spark.repl.Main.interp.command("""  
class MySchemaRDD(rdd:org.apache.spark.sql.SchemaRDD) extends java.io.Serializable {  
  
    def go() = {  
        var startstr = ""  
        var endstr = RECORD_SEPERATOR  
        val result = rdd.collect  
        result.foreach( x =>  
            print(x.mkString(startstr,FIELD_SEPERATOR,endstr))  
          )  
    }  
  
    def result() = {  
        rdd.collect  
    }  
  
    def saveto(output: String) = {  
        import org.apache.hadoop.io.{NullWritable,Text}  
        var startstr = ""  
        var endstr = RECORD_SEPERATOR  
        if(output.startsWith("hdfs:")) {  
            val outputpath = MyFileUtil.regularFile(output)  
            MyFileUtil.deletePath(outputpath)  
            rdd.map(x =>   
                  (NullWritable.get(), new Text(x.mkString(FIELD_SEPERATOR)))  
                ).saveAsHadoopFile[  
                  org.apache.hadoop.mapred.TextOutputFormat[NullWritable, Text]  
                ](outputpath)  
        } else {  
            val outputpath = MyFileUtil.regularFile(output)  
            MyFileUtil.deletePath(outputpath)  
            val result = rdd.collect()  
            val writer = new java.io.FileWriter(output)  
            result.foreach(x =>   
                writer.write(x.mkString(startstr,FIELD_SEPERATOR,endstr))  
              )  
            writer.close()  
        }  
    }  
}  
object MySchemaRDD {  
    implicit def toMySchemaRDD(rdd:org.apache.spark.sql.SchemaRDD) = new MySchemaRDD(rdd)  
}  
""")  
  
val ssc = new org.apache.spark.sql.SQLContext(sc)  
import ssc._  
import MySchemaRDD._  

def getRegisterString(rddname:String,classname:String,tablename:String,tabledef:String) : String = {  
    val members = tabledef.trim.split(",").map(_.trim.split(" ").filter(""!=)).map(x => (x(0).trim,x(1).trim.head.toString.toUpperCase+x(1).trim.tail))  
    val classmemberdef = members.map(x => (x._1+":"+x._2)).mkString(",")  
    val convertstr = members.map(x => x._2).zipWithIndex.map(x => "t("+x._2+").to"+x._1).mkString(",")  
    return s"""  
        case class ${classname}(${classmemberdef})  
        val schemardd = ${rddname}.map(_.split("${FIELD_SEPERATOR}")).map(t=>${classname}(${convertstr}))  
        ssc.registerRDDAsTable(schemardd,"${tablename}")  
    """  
}  

org.apache.spark.repl.Main.interp.command("""  
class MyCommandTranslator(cmd:String) extends java.io.Serializable {  
  
    def go()(implicit f: SchemaRDD => MySchemaRDD) = {  
        lastrdd = sql(cmd)  
        lastrdd.go()  
    }  
  
    def saveto(output: String)(implicit f: SchemaRDD => MySchemaRDD) = {  
        lastrdd = sql(cmd)  
        lastrdd.saveto(output)  
    }  
  
    def result()(implicit f: SchemaRDD => MySchemaRDD) = {  
        lastrdd = sql(cmd)  
        lastrdd.result()  
    }  
  
//    def hqlgo()(implicit f: SchemaRDD => MySchemaRDD) = {  
//        lastrdd = hql(cmd)  
//        lastrdd.go()  
//    }  
//  
//    def hqlsaveto(output: String)(implicit f: SchemaRDD => MySchemaRDD) = {  
//        lastrdd = hql(cmd)  
//        lastrdd.saveto(output)  
//    }  
//  
//    def hqlresult()(implicit f: SchemaRDD => MySchemaRDD) = {  
//        lastrdd = hql(cmd)  
//        lastrdd.result()  
//    }  
  
    def defineas(tabledef:String) = {  
        if( tabledef != "" ) {  
            org.apache.spark.repl.Main.interp.command(   
                getRegisterString(cmd,cmd.toUpperCase,cmd,tabledef)  
            )  
        } else {  
            org.apache.spark.repl.Main.interp.command(  
                "ssc.registerRDDAsTable(${cmd},\"${cmd}\")"  
            )  
        }  
    }  
  
    def from(filepath:String) {  
        if( cmd.trim.startsWith("create table ") ) {  
            val tablename = cmd.trim.substring(13).trim().split(" ")(0)  
            val leftstr = cmd.substring(13).trim().substring(tablename.length).trim()  
            val tabledef = leftstr.substring(1,leftstr.length-1).trim()  
            val realfile = MyFileUtil.regularFile(filepath)  
            org.apache.spark.repl.Main.interp.command(  
                "val "+tablename+" = sc.textFile(\""+realfile+"\")"  
            )  
            new MyCommandTranslator(tablename).defineas(tabledef)  
        } else {  
            println("usage:")  
            println("\"create table sometablename (field1 string,field2 int...)\" from \"somefile or hdfs:somepath\"")  
        }  
    }  
  
    def isok() = {  
        if(cmd.contains(".") || cmd.contains("/")) {  
            MyFileUtil.hasContext(cmd)  
        } else {  
            val res = sql(s"select count(*) from ${cmd}").result()  
            val count = res(0).getLong(0)  
            count > 0  
        }  
    }  
}  
object MyCommandTranslator {  
    implicit def stringToTranslator(cmd:String) = new MyCommandTranslator(cmd)  
  
    def show(tabledata:Array[org.apache.spark.sql.Row]) = {  
        tabledata.foreach( x => println(x.mkString("\t")))  
    }  
}  
""")  
  
def to = MyCommandTranslator  
import MyCommandTranslator._  
  
val onetable = sql("select 1 as id")  
ssc.registerRDDAsTable(onetable,"onetable")  
  
def help = {  
    println("""example:  
        "create table testperson (name string,age int,weight double)" from "testperson.txt"  
        "select * from testperson" go  
        "select * from testperson" saveto "somelocalfile.txt"  
        "select * from testperson" saveto "hdfs:/basedir/parentdir/testperson"  
        "testperson" isok  
        "somelocalfile.txt" isok  
        "hdfs:/basedir/parentdir/testperson" isok  
        val data = "select * from testperson" result  
        to show data  
        val somerdddata = sc.textFile("hdfs:/basedir/parentdir/testperson")<span style="font-family: Arial, Helvetica, sans-serif;"> </span>
        "somerdddata" defineas "name string,age int,weight double"  
        "select * from somerdddata" go  
        if you want to see the help of enveronment, please type :help  
        """)  
}


Exception in thread "main" java.io.FileNotFoundException: File does not exist: hdfs://hadoop01:9000/sparklog at org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1757) at org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1750) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1765) at org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77) at org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221) at org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:81) at org.apache.spark.SparkContext.<init>(SparkContext.scala:616) at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2740) at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:1026) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:1020) at org.apache.spark.examples.SparkPi$.main(SparkPi.scala:30) at org.apache.spark.examples.SparkPi.main(SparkPi.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1020) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:192) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:215) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91) at org.apache.spark.deploy.SparkSubmit$$anon$2.
03-26
ERROR SparkContext:91 - Error initializing SparkContext. java.lang.IllegalArgumentException: requirement failed: Can only call getServletHandlers on a running MetricsSystem at scala.Predef$.require(Predef.scala:224) at org.apache.spark.metrics.MetricsSystem.getServletHandlers(MetricsSystem.scala:91) at org.apache.spark.SparkContext.<init>(SparkContext.scala:515) at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2493) at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:934) at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:925) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925) at org.apache.spark.examples.SparkPi$.main(SparkPi.scala:31) at org.apache.spark.examples.SparkPi.main(SparkPi.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:894) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) 2025-03-26 11:52:11 INFO SparkContext:54 - SparkContext already stopped. 2025-03-26 11:52:11 INFO SparkContext:54 - Successfully stopped SparkContext Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Can only call getServletHandlers on a running MetricsSystem at scala.Predef$.require(Predef.scala:224) at org.apache.spark.metrics.MetricsSyste
03-27
25/06/06 07:25:20 INFO yarn.Client: client token: Token { kind: YARN_CLIENT_TOKEN, service: } diagnostics: Shutdown hook called before final status was reported. ApplicationMaster host: uklvadhdp020a.pi.dev.net ApplicationMaster RPC port: 40347 queue: ratan-ovv-queue start time: 1749194644862 final status: FAILED tracking URL: https://uklvadhdp008a.pi.dev.net:8090/proxy/application_1748586326247_2749/ user: g.ratan.001 25/06/06 07:25:20 ERROR yarn.Client: Application diagnostics message: Shutdown hook called before final status was reported. Exception in thread "main" org.apache.spark.SparkException: Application application_1748586326247_2749 finished with failed status at org.apache.spark.deploy.yarn.Client.run(Client.scala:1208) at org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1662) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:862) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:937) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:946) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) 25/06/06 07:25:20 INFO util.ShutdownHookManager: Shutdown hook called 25/06/06 07:25:20 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-dd3bdd8d-72de-43a4-bea9-66445eac3af1 25/06/06 07:25:20 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-ac07c915-6127-471d-9b96-5809f3a1ccfd 06-Jun-2025 07:25:20 UTC : [ERROR] Failed to run Spark Submit
最新发布
06-07
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值