Spark On HBase

POM

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <spark.version>2.4.0</spark.version>
    <scala.version>2.11</scala.version>
    <hbase.version>1.4.9</hbase.version>
    <fastjson.version>1.2.58</fastjson.version>
</properties>

<repositories>
    <repository>
        <id>apache-releases</id>
        <url>https://repository.apache.org/content/groups/public</url>
    </repository>
    <repository>
        <id>cloudera</id>
        <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
    </repository>
    <repository>
        <snapshots>
            <enabled>false</enabled>
        </snapshots>
        <id>central</id>
        <name>Central Repository</name>
        <url>http://repo.maven.apache.org/maven2</url>
    </repository>
</repositories>

<dependencies>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_${scala.version}</artifactId>
        <version>${spark.version}</version>
        <exclusions>
            <exclusion>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-common</artifactId>
        <version>${hbase.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-client</artifactId>
        <version>${hbase.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-server</artifactId>
        <version>${hbase.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-spark</artifactId>
        <version>1.2.0-cdh5.16.1</version>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>${fastjson.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_${scala.version}</artifactId>
        <version>${spark.version}</version>
    </dependency>
</dependencies>

代码

import java.time.{LocalDate, LocalDateTime}
import java.time.format.DateTimeFormatter
import java.util.Properties
import com.alibaba.fastjson.serializer.SerializerFeature
import com.alibaba.fastjson.{JSON, JSONObject}
import com.jzh.util.{EncrUtil, HBaseUtils, HDFSUtil, KafkaSink, SchemaTableUtil}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.util.Bytes
import com.jzh.config.Config
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import scala.collection.JavaConverters._

object DataHandleSync {
	def main(args: Array[String]): Unit = {
		val sparkSession: SparkSession = SparkSession.builder().config(new SparkConf().setAppName("DataHandleSync")).getOrCreate()
		val sqlContext: SQLContext = sparkSession.sqlContext
		val sparkContext: SparkContext = sparkSession.sparkContext
		val hbaseConfig = HBaseConfiguration.create()
		hbaseConfig.set("hbase.zookeeper.quorum", "127.0.0.1")
		hbaseConfig.set("zooleeper.znode.parent", "/hbase")
		hbaseConfig.set("hbase.zookeeper.property.clientPort", "2181")
		val hhaseContext = new HBaseContext(sparkContext,hbaseConfig)

		val gidRDD = hhaseContext.hbaseRDD(TableName.valueOf("COLLISION"),new Scan())
		val gidRow = gidRDD.map(tuple2 => {
		    val md5Phone = Bytes.toString(tuple2._1.get())
		    val cell = tuple2._2.listCells().get(0)
		    val gid = Bytes.toString(CellUtil.cloneValue(cell))
		    Row(md5Phone,gid)
		})
	}
}

注意spark2.x之后会提示找不到Logging,因为升级文件改变路径了。在项目根目录下创建包org.apache.spark,在包下创建Logging文件

package org.apache.spark

import org.apache.log4j.{LogManager, PropertyConfigurator}
import org.slf4j.{Logger, LoggerFactory}
import org.slf4j.impl.StaticLoggerBinder

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils
/**
 * Created by Administrator on 2017/8/11.
 */


/**
 * :: DeveloperApi ::
 * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
 * logging messages at different levels using methods that only evaluate parameters lazily if the
 * log level is enabled.
 *
 * NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility.
 *       This will likely be changed or removed in future releases.
 */
@DeveloperApi
trait Logging {
    // Make the log field transient so that objects with Logging can
    // be serialized and used on another machine
    @transient private var log_ : Logger = null

    // Method to get the logger name for this object
    protected def logName = {
        // Ignore trailing $'s in the class names for Scala objects
        this.getClass.getName.stripSuffix("$")
    }

    // Method to get or create the logger for this object
    protected def log: Logger = {
        if (log_ == null) {
            initializeIfNecessary()
            log_ = LoggerFactory.getLogger(logName)
        }
        log_
    }

    // Log methods that take only a String
    protected def logInfo(msg: => String) {
        if (log.isInfoEnabled) log.info(msg)
    }

    protected def logDebug(msg: => String) {
        if (log.isDebugEnabled) log.debug(msg)
    }

    protected def logTrace(msg: => String) {
        if (log.isTraceEnabled) log.trace(msg)
    }

    protected def logWarning(msg: => String) {
        if (log.isWarnEnabled) log.warn(msg)
    }

    protected def logError(msg: => String) {
        if (log.isErrorEnabled) log.error(msg)
    }

    // Log methods that take Throwables (Exceptions/Errors) too
    protected def logInfo(msg: => String, throwable: Throwable) {
        if (log.isInfoEnabled) log.info(msg, throwable)
    }

    protected def logDebug(msg: => String, throwable: Throwable) {
        if (log.isDebugEnabled) log.debug(msg, throwable)
    }

    protected def logTrace(msg: => String, throwable: Throwable) {
        if (log.isTraceEnabled) log.trace(msg, throwable)
    }

    protected def logWarning(msg: => String, throwable: Throwable) {
        if (log.isWarnEnabled) log.warn(msg, throwable)
    }

    protected def logError(msg: => String, throwable: Throwable) {
        if (log.isErrorEnabled) log.error(msg, throwable)
    }

    protected def isTraceEnabled(): Boolean = {
        log.isTraceEnabled
    }

    private def initializeIfNecessary() {
        if (!Logging.initialized) {
            Logging.initLock.synchronized {
                if (!Logging.initialized) {
                    initializeLogging()
                }
            }
        }
    }

    private def initializeLogging() {
        // Don't use a logger in here, as this is itself occurring during initialization of a logger
        // If Log4j 1.2 is being used, but is not initialized, load a default properties file
        val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
        // This distinguishes the log4j 1.2 binding, currently
        // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
        // org.apache.logging.slf4j.Log4jLoggerFactory
        val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)

        lazy val isInInterpreter: Boolean = {
            try {
                val interpClass = classForName("org.apache.spark.repl.Main")
                interpClass.getMethod("interp").invoke(null) != null
            } catch {
                case _: ClassNotFoundException => false
            }
        }
        def classForName(className: String): Class[_] = {
            Class.forName(className, true, getContextOrSparkClassLoader)
            // scalastyle:on classforname
        }
        def getContextOrSparkClassLoader: ClassLoader =
            Option(Thread.currentThread().getContextClassLoader).getOrElse(getSparkClassLoader)
        def getSparkClassLoader: ClassLoader = getClass.getClassLoader

        if (usingLog4j12) {
            val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
            if (!log4j12Initialized) {
                // scalastyle:off println
                if (isInInterpreter) {
                    val replDefaultLogProps = "org/apache/spark/log4j-defaults-repl.properties"
                    Option(Utils.getSparkClassLoader.getResource(replDefaultLogProps)) match {
                        case Some(url) =>
                            PropertyConfigurator.configure(url)
                            System.err.println(s"Using Spark's repl log4j profile: $replDefaultLogProps")
                            System.err.println("To adjust logging level use sc.setLogLevel(\"INFO\")")
                        case None =>
                            System.err.println(s"Spark was unable to load $replDefaultLogProps")
                    }
                } else {
                    val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
                    Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
                        case Some(url) =>
                            PropertyConfigurator.configure(url)
                            System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
                        case None =>
                            System.err.println(s"Spark was unable to load $defaultLogProps")
                    }
                }
                // scalastyle:on println
            }
        }
        Logging.initialized = true

        // Force a call into slf4j to initialize it. Avoids this happening from multiple threads
        // and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
        log
    }
}

private object Logging {
    @volatile private var initialized = false
    val initLock = new Object()
    try {
        // We use reflection here to handle the case where users remove the
        // slf4j-to-jul bridge order to route their logs to JUL.
        val bridgeClass = Utils.classForName("org.slf4j.bridge.SLF4JBridgeHandler")
        bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null)
        val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean]
        if (!installed) {
            bridgeClass.getMethod("install").invoke(null)
        }
    } catch {
        case e: ClassNotFoundException => // can't log anything yet so just fail silently
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值