spark开发环境搭建
1.下载开发工具luna eclipse 或者 Intellij IDEA(官网下载的 scala for eclipse如果不能用可以使用 luna)
2.安装jdk1.7配置环境变量
3.安装scala的语言包,下载scala-2.10.4.msi 配置SCALA HOME。下载spark1.6,并且配置spark环境变量
4.在eclipse中安装scala插件
图1
图2
5.安装maven,并且配置maven环境变量
图3
安装m2e-scala::http://alchim31.free.fr/m2e-scala/update-site/
从图4中搜索到的插件名称中可以看到,这里同时也配置了m2e,也即eclipse需要的maven插件。如果eclipse没有maven插件,则可以全部选中安装;若已经有了可以单独安装第三个Maven Integration for Scala IDE。
安装完成了MavenIntegration for Scala IDE之后,再输入上面的url,可安装列表里就没有Maven Integration for Scala IDE这一项了
在eclipse中右键->new 能看到下面的表示sclaa 插件安装成功.
使用maven+scala搭建spark项目:
1.右键-new
2.右键项目名称-configure-Add scala Nature 将maven项目改成scala项目
3.新建项目的默认的scala语言包默认使用的scala2.11,前面安装的是scala2.10.所以pom文件中会报错。
解决办法:右键项目-build path-configure build path,将scala语言包改成2.10
class
KafkaManager(val kafkaParams: Map[String, String])
extends
Serializable {
@transient
lazy val log = LogManager.getLogger(
this
.getClass)
private
val kc =
new
KafkaCluster(kafkaParams)
/**
* 创建数据流
* @param ssc
* @param kafkaParams
* @param topics
* @tparam K
* @tparam V
* @tparam KD
* @tparam VD
* @return
*/
def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](
ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = {
val groupId = kafkaParams.get(
"group.id"
).get
// 在zookeeper上读取offsets前先根据实际情况更新offsets
setOrUpdateOffsets(topics, groupId)
//从zookeeper上读取offset开始消费message
val messages = {
val partitionsE = kc.getPartitions(topics)
if
(partitionsE.isLeft)
throw
new
SparkException(
"get kafka partition failed:"
)
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if
(consumerOffsetsE.isLeft)
throw
new
SparkException(
"get kafka consumer offsets failed:"
)
val consumerOffsets = consumerOffsetsE.right.get
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
}
messages
}
/**
* 创建数据流前,根据实际消费情况更新消费offsets
* @param topics
* @param groupId
*/
private
def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = {
topics.foreach(topic => {
var hasConsumed =
true
val partitionsE = kc.getPartitions(Set(topic))
if
(partitionsE.isLeft)
throw
new
SparkException(s
"get kafka partition failed: ${partitionsE.left.get.mkString("
\n
")}"
)
val partitions: Set[TopicAndPartition] = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if
(consumerOffsetsE.isLeft) hasConsumed =
false
log.info(
"consumerOffsetsE.isLeft: "
+ consumerOffsetsE.isLeft)
if
(hasConsumed) {
// 消费过
log.warn(
"消费过"
)
/**
* 如果zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
* 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
* 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
* 这时把consumerOffsets更新为earliestLeaderOffsets
*/
val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
if
(earliestLeaderOffsetsE.isLeft)
throw
new
SparkException(s
"get earliest offsets failed: ${earliestLeaderOffsetsE.left.get.mkString("
\n
")}"
)
val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
val consumerOffsets = consumerOffsetsE.right.get
// 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
var offsets: Map[TopicAndPartition, Long] = Map()
consumerOffsets.foreach({
case
(tp, n) =>
val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
if
(n < earliestLeaderOffset) {
log.warn(
"consumer group:"
+ groupId +
",topic:"
+ tp.topic +
",partition:"
+ tp.partition +
" offsets已经过时,更新为"
+ earliestLeaderOffset)
offsets += (tp -> earliestLeaderOffset)
}
})
log.warn(
"offsets: "
+ consumerOffsets)
if
(!offsets.isEmpty) {
kc.setConsumerOffsets(groupId, offsets)
}
}
else
{
// 没有消费过
log.warn(
"没消费过"
)
val reset = kafkaParams.get(
"auto.offset.reset"
).map(_.toLowerCase)
var leaderOffsets: Map[TopicAndPartition, LeaderOffset] =
null
if
(reset == Some(
"smallest"
)) {
leaderOffsets = kc.getEarliestLeaderOffsets(partitions).right.get
}
else
{
leaderOffsets = kc.getLatestLeaderOffsets(partitions).right.get
}
val offsets = leaderOffsets.map {
case
(tp, offset) => (tp, offset.offset)
}
log.warn(
"offsets: "
+ offsets)
kc.setConsumerOffsets(groupId, offsets)
}
})
}
/**
* 更新zookeeper上的消费offsets
* @param rdd
*/
def updateZKOffsets(rdd: RDD[(String, String)]): Unit = {
val groupId = kafkaParams.get(
"group.id"
).get
val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for
(offsets <- offsetsList) {
val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
if
(o.isLeft) {
log.error(s
"Error updating the offset to Kafka cluster: ${o.left.get}"
)
}
}
}
}
@DeveloperApi
class
KafkaCluster(val kafkaParams: Map[String, String])
extends
Serializable {
import
KafkaCluster.{ Err, LeaderOffset, SimpleConsumerConfig }
@transient
private
var _config: SimpleConsumerConfig =
null
def config: SimpleConsumerConfig =
this
.
synchronized
{
if
(_config ==
null
) {
_config = SimpleConsumerConfig(kafkaParams)
}
_config
}
def connect(host: String, port: Int): SimpleConsumer =
new
SimpleConsumer(host, port, config.socketTimeoutMs,
config.socketReceiveBufferBytes, config.clientId)
def connectLeader(topic: String, partition: Int): Either[Err, SimpleConsumer] =
findLeader(topic, partition).right.map(hp => connect(hp._1, hp._2))
def findLeader(topic: String, partition: Int): Either[Err, (String, Int)] = {
val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion,
0
, config.clientId, Seq(topic))
val errs =
new
Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp: TopicMetadataResponse = consumer.send(req)
resp.topicsMetadata.find(_.topic == topic).flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.find(_.partitionId == partition)
}.foreach { pm: PartitionMetadata =>
pm.leader.foreach { leader =>
return
Right((leader.host, leader.port))
}
}
}
Left(errs)
}
def findLeaders(
topicAndPartitions: Set[TopicAndPartition]): Either[Err, Map[TopicAndPartition, (String, Int)]] = {
val topics = topicAndPartitions.map(_.topic)
val response = getPartitionMetadata(topics).right
val answer = response.flatMap { tms: Set[TopicMetadata] =>
val leaderMap = tms.flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.flatMap { pm: PartitionMetadata =>
val tp = TopicAndPartition(tm.topic, pm.partitionId)
if
(topicAndPartitions(tp)) {
pm.leader.map { l =>
tp -> (l.host -> l.port)
}
}
else
{
None