本期内容:
1.Executor的WAL
2.消息重放
3.其它
StorageLevel.scala
Memory不够的时候才考虑disk
classStorageLevel
private(
private var _useDisk: Boolean,
private var _useMemory: Boolean,
private var _useOffHeap: Boolean,
private var _deserialized: Boolean,
private var _replication: Int =
1)
extends Externalizable {
ReceiverSupervisorImpl.scala
/** Store the bytes of received data as a data block into Spark's memory. */
def pushBytes(
bytes: ByteBuffer,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
pushAndReportBlock(ByteBufferBlock(bytes), metadataOption, blockIdOption)
}
/** Store block and report it to driver */
def pushAndReportBlock(
receivedBlock: ReceivedBlock,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
val blockId = blockIdOption.getOrElse(nextBlockId)
val time = System.currentTimeMillis
val blockStoreResult =
receivedBlockHandler.storeBlock(blockId, receivedBlock)
logDebug(s"Pushed block $blockId in
${(System.currentTimeMillis
- time)} ms")
val numRecords = blockStoreResult.numRecords
val blockInfo =
ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
logDebug(s"Reported block $blockId")
}
receivedBlockHandler
private val
receivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if (checkpointDirOption.isEmpty) {
throw new SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. "
+
"Please use streamingContext.checkpoint() to set the checkpoint directory. "
+
"See documentation for more details.")
}
new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
} else {
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
}
}
def
enableReceiverLog(conf: SparkConf): Boolean = {
conf.getBoolean(RECEIVER_WAL_ENABLE_CONF_KEY,
false)
}
ReceivedBlockHandler
def
storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
var numRecords = None: Option[Long]
val putResult:
Seq[(BlockId, BlockStatus)] = block
match {
case ArrayBufferBlock(arrayBuffer) =>
numRecords = Some(arrayBuffer.size.toLong)
blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
tellMaster = true)
case IteratorBlock(iterator) =>
val countIterator =
new CountingIterator(iterator)
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
tellMaster = true)
numRecords = countIterator.count
putResult
case ByteBufferBlock(byteBuffer) =>
blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster =
true)
case o =>
throw new SparkException(
s"Could not store $blockId to block manager, unexpected block type
${o.getClass.getName}")
}
if (!putResult.map { _._1 }.contains(blockId)) {
throw new SparkException(
s"Could not store $blockId to block manager with storage level
$storageLevel")
}
BlockManagerBasedStoreResult(blockId, numRecords)
}
def
putIterator(
blockId: BlockId,
values: Iterator[Any],
level: StorageLevel,
tellMaster: Boolean = true,
effectiveStorageLevel: Option[StorageLevel] = None):
Seq[(BlockId, BlockStatus)] = {
require(values != null,
"Values is null")
doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}
FileBasedWriteAheadLog
write
/**
* Write a byte buffer to the log file. This method synchronously writes the data in the
* ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
* to HDFS, and will be available for readers to read.
*/
def write(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
var fileSegment:
FileBasedWriteAheadLogSegment = null
var failures = 0
var lastException:
Exception =
null
var succeeded = false
while (!succeeded && failures < maxFailures) {
try {
fileSegment = getLogWriter(time).write(byteBuffer)
if (closeFileAfterWrite) {
resetWriter()
}
succeeded = true
} catch
{
case ex:
Exception =>
lastException = ex
logWarning("Failed to write to write ahead log")
resetWriter()
failures += 1
}
}
if (fileSegment ==
null) {
logError(s"Failed to write to write ahead log after
$failures failures")
throw lastException
}
fileSegment
}
/** Get the current log writer while taking care of rotation */
private def getLogWriter(currentTime: Long): FileBasedWriteAheadLogWriter = synchronized {
if (currentLogWriter
== null
|| currentTime > currentLogWriterStopTime) {
resetWriter()
currentLogPath.foreach {
pastLogs +=
LogInfo(currentLogWriterStartTime,
currentLogWriterStopTime, _)
}
currentLogWriterStartTime = currentTime
currentLogWriterStopTime = currentTime + (rollingIntervalSecs *
1000)
val newLogPath =
new Path(logDirectory,
timeToLogFile(currentLogWriterStartTime,
currentLogWriterStopTime))
currentLogPath =
Some(newLogPath.toString)
currentLogWriter =
new FileBasedWriteAheadLogWriter(currentLogPath.get,
hadoopConf)
}
currentLogWriter
}
readAll
def
readAll(): JIterator[ByteBuffer] = synchronized {
val logFilesToRead =
pastLogs.map{ _.path} ++
currentLogPath
logInfo("Reading from the logs:\n"
+ logFilesToRead.mkString("\n"))
def readFile(file:
String):
Iterator[ByteBuffer] = {
logDebug(s"Creating log reader with
$file")
val reader =
new FileBasedWriteAheadLogReader(file, hadoopConf)
CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
}
if (!closeFileAfterWrite) {
logFilesToRead.iterator.map(readFile).flatten.asJava
} else {
// For performance gains, it makes sense to parallelize the recovery if
// closeFileAfterWrite = true
seqToParIterator(threadpool, logFilesToRead, readFile).asJava
}
}
FileBasedWriteAheadLogReader
private[streaming]
class FileBasedWriteAheadLogReader(path:
String, conf: Configuration)
extends Iterator[ByteBuffer]
with Closeable
with Logging {
private val instream
= HdfsUtils.getInputStream(path, conf)
private var closed
= (instream
== null)
// the file may be deleted as we're opening the stream
private var nextItem: Option[ByteBuffer] =
None
override def
hasNext: Boolean = synchronized {
if (closed) {
return false
}
if (nextItem.isDefined) {
// handle the case where hasNext is called without calling next
true
} else
{
try {
val length =
instream.readInt()
val buffer =
new Array[Byte](length)
instream.readFully(buffer)
nextItem =
Some(ByteBuffer.wrap(buffer))
logTrace("Read next item "
+ nextItem.get)
true
} catch
{
case e: EOFException =>
logDebug("Error reading next item, EOF reached", e)
close()
false
case e: IOException =>
logWarning("Error while trying to read data. If the file was deleted, "
+
"this should be okay.", e)
close()
if (HdfsUtils.checkFileExists(path, conf)) {
// If file exists, this could be a legitimate error
throw e
} else {
// File was deleted. This can occur when the daemon cleanup thread takes time to
// delete the file during recovery.
false
}
case e:
Exception =>
logWarning("Error while trying to read data from HDFS.", e)
close()
throw e
}
}
}
getInputStream
def
getInputStream(path: String, conf: Configuration): FSDataInputStream = {
val dfsPath =
new Path(path)
val dfs =
getFileSystemForPath(dfsPath, conf)
if (dfs.isFile(dfsPath)) {
try {
dfs.open(dfsPath)
} catch {
case e: IOException =>
// If we are really unlucky, the file may be deleted as we're opening the stream.
// This can happen as clean up is performed by daemon threads that may be left over from
// previous runs.
if (!dfs.isFile(dfsPath))
null else throw e
}
} else {
null
}
}
DirectKafkaInputDStream
@tailrec
protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
// Either.fold would confuse @tailrec, do it manually
if (o.isLeft) {
val err = o.left.get.toString
if (retries <=
0) {
throw new SparkException(err)
} else {
log.error(err)
Thread.sleep(kc.config.refreshLeaderBackoffMs)
latestLeaderOffsets(retries - 1)
}
} else {
o.right.get
}
}
KafkaRDD
private[kafka]
class KafkaRDD[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag] private[spark] (
sc: SparkContext,
kafkaParams: Map[String, String],
val offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, (String, Int)],
messageHandler: MessageAndMetadata[K, V] => R
) extends RDD[R](sc, Nil)
with Logging
with HasOffsetRanges {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case
(o, i) =>
val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
}.toArray
}
getPreferredLocations
override def
getPreferredLocations(thePart: Partition): Seq[String] = {
val part = thePart.asInstanceOf[KafkaRDDPartition]
// TODO is additional hostname resolution necessary here
Seq(part.host)
}
connectLeader
// The idea is to use the provided preferred host, except on task retry atttempts,
// to minimize number of kafka metadata requests
private def connectLeader: SimpleConsumer = {
if (context.attemptNumber >
0) {
kc.connectLeader(part.topic, part.partition).fold(
errs => throw new SparkException(
s"Couldn't connect to leader for topic
${part.topic}
${part.partition}: "
+
errs.mkString("\n")),
consumer => consumer
)
} else {
kc.connect(part.host, part.port)
}
}