1.调用textFile()
def textFile(
path: String,
minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
assertNotStopped()
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString)
}
2.minPartitions最小分区数计算
def defaultMinPartitions: Int = math.min(defaultParallelism, 2)
2.1解析defaultParallelism
def defaultParallelism: Int = {
assertNotStopped()
taskScheduler.defaultParallelism
}
这里调用的defaultParallelism是TaskScheduler特质中的方法,需要进一步寻找它的实现类:TaskSchedulerImpl
override def defaultParallelism(): Int = backend.defaultParallelism()
不过defaultParallelism仍然是SchedulerBackend特质中的方法,需要进一步寻找它的实现类:集群模式(yarn\mesos)和本地模式(local[])
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
org.apache.spark.scheduler.cluster.mesos.MesosSchedulerBackend
org.apache.spark.scheduler.local.LocalBackend
集群模式:(yarn)
override def defaultParallelism(): Int = {
conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2))
}
集群模式:(mesos)
override def defaultParallelism(): Int = sc.conf.getInt("spark.default.parallelism", 8)
本地模式:
从spark.default.parallelism或线程数量取值
override def defaultParallelism(): Int =
scheduler.conf.getInt("spark.default.parallelism", totalCores)
org.apache.spark.SparkConf#getInt
def getInt(key: String, defaultValue: Int): Int = {
getOption(key).map(_.toInt).getOrElse(defaultValue)
}
2.2minPartitions默认值:
minPartitions = min(defaultParallelism, 2)
默认并行度defaultParallelism:
如果设置spark.default.parallelism参数值,取该值
conf.getInt(“spark.default.parallelism”, math.max(totalCoreCount.get(), 2))
scheduler.conf.getInt(“spark.default.parallelism”, totalCores)
3.hadoopFile分析
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString)
def hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
assertNotStopped()
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path)
}
3.1HadoopRDD类
成员变量minPartitions解释:
Minimum number of HadoopRDD partitions (Hadoop Splits) to generate
获取RDD分区数的方法:
调用partitions.length或getNumPartitions
/**
* Get the array of partitions of this RDD, taking into account whether the
* RDD is checkpointed or not.
*/
final def partitions: Array[Partition] = {
checkpointRDD.map(_.partitions).getOrElse {
if (partitions_ == null) {
//getPartitions返回Array[HadoopPartition]
partitions_ = getPartitions
}
partitions_
}
}
/**
* Returns the number of partitions of this RDD.
*/
@Since("1.6.0")
final def getNumPartitions: Int = partitions.length
上面的partitions中调用了getPartitions方法,
override def getPartitions: Array[Partition] = {
val jobConf = getJobConf()
// add the credentials here as this can be called before SparkContext initialized
SparkHadoopUtil.get.addCredentials(jobConf)
val inputFormat = getInputFormat(jobConf)
//返回FileSplit[]
val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
val array = new Array[Partition](inputSplits.size)
for (i <- 0 until inputSplits.size) {
array(i) = new HadoopPartition(id, i, inputSplits(i))
}
array
}
进一步解析inputSplits = inputFormat.getSplits(jobConf, minPartitions)
这里调用的是org.apache.hadoop.mapred.InputFormat特质中的getSplits方法,需要进一步寻找它的实现类:org.apache.hadoop.mapred.FileInputFormat#getSplits
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
StopWatch sw = new StopWatch().start();
FileStatus[] files = listStatus(job);
// Save the number of input files for metrics/loadgen
//设置输入文件的数目files.length
job.setLong(NUM_INPUT_FILES, files.length);
long totalSize = 0; // compute total size
for (FileStatus file: files) { // check we have valid files
if (file.isDirectory()) {
throw new IOException("Not a file: "+ file.getPath());
}
totalSize += file.getLen();//获取路径下的文件大小(单位Byte)
}
//循环files后得到路径下所有文件总和(Byte)--totalSize
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
//minSplitSize默认值1
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
// generate splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
//循环files,获取输入路径下的所有文件信息
for (FileStatus file: files) {
Path path = file.getPath(); //文件路径
long length = file.getLen(); //文件大小
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(fs, path)) {
long blockSize = file.getBlockSize(); //块大小
//方法内部调用Math.max(minSize, Math.min(goalSize, blockSize))
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
long bytesRemaining = length; //文件剩余量
//SPLIT_SLOP = 1.1 (10% slop)
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,
length-bytesRemaining, splitSize, clusterMap);
//add方法返回FileSplit
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
splitHosts[0], splitHosts[1]));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length
- bytesRemaining, bytesRemaining, clusterMap);
//继续向ArrayList<FileSplit>中添加最后一个分片
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
splitHosts[0], splitHosts[1]));
}
} else {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,0,length,clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size()
+ ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
//返回FileSplit[]数组
return splits.toArray(new FileSplit[splits.size()]);
}
FileStatus类
封装了文件的路径、文件大小、副本数、block块大小、位置等信息
public class FileStatus implements Writable, Comparable {
private Path path;
private long length;
private boolean isdir;
private short block_replication;
private long blocksize;
private long modification_time;
private long access_time;
private FsPermission permission;
private String owner;
private String group;
private Path symlink;
...
}
FileStatus类
4.总结
4.1首先计算最小分区数: 直接指定或使用默认值
minPartitions = math.min(defaultParallelism, 2)
defaultParallelism取决于spark.default.parallelism或cores
优先级:spark.default.parallelism > cores
4.2
//循环files后得到路径下所有文件总和(Byte)–totalSize
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
//minSplitSize默认值1
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
4.3
//方法内部调用Math.max(minSize, Math.min(goalSize, blockSize))
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
4.4
bytesRemaining/splitSize > SPLIT_SLOP