GraphX之PartitionStrategy修改

最新推荐文章于 2024-06-24 21:03:28 发布

原创最新推荐文章于 2024-06-24 21:03:28 发布 · 1.5k 阅读

CC 4.0 BY-SA版权

本文介绍Apache Spark GraphX中实现的不同分区策略，包括EdgePartition2D、EdgePartition1D、RandomVertexCut等，并详细讨论了MetisPartition策略的具体实现及其遇到的问题。

增加了两个分区算法，原本做法是在trait中定义loadMetisFile方法，然后在GraphImpl类的

 override def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] = {
    partitionBy(partitionStrategy, edges.partitions.size)
  }

中将partitionStrategy使用asInstanceof[PartitionStrategy]强制转换成PartitionStrategy对象，然后调用loadMetisFile加载数据，但是发现只能第一次可行，后面在执行：

    val newEdges = edges.withPartitionsRDD(edges.map { e =>
      val part: PartitionID = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions)
      (part, (e.srcId, e.dstId, e.attr))
    }

中的getPartition方法时就无效了，这是因为前面的初始化是Analytics中的val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_)) 在master上面，后面子任务并行执行= partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_))时，需要将主节点上面初始化的对象partitionStrategy 反序列化，但是 metisParition的map对象无法序列化造成内容为空，子任务执行getPartition 查询报内容为空。

后来尝试改为

getPartition() {
   if(map.size=0) loadMetisFIle() //

由于partitionStrategy 是静态对象，导致多任务线程同时执行if(map.size=0) loadMetisFIle()，出现并发问题。

最后将loadMetisFIle的代码直接写入PartitionStrategy的trait初始化代码中，这样在对象初始化的时候直接读取，无所谓是子任务节点还是主节点，代码如下:

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.graphx

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.conf.Configuration
import java.net.URI
import org.apache.hadoop.fs.FSDataInputStream
import java.io.InputStreamReader
import java.io.BufferedReader
import scala.collection.mutable.HashMap
import groovy.transform.Synchronized

/**
 * Represents the way edges are assigned to edge partitions based on their source and destination
 * vertex IDs.
 */
trait PartitionStrategy extends Serializable {
  /** Returns the partition number for a given edge. */
  def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID
  
  def loadMetisFile()
        val metisMap =new HashMap[Int,Int] 

         try {
             val hdfs = FileSystem.get(URI.create("hdfs://192.168.0.100:9000/test/Web_metis_Final_Input.txt.part.6"), new Configuration)  
           	 var fp : FSDataInputStream = hdfs.open(new Path("hdfs://192.168.0.100:9000/test/Web_metis_Final_Input.txt.part.6"))  
           	 var isr : InputStreamReader = new InputStreamReader(fp) 
             var bReader : BufferedReader = new BufferedReader(isr)   
           	 var id:Int = 1 
           	 var line:String = bReader.readLine()  
             while(line!=null) {
               	 if ( !"".equals(line)) {
           		   metisMap.put(id, line.toInt)
           		   id = id +1 
           		 }
           		 line = bReader.readLine() 
             }
           	 isr.close() ;
           	 bReader.close() ;
             println("metisMap size: " + metisMap.size)
//           metisMap.foreach{case (e,i) => println(e,i)} 
    } catch  {
       case ex: Exception => { // Handle missing file 
            ex.printStackTrace()
       }  
    } 
}

/**
 * Collection of built-in [[PartitionStrategy]] implementations.
 */
object PartitionStrategy {
  /**
   * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix,
   * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication.
   *
   * Suppose we have a graph with 12 vertices that we want to partition
   * over 9 machines.  We can use the following sparse matrix representation:
   *
   * <pre>
   *       __________________________________
   *  v0   | P0 *     | P1       | P2    *  |
   *  v1   |  ****    |  *       |          |
   *  v2   |  ******* |      **  |  ****    |
   *  v3   |  *****   |  *  *    |       *  |
   *       ----------------------------------
   *  v4   | P3 *     | P4 ***   | P5 **  * |
   *  v5   |  *  *    |  *       |          |
   *  v6   |       *  |      **  |  ****    |
   *  v7   |  * * *   |  *  *    |       *  |
   *       ----------------------------------
   *  v8   | P6   *   | P7    *  | P8  *   *|
   *  v9   |     *    |  *    *  |          |
   *  v10  |       *  |      **  |  *  *    |
   *  v11  | * <-E    |  ***     |       ** |
   *       ----------------------------------
   * </pre>
   *
   * The edge denoted by `E` connects `v11` with `v1` and is assigned to processor `P6`. To get the
   * processor number we divide the matrix into `sqrt(numParts)` by `sqrt(numParts)` blocks. Notice
   * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3,
   * P6)` or the last
   * row of blocks `(P6, P7, P8)`.  As a consequence we can guarantee that `v11` will need to be
   * replicated to at most `2 * sqrt(numParts)` machines.
   *
   * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work
   * balance.  To improve balance we first multiply each vertex id by a large prime to shuffle the
   * vertex locations.
   *
   * When the number of partitions requested is not a perfect square we use a slightly different
   * method where the last column can have a different number of rows than the others while still
   * maintaining the same size per block.
   */
  case object EdgePartition2D extends PartitionStrategy {
      def loadMetisFile() {
  }
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
      val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt
      val mixingPrime: VertexId = 1125899906842597L
      if (numParts == ceilSqrtNumParts * ceilSqrtNumParts) {
        // Use old method for perfect squared to ensure we get same results
        val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt
        val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt
        (col * ceilSqrtNumParts + row) % numParts

      } else {
        // Otherwise use new method
        val cols = ceilSqrtNumParts
        val rows = (numParts + cols - 1) / cols
        val lastColRows = numParts - rows * (cols - 1)
        val col = (math.abs(src * mixingPrime) % numParts / rows).toInt
        val row = (math.abs(dst * mixingPrime) % (if (col < cols - 1) rows else lastColRows)).toInt
        col * rows + row

      }
    }
  }

  /**
   * Assigns edges to partitions using only the source vertex ID, colocating edges with the same
   * source.
   */
  case object EdgePartition1D extends PartitionStrategy {
      def loadMetisFile() {
  }
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
      val mixingPrime: VertexId = 1125899906842597L
      (math.abs(src * mixingPrime) % numParts).toInt
    }
  }


  /**
   * Assigns edges to partitions by hashing the source and destination vertex IDs, resulting in a
   * random vertex cut that colocates all same-direction edges between two vertices.
   */
  case object RandomVertexCut extends PartitionStrategy {
      def loadMetisFile() {
      }
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
//       println("RandomVertexCut src: " + src + ", src hashcode: "+ src.hashCode + ", result: " + 
//          +  math.abs((src, dst).hashCode()) % numParts)
      math.abs((src, dst).hashCode()) % numParts
    }
  }


  /**
   * Assigns edges to partitions by hashing the source and destination vertex IDs in a canonical
   * direction, resulting in a random vertex cut that colocates all edges between two vertices,
   * regardless of direction.
   */
  case object CanonicalRandomVertexCut extends PartitionStrategy {
      def loadMetisFile() {
  }
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
      if (src < dst) {
        math.abs((src, dst).hashCode()) % numParts
      } else {
        math.abs((dst, src).hashCode()) % numParts
      }
    }
  }
  
   /**
  *  range Partition尝试
  */ 
  case object RangePartition extends PartitionStrategy {
  def loadMetisFile() {
  }
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
      var max = 6 
      var keyRange = max / numParts;
      var part = (src.hashCode() % max) / keyRange;
//      println("liuqiang src: " + src + ", src hashcode: "+ src.hashCode + ", partition: " + part + ", result: " + 
//          + Math.max(0, Math.min(numParts - 1, part.toInt)))
      return Math.max(0, Math.min(numParts - 1, part.toInt));
    }
  }
  
   case object MetisPartition extends PartitionStrategy {
     
     /**
      *  这里由于不知道如何引入SparkContext的引用,而Spark中只允许有一个SparkContex存在，不能自己新建,sc.textFile("hdfs://XXX")无法用
      */ 
     def loadMetisFile() {
     }
//      def loadMetisFile() {
//            try {
//                 val hdfs = FileSystem.get(URI.create("hdfs://192.168.0.100:9000/test/Web_metis_Final_Input.txt.part.6"), new Configuration)  
//               	 var fp : FSDataInputStream = hdfs.open(new Path("hdfs://192.168.0.100:9000/test/Web_metis_Final_Input.txt.part.6"))  
//               	 var isr : InputStreamReader = new InputStreamReader(fp) 
//                 var bReader : BufferedReader = new BufferedReader(isr)   
//               	 var id:Int = 1 
//               	 var line:String = bReader.readLine()  
//                 while(line!=null) {
//                   	 if ( !"".equals(line)) {
//               		   metisMap.put(id, line.toInt)
//               		   id = id +1 
//               		 }
//               		 line = bReader.readLine() 
//                 }
//               	 isr.close() ;
//               	 bReader.close() ;
//                 println("metisMap size: " + metisMap.size)
////           metisMap.foreach{case (e,i) => println(e,i)} 
//    } catch  {
//       case ex: Exception => { // Handle missing file 
//            ex.printStackTrace()
//       }  
//    } 
//   }
     
    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
//      if(metisMap.size == 0) {
//        loadMetisFile()
//        println("loadMetisFile")
//      }
      var s = metisMap.get(src.hashCode().toInt)
      if(!s.isEmpty)
      return s.getOrElse(0)
      else {
        println("src: " + src +", hascode: " + src.hashCode + ", partition: " + metisMap.get(src.hashCode()) + ", size: " +
            metisMap.size )
        throw new IllegalArgumentException("Metis can't find partition!")
      }
    }
  }

  /** Returns the PartitionStrategy with the specified name. */
  def fromString(s: String): PartitionStrategy = s match {
    case "RandomVertexCut" => RandomVertexCut
    case "EdgePartition1D" => EdgePartition1D
    case "EdgePartition2D" => EdgePartition2D
    case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut
    case "RangePartition" => RangePartition
    case "MetisPartition" => MetisPartition
    case _ => throw new IllegalArgumentException("Invalid PartitionStrategy: " + s)
  }
}

参考博客： http://www.cnblogs.com/HeQiangJava/p/6711527.html