sparkSQL入数据库源码解析

最新推荐文章于 2022-11-22 14:11:46 发布

原创最新推荐文章于 2022-11-22 14:11:46 发布 · 767 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#spark

JAVA 专栏收录该内容

15 篇文章

订阅专栏

本文深入解析了Spark SQL与MySQL数据交互的内部机制，包括数据写入、读取和转换过程。详细介绍了如何使用Spark SQL进行数据操作，以及与Spark Core自定义分区写入MySQL的区别。同时，探讨了配置参数的作用，如batchSize和isolationLevel，并提供了实战中遇到的问题解决方案。

1.用sparkSQL入mysql和sparkcore入mysql有啥区别呢？我们看看sparksql源码:

写入:

df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)

转化一下配置信息:

def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
  assertNotPartitioned("jdbc")
  assertNotBucketed("jdbc")
  // connectionProperties should override settings in extraOptions.
  this.extraOptions = this.extraOptions ++ connectionProperties.asScala
  // explicit url and dbtable should override all
  this.extraOptions += ("url" -> url, "dbtable" -> table)
  format("jdbc").save()
}

获取datasource

def save(): Unit = {
  assertNotBucketed("save")
  val dataSource = DataSource(
    df.sparkSession,
    className = source,
    partitionColumns = partitioningColumns.getOrElse(Nil),
    bucketSpec = getBucketSpec,
    options = extraOptions.toMap)

  dataSource.write(mode, df)
}

选择写出的方式

def write(mode: SaveMode, data: DataFrame): Unit = {
    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
      throw new AnalysisException("Cannot save interval data type into external storage.")
    }

    providingClass.newInstance() match {
      case dataSource: CreatableRelationProvider =>
        dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
      case format: FileFormat =>
        writeInFileFormat(format, mode, data)
      case _ =>
        sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
    }
  }
}

//判断一下模式是什么，插入前：采用创建表还是追加还是删除数据等方式

override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    df: DataFrame): BaseRelation = {
  val jdbcOptions = new JDBCOptions(parameters)
  val url = jdbcOptions.url
  val table = jdbcOptions.table
  val createTableOptions = jdbcOptions.createTableOptions
  val isTruncate = jdbcOptions.isTruncate

  val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
  try {
    val tableExists = JdbcUtils.tableExists(conn, url, table)
    if (tableExists) {
      mode match {
        case SaveMode.Overwrite =>
          if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
            // In this case, we should truncate table and then load.
            truncateTable(conn, table)
            saveTable(df, url, table, jdbcOptions)
          } else {
            // Otherwise, do not truncate the table, instead drop and recreate it
            dropTable(conn, table)
            createTable(df.schema, url, table, createTableOptions, conn)
            saveTable(df, url, table, jdbcOptions)
          }

        case SaveMode.Append =>
          saveTable(df, url, table, jdbcOptions)

        case SaveMode.ErrorIfExists =>
          throw new AnalysisException(
            s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

        case SaveMode.Ignore =>
          // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
          // to not save the contents of the DataFrame and to not change the existing data.
          // Therefore, it is okay to do nothing here and then just return the relation below.
      }
    } else {
      createTable(df.schema, url, table, createTableOptions, conn)
      saveTable(df, url, table, jdbcOptions)
    }
  } finally {
    conn.close()
  }
  createRelation(sqlContext, parameters)
}

//核心就是这个saveTable方法:

def saveTable(
    df: DataFrame,
    url: String,
    table: String,
    options: JDBCOptions) {
  val dialect = JdbcDialects.get(url)
  val nullTypes: Array[Int] = df.schema.fields.map { field =>
    getJdbcType(field.dataType, dialect).jdbcNullType
  }

  val rddSchema = df.schema
  val getConnection: () => Connection = createConnectionFactory(options)
  val batchSize = options.batchSize
  val isolationLevel = options.isolationLevel
//主要是这句话，我去，还不是对每个分区进行操作么？呵呵
  df.foreachPartition(iterator => savePartition(
    getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect, isolationLevel)
  )
}

savePartition方法:

/**
 * Returns a PreparedStatement that inserts a row into table via conn.
 */
def insertStatement(conn: Connection, table: String, rddSchema: StructType, dialect: JdbcDialect)
    : PreparedStatement = {
  val columns = rddSchema.fields.map(x => dialect.quoteIdentifier(x.name)).mkString(",")
  val placeholders = rddSchema.fields.map(_ => "?").mkString(",")
  val sql = s"INSERT INTO $table ($columns) VALUES ($placeholders)"
  conn.prepareStatement(sql)
}

好了，分析完毕，其实sparkSQL入mysql和sparkcore自己读分区入MYSQL是一样的。

怎么使用？

df.write.jdbc(url1, "TEST.TRUNCATETEST", properties)
df2.write.mode(SaveMode.Overwrite).option("truncate", true)
  .jdbc(url1, "TEST.TRUNCATETEST", properties)

properties，配置信息，key：user，password,url,dbtable，batchsize，isolationLevel：NONE，READ_COMMITTED，READ_UNCOMMITTED，REPEATABLE_READ，SERIALIZABLE，默认:READ_UNCOMMITTED


if (!committed) {
  // The stage must fail.  We got here through an exception path, so
  // let the exception through unless rollback() or close() want to
  // tell the user about another problem.
  if (supportsTransactions) {
    conn.rollback()
  }
  conn.close()

如何实现mysql到其他数据库的直接传输:

主要利用ResultSet中的

getMetaData来获取字段个数Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
然后通过PreparedStatement的setObject来进行插入

实战过程中的问题：

1.设置了batchsieze为什么插入还是很慢？

mysql必须要在链接中设置批处理开启，jdbc:mysql://localhost:3306/mydb?rewriteBatchedStatements=true