解析spark sql将数据血缘并导入datahub

州周

已于 2024-02-02 09:50:02 修改

阅读量1.2k

点赞数 2

CC 4.0 BY-SA版权

文章标签： spark 大数据分布式

于 2023-08-23 17:35:40 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_45249411/article/details/132457248

由于我们是spark读取数据写入hudi 但是datahub自带的spark解析貌似不能解析

所以试图通过spline-spark-agent解析并导入datahub

简写版：

首先导入依赖用于解析

    <dependency>
        <groupId>za.co.absa.spline.agent.spark</groupId>
        <artifactId>spark-3.3-spline-agent-bundle_2.12</artifactId>
        <version>2.0.0</version>
    </dependency>

    <dependency>
        <groupId>io.acryl</groupId>
        <artifactId>datahub-client</artifactId>
        <version>0.10.2</version>
    </dependency>

 SparkLineageInitializer.enableLineageTracking(session);

1.先通过spline-spark-agent解析

由于spline-spark-agent自带的解析后的格式不能满足datahub的需要。所以需要自己实现。
这里可以尝试各种方式：kafka、console、http等
https://github.com/AbsaOSS/spline-spark-agent/blob/develop/core/src/main/resources/spline.default.yaml

2.自定义实现

2.1继承AbstractJsonLineageDispatcher这个类并实现name和send方法,在send方法中将解析后的符合datahub格式的数据血缘导入datahub

2.2在resources目录新增spline.properties，写上自定义的lineageDispatcher

比如
spline.lineageDispatcher = xxjson
spline.lineageDispatcher.xxjson.className =Utils.xxxxxDispatcher


3.导入datahub注意事项
https://datahubproject.io/docs/api/tutorials/lineage
因为datahub的api只有python没有java，这里我们采用http请求的方式。

但是datahub的http需要相应的Access Tokens,所以需要打开这个开关。
https://datahubproject.io/docs/authentication/personal-access-tokens/#personal-access-tokens-setup-prerequisites-and-permissions

说人话：修改docker-compose-without-neo4j.quickstart.yml，在datahub-frontend-react和datahub-gms下的environment添加- METADATA_SERVICE_AUTH_ENABLED=true

之后会获取一个access token 把这个token添加到header中即可

修改源码
/*
 * Copyright 2021 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.spline.harvester.dispatcher
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.commons.lang.StringUtils
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.protocol.HTTP

import java.io.{BufferedReader, InputStreamReader, PrintStream}
import java.nio.charset.Charset
import scala.collection.mutable

class DatahubLineageDispatcher() extends AbstractJsonLineageDispatcher {

  override def name = "Datahub"

  override protected def send(data: String): Unit = {
    if (data.startsWith("ExecutionPlan")) {
      val replaceDate = StringUtils.replace(data, "ExecutionPlan (apiVersion: 1.2):", "")
      //这里拼接的血缘
      val sb = makeLine(replaceDate)
      //先删除原来的数据血缘
      val splineRemove = "{\"query\": \"mutation updateLineage { updateLineage( input:{ edgesToAdd : [],edgesToRemove: [" + sb + "]})}\",\"variables\":{}}"
      //增加最新的数据血缘
      val splineAdd = "{\"query\": \"mutation updateLineage { updateLineage( input:{ edgesToAdd : [" + sb + "],edgesToRemove: []})}\",\"variables\":{}}"
      handleHttp(splineRemove, "http://xxxx.xxx.xx:9002/api/graphql")
      handleHttp(splineAdd, "http://xxxx.xxx.xx:9002/api/graphql")
    }

  }

  def makeLine(replaceDate: String): StringBuilder = {
    var downstreamUrn: String = ""

    val operations: JSONObject  = JSON.parseObject(replaceDate).getJSONObject("operations");

    val write: JSONObject = operations.getJSONObject("write")
    val `type`: String = write.getJSONObject("extra").getString("destinationType")
    if ("hudi" == `type`) {
      val params: JSONObject = write.getJSONObject("params")
      //目标数据库
      val targetDatabase: String = params.getString("hoodie.datasource.hive_sync.database")
      //目标表
      val targetTablename: String = params.getString("hoodie.datasource.hive_sync.table")
      downstreamUrn = "{downstreamUrn: \\\"urn:li:dataset:(urn:li:dataPlatform:hive," + targetDatabase + "." + targetTablename + ",PROD)\\\","
    }
    val readsArray: JSONArray = operations.getJSONArray("reads") //这里开始获取数据来源

    val sourset: mutable.LinkedHashSet[String] = new mutable.LinkedHashSet[String] //定义一个不允许重复的集合

    for (i <- 0 until readsArray.size) {
      val readObj: JSONObject = readsArray.getJSONObject(i)
      val sourceTable: String = readObj.getJSONObject("params").getJSONObject("table").getJSONObject("identifier").getString("table")
      val sourceDatabase: String = readObj.getJSONObject("params").getJSONObject("table").getJSONObject("identifier").getString("database")
      sourset.add(sourceDatabase + "." + sourceTable)
    }

    val jsonParamList = mutable.LinkedHashSet[String]();

    for (s <- sourset) {
      val upstreamUrn: String = "upstreamUrn :  \\\"urn:li:dataset:(urn:li:dataPlatform:hive," + s + ",PROD)\\\"}"
      val jsonParam: String = downstreamUrn + upstreamUrn
      jsonParamList.add(jsonParam)
    }
    val sb: StringBuilder = new StringBuilder

    jsonParamList.foreach(
      row=>{
        sb.append(row).append(",")
      }
    )
    if(!jsonParamList.isEmpty) {
      sb.append(jsonParamList.last)
    }
    sb
  }


  def handleHttp(jsonParam: String, url: String): Unit = {
    var in: BufferedReader = null
    try {
      val client = HttpClients.createDefault
      val request = new HttpPost(url)
      //这里的token每组不一样 需要数据组长生成添加使用。
      val token = "x.x.x"
      request.addHeader(HTTP.CONTENT_TYPE, "application/json")
      request.addHeader("Authorization", "Bearer " + token)
      val s = new StringEntity(jsonParam, Charset.forName("UTF-8"))
      s.setContentEncoding("UTF-8")
      s.setContentType("application/json;charset=UTF-8")
      request.setEntity(s)
      val response = client.execute(request)
      val code = response.getStatusLine.getStatusCode

      val in = response.getEntity.getContent.read()

      if (code == 200) {
        System.out.println("接口请求成功:" + in.toString + "        " + url)
      } else if (code == 500){
        System.out.println("服务器错误:" + in + ",url:" + url)
      }
      else System.out.println("接口未知的情况,code=" + code + "," + in + ",url:" + url)
    } catch {
      case e: Exception =>
        System.out.println("接口调用出现异常……")
    }
  }

}

通过sqllineage创建接口解析sql发送列级别数据血缘到datahub
参考：https://blog.youkuaiyun.com/LCriska/article/details/129329492


from flask import Flask, request
import json
from datahub.metadata.schema_classes import ChangeTypeClass
from sqllineage.runner import LineageRunner
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
    DatasetLineageType,
    FineGrainedLineage,
    FineGrainedLineageDownstreamType,
    FineGrainedLineageUpstreamType,
    Upstream,
    UpstreamLineage,
)


app = Flask(__name__)

@app.route('/', methods=['POST'])
def handle_post_request():
    sqldata = request.get_data()
    sqldata = sqldata.decode()
    try:
        data = json.loads(sqldata)
    except Exception as e:
        return "Invalid JSON", 400

    sql = data['sql']
    targedaBase = data['targedaBase']
    targeTable =  data['targeTable']
    endsql = 'insert into ' + targedaBase + '.' + targeTable + ' '+sql+'  '
    process_sql(endsql)

    return '接口收到：sql：'+endsql+' ;Database：'+targedaBase+'  ;Table：'+targeTable+' '

def process_sql(endsql):
    if endsql:
        result = LineageRunner(endsql)
        targetTableName = result.target_tables[0].__str__()  # 获取sql中的下游表名
        lineage = result.get_column_lineage   # 获取列级血缘
        result.print_column_lineage() # 打印列级血缘结果
        baozhuang(result,targetTableName,lineage)

# 库名设置
def datasetUrn(tableName):
    return builder.make_dataset_urn("hive", tableName)  # platform = hive

# 表、列级信息设置
def fieldUrn(tableName, fieldName):
    return builder.make_schema_field_urn(datasetUrn(tableName), fieldName)

def  baozhuang(result,targetTableName,lineage):
    # 字段级血缘list
    fineGrainedLineageList = []
    # 用于冲突检查的上游list
    upStreamsList = []

    # 遍历列级血缘
    for columnTuples in lineage():
        # 上游list
        upStreamStrList = []
        # 下游list
        downStreamStrList = []
        # 逐个字段遍历
        for column in columnTuples:
            # 元组中最后一个元素为下游表名与字段名，其他元素为上游表名与字段名
            # 遍历到最后一个元素，为下游表名与字段名
            if columnTuples.index(column) == len(columnTuples) - 1:
                downStreamFieldName = column.raw_name.__str__()
                downStreamTableName = column.__str__().replace('.' + downStreamFieldName, '').__str__()

                downStreamStrList.append(fieldUrn(downStreamTableName, downStreamFieldName))
            else:
                upStreamFieldName = column.raw_name.__str__()
                upStreamTableName = column.__str__().replace('.' + upStreamFieldName, '').__str__()
                upStreamStrList.append(fieldUrn(upStreamTableName, upStreamFieldName))
                # 用于检查上游血缘是否冲突
                upStreamsList.append(
                    Upstream(dataset=datasetUrn(upStreamTableName), type=DatasetLineageType.TRANSFORMED))

        fineGrainedLineage = FineGrainedLineage(upstreamType=FineGrainedLineageUpstreamType.DATASET,
                                                upstreams=upStreamStrList,

                                                downstreamType=FineGrainedLineageDownstreamType.FIELD_SET,
                                                downstreams=downStreamStrList)

        fineGrainedLineageList.append(fineGrainedLineage)
        sendLine(targetTableName,upStreamsList,fineGrainedLineageList)

def sendLine(targetTableName,upStreamsList,fineGrainedLineageList):
    fieldLineages = UpstreamLineage(upstreams=upStreamsList, fineGrainedLineages=fineGrainedLineageList)
    lineageMcp = MetadataChangeProposalWrapper(
        entityUrn=datasetUrn(targetTableName),  # 下游表名
        aspect=fieldLineages,
        changeType=ChangeTypeClass.UPSERT,
        aspectName="upstreamLineage",
    )

    # 调用datahub REST API
    emitter = DatahubRestEmitter('http://172.18.1.54:18080',token="xxx")  
    emitter.emit_mcp(lineageMcp)


if __name__ == '__main__':
    app.run(port=5620)

接口数据形式

{
    "sql": "        ",
    "targeTable": "targatest",
    "targedaBase": "test"
}