由于我们是spark读取数据写入hudi 但是datahub自带的spark解析貌似不能解析
所以试图通过spline-spark-agent解析并导入datahub
简写版:
首先导入依赖用于解析
<dependency>
<groupId>za.co.absa.spline.agent.spark</groupId>
<artifactId>spark-3.3-spline-agent-bundle_2.12</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>io.acryl</groupId>
<artifactId>datahub-client</artifactId>
<version>0.10.2</version>
</dependency>
SparkLineageInitializer.enableLineageTracking(session);
1.先通过spline-spark-agent解析
由于spline-spark-agent自带的解析后的格式不能满足datahub的需要。所以需要自己实现。
这里可以尝试各种方式:kafka、console、http等
https://github.com/AbsaOSS/spline-spark-agent/blob/develop/core/src/main/resources/spline.default.yaml
2.自定义实现
2.1继承AbstractJsonLineageDispatcher这个类并实现name和send方法,在send方法中将解析后的符合datahub格式的数据血缘导入datahub
2.2在resources目录新增spline.properties,写上自定义的lineageDispatcher
比如
spline.lineageDispatcher = xxjson
spline.lineageDispatcher.xxjson.className =Utils.xxxxxDispatcher
3.导入datahub注意事项
https://datahubproject.io/docs/api/tutorials/lineage
因为datahub的api只有python没有java,这里我们采用http请求的方式。
但是datahub的http需要相应的Access Tokens,所以需要打开这个开关。
https://datahubproject.io/docs/authentication/personal-access-tokens/#personal-access-tokens-setup-prerequisites-and-permissions
说人话:修改docker-compose-without-neo4j.quickstart.yml,在datahub-frontend-react和datahub-gms下的environment添加- METADATA_SERVICE_AUTH_ENABLED=true
之后会获取一个access token 把这个token添加到header中即可
修改源码
/*
* Copyright 2021 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package za.co.absa.spline.harvester.dispatcher
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.commons.lang.StringUtils
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClients
import org.apache.http.protocol.HTTP
import java.io.{BufferedReader, InputStreamReader, PrintStream}
import java.nio.charset.Charset
import scala.collection.mutable
class DatahubLineageDispatcher() extends AbstractJsonLineageDispatcher {
override def name = "Datahub"
override protected def send(data: String): Unit = {
if (data.startsWith("ExecutionPlan")) {
val replaceDate = StringUtils.replace(data, "ExecutionPlan (apiVersion: 1.2):", "")
//这里拼接的血缘
val sb = makeLine(replaceDate)
//先删除原来的数据血缘
val splineRemove = "{\"query\": \"mutation updateLineage { updateLineage( input:{ edgesToAdd : [],edgesToRemove: [" + sb + "]})}\",\"variables\":{}}"
//增加最新的数据血缘
val splineAdd = "{\"query\": \"mutation updateLineage { updateLineage( input:{ edgesToAdd : [" + sb + "],edgesToRemove: []})}\",\"variables\":{}}"
handleHttp(splineRemove, "http://xxxx.xxx.xx:9002/api/graphql")
handleHttp(splineAdd, "http://xxxx.xxx.xx:9002/api/graphql")
}
}
def makeLine(replaceDate: String): StringBuilder = {
var downstreamUrn: String = ""
val operations: JSONObject = JSON.parseObject(replaceDate).getJSONObject("operations");
val write: JSONObject = operations.getJSONObject("write")
val `type`: String = write.getJSONObject("extra").getString("destinationType")
if ("hudi" == `type`) {
val params: JSONObject = write.getJSONObject("params")
//目标数据库
val targetDatabase: String = params.getString("hoodie.datasource.hive_sync.database")
//目标表
val targetTablename: String = params.getString("hoodie.datasource.hive_sync.table")
downstreamUrn = "{downstreamUrn: \\\"urn:li:dataset:(urn:li:dataPlatform:hive," + targetDatabase + "." + targetTablename + ",PROD)\\\","
}
val readsArray: JSONArray = operations.getJSONArray("reads") //这里开始获取数据来源
val sourset: mutable.LinkedHashSet[String] = new mutable.LinkedHashSet[String] //定义一个不允许重复的集合
for (i <- 0 until readsArray.size) {
val readObj: JSONObject = readsArray.getJSONObject(i)
val sourceTable: String = readObj.getJSONObject("params").getJSONObject("table").getJSONObject("identifier").getString("table")
val sourceDatabase: String = readObj.getJSONObject("params").getJSONObject("table").getJSONObject("identifier").getString("database")
sourset.add(sourceDatabase + "." + sourceTable)
}
val jsonParamList = mutable.LinkedHashSet[String]();
for (s <- sourset) {
val upstreamUrn: String = "upstreamUrn : \\\"urn:li:dataset:(urn:li:dataPlatform:hive," + s + ",PROD)\\\"}"
val jsonParam: String = downstreamUrn + upstreamUrn
jsonParamList.add(jsonParam)
}
val sb: StringBuilder = new StringBuilder
jsonParamList.foreach(
row=>{
sb.append(row).append(",")
}
)
if(!jsonParamList.isEmpty) {
sb.append(jsonParamList.last)
}
sb
}
def handleHttp(jsonParam: String, url: String): Unit = {
var in: BufferedReader = null
try {
val client = HttpClients.createDefault
val request = new HttpPost(url)
//这里的token每组不一样 需要数据组长生成添加使用。
val token = "x.x.x"
request.addHeader(HTTP.CONTENT_TYPE, "application/json")
request.addHeader("Authorization", "Bearer " + token)
val s = new StringEntity(jsonParam, Charset.forName("UTF-8"))
s.setContentEncoding("UTF-8")
s.setContentType("application/json;charset=UTF-8")
request.setEntity(s)
val response = client.execute(request)
val code = response.getStatusLine.getStatusCode
val in = response.getEntity.getContent.read()
if (code == 200) {
System.out.println("接口请求成功:" + in.toString + " " + url)
} else if (code == 500){
System.out.println("服务器错误:" + in + ",url:" + url)
}
else System.out.println("接口未知的情况,code=" + code + "," + in + ",url:" + url)
} catch {
case e: Exception =>
System.out.println("接口调用出现异常……")
}
}
}
通过sqllineage创建接口解析sql发送列级别数据血缘到datahub
参考:https://blog.youkuaiyun.com/LCriska/article/details/129329492
from flask import Flask, request
import json
from datahub.metadata.schema_classes import ChangeTypeClass
from sqllineage.runner import LineageRunner
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
DatasetLineageType,
FineGrainedLineage,
FineGrainedLineageDownstreamType,
FineGrainedLineageUpstreamType,
Upstream,
UpstreamLineage,
)
app = Flask(__name__)
@app.route('/', methods=['POST'])
def handle_post_request():
sqldata = request.get_data()
sqldata = sqldata.decode()
try:
data = json.loads(sqldata)
except Exception as e:
return "Invalid JSON", 400
sql = data['sql']
targedaBase = data['targedaBase']
targeTable = data['targeTable']
endsql = 'insert into ' + targedaBase + '.' + targeTable + ' '+sql+' '
process_sql(endsql)
return '接口收到:sql:'+endsql+' ;Database:'+targedaBase+' ;Table:'+targeTable+' '
def process_sql(endsql):
if endsql:
result = LineageRunner(endsql)
targetTableName = result.target_tables[0].__str__() # 获取sql中的下游表名
lineage = result.get_column_lineage # 获取列级血缘
result.print_column_lineage() # 打印列级血缘结果
baozhuang(result,targetTableName,lineage)
# 库名设置
def datasetUrn(tableName):
return builder.make_dataset_urn("hive", tableName) # platform = hive
# 表、列级信息设置
def fieldUrn(tableName, fieldName):
return builder.make_schema_field_urn(datasetUrn(tableName), fieldName)
def baozhuang(result,targetTableName,lineage):
# 字段级血缘list
fineGrainedLineageList = []
# 用于冲突检查的上游list
upStreamsList = []
# 遍历列级血缘
for columnTuples in lineage():
# 上游list
upStreamStrList = []
# 下游list
downStreamStrList = []
# 逐个字段遍历
for column in columnTuples:
# 元组中最后一个元素为下游表名与字段名,其他元素为上游表名与字段名
# 遍历到最后一个元素,为下游表名与字段名
if columnTuples.index(column) == len(columnTuples) - 1:
downStreamFieldName = column.raw_name.__str__()
downStreamTableName = column.__str__().replace('.' + downStreamFieldName, '').__str__()
downStreamStrList.append(fieldUrn(downStreamTableName, downStreamFieldName))
else:
upStreamFieldName = column.raw_name.__str__()
upStreamTableName = column.__str__().replace('.' + upStreamFieldName, '').__str__()
upStreamStrList.append(fieldUrn(upStreamTableName, upStreamFieldName))
# 用于检查上游血缘是否冲突
upStreamsList.append(
Upstream(dataset=datasetUrn(upStreamTableName), type=DatasetLineageType.TRANSFORMED))
fineGrainedLineage = FineGrainedLineage(upstreamType=FineGrainedLineageUpstreamType.DATASET,
upstreams=upStreamStrList,
downstreamType=FineGrainedLineageDownstreamType.FIELD_SET,
downstreams=downStreamStrList)
fineGrainedLineageList.append(fineGrainedLineage)
sendLine(targetTableName,upStreamsList,fineGrainedLineageList)
def sendLine(targetTableName,upStreamsList,fineGrainedLineageList):
fieldLineages = UpstreamLineage(upstreams=upStreamsList, fineGrainedLineages=fineGrainedLineageList)
lineageMcp = MetadataChangeProposalWrapper(
entityUrn=datasetUrn(targetTableName), # 下游表名
aspect=fieldLineages,
changeType=ChangeTypeClass.UPSERT,
aspectName="upstreamLineage",
)
# 调用datahub REST API
emitter = DatahubRestEmitter('http://172.18.1.54:18080',token="xxx")
emitter.emit_mcp(lineageMcp)
if __name__ == '__main__':
app.run(port=5620)
接口数据形式
{
"sql": " ",
"targeTable": "targatest",
"targedaBase": "test"
}