# 设计
# 1. 实现两个基类,BaseReader和BaseConverter
# 2. BaseReader有几个抽象方法:get_raw_sql支持通过raw sql查询, get_query支持通过sqlalchemy query查询
# 3. BaseReader实现batch_read方法,使用cursor方式批量读取数据,并yield出去
# 4. 基于BaseReader实现一个子类BaseRelationReader,接收主表的ids列表,读取关联表数据。BaseRelationReader必须依附于BaseReader,BaseReader可以管理0~N个BaseRelationReader,在BaseReader的batch_read方法中,会依次调用每个BaseRelationReader的batch_read方法,把同批次的关联数据也取出来,一并yield出去
# 5. Reader yield出去的数据格式为:Dict[str, List],key为Reader的名称,value为查询出来的数据列表
# 6. BaseConverter实现batch_convert方法,接收BaseReader的yield数据,进行转换,并yield出去
# 7. 实现Basetransformer类,用来管理目标表的写入,以及一个或多个BaseReader和BaseConverter对,并实现transformer方法驱动核心流程
# 8. Reader和Converter是成对出现的,每对Reader和Converter的转换结果都会批量upsert到目标表。
from abc import ABC, abstractmethod
from typing import Callable, Generator, List, Dict, Any, Optional, Set, Tuple,AsyncGenerator
import asyncio
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import MetaData, Table, text
from ddwh.config import setup_logging
from ddwh.utils.db_utils import DBUtils
import time
from functools import wraps
import asyncio
logger = setup_logging()
def timer_decorator(func):
@wraps(func)
def wrapper(*args,**kwargs):
instance_name = args[0].name if hasattr(args[0], 'name') else func.__name__
start = time.perf_counter()
result = func(*args, **kwargs)
duration = time.perf_counter() - start
# 获取批次大小(如果可用)
batch_size = 0
if result:
batch_size = len(result) if isinstance(result, list) else 1
# 记录耗时日志(包含转换器名称和批次大小)
logger.info(
f"转换器 {instance_name} - {func.__name__} 耗时: {duration:.4f} 秒 | "
f"批次大小: {batch_size} | 平均每条: {duration / max(1, batch_size):.6f} 秒"
)
return result
return wrapper
class BaseReader(ABC):
"""数据读取基类
负责从数据源读取原始数据,支持批量读取和游标方式处理大数据集
可以管理多个BaseRelationReader, 在读取主表数据时同时读取关联数据
"""
def __init__(self, db_utils: DBUtils):
"""初始化读取器
Args:
db_utils: 数据库工具实例
"""
self.db_utils: DBUtils = db_utils
self.relation_readers: List[BaseRelationReader] = []
@property
def name(self) -> str:
return self.__class__.__name__
@property
def batch_size(self) -> int:
return 5000
@property
def order_fields(self) -> List[str]:
return []
@property
@abstractmethod
def get_query(self) -> Callable:
pass
class BaseMainReader(BaseReader):
"""主表数据读取器基类"""
def __init__(self, db_utils: DBUtils):
super().__init__(db_utils)
def add_relation_reader(self, relation_reader: "BaseRelationReader"):
"""添加关联表读取器
Args:
relation_reader: 关联表读取器实例
"""
self.relation_readers.append(relation_reader)
logger.info(f"添加关联读取器: {relation_reader.__class__.__name__}")
def read(self) -> Generator[Dict[str, List], None, None]:
"""批量读取数据
使用游标方式批量读取数据,同时读取关联数据
Yields:
Dict[str, List]: 数据字典,key为读取器名称,value为数据列表
"""
try:
query = self.get_query()
logger.info(f"开始批量读取数据,批次大小: {self.batch_size}")
logger.debug(
f"{self.name} query: {query.compile(compile_kwargs={'literal_binds': True})}"
)
for batch in self.db_utils.fetch_data_with_cursor(
query, batch_size=self.batch_size
):
if batch:
logger.info(f"从主表 {self.name} 读取到 {len(batch)} 条记录")
# 构建返回数据字典
result_data = {self.name: batch}
# 读取关联数据
for relation_reader in self.relation_readers:
try:
# 更新关联读取器的ID列表
relation_reader.extract_relation_ids(batch)
# 读取关联数据
relation_batch = []
for relation_data_batch in relation_reader.read():
relation_batch.extend(relation_data_batch)
if relation_batch:
result_data[relation_reader.name] = relation_batch
logger.info(
f"从关联表 {relation_reader.name} 读取到 {len(relation_batch)} 条记录"
)
except Exception as e:
logger.exception(f"读取关联数据失败: {str(e)}")
# 继续处理其他关联读取器
continue
yield result_data
except Exception as e:
logger.exception(f"批量读取数据失败: {str(e)}")
raise
class BaseRelationReader(BaseReader):
"""关联表数据读取器
基于主表ID列表读取关联表数据,必须依附于BaseReader
"""
def __init__(self, db_utils: DBUtils):
"""初始化关联表读取器
Args:
db_utils: 数据库工具实例
"""
super().__init__(db_utils)
self.relation_ids: Dict[str, Set[Any]] = {}
@property
@abstractmethod
def relations(self) -> Dict[str, str]:
"""
返回关联表的关联关系, key为主表的列名, value为关联表(当前表)的列名
"""
pass
def extract_relation_ids(self, batch: List[Dict[str, Any]]):
"""更新主表ID列表
Args:
main_ids: 主表ID列表
"""
for current_table_column in self.relations.values():
self.relation_ids[current_table_column] = set()
for record in batch:
for main_table_column, current_table_column in self.relations.items():
self.relation_ids[current_table_column].add(record[main_table_column])
def read(self) -> Generator[List[Dict[str, Any]], None, None]:
"""批量读取关联数据
Yields:
List[Dict[str, Any]]: 关联数据批次
"""
if not self.relation_ids:
return
try:
query = self.get_query()
logger.info(f"开始读取关联数据,主表ID数量: {len(self.relation_ids)}")
logger.debug(
f"{self.name} query: {query.compile(compile_kwargs={'literal_binds': True})}"
)
for batch in self.db_utils.fetch_data_with_cursor(
query, batch_size=self.batch_size
):
if batch:
logger.debug(f"读取到 {len(batch)} 条关联记录")
yield batch
except Exception as e:
logger.exception(f"批量读取关联数据失败: {str(e)}")
raise
class BaseConverter(ABC):
"""数据转换基类
负责将原始数据转换为目标格式
"""
def __init__(self, name: str = None):
"""初始化转换器
Args:
name: 转换器名称
"""
self.name = name or self.__class__.__name__
@property
@abstractmethod
def upsert_keys(self) -> Optional[List[str]]:
"""
返回upsert的键字段名列表,支持单列或多列联合主键
如果为None,则使用insert操作
"""
pass
@property
@abstractmethod
def table_name(self) -> str:
"""
返回目标表名,需要携带schema,格式为schema.table_name
"""
pass
@property
def truncate_before_convert(self) -> bool:
"""
返回是否在转换前清空目标表
"""
return False
def batch_delete_conditions(
self, converted_batch: List[Dict[str, Any]]
) -> Tuple[Optional[List[str]], Optional[List[List[Any]]]]:
"""
返回在每个批次转换前删除该批次对应的数据的条件
Args:
converted_batch: 转换后的数据批次
Returns:
Tuple[Optional[List[str]], Optional[List[List[Any]]]]: 删除条件列名列表和删除条件值列表
Example:
converted_batch = [
{"user_id": 1, "data_type": "A", "data_value": 100},
{"user_id": 2, "data_type": "B", "data_value": 200}
]
return ["user_id", "data_type"], [
[1, "A"],
[2, "B"]
]
"""
return None, None
@abstractmethod
def convert_a_batch(self, data_batch: Dict[str, List]) -> List[Dict[str, Any]]:
"""转换数据批次
Args:
data_batch: 原始数据批次,格式为Dict[str, List]
Returns:
List[Dict[str, Any]]: 转换后的数据列表
"""
pass
def convert(
self, data_batches: Generator[Dict[str, List], None, None]
) -> Generator[Tuple[List[Dict[str, Any]], str, Optional[List[str]]], None, None]:
"""批量转换数据
Args:
data_batches: 原始数据批次生成器
Yields:
Tuple[List[Dict[str, Any]], str, Optional[List[str]]]: 转换后的数据批次、目标表名、upsert键
"""
try:
for data_batch in data_batches:
try:
start_time = time.perf_counter()
converted_batch = self.convert_a_batch(data_batch)
elapsed = time.perf_counter() - start_time
logger.info(
f"转换器 {self.name} 的批次转换耗时: {elapsed:.4f} 秒, 批次大小: {len(converted_batch) if converted_batch else 0}"
)
if converted_batch:
logger.info(f"转换了 {len(converted_batch)} 条记录")
delete_columns, delete_values = self.batch_delete_conditions(
converted_batch
)
yield converted_batch, self.upsert_keys, delete_columns, delete_values
except Exception as e:
logger.exception(f"转换数据批次失败: {str(e)}")
# 继续处理其他批次,不中断整个流程
continue
except Exception as e:
logger.exception(f"批量转换数据失败: {str(e)}")
raise
class BaseTransformer:
"""异步数据转换协调器"""
def __init__(self, db_utils: DBUtils, max_workers: int = 4):
self.db_utils = db_utils
self.reader_converter_pairs: List[Tuple[BaseReader, BaseConverter]] = []
self.name = self.__class__.__name__
self.max_workers = max_workers # 线程池大小
def register_reader_converter_pair(
self, reader: BaseReader, converter: BaseConverter
):
"""添加Reader和Converter配对
Args:
reader: 数据读取器实例
converter: 数据转换器实例
"""
self.reader_converter_pairs.append((reader, converter))
logger.info(f"添加Reader-Converter配对: {reader.name} -> {converter.name}")
async def transform_async(self) -> bool:
"""
异步执行整个数据转换流程
Returns:
bool: 转换是否成功
"""
try:
logger.info(f"{self.name} 开始异步数据转换")
# 使用线程池执行器处理同步I/O操作
executor = ThreadPoolExecutor(max_workers=self.max_workers)
loop = asyncio.get_running_loop()
# 处理所有Reader-Converter配对
tasks = [
self._process_pair_async(reader, converter, loop, executor)
for reader, converter in self.reader_converter_pairs
]
# 等待所有任务完成
await asyncio.gather(*tasks)
logger.info(f"{self.name} 异步数据转换完成")
return True
except Exception as e:
logger.exception(f"异步转换失败: {str(e)}")
return False
async def _process_pair_async(
self,
reader: BaseReader,
converter: BaseConverter,
loop: asyncio.AbstractEventLoop,
executor: ThreadPoolExecutor
) -> None:
"""
异步处理单个Reader-Converter配对
"""
logger.info(f"异步处理配对: {reader.name} -> {converter.name}")
# 准备目标表
target_table = converter.table_name
schema, table_name = target_table.split(".", 1) if "." in target_table else (None, target_table)
table_obj = await loop.run_in_executor(
executor, self.db_utils.create_table_obj, table_name, schema
)
# 清空目标表(如果需要)
if converter.truncate_before_convert:
logger.info(f"清空目标表: {target_table}")
await loop.run_in_executor(
executor,
self.db_utils.execute_ddl,
text(f"TRUNCATE TABLE {target_table}")
)
# 创建异步数据处理管道
async for batch_data in self._create_async_pipeline(reader, converter, loop, executor):
await self._write_batch_async(
converter, table_obj, batch_data, loop, executor
)
async def _create_async_pipeline(
self,
reader: BaseReader,
converter: BaseConverter,
loop: asyncio.AbstractEventLoop,
executor: ThreadPoolExecutor
) -> AsyncGenerator[Tuple[Any, ...], None]:
"""
创建异步数据处理管道
"""
# 异步读取
logger.info(f"开始异步读取: {reader.name}")
data_gen = await loop.run_in_executor(executor, reader.read)
# 异步转换
logger.info(f"开始异步转换: {converter.name}")
converter_gen = await loop.run_in_executor(
executor, lambda: list(converter.convert(data_gen))
)
# 分批产出转换后的数据
for batch in converter_gen:
yield batch
async def _write_batch_async(
self,
converter: BaseConverter,
table_obj: Table,
batch_data: Tuple[Any, ...],
loop: asyncio.AbstractEventLoop,
executor: ThreadPoolExecutor
) -> None:
"""
异步写入数据批次
"""
converted_batch, upsert_keys, delete_columns, delete_values = batch_data
# 批量删除
if delete_columns and delete_values:
await loop.run_in_executor(
executor,
self.db_utils.batch_delete,
table_obj,
delete_columns,
delete_values
)
# 批量写入
if upsert_keys:
success = await loop.run_in_executor(
executor,
self.db_utils.batch_upsert,
table_obj,
converted_batch,
upsert_keys
)
else:
success = await loop.run_in_executor(
executor,
self.db_utils.batch_insert,
table_obj,
converted_batch
)
if not success:
raise Exception(f"异步写入到表 {converter.table_name} 失败")
logger.debug(f"异步处理了 {len(converted_batch)} 条记录")
依然没有写入数据
最新发布