import psycopg2
import pyodbc
import re
import logging
import time
import schedule
import concurrent.futures
from typing import List, Dict, Tuple, Any
from functools import lru_cache
from collections import deque
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("andon_etl.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 定义数据源配置(包含多个表)
TABLE_NAMES = ["line1block", "line1cam", "line1crank", "line1head", "line1hsg",
"line2block", "line2cam", "line2crank", "line2head"]
# 重构数据源配置(添加主机IP)
DATA_SOURCES = []
for table_name in TABLE_NAMES:
# 192.168.154.11 主机
DATA_SOURCES.append({
'type': 'postgresql',
'host': '192.168.154.11',
'port': '5432',
'database': 'andondata',
'user': 'powertrain',
'password': 'andromeda',
'table': table_name,
'host_ip': '192.168.154.11'
})
# 192.168.151.11 主机
DATA_SOURCES.append({
'type': 'postgresql',
'host': '192.168.151.11',
'port': '5432',
'database': 'andondata',
'user': 'powertrain',
'password': 'andromeda',
'table': table_name,
'host_ip': '192.168.151.11'
})
# SQL Server目标配置
TARGET_CONFIG = {
'server': '192.168.10.116',
'database': '现场DB',
'username': 'sa',
'password': 'gtec_6600',
'table': 'dwd_web安东生产线状态'
}
# 目标编码列表
TARGET_CODES = [
"GY100", "GY101", "GY102", "GY103", "GY104", "GY105", "GY106", "GY107",
"GY108", "GY109", "GY10A", "GY10B", "GY10C", "GY10D", "GY10E", "GY116",
"GY11F", "GY120", "GY121", "GY122", "GY124", "GY126", "GY128", "GY130",
"GY131", "GY132", "GY133"
]
# 修复正则表达式问题
def create_target_patterns(codes: List[str]) -> Dict[str, re.Pattern]:
patterns = {}
for code in codes:
# 使用re.escape确保特殊字符被正确转义
# 匹配格式:{CODE,xxxxxxxxxxxxxx} 其中x是0或1
pattern = re.escape(code) + r',([01]{16})}'
patterns[code] = re.compile(pattern)
return patterns
# 预编译正则表达式提高性能(修复格式)
TARGET_PATTERNS = create_target_patterns(TARGET_CODES)
escaped_codes = [re.escape(code) for code in TARGET_CODES]
ALL_CODES_PATTERN = re.compile(r'|'.join(escaped_codes))
# 连接池配置
POSTGRES_CONN_POOL = {}
SQL_SERVER_CONN_STR = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={TARGET_CONFIG['server']};DATABASE={TARGET_CONFIG['database']};UID={TARGET_CONFIG['username']};PWD={TARGET_CONFIG['password']}"
def get_postgres_connection(source_config: Dict) -> Any:
"""获取或创建PostgreSQL连接(带连接池)"""
key = f"{source_config['host']}_{source_config['database']}_{source_config['user']}"
if key not in POSTGRES_CONN_POOL or POSTGRES_CONN_POOL[key].closed:
try:
POSTGRES_CONN_POOL[key] = psycopg2.connect(
host=source_config['host'],
port=source_config['port'],
database=source_config['database'],
user=source_config['user'],
password=source_config['password'],
connect_timeout=5
)
logger.debug(f"创建新的PostgreSQL连接: {key}")
except Exception as e:
logger.error(f"创建PostgreSQL连接失败: {str(e)}")
return None
return POSTGRES_CONN_POOL[key]
def get_sql_server_connection() -> Any:
"""获取SQL Server连接"""
try:
return pyodbc.connect(SQL_SERVER_CONN_STR, timeout=5)
except pyodbc.Error as e:
logger.error(f"创建SQL Server连接失败: {str(e)}")
return None
@lru_cache(maxsize=128)
def parse_station_info(table_name: str) -> Tuple[str, str]:
"""解析线体和工作站信息(带缓存)"""
line_match = re.match(r'line(\d+)(\w+)', table_name)
if line_match:
return line_match.group(1), line_match.group(2)
return "unknown", "unknown"
def flatten_nested_list(nested_list: Any) -> str:
"""
展平任意嵌套的列表或数组,返回单个字符串
Args:
nested_list: 可能是列表、元组或其他嵌套结构的数据
Returns:
展平后的字符串
"""
# 如果不是列表,直接转换为字符串
if not isinstance(nested_list, (list, tuple)):
return str(nested_list)
# 使用队列进行广度优先遍历
queue = deque(nested_list)
result = []
while queue:
item = queue.popleft()
# 如果元素是嵌套列表,展开它
if isinstance(item, (list, tuple)):
queue.extend(item)
else:
result.append(str(item))
return ''.join(result)
def extract_data(source_config: Dict) -> List[Tuple]:
"""从指定数据源抽取包含目标编码的数据(使用连接池)"""
logger.info(f"从 {source_config['host']}.{source_config['table']} 抽取数据...")
data = []
try:
conn = get_postgres_connection(source_config)
if not conn:
return []
cursor = conn.cursor()
# 新的查询方式:检查数组中是否包含匹配任何目标编码的元素
query = f"""
SELECT data, "createdAt", "updatedAt"
FROM {source_config['table']}
WHERE EXISTS (
SELECT 1
FROM unnest(data) AS element
WHERE element ~ %s
)
ORDER BY "createdAt" DESC
LIMIT 1
"""
# 传递正则表达式模式作为参数
cursor.execute(query, (ALL_CODES_PATTERN.pattern,))
data = cursor.fetchall()
logger.info(f"从 {source_config['table']} 获取了 {len(data)} 条记录")
cursor.close()
except psycopg2.Error as e:
logger.error(f"PostgreSQL查询失败: {str(e)}")
except Exception as e:
logger.error(f"抽取数据失败: {str(e)}")
return data
def process_record(record: Tuple, source_config: Dict) -> List[Dict]:
"""处理单条记录并返回转换后的数据列表"""
if not record:
return []
data_arr, created_at, updated_at = record
table_name = source_config['table']
line_number, station_name = parse_station_info(table_name)
transformed_records = []
processed_codes = set()
# 展平任意嵌套的数据数组
data_str = flatten_nested_list(data_arr)
# 查找所有匹配的目标编码
for code in TARGET_CODES:
if code not in data_str:
continue
# 使用预编译的正则表达式提取二进制字符串
match = TARGET_PATTERNS[code].search(data_str)
if not match:
logger.debug(f"未找到 {code} 的二进制数据")
continue
binary_str = match.group(1)
# 检查二进制位状态
if len(binary_str) == 16:
pos8 = binary_str[7] # 顺数第8位(索引7)
last = binary_str[15] # 倒数第一位(索引15)
result = 1 if pos8 == '1' and last == '1' else 0
else:
result = 0
logger.warning(f"无效的二进制字符串长度: {binary_str}")
# 避免重复处理同一个编码
if code in processed_codes:
continue
processed_codes.add(code)
transformed_records.append({
'line_number': line_number,
'station_name': station_name,
'result': result,
'created_at': created_at,
'updated_at': updated_at,
'source_ip': source_config['host_ip'],
'source_table': table_name,
'target_code': code
})
return transformed_records
def create_target_table_if_not_exists() -> bool:
"""确保目标表存在,创建表(如果不存在)"""
try:
conn = get_sql_server_connection()
if not conn:
return False
cursor = conn.cursor()
# 检查表是否存在
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{TARGET_CONFIG['table']}'
""")
table_exists = cursor.fetchone()[0] > 0
if not table_exists:
# 创建目标表
create_table_sql = f"""
CREATE TABLE {TARGET_CONFIG['table']} (
id INT IDENTITY(1,1) PRIMARY KEY,
line_number NVARCHAR(10) NOT NULL,
station_name NVARCHAR(50) NOT NULL,
result INT NOT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
source_ip NVARCHAR(20) NOT NULL,
source_table NVARCHAR(50) NOT NULL,
target_code NVARCHAR(20) NOT NULL,
insert_time DATETIME DEFAULT GETDATE()
)
"""
cursor.execute(create_table_sql)
logger.info(f"创建目标表 {TARGET_CONFIG['table']}")
# 创建索引(如果不存在)
try:
cursor.execute(f"""
IF NOT EXISTS (
SELECT *
FROM sys.indexes
WHERE name = 'idx_target_code' AND object_id = OBJECT_ID('{TARGET_CONFIG['table']}')
)
CREATE INDEX idx_target_code ON {TARGET_CONFIG['table']} (target_code)
""")
except Exception as e:
logger.warning(f"创建索引失败(可能已存在): {str(e)}")
conn.commit()
return True
except Exception as e:
logger.error(f"创建目标表失败: {str(e)}")
return False
finally:
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
def load_data_batch(records: List[Dict]):
"""批量加载数据到SQL Server"""
if not records:
return
try:
conn = get_sql_server_connection()
if not conn:
return
cursor = conn.cursor()
# 创建插入SQL
insert_sql = f"""
INSERT INTO {TARGET_CONFIG['table']} (
line_number, station_name, result,
created_at, updated_at, source_ip, source_table, target_code
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
"""
# 准备批量插入数据
params = [
(
r['line_number'], r['station_name'], r['result'],
r['created_at'], r['updated_at'], r['source_ip'],
r['source_table'], r['target_code']
)
for r in records
]
# 批量执行插入操作
cursor.executemany(insert_sql, params)
conn.commit()
logger.info(f"成功批量插入 {len(records)} 条数据到SQL Server")
except pyodbc.Error as e:
logger.error(f"数据库操作失败: {str(e)}")
except Exception as e:
logger.error(f"加载数据到SQL Server失败: {str(e)}")
finally:
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
def process_data_source(source_config: Dict) -> int:
"""处理单个数据源并返回处理记录数"""
processed_count = 0
try:
# 抽取数据
data = extract_data(source_config)
if not data:
logger.debug(f"未从 {source_config['table']} 获取包含目标编码的数据")
return 0
# 处理最新记录
latest_record = data[0]
# 转换数据
transformed_records = process_record(latest_record, source_config)
# 加载到SQL Server
if transformed_records:
load_data_batch(transformed_records)
processed_count = len(transformed_records)
except Exception as e:
logger.error(f"处理数据源 {source_config['table']} 失败: {str(e)}")
return processed_count
def extract_transform_load():
"""ETL主流程:抽取、转换、加载,支持多个目标编码"""
logger.info(f"开始执行ETL流程,目标编码数量: {len(TARGET_CODES)}")
total_processed = 0
# 确保目标表存在
if not create_target_table_if_not_exists():
logger.error("目标表创建失败,中止ETL流程")
return
# 使用线程池并行处理数据源
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
futures = {executor.submit(process_data_source, config): config for config in DATA_SOURCES}
for future in concurrent.futures.as_completed(futures):
source_config = futures[future]
try:
processed = future.result()
total_processed += processed
logger.debug(f"处理 {source_config['table']} 完成: {processed} 条记录")
except Exception as e:
logger.error(f"处理 {source_config['table']} 失败: {str(e)}")
logger.info(f"ETL流程执行完成,总计处理 {total_processed} 条记录")
return total_processed
def run_scheduler():
"""定时任务调度"""
logger.info(f"启动定时任务调度器,每分钟执行一次多编码数据ETL")
schedule.every(1).minutes.do(extract_transform_load)
# 初始执行
extract_transform_load()
while True:
try:
schedule.run_pending()
time.sleep(1)
except KeyboardInterrupt:
logger.info("接收到中断信号,停止调度器")
break
except Exception as e:
logger.error(f"调度器异常: {str(e)}")
time.sleep(10) # 发生错误后等待10秒再继续
def cleanup_connections():
"""清理所有数据库连接"""
logger.info("清理数据库连接...")
for key, conn in POSTGRES_CONN_POOL.items():
try:
if not conn.closed:
conn.close()
except:
pass
POSTGRES_CONN_POOL.clear()
if __name__ == "__main__":
try:
run_scheduler()
finally:
cleanup_connections()
优化数据处理速度