import psycopg2
import pyodbc
import re
import logging
import time
import schedule
import concurrent.futures
from typing import List, Dict, Tuple, Any, Set
from functools import lru_cache
from collections import deque
import threading
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("andon_etl_optimized.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 定义数据源配置
TABLE_NAMES = ["line1block", "line1cam", "line1crank", "line1head", "line1hsg",
"line2block", "line2cam", "line2crank", "line2head"]
# 重构数据源配置(添加主机IP)
DATA_SOURCES = []
for table_name in TABLE_NAMES:
DATA_SOURCES.append({
'type': 'postgresql',
'host': '192.168.154.11',
'port': '5432',
'database': 'andondata',
'user': 'powertrain',
'password': 'andromeda',
'table': table_name,
'host_ip': '192.168.154.11'
})
DATA_SOURCES.append({
'type': 'postgresql',
'host': '192.168.151.11',
'port': '5432',
'database': 'andondata',
'user': 'powertrain',
'password': 'andromeda',
'table': table_name,
'host_ip': '192.168.151.11'
})
# SQL Server目标配置
TARGET_CONFIG = {
'server': '192.168.10.116',
'database': '现场DB',
'username': 'sa',
'password': 'gtec_6600',
'table': 'dwd_web安东生产线状态'
}
# 目标编码列表
TARGET_CODES = [
"GY100", "GY101", "GY102", "GY103", "GY104", "GY105", "GY106", "GY107",
"GY108", "GY109", "GY10A", "GY10B", "GY10C", "GY10D", "GY10E", "GY116",
"GY11F", "GY120", "GY121", "GY122", "GY124", "GY126", "GY128", "GY130",
"GY131", "GY132", "GY133"
]
# 创建目标编码集合用于快速查找
TARGET_CODES_SET = set(TARGET_CODES)
# 预编译所有目标编码的正则表达式
def create_target_patterns(codes: List[str]) -> Dict[str, re.Pattern]:
patterns = {}
for code in codes:
patterns[code] = re.compile(re.escape(code) + r',([01]{16})}')
return patterns
TARGET_PATTERNS = create_target_patterns(TARGET_CODES)
# 连接池配置
POSTGRES_CONN_POOL = {}
SQL_SERVER_CONN_STR = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={TARGET_CONFIG['server']};DATABASE={TARGET_CONFIG['database']};UID={TARGET_CONFIG['username']};PWD={TARGET_CONFIG['password']}"
POOL_LOCK = threading.Lock()
def get_postgres_connection(source_config: Dict) -> Any:
"""获取或创建PostgreSQL连接(带连接池和线程安全)"""
key = f"{source_config['host']}_{source_config['database']}_{source_config['user']}"
with POOL_LOCK:
if key not in POSTGRES_CONN_POOL or POSTGRES_CONN_POOL[key].closed:
try:
POSTGRES_CONN_POOL[key] = psycopg2.connect(
host=source_config['host'],
port=source_config['port'],
database=source_config['database'],
user=source_config['user'],
password=source_config['password'],
connect_timeout=5
)
logger.debug(f"创建新的PostgreSQL连接: {key}")
except Exception as e:
logger.error(f"创建PostgreSQL连接失败: {str(e)}")
return None
return POSTGRES_CONN_POOL[key]
def get_sql_server_connection() -> Any:
"""获取SQL Server连接"""
try:
return pyodbc.connect(SQL_SERVER_CONN_STR, timeout=5)
except pyodbc.Error as e:
logger.error(f"创建SQL Server连接失败: {str(e)}")
return None
# 缓存线体和工作站信息
@lru_cache(maxsize=128)
def parse_station_info(table_name: str) -> Tuple[str, str]:
"""解析线体和工作站信息(带缓存)"""
line_match = re.match(r'line(\d+)(\w+)', table_name)
if line_match:
return line_match.group(1), line_match.group(2)
return "unknown", "unknown"
def flatten_nested_list_optimized(nested_list: Any) -> str:
"""
优化后的展平嵌套列表函数,使用迭代代替递归
Args:
nested_list: 可能是列表、元组或其他嵌套结构的数据
Returns:
展平后的字符串
"""
stack = [nested_list]
result = []
while stack:
current = stack.pop()
if isinstance(current, (list, tuple)):
# 反转添加顺序以保持原始顺序
stack.extend(reversed(current))
else:
result.append(str(current))
return ''.join(result)
def extract_data_optimized(source_config: Dict) -> List[Tuple]:
"""优化后的数据抽取函数,减少数据库查询时间"""
logger.info(f"从 {source_config['host']}.{source_config['table']} 抽取数据...")
try:
conn = get_postgres_connection(source_config)
if not conn:
return []
cursor = conn.cursor()
# 优化查询:只获取必要字段,使用更高效的数组操作
query = f"""
SELECT data, "createdAt", "updatedAt"
FROM {source_config['table']}
WHERE data && %s -- 使用数组重叠操作符,更快
ORDER BY "createdAt" DESC
LIMIT 1
"""
# 传递包含所有目标编码的数组
cursor.execute(query, ([f"{{{code}," for code in TARGET_CODES],))
data = cursor.fetchall()
cursor.close()
return data
except psycopg2.Error as e:
logger.error(f"PostgreSQL查询失败: {str(e)}")
return []
except Exception as e:
logger.error(f"抽取数据失败: {str(e)}")
return []
def process_record_optimized(record: Tuple, source_config: Dict) -> List[Dict]:
"""优化后的记录处理函数,提高处理速度"""
if not record:
return []
data_arr, created_at, updated_at = record
table_name = source_config['table']
line_number, station_name = parse_station_info(table_name)
# 快速展平数据
data_str = flatten_nested_list_optimized(data_arr)
transformed_records = []
processed_codes = set()
# 一次性查找所有目标编码
for code in TARGET_CODES:
if code not in data_str or code in processed_codes:
continue
# 使用预编译的正则表达式提取二进制字符串
match = TARGET_PATTERNS[code].search(data_str)
if not match:
continue
binary_str = match.group(1)
if len(binary_str) == 16:
# 直接使用索引访问,避免多次查找
result = 1 if binary_str[7] == '1' and binary_str[15] == '1' else 0
else:
result = 0
transformed_records.append({
'line_number': line_number,
'station_name': station_name,
'result': result,
'created_at': created_at,
'updated_at': updated_at,
'source_ip': source_config['host_ip'],
'source_table': table_name,
'target_code': code
})
processed_codes.add(code)
return transformed_records
def create_target_table_if_not_exists() -> bool:
"""确保目标表存在,创建表(如果不存在)"""
try:
conn = get_sql_server_connection()
if not conn:
return False
cursor = conn.cursor()
# 检查表是否存在
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{TARGET_CONFIG['table']}'
""")
table_exists = cursor.fetchone()[0] > 0
if not table_exists:
# 创建目标表
create_table_sql = f"""
CREATE TABLE {TARGET_CONFIG['table']} (
id INT IDENTITY(1,1) PRIMARY KEY,
line_number NVARCHAR(10) NOT NULL,
station_name NVARCHAR(50) NOT NULL,
result INT NOT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
source_ip NVARCHAR(20) NOT NULL,
source_table NVARCHAR(50) NOT NULL,
target_code NVARCHAR(20) NOT NULL,
insert_time DATETIME DEFAULT GETDATE()
)
"""
cursor.execute(create_table_sql)
logger.info(f"创建目标表 {TARGET_CONFIG['table']}")
# 创建索引(如果不存在)
try:
cursor.execute(f"""
IF NOT EXISTS (
SELECT *
FROM sys.indexes
WHERE name = 'idx_target_code' AND object_id = OBJECT_ID('{TARGET_CONFIG['table']}')
)
CREATE INDEX idx_target_code ON {TARGET_CONFIG['table']} (target_code)
""")
except Exception as e:
logger.warning(f"创建索引失败(可能已存在): {str(e)}")
conn.commit()
return True
except Exception as e:
logger.error(f"创建目标表失败: {str(e)}")
return False
finally:
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
def load_data_batch_optimized(records: List[Dict]):
"""优化后的批量加载函数,使用更高效的插入方法"""
if not records:
return
try:
conn = get_sql_server_connection()
if not conn:
return
cursor = conn.cursor()
# 使用表值参数(TVP)提高批量插入性能
try:
# 创建临时表
cursor.execute("""
CREATE TABLE #TempAndonData
(
line_number NVARCHAR(10),
station_name NVARCHAR(50),
result INT,
created_at DATETIME,
updated_at DATETIME,
source_ip NVARCHAR(20),
source_table NVARCHAR(50),
target_code NVARCHAR(20)
)
""")
# 批量插入到临时表
insert_temp = """
INSERT INTO #TempAndonData
(line_number, station_name, result, created_at, updated_at, source_ip, source_table, \
target_code)
VALUES (?, ?, ?, ?, ?, ?, ?, ?) \
"""
params = [
(r['line_number'], r['station_name'], r['result'],
r['created_at'], r['updated_at'], r['source_ip'],
r['source_table'], r['target_code'])
for r in records
]
cursor.executemany(insert_temp, params)
# 从临时表插入到目标表
cursor.execute(f"""
INSERT INTO {TARGET_CONFIG['table']}
(line_number, station_name, result, created_at, updated_at, source_ip, source_table, target_code)
SELECT
line_number, station_name, result, created_at, updated_at, source_ip, source_table, target_code
FROM #TempAndonData
""")
conn.commit()
logger.info(f"成功批量插入 {len(records)} 条数据到SQL Server (TVP)")
except pyodbc.Error as e:
# 如果TVP不可用,则回退到常规批量插入
logger.warning(f"TVP方法失败,使用常规批量插入: {str(e)}")
insert_sql = f"""
INSERT INTO {TARGET_CONFIG['table']} (
line_number, station_name, result,
created_at, updated_at, source_ip, source_table, target_code
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
"""
params = [
(
r['line_number'], r['station_name'], r['result'],
r['created_at'], r['updated_at'], r['source_ip'],
r['source_table'], r['target_code']
)
for r in records
]
cursor.executemany(insert_sql, params)
conn.commit()
logger.info(f"成功批量插入 {len(records)} 条数据到SQL Server (常规)")
except pyodbc.Error as e:
logger.error(f"数据库操作失败: {str(e)}")
conn.rollback()
except Exception as e:
logger.error(f"加载数据到SQL Server失败: {str(e)}")
finally:
cursor.close()
conn.close()
def process_data_source(source_config: Dict) -> int:
"""处理单个数据源并返回处理记录数"""
try:
# 抽取数据
data = extract_data_optimized(source_config)
if not data:
return 0
# 处理最新记录
latest_record = data[0]
# 转换数据
transformed_records = process_record_optimized(latest_record, source_config)
# 加载到SQL Server
if transformed_records:
load_data_batch_optimized(transformed_records)
return len(transformed_records)
return 0
except Exception as e:
logger.error(f"处理数据源 {source_config['table']} 失败: {str(e)}")
return 0
def extract_transform_load_optimized():
"""优化后的ETL主流程"""
logger.info(f"开始执行优化ETL流程,目标编码数量: {len(TARGET_CODES)}")
total_processed = 0
start_time = time.time()
# 确保目标表存在
if not create_target_table_if_not_exists():
logger.error("目标表创建失败,中止ETL流程")
return
# 使用线程池并行处理数据源(根据CPU核心数动态调整)
num_workers = min(16, len(DATA_SOURCES) * 2) # 最大16个worker
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = {executor.submit(process_data_source, config): config for config in DATA_SOURCES}
for future in concurrent.futures.as_completed(futures):
source_config = futures[future]
try:
processed = future.result()
total_processed += processed
except Exception as e:
logger.error(f"处理 {source_config['table']} 失败: {str(e)}")
duration = time.time() - start_time
logger.info(f"优化ETL流程执行完成,总计处理 {total_processed} 条记录,耗时 {duration:.2f} 秒")
return total_processed
def run_scheduler():
"""定时任务调度"""
logger.info(f"启动定时任务调度器,每分钟执行一次优化版ETL")
schedule.every(1).minutes.do(extract_transform_load_optimized)
# 初始执行
extract_transform_load_optimized()
while True:
try:
schedule.run_pending()
time.sleep(1)
except KeyboardInterrupt:
logger.info("接收到中断信号,停止调度器")
break
except Exception as e:
logger.error(f"调度器异常: {str(e)}")
time.sleep(10)
def cleanup_connections():
"""清理所有数据库连接"""
logger.info("清理数据库连接...")
for key, conn in list(POSTGRES_CONN_POOL.items()):
try:
if not conn.closed:
conn.close()
del POSTGRES_CONN_POOL[key]
except:
pass
if __name__ == "__main__":
try:
run_scheduler()
finally:
cleanup_connections()
源数据每个data字段中有几千对键值对,优化pytho脚本,提升数据处理速度
最新发布