一、整体流程概览
Query生成的数据模板:行是Simulation(模拟次数)和Time(时间步),列是Var1-Var8(变量值)。
| Simulation | Time Index | Var1 | Var2 | Var3 | Var4 | Var5 | Var6 | Var7 | Var8 |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | ||||||||
| 1 | 1 | ||||||||
| 1 | 2 | ||||||||
| 1 | 3 | ||||||||
| … | … | … | … | … | … | … | … | … | … |
| 1 | 99 | ||||||||
| 2 | 0 | ||||||||
| … | … | … | … | … | … | … | … | … | … |
csv数据模板:行是(Simulation, Var)组合,列是Time(0-99),值是变量在该时间步的值。
| Simulation | Var | 0 | 1 | 2 | … | 99 |
|---|---|---|---|---|---|---|
| 1 | Var1 | … | ||||
| 1 | Var2 | … | ||||
| 1 | Var3 | … | ||||
| 1 | Var4 | … | ||||
| 1 | Var5 | … | ||||
| 1 | Var6 | … | ||||
| 1 | Var7 | … | ||||
| 1 | Var8 | … | ||||
| 2 | Var1 | … | ||||
| 2 | Var2 | … | ||||
| 2 | Var3 | … | ||||
| 2 | Var4 | … | ||||
| 2 | Var5 | … | ||||
| … | … | … | … | … | … | … |
二、关键步骤实现
步骤1:复制Access文件中的表和查询到目标文件
需通过Python操作Access数据库,复制1个表和7个查询到10个目标.mdb文件。推荐使用pyodbc(操作数据库结构)和shutil(复制表数据文件,可选)。
准备工作
- 安装依赖库:
pyodbc(操作Access)、pandas(数据处理)。
pip install pyodbc pandas
pip install pywin32 pandas pyodbc # pywin32操作Access COM接口,pandas处理数据
- 确认Access驱动:Windows系统需安装
Microsoft Access Database Engine,32/64位需与Python环境匹配。
import pyodbc
import os
from tqdm import tqdm # 进度条(可选)
def copy_access_objects(source_db, target_dir, table_names, query_names, num_targets=10):
"""
将源Access文件中的表和查询复制到多个目标文件
:param source_db: 源Access文件路径(.mdb)
:param target_dir: 目标文件存储目录
:param table_names: 需要复制的表名列表(如["Table1"])
:param query_names: 需要复制的查询名列表(如["Query1", "Query2", ...])
:param num_targets: 目标文件数量(默认10个)
"""
# 确保目标目录存在
os.makedirs(target_dir, exist_ok=True)
# 连接源数据库
source_conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={source_db};"
source_conn = pyodbc.connect(source_conn_str)
source_cursor = source_conn.cursor()
# 遍历生成10个目标文件
for i in tqdm(range(1, num_targets+1), desc="复制进度"):
target_db = os.path.join(target_dir, f"Target_{i}.mdb")
# 创建目标数据库(空文件)
target_conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={target_db};"
target_conn = pyodbc.connect(target_conn_str)
target_cursor = target_conn.cursor()
try:
# 复制表结构+数据
for table in table_names:
# 获取表结构(字段名、类型、约束)
source_cursor.execute(f"SELECT * FROM {table} WHERE 1=0") # 仅获取结构
columns = source_cursor.description
col_defs = []
for col in columns:
# 简化处理:仅获取字段名和类型(需根据实际调整)
col_type = str(col.type_code) # 实际需映射到Access类型(如TEXT=10, INTEGER=3)
col_defs.append(f"[{col[0]}] {col_type}")
# 在目标库创建表
create_table_sql = f"CREATE TABLE [{table}] ({', '.join(col_defs)})"
target_cursor.execute(create_table_sql)
# 复制数据(若表非空)
data = source_cursor.execute(f"SELECT * FROM {table}").fetchall()
if data:
placeholders = ", ".join(["?"] * len(columns))
insert_sql = f"INSERT INTO [{table}] VALUES ({placeholders})"
target_cursor.executemany(insert_sql, data)
target_conn.commit()
# 复制查询(保存为目标库的查询对象)
for query in query_names:
# 获取源查询的SQL定义
source_cursor.execute(f"SELECT SQL FROM INFORMATION_SCHEMA.QUERIES WHERE NAME='{query}'")
query_sql = source_cursor.fetchone()[0]
# 在目标库创建同名查询
target_cursor.execute(f"CREATE QUERY [{query}] AS {query_sql}")
target_conn.commit()
except Exception as e:
print(f"目标文件 {target_db} 复制失败: {e}")
continue
finally:
# 关闭连接
target_conn.close()
source_conn.close()
# 示例调用(根据实际修改参数)
source_mdb = r"C:\SourceDB.mdb"
target_dir = r"C:\TargetDBs"
tables_to_copy = ["YourTable"] # 替换为实际表名
queries_to_copy = ["Query1", "Query2", "Query3", ...] # 替换为实际7个查询名
copy_access_objects(source_mdb, target_dir, tables_to_copy, queries_to_copy, num_targets=10)
注意事项
- 字段类型映射:
pyodbc获取的type_code可能与Access实际类型(如TEXT、DATE)不一致,需手动映射(参考https://pyodbc.readthedocs.io/en/latest/usage.html#data-types)。 - 查询依赖:若查询依赖其他表或查询,需确保目标库中已复制所有依赖对象。
- 大表性能:复制大表时,
executemany可能较慢,可分块读取(fetchmany(1000))并插入。
优化可能的方法:
数据库连接
import pyodbc
import pandas as pd
def connect_access(db_path):
"""连接Access数据库"""
driver = r"{Microsoft Access Driver (*.mdb, *.accdb)}"
conn_str = f"DRIVER={driver};DBQ={db_path};"
try:
conn = pyodbc.connect(conn_str)
return conn
except pyodbc.Error as e:
raise Exception(f"数据库连接失败: {e}")
def read_table_data(conn, table_name):
"""读取指定表的全部数据(小数据量适用)"""
query = f"SELECT * FROM {table_name}"
return pd.read_sql(query, conn)
配置参数定义
通过config.yaml管理路径和对象列表(更易维护):
source_db: "path/to/source.mdb" # 源Access文件
target_dir: "path/to/target_dir/" # 目标文件存储目录
target_files: ["target1.mdb", ..., "target10.mdb"] # 10个目标文件名
tables_to_copy: ["SourceTable"] # 需复制的1个表名
queries_to_copy: ["Query1", "Query2", ..., "Query7"] # 需复制的7个查询名
output_csv_dir: "path/to/csv_output/" # CSV输出目录
使用win32com操作Access COM接口,逐个复制表和查询到目标文件。
import win32com.client as win32
import os
import yaml # 读取配置
def copy_objects_to_target(source_db_path, target_db_path, tables, queries):
"""将源数据库的表和查询复制到目标数据库"""
try:
# 初始化Access应用(后台运行)
access = win32.Dispatch('Access.Application')
access.Visible = False
access.OpenCurrentDatabase(source_db_path, False)
# 打开源数据库(只读模式避免锁定)
source_db = access.CurrentDb()
# 检查目标数据库是否存在,不存在则创建
try:
# 尝试以独占方式打开现有数据库
access.OpenCurrentDatabase(target_path, False)
except:
# 如果不存在,则创建新数据库
access.NewCurrentDatabase(target_path)
# 获取目标数据库对象引用
target_db = access.CurrentDb()
# 复制表
for table in tables:
if table in source_db.TableDefs:
source_db.CopyObject(target_db.Name, table, "Table", table)
print(f"表 {table} 复制成功到 {target_db_path}")
# 复制查询
for query in queries:
if query in source_db.QueryDefs:
source_db.CopyObject(target_db.Name, query, "Query", query)
print(f"查询 {query} 复制成功到 {target_db_path}")
return True
except Exception as e:
print(f"复制失败: {str(e)}")
return False
finally:
# 关闭数据库和应用
if 'source_db' in locals(): source_db.Close()
if 'target_db' in locals(): target_db.Close()
access.Quit()
del source_db, target_db, access
完整示例:企业级对象复制工具
import win32com.client as win32
from win32com.client import constants
import os
import time
class AccessObjectCopier:
"""企业级Access对象复制工具"""
def __init__(self, visible=False):
self.access = win32.Dispatch("Access.Application")
self.access.Visible = visible
self.source_db = None
def connect_to_source(self, db_path):
"""连接到源数据库"""
try:
self.access.OpenCurrentDatabase(db_path)
self.source_db = self.access.CurrentDb()
return True
except Exception as e:
print(f"连接源数据库失败: {e}")
return False
def ensure_target_exists(self, target_path):
"""确保目标数据库存在"""
if not os.path.exists(target_path):
try:
# 创建新数据库
self.access.NewCurrentDatabase(target_path)
self.access.CloseCurrentDatabase()
# 重新连接源数据库
self.access.OpenCurrentDatabase(self.source_db.Name)
return True
except Exception as e:
print(f"创建目标数据库失败: {e}")
return False
return True
def copy_objects(self, target_path, objects, overwrite=False):
"""
复制对象到目标数据库
Args:
target_path: 目标数据库路径
objects: 要复制的对象字典
overwrite: 是否覆盖已存在的对象
"""
if not self.ensure_target_exists(target_path):
return False
results = {"success": [], "failed": []}
# 复制表
for table_name in objects.get("tables", []):
if self.copy_table(table_name, target_path, overwrite):
results["success"].append(f"表: {table_name}")
else:
results["failed"].append(f"表: {table_name}")
# 复制查询
for query_name in objects.get("queries", []):
if self.copy_query(query_name, target_path, overwrite):
results["success"].append(f"查询: {query_name}")
else:
results["failed"].append(f"查询: {query_name}")
return results
def copy_table(self, table_name, target_path, overwrite=False):
"""复制表"""
try:
# 如果允许覆盖且目标已存在,先删除
if overwrite and self.object_exists_in_target(target_path, table_name, "table"):
self.delete_object_in_target(target_path, table_name, constants.acTable)
self.access.DoCmd.CopyObject(target_path, table_name, constants.acTable, table_name)
return True
except Exception as e:
print(f"复制表 '{table_name}' 失败: {e}")
return False
def copy_query(self, query_name, target_path, overwrite=False):
"""复制查询"""
try:
# 如果允许覆盖且目标已存在,先删除
if overwrite and self.object_exists_in_target(target_path, query_name, "query"):
self.delete_object_in_target(target_path, query_name, constants.acQuery)
self.access.DoCmd.CopyObject(target_path, query_name, constants.acQuery, query_name)
return True
except Exception as e:
print(f"复制查询 '{query_name}' 失败: {e}")
return False
def object_exists_in_target(self, target_path, object_name, object_type):
"""检查目标数据库中对象是否存在"""
# 保存当前状态
current_db = self.access.CurrentDb().Name
try:
# 临时切换到目标数据库
self.access.CloseCurrentDatabase()
self.access.OpenCurrentDatabase(target_path)
target_db = self.access.CurrentDb()
# 检查对象是否存在
exists = False
if object_type == "table":
for i in range(target_db.TableDefs.Count):
if target_db.TableDefs(i).Name == object_name:
exists = True
break
elif object_type == "query":
for i in range(target_db.QueryDefs.Count):
if target_db.QueryDefs(i).Name == object_name:
exists = True
break
# 切换回源数据库
self.access.CloseCurrentDatabase()
self.access.OpenCurrentDatabase(current_db)
return exists
except Exception as e:
print(f"检查目标对象存在性失败: {e}")
# 尝试恢复源数据库连接
try:
self.access.OpenCurrentDatabase(current_db)
except:
pass
return False
def delete_object_in_target(self, target_path, object_name, object_type):
"""删除目标数据库中的对象"""
# 保存当前状态
current_db = self.access.CurrentDb().Name
try:
# 切换到目标数据库
self.access.CloseCurrentDatabase()
self.access.OpenCurrentDatabase(target_path)
# 删除对象
self.access.DoCmd.DeleteObject(object_type, object_name)
# 切换回源数据库
self.access.CloseCurrentDatabase()
self.access.OpenCurrentDatabase(current_db)
return True
except Exception as e:
print(f"删除目标对象失败: {e}")
# 尝试恢复源数据库连接
try:
self.access.OpenCurrentDatabase(current_db)
except:
pass
return False
def close(self):
"""关闭连接"""
try:
self.access.Quit()
except:
pass
# 使用示例
copier = AccessObjectCopier(visible=False)
if copier.connect_to_source("C:\\Source.accdb"):
objects_to_copy = {
"tables": ["Customers", "Orders"],
"queries": ["SalesReport", "CustomerList"]
}
results = copier.copy_objects("C:\\Backup.accdb", objects_to_copy, overwrite=True)
print("复制结果:", results)
copier.close()
步骤2:多进程运行数据转换函数
需使用multiprocessing.Pool并行处理10个目标文件,每个进程独立连接目标Access文件,读取Query1和Query2数据,转换为CSV。
转换逻辑:将长格式数据转换为宽格式,核心是用pandas.pivot_table或melt+pivot。
单目标文件处理函数
import pandas as pd
import pyodbc
from multiprocessing import Pool
def process_target_file(target_db_path):
"""处理单个目标Access文件:读取Query1和Query2,生成CSV"""
try:
# 连接目标数据库
conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={target_db_path};"
conn = pyodbc.connect(conn_str)
# 读取Query1和Query2数据(假设Query1对应图1的原始数据)
query1_df = pd.read_sql("SELECT * FROM Query1", conn)
query2_df = pd.read_sql("SELECT * FROM Query2", conn) # 若Query2有其他用途(如配置),按需处理
# 数据清洗(根据实际调整列名)
query1_df = query1_df.rename(columns={
"Simulation": "Simulation",
"Time": "Time",
"Var1": "Var1", "Var2": "Var2", ..., "Var8": "Var8" # 确保列名与图1一致
})
# 提取时间步(Time列中的数值,如图1的"0 0.80046"需拆分为时间点0)
# 假设Time列格式为"时间点 数值"(如"0 0.80046"),需提取时间点作为实际时间步
query1_df[["TimeStep", " dummy"]] = query1_df["Time"].str.split(" ", expand=True)
query1_df["TimeStep"] = query1_df["TimeStep"].astype(int) # 转换为整数时间步(0-99)
# 筛选有效时间步(0-99)
query1_df = query1_df[query1_df["TimeStep"].between(0, 99)]
# 转换为宽格式(图2模板)
# 行:Simulation + Var;列:TimeStep(0-99);值:Var对应值
pivot_df = query1_df.pivot_table(
index=["Simulation", "Var"], # 行标签
columns="TimeStep", # 列标签(时间步)
values=query1_df.columns[-1], # 值(最后一列是变量值,需根据实际调整)
aggfunc="first" # 若有重复,取第一个值
).reset_index()
# 重命名列(确保列名为0-99)
pivot_df.columns.name = None # 移除列索引的名称
time_columns = [str(ts) for ts in range(100)] # 0-99列
pivot_df = pivot_df.reindex(columns=["Simulation", "Var"] + time_columns)
# 输出CSV(路径可根据需求调整,如保存到目标文件同级目录)
output_csv = os.path.splitext(target_db_path)[0] + "_output.csv"
pivot_df.to_csv(output_csv, index=False)
return f"成功处理:{target_db_path} → {output_csv}"
except Exception as e:
return f"处理失败:{target_db_path},错误:{e}"
finally:
if 'conn' in locals():
conn.close()
# 示例调用(多进程处理10个目标文件)
if __name__ == "__main__":
target_dir = r"C:\TargetDBs"
target_files = [os.path.join(target_dir, f"Target_{i}.mdb") for i in range(1, 11)]
# 使用多进程(进程数=CPU核心数,建议不超过10)
with Pool(processes=min(10, os.cpu_count())) as pool:
results = pool.map(process_target_file, target_files)
# 打印结果
for res in results:
print(res)
注意事项
- 时间步处理:图1的
Time列可能包含非整数或冗余信息(如"0 0.80046"),需根据实际规则提取有效时间步(如第一个数值作为时间点)。 - 变量对齐:确保
Query1中的变量(Var1-Var8)与图2的行标签完全匹配,避免列名错误。 - 缺失值处理:图2中某些时间步(如列4、5)为空,转换时需保留空值(
NaN)或填充默认值(如-)。 - 多进程安全:每个进程独立连接数据库,避免文件锁冲突(Access不支持多进程同时写入,但此处仅为读取)。
步骤2 的另一种解法:
import pandas as pd
import win32com.client as win32
def transform_query_to_csv(target_db_path, output_dir):
"""读取目标数据库的Query1和Query2,转换为CSV"""
try:
# 连接目标数据库
access = win32.gencache.EnsureDispatch('Access.Application')
access.Visible = False
db = access.OpenCurrentDatabase(target_db_path, False)
# 读取Query1和Query2数据到DataFrame
query1 = db.OpenRecordset("SELECT * FROM Query1")
df1 = pd.DataFrame.from_records(query1.Fields)
df1.columns = [f.Name for f in query1.Fields] # 列名匹配
query2 = db.OpenRecordset("SELECT * FROM Query2")
df2 = pd.DataFrame.from_records(query2.Fields)
df2.columns = [f.Name for f in query2.Fields]
# 转换逻辑(示例:长格式转宽格式,匹配图1→图2)
def reshape_df(df):
# 假设原数据列:Simulation, Time, Var1-Var8(需根据实际调整)
# 目标列:Simulation, Var0-Var99(缺失的Var用NaN填充)
id_vars = ['Simulation', 'Time']
value_vars = [f'Var{i}' for i in range(1, 9)] # 假设原Var是Var1-Var8
melted = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='Var', value_name='Value')
# 提取Var编号(如Var1→1),补全0-99
melted['Var_Num'] = melted['Var'].str.extract('(\d+)').astype(int)
all_vars = pd.DataFrame({'Var_Num': range(0, 100)}) # Var0-Var99
# 左连接补全缺失的Var
merged = all_vars.merge(melted, on='Var_Num', how='left')
merged['Var'] = merged['Var_Num'].astype(str).replace({'0': 'Var0'}) # 兼容Var0命名
# 透视回宽格式(Simulation + Var0-Var99)
pivoted = merged.pivot_table(
index=['Simulation', 'Time'],
columns='Var',
values='Value',
fill_value=None
).reset_index()
# 重命名列(Simulation, Time, Var0, Var1,...,Var99)
cols = ['Simulation', 'Time'] + [f'Var{i}' for i in range(0, 100)]
return pivoted[cols]
# 转换并保存CSV
df1_reshaped = reshape_df(df1)
df2_reshaped = reshape_df(df2)
# 输出路径(示例:target1_Query1.csv)
csv1_path = os.path.join(output_dir, f"{os.path.basename(target_db_path)}_Query1.csv")
csv2_path = os.path.join(output_dir, f"{os.path.basename(target_db_path)}_Query2.csv")
df1_reshaped.to_csv(csv1_path, index=False)
df2_reshaped.to_csv(csv2_path, index=False)
print(f"转换完成:{csv1_path}, {csv2_path}")
return True
except Exception as e:
print(f"转换失败: {str(e)}")
return False
finally:
if 'db' in locals(): db.Close()
if 'access' in locals(): access.Quit()
del access
import multiprocessing
import yaml
import os
def process_single_target(target_file, config):
"""单个目标文件的完整处理流程(复制+转换)"""
source_db = config['source_db']
target_dir = config['target_dir']
output_csv_dir = config['output_csv_dir']
# 目标数据库完整路径
target_db_path = os.path.join(target_dir, target_file)
# 步骤1:复制表和查询到目标文件
tables = config['tables_to_copy']
queries = config['queries_to_copy']
copy_success = copy_objects_to_target(source_db, target_db_path, tables, queries)
if not copy_success:
return f"{target_file} 复制失败,跳过转换"
# 步骤2:转换数据并生成CSV
transform_success = transform_query_to_csv(target_db_path, output_csv_dir)
return f"{target_file} 处理完成" if transform_success else f"{target_file} 转换失败"
def main():
# 加载配置
with open("config.yaml", "r") as f:
config = yaml.safe_load(f)
# 创建输出目录(若不存在)
os.makedirs(config['target_dir'], exist_ok=True)
os.makedirs(config['output_csv_dir'], exist_ok=True)
# 创建进程池(10个进程,对应10个目标文件)
with multiprocessing.Pool(processes=10) as pool:
# 生成任务参数(目标文件列表)
tasks = [(file, config) for file in config['target_files']]
# 并行执行(starmap传递多参数)
results = pool.starmap(process_single_target, tasks)
# 输出结果汇总
print("
===== 处理结果 =====")
for res in results:
print(res)
if __name__ == "__main__":
main()

被折叠的 条评论
为什么被折叠?



