Python Access 处理示例

一、整体流程概览

源Access文件
复制表和查询到10个目标Access文件
多进程并行处理10个目标文件
每个进程读取Query1和Query2数据
数据转换为csv模板格式
输出CSV文件

Query生成的数据模板​:行是Simulation(模拟次数)和Time(时间步),列是Var1-Var8(变量值)。

SimulationTime IndexVar1Var2Var3Var4Var5Var6Var7Var8
10
11
12
13
199
20

csv数据模板​:行是(Simulation, Var)组合,列是Time(0-99),值是变量在该时间步的值。

SimulationVar01299
1Var1
1Var2
1Var3
1Var4
1Var5
1Var6
1Var7
1Var8
2Var1
2Var2
2Var3
2Var4
2Var5

二、关键步骤实现

步骤1:复制Access文件中的表和查询到目标文件

需通过Python操作Access数据库,复制1个表7个查询到10个目标.mdb文件。推荐使用pyodbc(操作数据库结构)和shutil(复制表数据文件,可选)。

准备工作

  • 安装依赖库:pyodbc(操作Access)、pandas(数据处理)。
 pip install pyodbc pandas
pip install pywin32 pandas pyodbc  # pywin32操作Access COM接口,pandas处理数据
  • 确认Access驱动:Windows系统需安装Microsoft Access Database Engine,32/64位需与Python环境匹配。
import pyodbc
import os
from tqdm import tqdm  # 进度条(可选)

def copy_access_objects(source_db, target_dir, table_names, query_names, num_targets=10):
    """
    将源Access文件中的表和查询复制到多个目标文件
    :param source_db: 源Access文件路径(.mdb)
    :param target_dir: 目标文件存储目录
    :param table_names: 需要复制的表名列表(如["Table1"])
    :param query_names: 需要复制的查询名列表(如["Query1", "Query2", ...])
    :param num_targets: 目标文件数量(默认10个)
    """
    # 确保目标目录存在
    os.makedirs(target_dir, exist_ok=True)
    
    # 连接源数据库
    source_conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={source_db};"
    source_conn = pyodbc.connect(source_conn_str)
    source_cursor = source_conn.cursor()
    
    # 遍历生成10个目标文件
    for i in tqdm(range(1, num_targets+1), desc="复制进度"):
        target_db = os.path.join(target_dir, f"Target_{i}.mdb")
        
        # 创建目标数据库(空文件)
        target_conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={target_db};"
        target_conn = pyodbc.connect(target_conn_str)
        target_cursor = target_conn.cursor()
        
        try:
            # 复制表结构+数据
            for table in table_names:
                # 获取表结构(字段名、类型、约束)
                source_cursor.execute(f"SELECT * FROM {table} WHERE 1=0")  # 仅获取结构
                columns = source_cursor.description
                col_defs = []
                for col in columns:
                    # 简化处理:仅获取字段名和类型(需根据实际调整)
                    col_type = str(col.type_code)  # 实际需映射到Access类型(如TEXT=10, INTEGER=3)
                    col_defs.append(f"[{col[0]}] {col_type}")
                
                # 在目标库创建表
                create_table_sql = f"CREATE TABLE [{table}] ({', '.join(col_defs)})"
                target_cursor.execute(create_table_sql)
                
                # 复制数据(若表非空)
                data = source_cursor.execute(f"SELECT * FROM {table}").fetchall()
                if data:
                    placeholders = ", ".join(["?"] * len(columns))
                    insert_sql = f"INSERT INTO [{table}] VALUES ({placeholders})"
                    target_cursor.executemany(insert_sql, data)
                target_conn.commit()
            
            # 复制查询(保存为目标库的查询对象)
            for query in query_names:
                # 获取源查询的SQL定义
                source_cursor.execute(f"SELECT SQL FROM INFORMATION_SCHEMA.QUERIES WHERE NAME='{query}'")
                query_sql = source_cursor.fetchone()[0]
                
                # 在目标库创建同名查询
                target_cursor.execute(f"CREATE QUERY [{query}] AS {query_sql}")
                target_conn.commit()
        
        except Exception as e:
            print(f"目标文件 {target_db} 复制失败: {e}")
            continue
        
        finally:
            # 关闭连接
            target_conn.close()
    source_conn.close()

# 示例调用(根据实际修改参数)
source_mdb = r"C:\SourceDB.mdb"
target_dir = r"C:\TargetDBs"
tables_to_copy = ["YourTable"]  # 替换为实际表名
queries_to_copy = ["Query1", "Query2", "Query3", ...]  # 替换为实际7个查询名
copy_access_objects(source_mdb, target_dir, tables_to_copy, queries_to_copy, num_targets=10)

注意事项

  • 字段类型映射pyodbc获取的type_code可能与Access实际类型(如TEXTDATE)不一致,需手动映射(参考https://pyodbc.readthedocs.io/en/latest/usage.html#data-types)。
  • 查询依赖:若查询依赖其他表或查询,需确保目标库中已复制所有依赖对象。
  • 大表性能:复制大表时,executemany可能较慢,可分块读取(fetchmany(1000))并插入。

优化可能的方法:

数据库连接
import pyodbc
import pandas as pd

def connect_access(db_path):
    """连接Access数据库"""
    driver = r"{Microsoft Access Driver (*.mdb, *.accdb)}"
    conn_str = f"DRIVER={driver};DBQ={db_path};"
    try:
        conn = pyodbc.connect(conn_str)
        return conn
    except pyodbc.Error as e:
        raise Exception(f"数据库连接失败: {e}")

def read_table_data(conn, table_name):
    """读取指定表的全部数据(小数据量适用)"""
    query = f"SELECT * FROM {table_name}"
    return pd.read_sql(query, conn)
配置参数定义

通过config.yaml管理路径和对象列表(更易维护):

source_db: "path/to/source.mdb"       # 源Access文件
target_dir: "path/to/target_dir/"     # 目标文件存储目录
target_files: ["target1.mdb", ..., "target10.mdb"]  # 10个目标文件名
tables_to_copy: ["SourceTable"]       # 需复制的1个表名
queries_to_copy: ["Query1", "Query2", ..., "Query7"]  # 需复制的7个查询名
output_csv_dir: "path/to/csv_output/" # CSV输出目录
使用win32com操作Access COM接口,逐个复制表和查询到目标文件。
import win32com.client as win32
import os
import yaml  # 读取配置

def copy_objects_to_target(source_db_path, target_db_path, tables, queries):
    """将源数据库的表和查询复制到目标数据库"""
    try:
        # 初始化Access应用(后台运行)
        access = win32.Dispatch('Access.Application')
        access.Visible = False
        access.OpenCurrentDatabase(source_db_path, False)
        # 打开源数据库(只读模式避免锁定)
        source_db = access.CurrentDb()

        # 检查目标数据库是否存在,不存在则创建
        try:
            # 尝试以独占方式打开现有数据库
            access.OpenCurrentDatabase(target_path, False)
        except:
            # 如果不存在,则创建新数据库
            access.NewCurrentDatabase(target_path)
        
        # 获取目标数据库对象引用
        target_db = access.CurrentDb()

        # 复制表
        for table in tables:
            if table in source_db.TableDefs:
                source_db.CopyObject(target_db.Name, table, "Table", table)
                print(f"表 {table} 复制成功到 {target_db_path}")

        # 复制查询
        for query in queries:
            if query in source_db.QueryDefs:
                source_db.CopyObject(target_db.Name, query, "Query", query)
                print(f"查询 {query} 复制成功到 {target_db_path}")

        return True
    except Exception as e:
        print(f"复制失败: {str(e)}")
        return False
    finally:
        # 关闭数据库和应用
        if 'source_db' in locals(): source_db.Close()
        if 'target_db' in locals(): target_db.Close()
        access.Quit()
        del source_db, target_db, access
完整示例:企业级对象复制工具
import win32com.client as win32
from win32com.client import constants
import os
import time

class AccessObjectCopier:
    """企业级Access对象复制工具"""
    
    def __init__(self, visible=False):
        self.access = win32.Dispatch("Access.Application")
        self.access.Visible = visible
        self.source_db = None
        
    def connect_to_source(self, db_path):
        """连接到源数据库"""
        try:
            self.access.OpenCurrentDatabase(db_path)
            self.source_db = self.access.CurrentDb()
            return True
        except Exception as e:
            print(f"连接源数据库失败: {e}")
            return False
    
    def ensure_target_exists(self, target_path):
        """确保目标数据库存在"""
        if not os.path.exists(target_path):
            try:
                # 创建新数据库
                self.access.NewCurrentDatabase(target_path)
                self.access.CloseCurrentDatabase()
                # 重新连接源数据库
                self.access.OpenCurrentDatabase(self.source_db.Name)
                return True
            except Exception as e:
                print(f"创建目标数据库失败: {e}")
                return False
        return True
    
    def copy_objects(self, target_path, objects, overwrite=False):
        """
        复制对象到目标数据库
        
        Args:
            target_path: 目标数据库路径
            objects: 要复制的对象字典
            overwrite: 是否覆盖已存在的对象
        """
        if not self.ensure_target_exists(target_path):
            return False
        
        results = {"success": [], "failed": []}
        
        # 复制表
        for table_name in objects.get("tables", []):
            if self.copy_table(table_name, target_path, overwrite):
                results["success"].append(f"表: {table_name}")
            else:
                results["failed"].append(f"表: {table_name}")
        
        # 复制查询
        for query_name in objects.get("queries", []):
            if self.copy_query(query_name, target_path, overwrite):
                results["success"].append(f"查询: {query_name}")
            else:
                results["failed"].append(f"查询: {query_name}")
        
        return results
    
    def copy_table(self, table_name, target_path, overwrite=False):
        """复制表"""
        try:
            # 如果允许覆盖且目标已存在,先删除
            if overwrite and self.object_exists_in_target(target_path, table_name, "table"):
                self.delete_object_in_target(target_path, table_name, constants.acTable)
            
            self.access.DoCmd.CopyObject(target_path, table_name, constants.acTable, table_name)
            return True
        except Exception as e:
            print(f"复制表 '{table_name}' 失败: {e}")
            return False
    
    def copy_query(self, query_name, target_path, overwrite=False):
        """复制查询"""
        try:
            # 如果允许覆盖且目标已存在,先删除
            if overwrite and self.object_exists_in_target(target_path, query_name, "query"):
                self.delete_object_in_target(target_path, query_name, constants.acQuery)
            
            self.access.DoCmd.CopyObject(target_path, query_name, constants.acQuery, query_name)
            return True
        except Exception as e:
            print(f"复制查询 '{query_name}' 失败: {e}")
            return False
    
    def object_exists_in_target(self, target_path, object_name, object_type):
        """检查目标数据库中对象是否存在"""
        # 保存当前状态
        current_db = self.access.CurrentDb().Name
        
        try:
            # 临时切换到目标数据库
            self.access.CloseCurrentDatabase()
            self.access.OpenCurrentDatabase(target_path)
            target_db = self.access.CurrentDb()
            
            # 检查对象是否存在
            exists = False
            if object_type == "table":
                for i in range(target_db.TableDefs.Count):
                    if target_db.TableDefs(i).Name == object_name:
                        exists = True
                        break
            elif object_type == "query":
                for i in range(target_db.QueryDefs.Count):
                    if target_db.QueryDefs(i).Name == object_name:
                        exists = True
                        break
            
            # 切换回源数据库
            self.access.CloseCurrentDatabase()
            self.access.OpenCurrentDatabase(current_db)
            
            return exists
            
        except Exception as e:
            print(f"检查目标对象存在性失败: {e}")
            # 尝试恢复源数据库连接
            try:
                self.access.OpenCurrentDatabase(current_db)
            except:
                pass
            return False
    
    def delete_object_in_target(self, target_path, object_name, object_type):
        """删除目标数据库中的对象"""
        # 保存当前状态
        current_db = self.access.CurrentDb().Name
        
        try:
            # 切换到目标数据库
            self.access.CloseCurrentDatabase()
            self.access.OpenCurrentDatabase(target_path)
            
            # 删除对象
            self.access.DoCmd.DeleteObject(object_type, object_name)
            
            # 切换回源数据库
            self.access.CloseCurrentDatabase()
            self.access.OpenCurrentDatabase(current_db)
            
            return True
            
        except Exception as e:
            print(f"删除目标对象失败: {e}")
            # 尝试恢复源数据库连接
            try:
                self.access.OpenCurrentDatabase(current_db)
            except:
                pass
            return False
    
    def close(self):
        """关闭连接"""
        try:
            self.access.Quit()
        except:
            pass

# 使用示例
copier = AccessObjectCopier(visible=False)
if copier.connect_to_source("C:\\Source.accdb"):
    objects_to_copy = {
        "tables": ["Customers", "Orders"],
        "queries": ["SalesReport", "CustomerList"]
    }
    
    results = copier.copy_objects("C:\\Backup.accdb", objects_to_copy, overwrite=True)
    print("复制结果:", results)
    
    copier.close()

步骤2:多进程运行数据转换函数

需使用multiprocessing.Pool并行处理10个目标文件,每个进程独立连接目标Access文件,读取Query1Query2数据,转换为CSV。

转换逻辑:将长格式数据转换为宽格式,核心是用pandas.pivot_tablemelt+pivot

单目标文件处理函数

import pandas as pd
import pyodbc
from multiprocessing import Pool

def process_target_file(target_db_path):
    """处理单个目标Access文件:读取Query1和Query2,生成CSV"""
    try:
        # 连接目标数据库
        conn_str = f"DRIVER={{Microsoft Access Driver (*.mdb)}};DBQ={target_db_path};"
        conn = pyodbc.connect(conn_str)
        
        # 读取Query1和Query2数据(假设Query1对应图1的原始数据)
        query1_df = pd.read_sql("SELECT * FROM Query1", conn)
        query2_df = pd.read_sql("SELECT * FROM Query2", conn)  # 若Query2有其他用途(如配置),按需处理
        
        # 数据清洗(根据实际调整列名)
        query1_df = query1_df.rename(columns={
            "Simulation": "Simulation",
            "Time": "Time",
            "Var1": "Var1", "Var2": "Var2", ..., "Var8": "Var8"  # 确保列名与图1一致
        })
        
        # 提取时间步(Time列中的数值,如图1的"0 0.80046"需拆分为时间点0)
        # 假设Time列格式为"时间点 数值"(如"0 0.80046"),需提取时间点作为实际时间步
        query1_df[["TimeStep", " dummy"]] = query1_df["Time"].str.split(" ", expand=True)
        query1_df["TimeStep"] = query1_df["TimeStep"].astype(int)  # 转换为整数时间步(0-99)
        
        # 筛选有效时间步(0-99)
        query1_df = query1_df[query1_df["TimeStep"].between(0, 99)]
        
        # 转换为宽格式(图2模板)
        # 行:Simulation + Var;列:TimeStep(0-99);值:Var对应值
        pivot_df = query1_df.pivot_table(
            index=["Simulation", "Var"],  # 行标签
            columns="TimeStep",           # 列标签(时间步)
            values=query1_df.columns[-1], # 值(最后一列是变量值,需根据实际调整)
            aggfunc="first"               # 若有重复,取第一个值
        ).reset_index()
        
        # 重命名列(确保列名为0-99)
        pivot_df.columns.name = None  # 移除列索引的名称
        time_columns = [str(ts) for ts in range(100)]  # 0-99列
        pivot_df = pivot_df.reindex(columns=["Simulation", "Var"] + time_columns)
        
        # 输出CSV(路径可根据需求调整,如保存到目标文件同级目录)
        output_csv = os.path.splitext(target_db_path)[0] + "_output.csv"
        pivot_df.to_csv(output_csv, index=False)
        
        return f"成功处理:{target_db_path}{output_csv}"
    
    except Exception as e:
        return f"处理失败:{target_db_path},错误:{e}"
    finally:
        if 'conn' in locals():
            conn.close()

# 示例调用(多进程处理10个目标文件)
if __name__ == "__main__":
    target_dir = r"C:\TargetDBs"
    target_files = [os.path.join(target_dir, f"Target_{i}.mdb") for i in range(1, 11)]
    
    # 使用多进程(进程数=CPU核心数,建议不超过10)
    with Pool(processes=min(10, os.cpu_count())) as pool:
        results = pool.map(process_target_file, target_files)
    
    # 打印结果
    for res in results:
        print(res)

注意事项

  • 时间步处理:图1的Time列可能包含非整数或冗余信息(如"0 0.80046"),需根据实际规则提取有效时间步(如第一个数值作为时间点)。
  • 变量对齐:确保Query1中的变量(Var1-Var8)与图2的行标签完全匹配,避免列名错误。
  • 缺失值处理:图2中某些时间步(如列4、5)为空,转换时需保留空值(NaN)或填充默认值(如-)。
  • 多进程安全:每个进程独立连接数据库,避免文件锁冲突(Access不支持多进程同时写入,但此处仅为读取)。

步骤2 的另一种解法:

import pandas as pd
import win32com.client as win32

def transform_query_to_csv(target_db_path, output_dir):
    """读取目标数据库的Query1和Query2,转换为CSV"""
    try:
        # 连接目标数据库
        access = win32.gencache.EnsureDispatch('Access.Application')
        access.Visible = False
        db = access.OpenCurrentDatabase(target_db_path, False)

        # 读取Query1和Query2数据到DataFrame
        query1 = db.OpenRecordset("SELECT * FROM Query1")
        df1 = pd.DataFrame.from_records(query1.Fields)
        df1.columns = [f.Name for f in query1.Fields]  # 列名匹配

        query2 = db.OpenRecordset("SELECT * FROM Query2")
        df2 = pd.DataFrame.from_records(query2.Fields)
        df2.columns = [f.Name for f in query2.Fields]

        # 转换逻辑(示例:长格式转宽格式,匹配图1→图2)
        def reshape_df(df):
            # 假设原数据列:Simulation, Time, Var1-Var8(需根据实际调整)
            # 目标列:Simulation, Var0-Var99(缺失的Var用NaN填充)
            id_vars = ['Simulation', 'Time']
            value_vars = [f'Var{i}' for i in range(1, 9)]  # 假设原Var是Var1-Var8
            melted = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='Var', value_name='Value')
            
            # 提取Var编号(如Var1→1),补全0-99
            melted['Var_Num'] = melted['Var'].str.extract('(\d+)').astype(int)
            all_vars = pd.DataFrame({'Var_Num': range(0, 100)})  # Var0-Var99
            
            # 左连接补全缺失的Var
            merged = all_vars.merge(melted, on='Var_Num', how='left')
            merged['Var'] = merged['Var_Num'].astype(str).replace({'0': 'Var0'})  # 兼容Var0命名
            
            # 透视回宽格式(Simulation + Var0-Var99)
            pivoted = merged.pivot_table(
                index=['Simulation', 'Time'],
                columns='Var',
                values='Value',
                fill_value=None
            ).reset_index()
            
            # 重命名列(Simulation, Time, Var0, Var1,...,Var99)
            cols = ['Simulation', 'Time'] + [f'Var{i}' for i in range(0, 100)]
            return pivoted[cols]

        # 转换并保存CSV
        df1_reshaped = reshape_df(df1)
        df2_reshaped = reshape_df(df2)
        
        # 输出路径(示例:target1_Query1.csv)
        csv1_path = os.path.join(output_dir, f"{os.path.basename(target_db_path)}_Query1.csv")
        csv2_path = os.path.join(output_dir, f"{os.path.basename(target_db_path)}_Query2.csv")
        
        df1_reshaped.to_csv(csv1_path, index=False)
        df2_reshaped.to_csv(csv2_path, index=False)
        print(f"转换完成:{csv1_path}, {csv2_path}")

        return True
    except Exception as e:
        print(f"转换失败: {str(e)}")
        return False
    finally:
        if 'db' in locals(): db.Close()
        if 'access' in locals(): access.Quit()
        del access
import multiprocessing
import yaml
import os

def process_single_target(target_file, config):
    """单个目标文件的完整处理流程(复制+转换)"""
    source_db = config['source_db']
    target_dir = config['target_dir']
    output_csv_dir = config['output_csv_dir']
    
    # 目标数据库完整路径
    target_db_path = os.path.join(target_dir, target_file)
    
    # 步骤1:复制表和查询到目标文件
    tables = config['tables_to_copy']
    queries = config['queries_to_copy']
    copy_success = copy_objects_to_target(source_db, target_db_path, tables, queries)
    
    if not copy_success:
        return f"{target_file} 复制失败,跳过转换"
    
    # 步骤2:转换数据并生成CSV
    transform_success = transform_query_to_csv(target_db_path, output_csv_dir)
    return f"{target_file} 处理完成" if transform_success else f"{target_file} 转换失败"

def main():
    # 加载配置
    with open("config.yaml", "r") as f:
        config = yaml.safe_load(f)
    
    # 创建输出目录(若不存在)
    os.makedirs(config['target_dir'], exist_ok=True)
    os.makedirs(config['output_csv_dir'], exist_ok=True)
    
    # 创建进程池(10个进程,对应10个目标文件)
    with multiprocessing.Pool(processes=10) as pool:
        # 生成任务参数(目标文件列表)
        tasks = [(file, config) for file in config['target_files']]
        # 并行执行(starmap传递多参数)
        results = pool.starmap(process_single_target, tasks)
    
    # 输出结果汇总
    print("
===== 处理结果 =====")
    for res in results:
        print(res)

if __name__ == "__main__":
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值