目录
3、MySQL8.0的数据准备(初始数据、新数据、历史拉链表的建表语句)
1、问题背景
背景中,需要从一个数据库获取数据,插入到另一个数据库中,所以使用Python来编写一个调度脚本来操作,而不用纯粹的SQL语句。(此处为去敏,使用本地MySQL+Python调度演示)
2、历史拉链表的原理
关于拉链表的组成原理,此处引用《Greenplum 企业应用实战》by何勇、陈晓峰中的讲解。大意就是,设置两个日期字段,start_time、end_time,即可;
第一天,全量同步过来的数据,start_time设置为当天、end_time设置为9999-12-31;
第二天,将对于逻辑上删除的数据,做闭链处理,即end_time设置为当天;新增的数据,做开新链处理,即新增一条,start_time设置为当天、end_time设置为9999-12-31;修改的数据,比如同一个主键、其他属性被修改了,做以上两种处理。
以后每天,皆重复执行第二天的操作。
最后这张表的效果是,1、查历史上每天的数据都是ok的;2、查每天新增(+修改)的数据也是ok的
3、MySQL8.0的数据准备(初始数据、新数据、历史拉链表的建表语句)
第一天的源表数据,预计全量同步的数据:
/*
Navicat Premium Dump SQL
Target Server Type : MySQL
Target Server Version : 80040 (8.0.40)
File Encoding : 65001
Date: 11/11/2024 10:58:07
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tab_col_name_1021
-- ----------------------------
DROP TABLE IF EXISTS `tab_col_name_1021`;
CREATE TABLE `tab_col_name_1021` (
`id` int NOT NULL AUTO_INCREMENT,
`schema_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '库名',
`tab_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '表英文名',
`tab_comment` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '表中文名',
`col_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '字段英文名',
`col_type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '字段类型',
`col_comment` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL COMMENT '字段中文名',
`code_values` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '码值描述\r\n',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 10 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of tab_col_name_1021
-- ----------------------------
INSERT INTO `tab_col_name_1021` VALUES (1, 'prd_rgd', 'A', '123', 'a', 'varchar(255)', 'a1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (2, 'prd_rgd', 'A', '123', 'b', 'int', 'b1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (3, 'prd_iww', 'B', '234', 'c', 'decimal(4,2)', 'c1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (4, 'prd_rgd', 'B', '234', 'd', 'varchar(255)', 'd1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (5, 'prd_htt', 'C', '345', 'e', 'date', 'e1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (6, 'prd_rgd', 'D', '456', 'd', 'varchar(255)', 'd1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (7, 'prd_rgd', 'E', '567', 'e', 'text', 'e1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (8, 'prd_iww', 'R', '269', 'r', 'text', 'r1', NULL);
INSERT INTO `tab_col_name_1021` VALUES (9, 'prd_iww', 'V', '636', 'i', 'varchar(255)', 'i1', NULL);
SET FOREIGN_KEY_CHECKS = 1;
第二天的源表数据(此处直接新增了一个表),预计将(与第一天对比下后的)增量同步到拉链表:
/*
Navicat Premium Dump SQL
Source Server Type : MySQL
Source Server Version : 80040 (8.0.40)
Target Server Type : MySQL
Target Server Version : 80040 (8.0.40)
File Encoding : 65001
Date: 11/11/2024 10:58:17
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tab_col_name_1022
-- ----------------------------
DROP TABLE IF EXISTS `tab_col_name_1022`;
CREATE TABLE `tab_col_name_1022` (
`id` int NOT NULL AUTO_INCREMENT,
`schema_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '库名',
`tab_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '表英文名',
`tab_comment` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '表中文名',
`col_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '字段英文名',
`col_type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '字段类型',
`col_comment` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL COMMENT '字段中文名',
`code_values` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL COMMENT '码值描述\r\n',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 13 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of tab_col_name_1022
-- ----------------------------
INSERT INTO `tab_col_name_1022` VALUES (1, 'prd_ads', 'A', '123', 'a', 'varchar(255)', 'a1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (2, 'prd_ads', 'A', '123', 'b', 'int', 'b1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (3, 'prd_dws', 'B', '234', 'c', 'decimal(4,2)', 'c1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (6, 'prd_ads', 'D', '456', 'd', 'varchar(255)', 'd1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (7, 'prd_ads', 'E', '567', 'e', 'text', 'e1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (8, 'prd_aws', 'R', '269', 'r', 'text', 'r1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (9, 'prd_aws', 'V', '636-999', 'i', 'varchar(9999)', '789', NULL);
INSERT INTO `tab_col_name_1022` VALUES (10, 'prd_aws', 'W', '734', 'u', 'text', 'u1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (11, 'prd_aws', 'Z', '245', 'p', 'varchar(255)', 'p1', NULL);
INSERT INTO `tab_col_name_1022` VALUES (12, 'prd_ads', 'F', '466', 'o', 'varchar(255)', 'ui', NULL);
SET FOREIGN_KEY_CHECKS = 1;
历史拉链表的建表语句
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for new_tab_col_name
-- ----------------------------
DROP TABLE IF EXISTS `new_tab_col_name`;
CREATE TABLE `new_tab_col_name` (
`id` int NOT NULL AUTO_INCREMENT,
`schema_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`tab_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`tab_comment` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`col_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`col_type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`col_comment` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL,
`code_values` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`start_time` date NULL DEFAULT NULL,
`end_time` date NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 181 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
4、第一步、全量同步脚本(就是导入、导出)
from datetime import datetime
from pprint import pprint
import mysql.connector
from mysql.connector import Error
import time
try:
# 创建数据库连接
connection = mysql.connector.connect(
host='localhost', # 例如 'localhost'
database='test1', # 你的数据库名
user='root', # 你的用户名
password='123456' # 你的密码
)
if connection.is_connected():
# 创建游标对象
cursor = connection.cursor()
# 查询所有记录
cursor.execute("SELECT * FROM tab_col_name_1021")
results = cursor.fetchall()
# 排除结果集中的null值
for i in range(len(results)):
results[i]=list(results[i])
for j in range(len(results[i])):
if results[i][j]==None:
results[i][j]=''
# 执行插入
if results:
for row in results:
if row:
# 空白插入语句
insert_sql='INSERT INTO new_tab_col_name (schema_name,tab_name, tab_comment,col_name,col_type,col_comment,code_values,start_time,end_time) VALUES (\'{}\', \'{}\', \'{}\', \'{}\', \'{}\', \'{}\', \'{}\', \'{}\', \'{}\')'
# 放入值
# print(type(row[1]))
insert_sql=insert_sql.format(row[1],row[2],row[3],row[4],row[5],row[6],row[7],str(time.strftime("%Y-%m-%d", time.localtime())),'9999-12-31')
print(insert_sql)
# 执行
cursor.execute(insert_sql)
# 打印插入成功的条数
print('插入成功的条数:',cursor.rowcount)
# 提交事务
connection.commit()
print("Transaction completed successfully.")
except Error as e:
# 如果发生错误,回滚事务
if connection.is_connected():
connection.rollback()
print(f"Error: {e}")
print('发生错误,已经回滚')
finally:
# 关闭游标和连接
if connection.is_connected():
cursor.close()
connection.close()
print("MySQL connection is closed")
5、增量更新脚本
from datetime import datetime
from pprint import pprint
import mysql.connector
from mysql.connector import Error
# 异常处理
try:
# 1、创建数据库连接
connection = mysql.connector.connect(
host='localhost', # 例如 'localhost'
database='test1', # 你的数据库名
user='root', # 你的用户名
password='123456' # 你的密码
)
# 2、在连接中执行操作
if connection.is_connected():
# 3、创建游标对象
cursor = connection.cursor()
# 4、先执行更新增量到拉链表的操作
# 01、查询当天所有记录,得到的结果的格式为:一个列表装着很多元组,每个元组代表一行数据,各个列作为其中的最基础元素
# 转换每行数据的格式,由元组→→列表
# 遍历每行数据,改null值为空字符串''
cursor.execute("SELECT * FROM tab_col_name_1022")
now_results = cursor.fetchall()
for i in range(len(now_results)):
now_results[i]=list(now_results[i])
for j in range(len(now_results[i])):
if now_results[i][j]==None:
now_results[i][j]=''
# print(now_results)
# 02、查询拉链表中所有最新记录,得到的结果的格式为:一个列表装着很多元组,每个元组代表一行数据,各个列作为其中的最基础元素
# 转换每行数据的格式,由元组→→列表
# 遍历每行数据,改null值为空字符串''
cursor.execute("SELECT id,schema_name,tab_name, tab_comment,col_name,col_type,col_comment,code_values FROM new_tab_col_name where end_time='9999-12-31' ")
all_results = cursor.fetchall()
for i in range(len(all_results)):
all_results[i]=list(all_results[i])
for j in range(len(all_results[i])):
if all_results[i][j]==None:
all_results[i][j]=''
# 03、找到具有删除、修改差异的数据行
# 声明、初始化,待删除、被修改、相同的三个列表
del_rows,upd_rows,add_rows,same_rows=[],[],[],[]
# 循环当前拉链表中的数据,检查每一条数据是否还存在
for i in range(len(all_results)) :
all_row=all_results[i]
is_exists=False
#is_changes=None
# 循环当天的数据
for j in range(len(now_results)):
now_row=now_results[j]
# 假如以库名、表名、字段名相等,就证明此条数据仍然存在
if all_row[1].strip().lower() == now_row[1].strip().lower() and all_row[2].strip().lower() == now_row[2].strip().lower() and all_row[4].strip().lower() == now_row[4].strip().lower():
is_exists=True
# 如果该条数据在存在的基础上,结果集的第六列,即列的类型被修改了,即放入被修改列表
if all_row[5].strip().lower() != now_row[5].strip().lower():
#is_changes=False
#else:
#is_changes=True
now_row[7]=all_row[7]
upd_rows.append(now_row)
#print(upd_rows)
# 如果该条数据是两者皆有的,即放入重合列表,方便后续进行除外操作
same_rows.append(now_row)
# 如果该条数据不存在,即放入待删除列表
if is_exists==False:
del_rows.append(all_row)
# print(del_rows)
print('-----------------')
print(del_rows+upd_rows)
print('-----------------')
add_rows=[item for item in now_results if item not in same_rows]
print(add_rows)
# 04、将拉链表中对应数据的进行失效操作
# 001、需要删除的数据行
# 002、发生修改的数据行,此处先让他失效,后续再插入一条新的
if (del_rows+upd_rows):
for row in (del_rows+upd_rows):
if row:
# 空白更新语句
update_sql='update new_tab_col_name set end_time=\'{}\' where schema_name=\'{}\' and tab_name=\'{}\' and col_name =\'{}\''
# 放入值 str(time.strftime("%Y-%m-%d", time.localtime()))
update_sql=update_sql.format('2024-10-22',row[1],row[2],row[4])
print(update_sql)
# 执行
cursor.execute(update_sql)
# 打印更新成功的条数
print('打印更新成功(删除)的条数:',cursor.rowcount)
# 05、执行拉链表中对应数据的新增操作
# 001、对于发生修改差异的数据,这里去新增
# 002、对于新增加的数据,进行新增
if (upd_rows+add_rows):
for row in (upd_rows+add_rows):
if row:
# 空白插入语句
insert_sql='INSERT INTO new_tab_col_name (schema_name,tab_name, tab_comment,col_name,col_type,col_comment,code_values,start_time,end_time) VALUES (\'{}\', \'{}\', \'{}\', \'{}\',\'{}\', \'{}\', \'{}\', \'{}\', \'{}\')'
# 放入当前值 测试为'2024-10-22',实际应该为str(time.strftime("%Y-%m-%d", time.localtime()))
insert_sql=insert_sql.format(row[1],row[2],row[3],row[4],row[5],row[6],row[7],'2024-10-22','9999-12-31')
print(insert_sql)
# 执行
cursor.execute(insert_sql)
# 打印新增成功的条数
print('打印新增成功的条数:',cursor.rowcount)
# 06、提交事务
connection.commit()
print("Transaction completed successfully.\n本周更新增量数据到拉链表完成")
except Error as e:
# 如果发生错误,回滚事务
if connection.is_connected():
connection.rollback()
print(f"Error: {e}")
print('发生错误,已经回滚')
finally:
# 关闭游标和连接
if connection.is_connected():
cursor.close()
connection.close()
print("MySQL connection is closed")
6、执行两个脚本后的结果
10-21日的结果全量同步过去后,结果略;在第二个增量脚本执行完后,结果如下,很完美;删除的进行了闭链操作;修改的,执行了闭链、开链操作;新增的,执行了开新链操作