REF
http://hunkshi.github.io/2015/06/09/2015-06-09-python-huge-set-operation/
场景
需要分别从 mysql 和 mongodb 中取出 id,比较两者的差异。将已经不存在与 mysql 中的 id 从 mongodb 中删除
问题
在使用 set 的时候内存占用过大, 在使用 np.setdiff1d(s, t, assume_unique=True) 时候,会出现 End of file
的错误,同时占用时间过长。
解决
第一步的代码:
import os
import time
import pymongo
import pymysql
import numpy as np
# some ref
# http://hunkshi.github.io/2015/06/09/2015-06-09-python-huge-set-operation/
def gen_sql_con(mysql_host, mysql_port, mysql_username, mysql_password, mysql_DBname):
try:
mysql_con = pymysql.connect(
host=mysql_host,
port=mysql_port,
user=mysql_username,
password=mysql_password,
charset='utf8mb4',
db=mysql_DBname
)
except Exception:
raise
return mysql_con
def select_all_ids(con, db_name, table_name):
query_sql = """select id from {}.{}; """.format(db_name, table_name)
try:
with con.cursor() as cursor:
cursor.execute(query_sql)
res = cursor.fetchall()
# f = open("sql.txt")
# for j in res:
# f.write(j[0], "a")
ids = [j[0] for j in res]
except Exception:
raise
finally:
con.commit()
return ids
class NONESENSE:
mongocli = pymongo.MongoClient(os.environ.get("JZDATAURI", "mongodb://127.0.0.1:27017"))
def DB() -> pymongo.MongoClient:
return NONESENSE.mongocli
def select_all_mongo_ids(db_name, table_name):
coll = DB()[db_name][table_name]
ids = None
for i in range(3):
try:
# 如果使用 distinct 的话 内存会崩掉 ...
# ids = coll.find().distinct("id")
# 这种的话速度比较慢, 在转换为 set 的过程中,有可能超时断连
cursor = coll.find({}, {"id": 1, "_id": 0})
ids = [res.get("id") for res in cursor]
except Exception as e:
print(e)
if ids:
break
return ids
def delete_ids(db_name, table_name, ids):
coll = DB()[db_name][table_name]
try:
coll.delete_many({"id": {"$in": list(ids)}})
except Exception as e:
print(e)
def method_check_time_usage(func):
def wrapped(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
dt = time.time()-start
if dt > 0.1:
print(f"[TimeUsage] {func.__module__}.{func.__name__} usage: {dt}")
return result
return wrapped
@method_check_time_usage
def clean_data(table_name):
# config
host = "139.159.176.118"
port = 3306
user = "dcr"
password =
sqlDBname = "datacenter"
mongoDBname = "JQdata"
# mysql con
conn = gen_sql_con(host, port, user, password, sqlDBname)
sql_ids = select_all_ids(conn, sqlDBname, table_name)
print(len(sql_ids))
mongo_ids = select_all_mongo_ids(mongoDBname, table_name)
# print(len(mongo_ids))
# set 性能需要优化
# to_deletes = set(mongo_ids)-set(sql_ids)
to_deletes = np.setdiff1d(mongo_ids, sql_ids, assume_unique=True)
print("to_delete", to_deletes)
# to_add = set(sql_ids)-set(mongo_ids)
# print("to_add", to_add)
# delete from mongo
delete_ids(mongoDBname, table_name, to_deletes)
# print(len(select_all_mongo_ids(mongoDBname, table_name)))
if __name__ == "__main__":
tables = [
# 与金融数据相关的数据表
# "comcn_balancesheet",
# "comcn_cashflowstatement",
# "comcn_incomestatement",
# "comcn_qcashflowstatement",
# "comcn_qincomestatement",
#
# # 与交易日历相关的数据表
# "const_tradingday",
#
# # 与market相关的数据表
# "const_secumain",
#
# # 与每日退市股票列表相关的数据表
# "stk_liststatus",
#
# # 与 index 相关的原始数据表
# "const_secumainall",
"index_indexcomponentsweight",
#
# # 与 industry 相关的原始数据表
# "comcn_exgindustry",
#
# # 与 concept 相关的原始数据表
# "comcn_conceptlist", # 概念板块常量表
# "comcn_coconcept", # 概念所属公司表
#
# # 与股本相关的原始表
# "comcn_sharestru", # 公司股本结构变动
]
for table in tables:
clean_data(table)
将mongo ids 通过生成器部分生成
import os
import time
import pymongo
import pymysql
import numpy as np
def gen_sql_con(mysql_host, mysql_port, mysql_username, mysql_password, mysql_DBname):
try:
mysql_con = pymysql.connect(
host=mysql_host,
port=mysql_port,
user=mysql_username,
password=mysql_password,
charset='utf8mb4',
db=mysql_DBname
)
except Exception:
raise
return mysql_con
def select_all_ids(con, db_name, table_name):
query_sql = """select id from {}.{}; """.format(db_name, table_name)
try:
with con.cursor() as cursor:
cursor.execute(query_sql)
res = cursor.fetchall()
ids = [j[0] for j in res]
except Exception:
raise
finally:
con.commit()
return ids
class NONESENSE:
mongocli = pymongo.MongoClient(os.environ.get("JZDATAURI", "mongodb://127.0.0.1:27017"))
def DB() -> pymongo.MongoClient:
return NONESENSE.mongocli
def select_all_mongo_ids(db_name, table_name):
""""一个生成器 每次yield出一部分的ids"""
coll = DB()[db_name][table_name]
# ids = None
start = 0
num = 100000
while True:
# 如果无条件使用 distinct 的话 内存会崩掉 ...
ret = coll.find({"id": {"$gte": start, "$lt": start+num}}, {"id": 1, "_id": 0})
start += num
ids = [j.get("id") for j in ret]
yield ids
# 这种的话速度比较慢, 在转换为 set 的过程中,有可能超时断连
# cursor = coll.find({}, {"id": 1, "_id": 0})
# ids = [res.get("id") for res in cursor]
def delete_ids(db_name, table_name, ids):
coll = DB()[db_name][table_name]
try:
coll.delete_many({"id": {"$in": list(ids)}})
except Exception as e:
print(e)
def method_check_time_usage(func):
def wrapped(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
dt = time.time()-start
if dt > 0.1:
print(f"[TimeUsage] {func.__module__}.{func.__name__} usage: {dt}")
return result
return wrapped
@method_check_time_usage
def clean_data(table_name):
# config
host = "139.159.176.118"
port = 3306
user = "dcr"
password =
sqlDBname = "datacenter"
mongoDBname = "JQdata"
# mysql con
conn = gen_sql_con(host, port, user, password, sqlDBname)
# sql_ids 可以全部拿出来,现阶段问题不大
sql_ids = select_all_ids(conn, sqlDBname, table_name)
print(len(sql_ids))
counter = 0
for some_ids in select_all_mongo_ids(mongoDBname, table_name):
if not some_ids and counter <= 100:
# 当 10000 * 100 没有数据的时候,认为生成器被耗尽了,不再进行循环。
# 这么做的主要原因是没办法一下子取出所有的 mongo ids,而且 id 对于数据来说是跳跃的。
# FIXME
counter += 1
continue
if not some_ids:
break
counter = 0
print(len(some_ids))
to_delete = np.setdiff1d(some_ids, sql_ids, assume_unique=True)
# print("to_delete: ", to_delete)
delete_ids(mongoDBname, table_name, to_delete)
if __name__ == "__main__":
tables = [
# 与金融数据相关的数据表
# "comcn_balancesheet",
# "comcn_cashflowstatement",
# "comcn_incomestatement",
# "comcn_qcashflowstatement",
# "comcn_qincomestatement",
#
# # 与交易日历相关的数据表
# "const_tradingday",
#
# # 与market相关的数据表
# "const_secumain",
#
# # 与每日退市股票列表相关的数据表
# "stk_liststatus",
#
# # 与 index 相关的原始数据表
# "const_secumainall",
"index_indexcomponentsweight",
#
# # 与 industry 相关的原始数据表
# "comcn_exgindustry",
#
# # 与 concept 相关的原始数据表
# "comcn_conceptlist", # 概念板块常量表
# "comcn_coconcept", # 概念所属公司表
#
# # 与股本相关的原始表
# "comcn_sharestru", # 公司股本结构变动
]
for table in tables:
clean_data(table)
但是在求取集合的时候速度还是比较慢 …