def get_connections_key(item):
native_price = item.get('native_price', item['costPrice'])
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['transferAirport'] + item['arr'] + item['depDate'] + item[
'depDate2'] + item['flightNo'] + item['flightNo2'] + item[
'cabin'] + \
item['costPrice'] + native_price + item['seatNum']
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def get_key(item):
native_price = item.get('native_price', item['costPrice'])
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['arr'] + item['depDate'] + item['flightNo'] + item[
'cabin'] + item['costPrice'] + native_price + item['seatNum']
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def save(old_data, new_data, database_address, database_name):
try:
# 1. 预处理:创建MD5映射字典提高查找效率
old_data_md5_map = {get_key(item): item for item in old_data}
new_data_md5_map = {get_key(item): item for item in new_data}
# 2. 分类处理数据
need_invalidate_ids = [] # 需要标记为无效的ID
need_update_time_md5 = [] # 需要更新的数据的MD5
to_insert_data = [] # 需要插入的新数据
unique_dict = {}
# for item in old_data:
# key = (item['policyType'], item['depDate'], item['flightNo'], item['cabin']) # 组合成元组
# if key in unique_dict:
# # 如果 key 已经存在,说明当前 item 是重复的,将其添加到 duplicated_items
# need_invalidate_ids.append(item['_id'])
# else:
# # 如果 key 不存在,将当前 item 添加到 unique_dict
# unique_dict[key] = item
# 2.1 找出需要失效的旧数据(存在于old但不存在于new)
for md5, item in old_data_md5_map.items():
if md5 not in new_data_md5_map:
need_invalidate_ids.append(item['_id'])
# 2.2 找出需要更新的数据和真正的新数据
for md5, new_item in new_data_md5_map.items():
if md5 in old_data_md5_map:
# 记录MD5,稍后从old_data中获取_id
need_update_time_md5.append(md5)
else:
to_insert_data.append(new_item)
# 获取需要更新时间的数据的ID
need_update_time_ids = [
old_data_md5_map[md5]['_id']
for md5 in need_update_time_md5
if md5 in old_data_md5_map
]
# 3. 执行数据库操作
success_flag = True
# 3.1 插入新数据
if to_insert_data:
try:
result = database_address.insert_many(
collect_name=database_name,
documents=to_insert_data
)
logger.info(f"插入{len(result.inserted_ids)}条新数据")
except Exception as e:
logger.error(f"【{database_name}】插入数据失败: {str(e)}")
success_flag = False
# 3.2 更新现有数据的时间戳
# if need_update_time_ids and success_flag:
# try:
# filter_criteria = {'_id': {'$in': need_update_time_ids}}
# update_result = database_address.update_many(
# collect_name=database_name,
# filter=filter_criteria,
# update={"$set": {"updateTime": datetime.now()}}
# )
# logger.info(f"更新{update_result.matched_count}条数据时间戳")
# except Exception as e:
# logger.error(f"更新时间戳失败: {str(e)}")
# success_flag = False
# 3.3 标记失效数据
if need_invalidate_ids and success_flag:
try:
filter_criteria = {'_id': {'$in': need_invalidate_ids}}
update_result = database_address.update_many(
collect_name=database_name,
filter=filter_criteria,
update={
"$set": {
"status": 1,
"updateTime": datetime.now()
}
}
)
logger.info(f"标记{update_result.matched_count}条数据为无效")
except Exception as e:
logger.error(f"【{database_name}】标记无效数据失败: {str(e)}")
success_flag = False
return success_flag
except Exception as e:
logger.error(f"【{database_name}】增量保存异常: {str(e)}", exc_info=True)
return False
def saveConnections(old_data, new_data, database_address, database_name):
try:
# 1. 预处理:创建MD5映射字典提高查找效率
old_data_md5_map = {get_connections_key(item): item for item in old_data}
new_data_md5_map = {get_connections_key(item): item for item in new_data}
# 2. 分类处理数据
need_invalidate_ids = [] # 需要标记为无效的ID
need_update_time_md5 = [] # 需要更新的数据的MD5
to_insert_data = [] # 需要插入的新数据
# 2.1 找出需要失效的旧数据(存在于old但不存在于new)
for md5, item in old_data_md5_map.items():
if md5 not in new_data_md5_map:
need_invalidate_ids.append(item['_id'])
# 2.2 找出需要更新的数据和真正的新数据
for md5, new_item in new_data_md5_map.items():
if md5 in old_data_md5_map:
# 记录MD5,稍后从old_data中获取_id
need_update_time_md5.append(md5)
else:
to_insert_data.append(new_item)
# 获取需要更新时间的数据的ID
need_update_time_ids = [
old_data_md5_map[md5]['_id']
for md5 in need_update_time_md5
if md5 in old_data_md5_map
]
# 3. 执行数据库操作
success_flag = True
# 3.1 插入新数据
if to_insert_data:
try:
result = database_address.insert_many(
collect_name=database_name,
documents=to_insert_data
)
logger.info(f"插入{len(result.inserted_ids)}条新数据")
except Exception as e:
logger.error(f"【{database_name}】插入数据失败: {str(e)}")
success_flag = False
# 3.2 更新现有数据的时间戳
if need_update_time_ids and success_flag:
try:
filter_criteria = {'_id': {'$in': need_update_time_ids}}
update_result = database_address.update_many(
collect_name=database_name,
filter=filter_criteria,
update={"$set": {"update_time": datetime.now()}}
)
logger.info(f"更新{update_result.matched_count}条数据时间戳")
except Exception as e:
logger.error(f"更新时间戳失败: {str(e)}")
success_flag = False
# 3.3 标记失效数据
if need_invalidate_ids and success_flag:
try:
filter_criteria = {'_id': {'$in': need_invalidate_ids}}
update_result = database_address.update_many(
collect_name=database_name,
filter=filter_criteria,
update={
"$set": {
"status": 1,
"update_time": datetime.now()
}
}
)
logger.info(f"标记{update_result.matched_count}条数据为无效")
except Exception as e:
logger.error(f"【{database_name}】标记无效数据失败: {str(e)}")
success_flag = False
return success_flag
except Exception as e:
logger.error(f"【{database_name}】增量保存异常: {str(e)}", exc_info=True)
return False
def get_key_intel(item):
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['arr'] + item['depDate'] + item['flightNo'] + item[
'cabin'] + item['price'] + item['totalTax'] + item['seatNum']
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def saveIntel(old_data, new_data, database_address, database_name):
try:
# 1. 预处理:创建MD5映射字典提高查找效率
old_data_md5_map = {get_key_intel(item): item for item in old_data}
new_data_md5_map = {get_key_intel(item): item for item in new_data}
# 2. 分类处理数据
need_invalidate_ids = [] # 需要标记为无效的ID
need_update_time_md5 = [] # 需要更新的数据的MD5
to_insert_data = [] # 需要插入的新数据
unique_dict = {}
for item in old_data:
# key = (item['policyType'], item['dep'], item['arr'], item['depDate'], item['flightNo']) # 组合成元组
key = get_key_intel(item) # 组合成元组
if key in unique_dict:
# 如果 key 已经存在,说明当前 item 是重复的,将其添加到 duplicated_items
need_invalidate_ids.append(item['_id'])
else:
# 如果 key 不存在,将当前 item 添加到 unique_dict
unique_dict[key] = item
# 2.1 找出需要失效的旧数据(存在于old但不存在于new)
for md5, item in old_data_md5_map.items():
if md5 not in new_data_md5_map:
need_invalidate_ids.append(item['_id'])
# 2.2 找出需要更新的数据和真正的新数据
for md5, new_item in new_data_md5_map.items():
if md5 in old_data_md5_map:
# 记录MD5,稍后从old_data中获取_id
need_update_time_md5.append(md5)
else:
to_insert_data.append(new_item)
# 获取需要更新时间的数据的ID
need_update_time_ids = [
old_data_md5_map[md5]['_id']
for md5 in need_update_time_md5
if md5 in old_data_md5_map
]
# 3. 执行数据库操作
success_flag = True
# 3.1 插入新数据
if to_insert_data:
try:
result = database_address.insert_many(
collect_name=database_name,
documents=to_insert_data
)
logger.info(f"插入{len(result.inserted_ids)}条新数据,{to_insert_data}")
except Exception as e:
logger.error(f"【{database_name}】插入数据失败: {str(e)}")
success_flag = False
# 3.2 更新现有数据的时间戳
# if need_update_time_ids and success_flag:
# try:
# filter_criteria = {'_id': {'$in': need_update_time_ids}}
# update_result = database_address.update_many(
# collect_name=database_name,
# filter=filter_criteria,
# update={"$set": {"updateTime": datetime.now()}}
# )
# logger.info(f"更新{update_result.matched_count}条数据时间戳")
# except Exception as e:
# logger.error(f"更新时间戳失败: {str(e)}")
# success_flag = False
# 3.3 标记失效数据
if need_invalidate_ids:
try:
filter_criteria = {'_id': {'$in': need_invalidate_ids}}
update_result = database_address.update_many(
collect_name=database_name,
filter=filter_criteria,
update={
"$set": {
"status": 1,
"updateTime": datetime.now()
}
}
)
logger.info(f"标记{update_result.matched_count}条数据为无效")
except Exception as e:
logger.error(f"【{database_name}】标记无效数据失败: {str(e)}")
success_flag = False
return success_flag
except Exception as e:
logger.error(f"【{database_name}】增量保存异常: {str(e)}", exc_info=True)
return False
def get_key_intel_new(item):
policy_segment_str = ''.join(str(segment) for segment in item['policySegmentList'])
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['arr'] + item['depDate'] + item['flightNo'] + item[
'cabin'] + item['price'] + item['totalTax'] + item['seatNum'] + policy_segment_str
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def get_key_new(item):
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['arr'] + item['depDate'] + item['flightNo'] + item[
'cabin']
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def get_price_key(item):
combined_string = item['price'] + item['totalTax'] + item['totalPrice']
return hashlib.md5(combined_string.encode()).hexdigest()
def filter_upd_data(ins, item):
if get_key_new(ins) == get_key_new(item): # 出发到达日期航班号舱位一致,视为一条数据
if get_price_key(ins) == get_price_key(item): # 价格也一样,视为需要更新的数据
return item
def filter_invalid_data(ins, item):
if get_key_new(ins) == get_key_new(item): # 出发到达日期航班号舱位一致,视为一条数据
if get_price_key(ins) != get_price_key(item): # 价格不一样,视为需要无效的数据
return item
def update_mongo_data_new(mgClient, need_ins_list, HBGJ_PRICE_DATA, dep, arr, dep_date, task_tag):
mongo_update_lock.acquire()
try:
temp_res_data = mgClient.find(collect_name=HBGJ_PRICE_DATA,
filter={'policyType': task_tag, 'dep_tag': dep,
'arr_tag': arr, 'depDate': dep_date,
'status': 0})
temp_res_data = list(temp_res_data)
# 1. 预处理:创建MD5映射字典提高查找效率
old_data_md5_map = {get_key(item): item for item in temp_res_data}
new_data_md5_map = {get_key(item): item for item in need_ins_list}
# 2. 分类处理数据
need_invalidate_ids = [] # 需要标记为无效的ID
need_update_time_md5 = [] # 需要更新的数据的MD5
to_insert_data = [] # 需要插入的新数据
unique_dict = {}
# for item in old_data:
# key = (item['policyType'], item['depDate'], item['flightNo'], item['cabin']) # 组合成元组
# if key in unique_dict:
# # 如果 key 已经存在,说明当前 item 是重复的,将其添加到 duplicated_items
# need_invalidate_ids.append(item['_id'])
# else:
# # 如果 key 不存在,将当前 item 添加到 unique_dict
# unique_dict[key] = item
# 2.1 找出需要失效的旧数据(存在于old但不存在于new)
for md5, item in old_data_md5_map.items():
if md5 not in new_data_md5_map:
need_invalidate_ids.append(item['_id'])
# 2.2 找出需要更新的数据和真正的新数据
for md5, new_item in new_data_md5_map.items():
if md5 in old_data_md5_map:
# 记录MD5,稍后从old_data中获取_id
need_update_time_md5.append(md5)
else:
to_insert_data.append(new_item)
# 获取需要更新时间的数据的ID
need_update_time_ids = [
old_data_md5_map[md5]['_id']
for md5 in need_update_time_md5
if md5 in old_data_md5_map
]
# 3. 执行数据库操作
success_flag = True
# 3.1 插入新数据
if to_insert_data:
try:
result = mgClient.insert_many(
collect_name=HBGJ_PRICE_DATA,
documents=to_insert_data
)
logger.info(f"插入{len(result.inserted_ids)}条新数据,{to_insert_data}")
except Exception as e:
logger.error(f"【{HBGJ_PRICE_DATA}】插入数据失败: {str(e)}")
success_flag = False
# 3.2 更新现有数据的时间戳
# if need_update_time_ids and success_flag:
# try:
# filter_criteria = {'_id': {'$in': need_update_time_ids}}
# update_result = database_address.update_many(
# collect_name=database_name,
# filter=filter_criteria,
# update={"$set": {"updateTime": datetime.now()}}
# )
# logger.info(f"更新{update_result.matched_count}条数据时间戳")
# except Exception as e:
# logger.error(f"更新时间戳失败: {str(e)}")
# success_flag = False
# 3.3 标记失效数据
if need_invalidate_ids and success_flag:
try:
filter_criteria = {'_id': {'$in': need_invalidate_ids}}
update_result = mgClient.update_many(
collect_name=HBGJ_PRICE_DATA,
filter=filter_criteria,
update={
"$set": {
"status": 1,
"updateTime": datetime.now()
}
}
)
logger.info(f"标记{update_result.matched_count}条数据为无效")
except Exception as e:
logger.error(f"【{HBGJ_PRICE_DATA}】标记无效数据失败: {str(e)}")
success_flag = False
return success_flag
except Exception as e:
logger.info(f"更新MongoDB数据时发生异常: {str(e)}")
raise # 可以选择重新抛出异常或者处理异常
finally:
# 确保锁被释放
mongo_update_lock.release()
def update_mongo_data(mgClient, need_ins_list, temp_res_data, HBGJ_PRICE_DATA):
"""更新MongoDB数据"""
# 需要更新成无效的数据
need_upd_data = []
for item in temp_res_data:
item_md5 = get_key(item)
temp_data = [ins for ins in need_ins_list if get_key(ins) == item_md5]
if not temp_data:
need_upd_data.append(item)
else:
need_ins_list = [ins for ins in need_ins_list if get_key(ins) != item_md5]
# 插入新有效数据
is_data_flag = 0
if need_ins_list:
is_data_flag = 1
mgClient.insert_many(collect_name=HBGJ_PRICE_DATA, documents=need_ins_list)
logger.info(f"【插入有效数据】: 长度{len(need_ins_list)}")
# 更新成无效数据
upd_id_list = [data['_id'] for data in need_upd_data]
if upd_id_list:
filter_criteria = {'_id': {'$in': upd_id_list}}
res = mgClient.update_many(
collect_name=HBGJ_PRICE_DATA,
filter=filter_criteria,
update={"$set": {"status": 1, "update_time": datetime.now()}}
)
logger.info(f"【更新无效数据】: 长度{res.matched_count}")
return is_data_flag
def saveIntelNew(old_data, new_data, database_address, database_name):
try:
# 1. 预处理:创建MD5映射字典提高查找效率
old_data_md5_map = {get_key_intel_new(item): item for item in old_data}
new_data_md5_map = {get_key_intel_new(item): item for item in new_data}
# 2. 分类处理数据
need_invalidate_ids = [] # 需要标记为无效的ID
need_update_time_md5 = [] # 需要更新的数据的MD5
to_insert_data = [] # 需要插入的新数据
unique_dict = {}
for item in old_data:
# key = (item['policyType'], item['dep'], item['arr'], item['depDate'], item['flightNo']) # 组合成元组
key = get_key_intel(item) # 组合成元组
if key in unique_dict:
# 如果 key 已经存在,说明当前 item 是重复的,将其添加到 duplicated_items
need_invalidate_ids.append(item['_id'])
else:
# 如果 key 不存在,将当前 item 添加到 unique_dict
unique_dict[key] = item
# 2.1 找出需要失效的旧数据(存在于old但不存在于new)
for md5, item in old_data_md5_map.items():
if md5 not in new_data_md5_map:
need_invalidate_ids.append(item['_id'])
# 2.2 找出需要更新的数据和真正的新数据
for md5, new_item in new_data_md5_map.items():
if md5 in old_data_md5_map:
# 记录MD5,稍后从old_data中获取_id
need_update_time_md5.append(md5)
else:
to_insert_data.append(new_item)
# 获取需要更新时间的数据的ID
need_update_time_ids = [
old_data_md5_map[md5]['_id']
for md5 in need_update_time_md5
if md5 in old_data_md5_map
]
# 3. 执行数据库操作
success_flag = True
# 3.1 插入新数据
if to_insert_data:
try:
result = database_address.insert_many(
collect_name=database_name,
documents=to_insert_data
)
logger.info(f"插入{len(result.inserted_ids)}条新数据")
except Exception as e:
logger.error(f"【{database_name}】插入数据失败: {str(e)}")
success_flag = False
# 3.2 更新现有数据的时间戳
# if need_update_time_ids and success_flag:
# try:
# filter_criteria = {'_id': {'$in': need_update_time_ids}}
# update_result = database_address.update_many(
# collect_name=database_name,
# filter=filter_criteria,
# update={"$set": {"updateTime": datetime.now()}}
# )
# logger.info(f"更新{update_result.matched_count}条数据时间戳")
# except Exception as e:
# logger.error(f"更新时间戳失败: {str(e)}")
# success_flag = False
# 3.3 标记失效数据
if need_invalidate_ids and success_flag:
try:
filter_criteria = {'_id': {'$in': need_invalidate_ids}}
update_result = database_address.update_many(
collect_name=database_name,
filter=filter_criteria,
update={
"$set": {
"status": 1,
"updateTime": datetime.now()
}
}
)
logger.info(f"标记{update_result.matched_count}条数据为无效")
except Exception as e:
logger.error(f"【{database_name}】标记无效数据失败: {str(e)}")
success_flag = False
return success_flag
except Exception as e:
logger.error(f"【{database_name}】增量保存异常: {str(e)}", exc_info=True)
return False
def get_key_statistics(item):
# 拼接字段值
combined_string = item['policyType'] + item['dep'] + item['arr'] + item['depDate'] + item['flightNo'] + item[
'cabin'] + \
item['costPrice'] + item['seatNum']
# 计算 MD5
return hashlib.md5(combined_string.encode()).hexdigest()
def saveStatistics(old_data, new_data, database_address, database_name):
try:
# 需要更新成无效的数据
need_upd_data = []
# 需要刷新更新时间的数据
need_upd_time_data = []
is_data_flag = 0
for item in old_data:
item_md5 = get_key(item)
temp_data = [ins for ins in new_data if get_key(ins) == item_md5]
if len(temp_data) == 0:
# 将库里有的,新数据没有的放到更新无效列表
need_upd_data.append(item)
elif len(temp_data) >= 1:
# 将库里有的,不需要变动的数据从插入列表删除
new_data = [ins for ins in new_data if get_key(ins) != item_md5]
need_upd_time_data = [ins for ins in new_data if get_key(ins) == item_md5]
# 插入新有效数据
if len(new_data) > 0:
database_address.insert_many(collect_name=database_name, documents=new_data)
logger.info(f"【插入数据】: 长度{len(new_data)}")
# 刷新更新时间
if len(need_upd_time_data) > 0:
upd_id_list = [data['_id'] for data in need_upd_time_data]
time_limit = datetime.utcnow() + timedelta(hours=3)
filter_criteria = {
'_id': {'$in': upd_id_list},
'update_time': {'$lt': time_limit}
}
res = database_address.update_many(collect_name=database_name, filter=filter_criteria, update={
"$set": {"update_time": datetime.now()},
"$inc": {"run_count": 1}
})
logger.info(f"【更新插入时间数据】: 匹配{res.matched_count}条,更新{res.modified_count}条")
except Exception as e:
logger.error(f"【保存异常】: {e}")
return None
def saveStatistics1(old_data, new_data, database_address, database_name):
try:
# 需要更新成无效的数据
need_upd_data = []
# 需要刷新更新时间的数据
need_upd_time_data = []
is_data_flag = 0
for item in old_data:
item_md5 = get_key(item)
temp_data = [ins for ins in new_data if get_key(ins) == item_md5]
if len(temp_data) == 0:
# 将库里有的,新数据没有的放到更新无效列表
need_upd_data.append(item)
elif len(temp_data) >= 1:
# 将库里有的,不需要变动的数据从插入列表删除
new_data = [ins for ins in new_data if get_key(ins) != item_md5]
need_upd_time_data = [ins for ins in new_data if get_key(ins) == item_md5]
# 插入新有效数据
if len(new_data) > 0:
database_address.insert_many(collect_name=database_name, documents=new_data)
logger.info(f"【插入数据】: 长度{len(new_data)}")
# 刷新更新时间
if len(need_upd_time_data) > 0:
upd_id_list = [data['_id'] for data in need_upd_time_data]
time_limit = datetime.utcnow() + timedelta(hours=24)
filter_criteria = {
'_id': {'$in': upd_id_list},
'update_time': {'$lt': time_limit}
}
res = database_address.update_many(collect_name=database_name, filter=filter_criteria, update={
"$set": {"update_time": datetime.now()},
"$inc": {"run_count": 1}
})
logger.info(f"【更新插入时间数据】: 匹配{res.matched_count}条,更新{res.modified_count}条")
except Exception as e:
logger.error(f"【保存异常】: {e}")
return None
每行代码添加中文注释
最新发布