import pandas as pd
import re
from tqdm import tqdm
# import warnings
# # warnings.filterwarnings("ignore")
# match: '3207882_python程序设计实验指导书.pdf'
# 物联网RFID原理与技术 第2版_高建良,贺建飚编著 .pdf
def match_book(original_book_name,original_book_name2, re_str):
print(original_book_name)
print(re_str)
result_dict = {}
match_flag = False
match_result = re.search(re_str, original_book_name2)
# 匹配书名
if match_result:
match_flag = True
result_dict = match_result.groupdict()
result_dict["original_book_name"] = original_book_name
return match_flag, result_dict
# 判断字符串是否全是中文
def IsChinese(character):
for cha in character:
if not '\u0e00' <= cha <= "\u9fa5":
return False
else:
return True
if __name__ == '__main__':
re_str_dict = {
# # match: '3207882_python程序设计实验指导书.pdf'
"match1": '^\d+_(?P<book_name>.*?)(?:.html)?.pdf',
# # '[图灵]《24小时365天不间断服务》[张毅译][人民邮电出版社][978-7-115-38024-1][2015.1][p361].pdf'
"match2": ".*?《(?P<book_name>.*?)》.*?\[(?P<book_author>.*?)[译|著|编|译制]\].*?\[(?P<book_publish>.*?[社|电子书])\](?:.*?).pdf",
# 《机床电气线路安装与维修工作页》.pdf
"match3": "《(?P<book_name>.*?)》.pdf",
# 半导体器件可靠性技术_(日)安食恒雄主编;日本松下电子工业株式会社编;周南生等译_西安:西安电子科技大学出版社
"match4": "(?P<book_name>.*?)_(?P<book_author>.*?[编|译|著|])_(?P<book_publish>.*?[出版社|组|厂]).pdf",
# # 半导体集成电路 姚金生
"match5": "(?P<book_name>.*?) (?P<book_author>.*?).pdf",
# 电路、信号与系统 [徐昌彪 主编] 2012年.pdf
"match6": '(?P<book_name>.*?) \[(?P<book_author>.*?)[主编|著]\](?:.*?)(?:.html)?.pdf',
# 电工电子技术实验 [席建中,陈松柏,何勇 主编;刘建生,罗小华,彭名华,刘西成,张雪平,李兴红副主编] 2014年.pdf
"match7": '(?P<book_name>.*?) \[(?P<book_author>.*?)[著|编著|主编] (?:.*?)[副主编]\](?:.*?)(?:.html)?.pdf',
# 电工电子技术基础教程 第2版 [陈新龙,胡国庆 著] 2013年.pdf
"match8": '(?P<book_name>.*?) (?:.*?版) \[(?P<book_author>.*?)[编|著|编著|主编]\](?:.*?)(?:.html)?.pdf',
# [OpenCL.Programming.Guide(第1版)].pdf
"match9":"^\[(?P<book_name>.*?)\].pdf$",
# [web开发CSS系列].WebDevelopmentSolutions.pdf
"match10": "^\[.*?\].(?P<book_name>.*?).pdf$",
#《android底层开发实战》[电子书][p48].pdf
'match11': "《(?P<book_name>.*?)》.*?\[(?P<book_author>.*?[书|编著|英文|译制|笔记])\](?:.*?).pdf",
# 【课件】图象工程(上册)(清华大学)_章毓晋.pdf
"match13":".*?】(?P<book_name>.*?)\(.*?\)_(?P<book_author>.*?).pdf",
# c程序设计语言(第二版,中文版,b.w.kernighan、d.m.ritchie 著)
"match14":"(?P<book_name>.*?)((?P<book_author>.*?著)).pdf",
# "match15": "(?P<book_name>.*?)_(?P<book_author>.*?)_(?P<book_publish>.*?).pdf",
# # 高等学校规划教材 电路与电子学 王文辉
# "match16":"(?:>*?材) (?P<book_name>.*?) (?P<book_author>.*?).pdf",
# # 电气运行_潘龙德主编
# "match17": "(?P<book_name>.*?)_(?P<book_author>.*?).pdf",
# 网络并购:并购交易的电子商务化 武建永 李斌
# "match18": "(?P<book_name>.*?) (?P<book_author>.*?)",
# 编译原理 by 蒋宗礼,姜守旭编著(z - lib.org)
"match19": "(?P<book_name>.*?) by (?P<book_author>.*?)\(.*?",
# 高效程序员的45个习惯:敏捷开发修炼之道(中文版).(苏帕拉马尼亚姆).钱安川等.pdf
# "match20": "(?P<book_name>.*?)\((?P<book_author>.*?)\).pdf",
'mathch12':"(?P<book_name>.*?)(?:.html)?.pdf",
}
book_df = pd.read_excel(r"..\data\第二批提交书籍清单.xlsx")
book_df = book_df.drop_duplicates()
book_list = book_df["书名"].unique().tolist()
result_list = []
for original_book_name in book_list:
# if original_book_name !="C程序设计语言(第二版,中文版,B.W.Kernighan、D.M.Ritchie 著).pdf":
# continue
original_book_name1 = original_book_name.replace(" ", "")
not_need_list = ["[大家网]", '[www.topsage.com]','[www.TopSage.com]']
original_book_name2 = original_book_name.lower()
for not_need in not_need_list:
original_book_name2 = original_book_name2.replace(not_need, "")
result_dict = {}
for re_id,re_str in re_str_dict.items():
if re_id =="match5" and not IsChinese(original_book_name1):
continue
match_flag, result_dict = match_book(original_book_name,original_book_name2, re_str)
if match_flag:
result_list.append(result_dict)
break
else:
result_dict["original_book_name"] = original_book_name
result_list.append(result_dict)
result_df = pd.DataFrame(result_list)
result_df.to_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对.xlsx", index=False)
print(result_df)
# result_df = pd.read_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对.xlsx")
# result_list = []
# for row_index, row_df in result_df.iterrows():
# # book_name original_book_name book_author book_publish
# result_list.append((row_df["book_name"], row_df["original_book_name"], row_df["book_author"],row_df["book_publish"]))
# book_ict_df = pd.read_excel(r"..\data\ICT通信中文书单(2).xlsx")
# book_ict_df= book_ict_df.drop_duplicates()
# target_result_list =[]
# for book_ict_index,book_ict_info_df in tqdm(book_ict_df.iterrows()):
# ict_book_name = book_ict_info_df["图书名称"]
# ict_book_author = book_ict_info_df["作者署名"]
# ict_book_publish = book_ict_info_df["出版社"]
# ishaving = book_ict_info_df["书籍是否已有"]
# if ishaving==1:
# target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
# else:
# for book_info in result_list:
# book_name = book_info[0]
# original_book_name = book_info[1]
# book_author = book_info[2]
# book_publish = book_info[3]
# if book_name != ict_book_name:
# continue
# if book_publish==book_publish and ":" in book_publish:
# book_publish = book_publish.split(":")[1]
# if book_publish !=ict_book_publish:
# continue
# book_ict_info_df["书籍是否已有"] =1
# book_ict_info_df["备注"] = "新增"
# target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
# break
# else:
# target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
# target_df = pd.concat(target_result_list, ignore_index=True)
# target_df.to_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对2.xlsx", index=False)