帮我优化以下代码:
import xbot
from xbot import web, print, sleep
from . import package
import os
import hashlib
import win32com.client # 用于处理doc格式(需安装pywin32)
from openpyxl import load_workbook
import requests
import csv
from pdfplumber import open as open_pdf
from docx import Document
# -------------------------- 新增:.doc转.docx工具函数 --------------------------
def convert_doc_to_docx(doc_path):
"""将.doc文件转换为.docx(仅转换一次,避免重复操作)"""
docx_path = os.path.splitext(doc_path)[0] + ".docx" # 生成目标路径(同目录+改后缀)
if os.path.exists(docx_path):
return docx_path # 已转换过,直接返回结果
word_app = None
try:
# 启动Word进程(后台运行)
word_app = win32com.client.DispatchEx("Word.Application")
word_app.Visible = False # 隐藏窗口
word_app.DisplayAlerts = False # 禁用警告弹窗
# 打开.doc文件
doc = word_app.Documents.Open(doc_path)
# 另存为.docx格式(FileFormat=16是docx的固定代码)
doc.SaveAs2(docx_path, FileFormat=16)
doc.Close(SaveChanges=False) # 关闭文档,不保存修改
print(f"已将.doc转换为.docx:{os.path.basename(docx_path)}")
return docx_path
except Exception as e:
print(f".doc转换失败({os.path.basename(doc_path)}):{str(e)}")
return doc_path # 转换失败则返回原路径(继续用原方式提取)
finally:
if word_app:
word_app.Quit() # 确保关闭Word进程,避免残留
def crawl_list_pages(start_page, end_page, excel_path):
"""爬取列表页的标题、链接及页面URL,保存到Excel"""
all_data = []
for page in range(start_page, end_page + 1):
list_page_url = "https://www.nmpa.gov.cn/yaopin/ypfgwj/index.html" if page == 1 else f"https://www.nmpa.gov.cn/yaopin/ypfgwj/index_{page-1}.html"
print(f"正在爬取第{page}页数据(列表页URL:{list_page_url})")
web_object = web.create(list_page_url, 'edge', load_timeout=30)
a_elements = web_object.find_all_by_xpath("//div[@class='list']/ul[1]/li/a")
print(f"第{page}页找到{len(a_elements)}条记录")
current_page_data = []
for element in a_elements:
href = element.get_attribute("href")
detail_url = href.replace("../..", "https://www.nmpa.gov.cn")
title = element.get_text().strip()
current_page_data.append({"标题": title, "详情页URL": detail_url, "列表页URL": list_page_url})
all_data.extend(current_page_data)
web_object.close()
save_to_excel(current_page_data, excel_path)
return all_data
def save_to_excel(data, excel_path):
"""将数据(含列表页URL)追加写入Excel"""
if not os.path.exists(excel_path):
print(f"错误:Excel文件不存在 → {excel_path}")
return
wb = load_workbook(excel_path)
ws = wb["Sheet1"]
max_row = ws.max_row
start_row = 2 if max_row == 0 else max_row + 1
if max_row == 0:
ws.cell(row=1, column=1, value="标题")
ws.cell(row=1, column=2, value="详情页URL")
ws.cell(row=1, column=3, value="列表页URL")
for i, item in enumerate(data):
ws.cell(row=start_row + i, column=1, value=item["标题"])
ws.cell(row=start_row + i, column=2, value=item["详情页URL"])
ws.cell(row=start_row + i, column=3, value=item["列表页URL"])
wb.save(excel_path)
wb.close()
print(f"已追加{len(data)}条数据到Excel")
def extract_page_data(web_object):
"""从详情页提取核心数据"""
suoyin_elems = web_object.find_all_by_xpath("/html/body/div[4]/div[1]/table/tbody/tr[1]/td[2]")
suoyin = suoyin_elems[0].get_text().strip() if (suoyin_elems and suoyin_elems[0].get_text().strip()) else ""
fenlei_elems = web_object.find_all_by_xpath("/html/body/div[4]/div[1]/table/tbody/tr[1]/td[4]")
fenlei = fenlei_elems[0].get_text().strip() if (fenlei_elems and fenlei_elems[0].get_text().strip()) else ""
biaoti_elems = web_object.find_all_by_xpath("/html/body/div[4]/div[1]/table/tbody/tr[2]/td[2]")
biaoti_web = biaoti_elems[0].get_text().strip() if (biaoti_elems and biaoti_elems[0].get_text().strip()) else ""
day_elems = web_object.find_all_by_xpath("/html/body/div[4]/div[1]/table/tbody/tr[3]/td[2]")
day = day_elems[0].get_text().strip() if (day_elems and day_elems[0].get_text().strip()) else ""
neirong_elems = web_object.find_all_by_xpath("/html/body/div[4]/div[5]")
page_content = neirong_elems[0].get_text().strip() if (neirong_elems and neirong_elems[0].get_text().strip()) else ""
return {
"suoyin": suoyin, "fenlei": fenlei, "biaoti_web": biaoti_web,
"day": day, "page_content": page_content
}
def url_to_md5(url):
"""URL转MD5值(32位小写)"""
md5_hash = hashlib.md5()
md5_hash.update(url.encode('utf-8'))
return md5_hash.hexdigest()
# -------------------------- 注释掉原doc提取函数(不再使用) --------------------------
# def extract_doc_content(file_path):
# """提取doc格式内容(调用Windows Word应用)"""
# word_app = None
# try:
# word_app = win32com.client.DispatchEx("Word.Application")
# word_app.Visible = False
# word_app.DisplayAlerts = False
# doc = word_app.Documents.Open(file_path)
# content = doc.Content.Text.strip()
# doc.Close(SaveChanges=False)
# return content
# except Exception as e:
# return f"【doc格式提取失败:{str(e)}(需确保本地安装Office/WPS)】"
# finally:
# if word_app:
# word_app.Quit()
# -------------------------- 修改:提取文件内容函数(加入转换逻辑) --------------------------
def extract_file_content(file_path):
"""提取PDF、docx、doc三种格式的内容(.doc先转为.docx)"""
file_ext = file_path.split(".")[-1].lower()
content = ""
try:
# 核心修改:.doc先转为.docx,再按docx提取
if file_ext == "doc":
file_path = convert_doc_to_docx(file_path) # 转换为docx
file_ext = "docx" # 切换处理逻辑
if file_ext == "pdf":
# PDF提取(pdfplumber)
with open_pdf(file_path) as pdf:
for page in pdf.pages:
content += (page.extract_text() or "") + "\n\n"
elif file_ext == "docx":
# docx提取(python-docx)
doc = Document(file_path)
for para in doc.paragraphs:
para_text = para.text.strip()
if para_text:
content += para_text + "\n"
else:
content = f"【不支持的文件格式:.{file_ext}】"
except Exception as e:
content = f"【文件提取异常:{str(e)}】"
return content.strip()
def process_attachments(web_object, root_save_dir, day):
"""处理附件下载+多格式内容提取"""
attach_info = []
lianjie_elems = web_object.find_all_by_xpath('//*[contains(@class, "text")]//p//a')
for elem in lianjie_elems:
href = elem.get_attribute("href")
if href and "/directory/" in href:
full_link = href.replace("/directory/", "https://www.nmpa.gov.cn/directory/")
chinese_name = elem.get_text().strip()
if "." in chinese_name:
chinese_name = chinese_name.rsplit(".", 1)[0]
attach_info.append((full_link, chinese_name))
if not attach_info:
return "无附件", ""
# 准备保存目录
day_dir = os.path.join(root_save_dir, day) if day else root_save_dir
if not os.path.exists(day_dir):
os.makedirs(day_dir)
print(f"已创建文件夹:{day_dir}")
attach_paths = []
all_attach_content = ""
for idx, (link, chinese_name) in enumerate(attach_info, 1):
# 处理文件名特殊字符
valid_name = chinese_name.replace("?", "").replace("*", "").replace(":", "").replace("\"", "")
valid_name = valid_name.replace("<", "").replace(">", "").replace("|", "").replace("/", "").replace("\\", "")
# 提取后缀并生成完整文件名
url_suffix = link.split(".")[-1].lower() if "." in link else ""
valid_suffixes = ["pdf", "doc", "docx", "xls", "xlsx", "zip", "rar"]
full_file_name = f"{valid_name}.{url_suffix}" if url_suffix in valid_suffixes else valid_name
local_save_path = os.path.join(day_dir, full_file_name)
expected_path = f"{day}/{full_file_name}" if day else full_file_name
# 下载+提取内容
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
response = requests.get(link, headers=headers, timeout=30, stream=True)
if response.status_code == 200:
with open(local_save_path, "wb") as f:
for chunk in response.iter_content(1024):
if chunk:
f.write(chunk)
print(f"附件{idx}下载成功:{local_save_path}")
# 提取内容(支持PDF/docx/doc,doc已转为docx)
attach_content = extract_file_content(local_save_path)
all_attach_content += f"==== {full_file_name}\n{attach_content}\n\n"
attach_paths.append(expected_path)
else:
print(f"附件{idx}下载失败(状态码{response.status_code})")
attach_paths.append(f"{expected_path}(附件无法下载)")
all_attach_content += f"==== {full_file_name}\n【下载失败,无法提取内容】\n\n"
except Exception as e:
print(f"附件{idx}处理异常:{str(e)}")
attach_paths.append(f"{expected_path}(附件处理异常)")
all_attach_content += f"==== {full_file_name}\n【处理异常:{str(e)}】\n\n"
return ";".join(attach_paths), all_attach_content.rstrip("\n")
def main(params):
"""主函数:串联完整流程(支持PDF/docx/doc提取)"""
# 配置参数
excel_path = "F:/项目文档/作业/影刀医药/url.xlsx"
csv_path = "F:/项目文档/作业/影刀医药/医药信息.csv"
root_save_dir = "d:/yiyao"
start_page = 1
end_page = 1 # 测试用1,正式运行改75
batch_size = 10
# 1. 爬取列表页
crawl_list_pages(start_page, end_page, excel_path)
# 2. 加载Excel数据
wb = load_workbook(excel_path)
ws = wb["Sheet1"]
url_mapping = {}
detail_urls = []
for row in range(2, ws.max_row + 1):
title = ws.cell(row=row, column=1).value
detail_url = ws.cell(row=row, column=2).value
list_page_url = ws.cell(row=row, column=3).value
if title and detail_url and list_page_url:
url_mapping[detail_url] = {"title_backup": title, "list_page_url": list_page_url}
detail_urls.append(detail_url)
wb.close()
print(f"共加载{len(detail_urls)}个详情页链接")
# 3. 初始化CSV表头(含新增列)
if not os.path.exists(csv_path):
with open(csv_path, "w", encoding="utf-8-sig", newline="") as f:
csv.writer(f).writerow([
"标题", "分类", "索引", "日期", "内容", "保存附件",
"列表页URL", "MD5值"
])
# 4. 批量处理详情页
batch_data = []
for idx, detail_url in enumerate(detail_urls, 1):
print(f"\n处理第{idx}个详情页:{detail_url}")
title_backup = url_mapping[detail_url]["title_backup"]
list_page_url = url_mapping[detail_url]["list_page_url"]
detail_md5 = url_to_md5(detail_url)
# 打开网页
web_object = web.create(detail_url, 'edge', load_timeout=2000)
# 提取页面数据
page_data = extract_page_data(web_object)
final_title = page_data["biaoti_web"] if page_data["biaoti_web"] else title_backup
# 处理附件(多格式提取)
final_attach, attach_content = process_attachments(web_object, root_save_dir, page_data["day"])
# 拼接最终内容
final_content = page_data["page_content"]
if attach_content:
final_content = f"{final_content}\n\n{attach_content}" if final_content else attach_content
# 组装行数据
current_row = [
final_title, page_data["fenlei"], page_data["suoyin"], page_data["day"],
final_content, final_attach, list_page_url, detail_md5
]
batch_data.append(current_row)
# 批量写入CSV
if len(batch_data) >= batch_size or idx == len(detail_urls):
with open(csv_path, "a", encoding="utf-8-sig", newline="") as f:
csv.writer(f).writerows(batch_data)
print(f"已写入{len(batch_data)}条数据到CSV(支持doc格式)")
batch_data = []
web_object.close()
print(f"\n所有处理完成!共处理{len(detail_urls)}个详情页,CSV路径:{csv_path}")