import xbot
from xbot import web
from xbot import excel
from xbot import print, sleep
from .import package
from .package import variables as glv
import pymysql
import os, shutil
import pdfplumber
import docx
import time, hashlib
from pathlib import Path
import csv
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import queue
import requests
from urllib.parse import urljoin, urlparse
# 全局变量用于存储爬取结果
title_link_queue = queue.Queue()
content_queue = queue.Queue()
###############################循环抓取文章的标题和url##############################
def get_web():
title_list = []
link_list = []
# 使用多线程同时抓取多页
def fetch_page(i):
url = f"https://db.yaozh.com/policies?p={i}"
try:
web_object = web.create(url, 'edge', load_timeout=200)
# 更新XPath以匹配实际页面结构
url_list = web_object.find_all_by_xpath('//div[@class="responsive-table"]/table/tbody/tr/th/a')
page_titles = []
page_links = []
for item in url_list:
title = item.get_text()
link = item.get_attribute('href')
# 处理相对链接
if link.startswith('/'):
link = urljoin("https://db.yaozh.com", link)
# 过滤条件:标题不为空且链接有效
if title and link:
page_titles.append(title.strip())
page_links.append(link)
web_object.close()
return page_titles, page_links
except Exception as e:
print(f"获取页面 {i} 时出错: {str(e)}")
return [], []
# 使用线程池,最多同时运行2个线程(即同时打开2个网页)
with ThreadPoolExecutor(max_workers=2) as executor:
futures = [executor.submit(fetch_page, i) for i in range(1, 3)] # 提交第1、2页任务
for future in as_completed(futures):
page_titles, page_links = future.result()
title_list.extend(page_titles)
link_list.extend(page_links)
return title_list, link_list
###########################将文章的标题和url存储进入到CSV里#########################
def url_insert_csv(title_list, link_list):
csv_file_path = r'D:\作业\yaozhi_url.csv'
# 检查父目录是否存在
directory = os.path.dirname(csv_file_path)
if not os.path.exists(directory):
os.makedirs(directory) # 自动创建目录
existing_links = set()
# 尝试读取已有数据(若文件存在且可读)
try:
if os.path.exists(csv_file_path):
with open(csv_file_path, mode='r', encoding='utf-8-sig') as file:
reader = csv.DictReader(file)
for row in reader:
if '链接' in row:
existing_links.add(row['链接'])
except PermissionError:
print(f"警告:无法读取历史数据,文件可能被占用:{csv_file_path}")
return # 或抛出异常提示用户
# 过滤新数据
new_data = [[t, l] for t, l in zip(title_list, link_list) if l not in existing_links]
# 写入文件
try:
file_mode = 'a' if existing_links else 'w'
with open(csv_file_path, mode=file_mode, newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
if file_mode == 'w':
writer.writerow(['标题', '链接'])
writer.writerows(new_data)
print(f"新增 {len(new_data)} 条记录。")
except PermissionError:
print(f"[错误] 无法写入文件,请检查:\n1. 是否用Excel打开了'{csv_file_path}'\n2. 是否有写入权限")
####################################获取单篇文章内容#################################
def get_single_article_content(item, article_title):
try:
web_object_content = web.create(item, 'edge', load_timeout=200)
# 尝试多种XPath来获取表格数据
# 尝试多个可能的XPath
table_elements = web_object_content.find_all_by_xpath('//*[contains(@class, "body")]//div[contains(@class, "manual")]//div[@class="content hbg" or @class="content"]')
dept = "未知"
text_sign = "未知"
level = "未知"
public_date = "未知"
keep_time = "未知"
title_content = "未知"
# 提取基本信息
if table_elements:
try:
if len(table_elements) > 5:
dept = table_elements[0].get_text()[6:].strip() if len(table_elements[0].get_text()) > 6 else "未知"
text_sign = table_elements[1].get_text()[6:].strip() if len(table_elements[1].get_text()) > 6 else "未知"
level = table_elements[2].get_text()[6:].strip() if len(table_elements[2].get_text()) > 6 else "未知"
public_date = table_elements[3].get_text()[-10:].strip() if len(table_elements[3].get_text()) >= 10 else "未知"
keep_time = table_elements[4].get_text()[6:].strip() if len(table_elements[4].get_text()) > 6 else "未知"
title_content = table_elements[5].get_text().strip()
else:
dept = table_elements[0].get_text()[6:].strip() if len(table_elements[0].get_text()) > 6 else "未知"
level = table_elements[1].get_text()[6:].strip() if len(table_elements[1].get_text()) > 6 else "未知"
public_date = table_elements[2].get_text()[-10:].strip() if len(table_elements[2].get_text()) >= 10 else "未知"
keep_time = table_elements[3].get_text()[6:].strip() if len(table_elements[3].get_text()) > 6 else "未知"
title_content = table_elements[4].get_text().strip()
except Exception as e:
print(f"提取基本信息时出错: {str(e)}")
# 获取正文内容
text_content = "未知"
try:
text_elements = web_object_content.find_all_by_xpath('//*[contains(@class, "content")]//div[contains(@class, "new_detail_content")]//p')
if text_elements:
text_content = ''.join(element.get_text().strip() for element in text_elements if element.get_text().strip())
else:
# 尝试其他可能的正文内容XPath
text_elements = web_object_content.find_all_by_xpath('//div[@class="content"]/p | //div[@class="content"]/div | //div[@class="article-content"]//p')
if text_elements:
text_content = '\n'.join(element.get_text().strip() for element in text_elements if element.get_text().strip())
except:
pass
# 如果正文内容为空,使用标题内容
if not text_content.strip():
text_content = title_content
# 下载附件
result_list = download_new(web_object_content, public_date)
path_file = result_list[0]
all_text_content = result_list[1]
target_url = item
target_id = url_to_md5(target_url)
web_object_content.close()
# 合并文章内容和附件内容
if all_text_content != "无附件内容" and all_text_content.strip() != "":
# 如果有附件内容,则合并文章内容和附件内容
combined_content = f"{text_content}\n\n附件内容:\n{all_text_content}"
else:
# 如果无附件内容,则只使用文章内容
combined_content = text_content
# 返回数据(按照数据库表结构,添加文章标题字段)
data = ["李耀铭", article_title, dept, text_sign, level, public_date, keep_time, combined_content, path_file, item, target_id]
return data
except Exception as e:
print(f"处理文章 {item} 时出错: {str(e)}")
try:
web_object_content.close()
except:
pass
return None
####################################获取文章内容#####################################
def get_content():
# 读取CSV文件中的URL和标题
csv_file_path = 'D:\\作业\\yaozhi_url.csv'
execl_url_list = []
execl_title_list = []
with open(csv_file_path, mode='r', encoding='utf-8-sig') as file:
reader = csv.reader(file)
next(reader) # 跳过表头
for row in reader:
execl_title_list.append(row[0]) # 标题在第一列
execl_url_list.append(row[1]) # 链接在第二列
print(f"开始处理 {len(execl_url_list)} 篇文章...")
data_lines = []
processed_count = 0
# 使用线程池并行处理文章
max_workers = 2 # 减少并发数以避免浏览器崩溃
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_url = {executor.submit(get_single_article_content, url, title): (url, title) for url, title in zip(execl_url_list, execl_title_list)}
# 收集结果
for future in as_completed(future_to_url):
url, title = future_to_url[future]
try:
result = future.result()
if result:
data_lines.append(result)
processed_count += 1
print(f"已处理第 {processed_count} 篇文章: {result[2][:20]}...")
# 每5篇文章上传到数据库和CSV
if len(data_lines) >= 5:
print(f"已收集 {len(data_lines)} 篇文章,正在批量上传到数据库和CSV...")
upload_to_mysql(data_lines)
set_csv(data_lines) # 保存批量数据到CSV
# 清空已上传的数据
data_lines = []
print(f"前 {processed_count} 篇文章已批量上传到数据库和CSV")
except Exception as e:
print(f"处理URL {url} 时出错: {str(e)}")
# 上传剩余的数据(如果不足5篇)
if data_lines:
print(f"正在上传剩余的 {len(data_lines)} 篇文章到数据库和CSV...")
upload_to_mysql(data_lines)
set_csv(data_lines) # 保存剩余数据到CSV
print("剩余文章已上传到数据库和CSV")
print(f"总共处理了 {processed_count} 篇文章")
return data_lines
#######################################附件下载#####################################
def download_new(web_object_content, public_date):
# 初始化结果变量
str_download_path = "无附件"
all_text = "无附件内容"
download_list = []
try:
# 获取页面中的所有下载链接(包含Word、PDF等)
# 尝试多种可能的附件链接XPath
download_files = web_object_content.find_all_by_xpath('//a[contains(@href, ".pdf") or contains(@href, ".doc") or contains(@href, ".docx") or contains(@href, ".xls") or contains(@href, ".xlsx")]')
if not download_files:
download_files = web_object_content.find_all_by_xpath('//span[@class="fileLink"]//a | //div[@class="attachment"]//a')
if not download_files:
print("未找到附件链接")
return [str_download_path, all_text]
# 创建根目录和日期文件夹
root_dir = r"D:\\作业\\yaozhi"
os.makedirs(root_dir, exist_ok=True) # 确保根目录存在
date_folder = os.path.join(root_dir, public_date if public_date != "未知" else "no_date")
os.makedirs(date_folder, exist_ok=True) # 自动创建文件夹(已存在则不报错)
for item in download_files:
try:
# 获取文件信息
file_title = item.get_text().strip() # 获取链接文本作为文件名
file_url = item.get_attribute('href').strip()
# 处理相对链接
if file_url.startswith('/'):
file_url = urljoin("https://db.yaozh.com", file_url)
elif not file_url.startswith(('http://', 'https://')):
current_url = web_object_content.get_current_url()
file_url = urljoin(current_url, file_url)
# 提取原始文件名(从URL中)和文件类型
origin_file_name = os.path.basename(file_url)
if '.' not in origin_file_name:
# 如果URL中没有扩展名,尝试从链接文本推断
if 'pdf' in file_title.lower():
origin_file_name += '.pdf'
elif 'doc' in file_title.lower():
origin_file_name += '.doc'
elif 'docx' in file_title.lower():
origin_file_name += '.docx'
else:
continue # 跳过无法识别的文件
file_type = origin_file_name.split('.')[-1].lower() if '.' in origin_file_name else 'unknown'
# 如果文件名为空,使用链接文本作为文件名
if not file_title or file_title == origin_file_name:
file_title = origin_file_name
elif not file_title.endswith(f'.{file_type}'):
file_title = f"{file_title}.{file_type}" # 确保标题带扩展名
print(f"准备下载: {file_title} from {file_url}")
# 下载文件
response = requests.get(file_url, stream=True)
if response.status_code == 200:
# 目标路径(日期文件夹内)
target_path = os.path.join(date_folder, file_title)
# 处理重名:如果目标文件已存在,添加序号
counter = 1
original_target_path = target_path
while os.path.exists(target_path):
name, ext = os.path.splitext(file_title)
target_path = os.path.join(date_folder, f"{name}_{counter}{ext}")
counter += 1
# 保存文件
with open(target_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"文件下载成功:{target_path}")
# 记录相对路径
relative_path = os.path.relpath(target_path, root_dir)
download_list.append(relative_path)
# 提取文件内容(根据类型处理)
if file_type == 'pdf':
try:
with pdfplumber.open(target_path) as pdf:
page_texts = []
for page in pdf.pages:
text = page.extract_text()
if text:
page_texts.append(text.strip())
extracted_text = "\n\n".join(page_texts)
all_text = extracted_text if extracted_text else all_text
except Exception as e:
print(f"PDF提取错误: {str(e)}")
elif file_type in ['docx', 'doc']: # 处理docx和doc
try:
if file_type == 'docx':
doc = docx.Document(target_path)
content = []
# 提取段落
for para in doc.paragraphs:
para_text = para.text.strip()
if para_text:
content.append(para_text)
# 提取表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
content.append(cell_text)
extracted_text = "\n".join(content)
all_text = extracted_text if extracted_text else all_text
except Exception as e:
print(f"Word文档提取错误: {str(e)}")
else:
print(f"下载失败,状态码: {response.status_code}")
except Exception as e:
print(f"处理下载链接时出错: {str(e)}")
continue
# 拼接所有下载路径
if download_list:
str_download_path = ",".join(download_list)
else:
str_download_path = "无附件" # 确保无附件时返回正确的值
except Exception as e:
print(f"下载附件时出错: {str(e)}")
str_download_path = "无附件" # 出错时也返回正确的值
return [str_download_path, all_text]
##############################platform_id字段的生成############################
def url_to_md5(url):
# 创建MD5哈希对象
md5_hash = hashlib.md5()
# 将URL编码为UTF-8字节流(哈希函数需要字节输入)
md5_hash.update(url.encode('utf-8'))
# 获取16进制表示的哈希结果(32位)
return md5_hash.hexdigest()
###########################将所需数据插入到指定CSV里#########################
def set_csv(data_lines):
# 定义CSV文件路径
csv_file_path = 'D:\\作业\\yaozhi'
# 检查文件是否存在,以决定是否写入表头
file_exists = os.path.isfile(csv_file_path)
# 以追加模式写入CSV文件
with open(csv_file_path, mode='a', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
# 如果文件不存在,写入表头
if not file_exists:
writer.writerow(['学生', '文章标题', '发布部门', '字号', '效力级别', '发布日期', '有效时间', '内容', '附件路径', '原文链接', '平台ID'])
# 写入数据
for data in data_lines:
writer.writerow(data)
def upload_to_mysql(data_lines):
"""上传数据到MySQL数据库"""
config = {
"host": "192.168.100.18",
"port": 3306,
"user": "yiyaoqixie",
"password": "123456",
"database": "yiyaoqixie", # 原为 "db" -> 应使用 "database"
"charset": "utf8mb4",
"cursorclass": pymysql.cursors.DictCursor
}
connection = None
cursor = None
try:
conn = get_mysql_conn()
cursor = conn.cursor()
cursor.execute("""
INSERT INTO yaozh_policies (
student, title, cat, code, public_date, content, attachments, url, platform_id
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
title = VALUES(title),
cat = VALUES(cat),
code = VALUES(code),
public_date = VALUES(public_date),
content = VALUES(content),
attachments = VALUES(attachments),
url = VALUES(url)
""", (
data["student"],
data["title"],
data["cat"] or "",
data["code"] or "",
data["public_date"] or "",
data["content"] or "",
data["attachments"] or "",
data["url"],
data["platform_id"]
))
conn.commit()
if cursor.rowcount == 1:
print(f"✅ 数据入库成功:{data['title'][:20]}")
elif cursor.rowcount == 2:
print(f"🔄 数据更新成功:{data['title'][:20]}")
cursor.close()
conn.close()
return True
except Exception as e:
if 'conn' in locals():
conn.rollback()
print(f"❌ 数据入库失败[{data['title'][:20]}]: {str(e)[:60]}")
return False
finally:
# 安全关闭资源
if cursor:
cursor.close()
if connection:
connection.close()
#########################################主函数#######################################
def main(args):
print("开始获取网页数据...")
result1, result2 = get_web()
print(f"获取到 {len(result1)} 个标题和 {len(result2)} 个链接")
if result1 and result2:
url_insert_csv(result1, result2)
print("开始获取文章内容...")
get_content()
print("爬取完成!")
else:
print("未能获取到数据,请检查网络连接和XPath选择器")
if __name__ == "__main__":
import sys
main(sys.argv)
完善一下插入数据库数据片段,并给我完整代码
最新发布