import requests
from lxml import html
import csv
# 目标URL
url = "http://opinion.people.com.cn/n1/2025/1111/c1003-40600808.html"
# 发送请求
response = requests.get(url)
response.encoding = 'utf-8' # 设置编码
# 使用lxml解析HTML
tree = html.fromstring(response.text)
# 使用XPath提取信息
title = tree.xpath('//h1/text()')
title = title[0].strip() if title else "未找到标题"
# 使用XPath提取时间和作者
time_element = tree.xpath('//div[@class="col-1-1 fl"]/text()')
if time_element:
time_text = time_element[0].strip()
time = time_text.split('来源:')[0].strip() if '来源:' in time_text else time_text
else:
time = "未找到时间"
author_element = tree.xpath('//div[@class="author cf"]/text()')
if author_element:
author = author_element[0].replace('编辑:', '').strip()
else:
author = "未找到作者"
# 使用XPath提取正文内容
content_elements = tree.xpath('//div[@class="rm_txt_con cf"]//p/text() | //div[@class="edit cf"]//p/text()')
if content_elements:
content = '\n'.join([p.strip() for p in content_elements])
else:
content = "未找到正文内容"
# 打印结果
print("爬取结果:")
print(f"标题:{title}")
print(f"时间:{time}")
print(f"作者:{author}")
print(f"内容预览:{content[:100]}...")
# 保存到CSV文件
with open('人民网文章.csv', 'w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(['标题', '时间', '作者', '内容'])
writer.writerow([title, time, author, content])
print("\n数据已保存到 '人民网文章.csv' 文件")
# 也可以选择保存到文本文件
with open('人民网文章.txt', 'w', encoding='utf-8') as file:
file.write(f"标题:{title}\n")
file.write(f"时间:{time}\n")
file.write(f"作者:{author}\n")
file.write(f"内容:\n{content}\n")
print("数据已保存到 '人民网文章.txt' 文件")
import tkinter as tk
from tkinter import messagebox, scrolledtext
import requests
from lxml import html
import csv
class SimpleSpider:
def __init__(self, root):
self.root = root
self.root.title("人民网爬虫")
self.root.geometry("600x500")
self.data = {}
self.create_ui()
def create_ui(self):
# URL输入
tk.Label(self.root, text="文章URL:").pack(pady=5)
self.url_entry = tk.Entry(self.root, width=70)
self.url_entry.pack(pady=5)
self.url_entry.insert(0, "http://opinion.people.com.cn/n1/2025/1111/c1003-40600808.html")
# 按钮
btn_frame = tk.Frame(self.root)
btn_frame.pack(pady=10)
tk.Button(btn_frame, text="开始爬取", command=self.crawl).pack(side=tk.LEFT, padx=5)
tk.Button(btn_frame, text="保存CSV", command=self.save_csv).pack(side=tk.LEFT, padx=5)
tk.Button(btn_frame, text="保存TXT", command=self.save_txt).pack(side=tk.LEFT, padx=5)
# 结果显示
tk.Label(self.root, text="爬取结果:").pack(anchor='w', pady=(10,0))
self.result_text = scrolledtext.ScrolledText(self.root, height=20)
self.result_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
def crawl(self):
url = self.url_entry.get().strip()
if not url:
messagebox.showerror("错误", "请输入URL")
return
try:
response = requests.get(url, timeout=10)
response.encoding = 'utf-8'
tree = html.fromstring(response.text)
# XPath提取数据
title = tree.xpath('//h1/text()')
title = title[0].strip() if title else "未找到标题"
time_elem = tree.xpath('//div[@class="col-1-1 fl"]/text()')
time = time_elem[0].strip().split('来源:')[0].strip() if time_elem else "未找到时间"
author_elem = tree.xpath('//div[@class="author cf"]/text()')
author = author_elem[0].replace('编辑:', '').strip() if author_elem else "未找到作者"
content_elems = tree.xpath('//div[@class="rm_txt_con cf"]//p/text()')
content = '\n'.join([p.strip() for p in content_elems]) if content_elems else "未找到正文"
# 保存数据
self.data = {'title': title, 'time': time, 'author': author, 'content': content}
# 显示结果
self.result_text.delete(1.0, tk.END)
self.result_text.insert(tk.END, f"标题:{title}\n\n时间:{time}\n\n作者:{author}\n\n内容:\n{content}")
messagebox.showinfo("成功", "爬取完成!")
except Exception as e:
messagebox.showerror("错误", f"爬取失败: {str(e)}")
def save_csv(self):
if not self.data:
messagebox.showerror("错误", "请先爬取文章")
return
try:
with open('人民网文章.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['标题', '时间', '作者', '内容'])
writer.writerow([self.data['title'], self.data['time'], self.data['author'], self.data['content']])
messagebox.showinfo("成功", "已保存为CSV文件")
except Exception as e:
messagebox.showerror("错误", f"保存失败: {str(e)}")
def save_txt(self):
if not self.data:
messagebox.showerror("错误", "请先爬取文章")
return
try:
with open('人民网文章.txt', 'w', encoding='utf-8') as f:
f.write(f"标题:{self.data['title']}\n")
f.write(f"时间:{self.data['time']}\n")
f.write(f"作者:{self.data['author']}\n")
f.write(f"内容:\n{self.data['content']}\n")
messagebox.showinfo("成功", "已保存为TXT文件")
except Exception as e:
messagebox.showerror("错误", f"保存失败: {str(e)}")
if __name__ == "__main__":
root = tk.Tk()
app = SimpleSpider(root)
root.mainloop()
//对以上代码进行修改:包含py文档/下载内容,方法至少含正则表达式、xpath等基础方法,多线程、自动化等
最新发布