import requests
from lxml import etree
import time
origin_url = str(input("请输入爬取的目标网站(去除页码后缀):")) + "%s"
first_page = int(input("请输入爬取的第一页:"))
last_page = int(input("请输入爬取的最后一页:")) + 1
pages = [x for x in range(first_page, last_page)]
xpath_dict = {
"titles_xpath": input("请输入标题的xpath解析方法:"),
"times_xpath": input("请输入时间的xpath解析方法:"),
"urls_xpath": input("请输入URL的xpath解析方法:"),
"contents_xpath": input("请输入网页内容的xpath解析方法:"),
"sources_xpath": input("请输入新闻来源的xpath解析方法:")
}
def Request(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
text = response.text
return text
def ParsePage(text, xpath_dict):
html = etree.HTML(text)
titles = html.xpath(xpath_dict["titles_xpath"])
times = html.xpath(xpath_dict["times_xpath"])
urls = html.xpath(xpath_dict["urls_xpath"])
contents = []
sources = []
for url in urls:
try:
new_text = Request(url)
new_html = etree.HTML(new_text)
content = new_html.xpath(xpath_dict["contents_xpath"])
new_content = ''.join(content).replace(",", ",").replace(" ", "")\
.replace("\r", "").replace("\n", "").strip()
contents.append(new_content)
source = new_html.xpath(xpath_dict["sources_xpath"])
new_source = "".join(source).replace(",", ",").replace(" ", "")\
.replace("\r", "").replace("\n", "").strip()
sources.append(new_source)
except:
print("%s解析失败!" % url)
continue
datas = []
datas.append(titles)
datas.append(times)
datas.append(contents)
datas.append(urls)
datas.append(sources)
return datas
def SaveFiles(datas, page):
filename = '第%s页内容.csv' % page
with open(filename, "w", encoding="utf-8") as fp:
title = ["标题", "时间", "内容", "URL", "来源"]
fp.write(str(title[0]))
fp.write(",")
fp.write(str(title[1]))
fp.write(",")
fp.write(str(title[2]))
fp.write(",")
fp.write(str(title[3]))
fp.write(",")
fp.write(str(title[4]))
fp.write("\n")
for i in range(len(datas[0])):
fp.write(str(datas[0][i]))
fp.write(",")
fp.write(str(datas[1][i]))
fp.write(",")
fp.write(str(datas[2][i]))
fp.write(",")
fp.write(str(datas[3][i]))
fp.write(",")
fp.write(str(datas[4][i]))
fp.write("\n")
fp.close()
def Main():
for page in pages:
url = origin_url % page
res_text = Request(url)
dicts = xpath_dict
res_datas = ParsePage(res_text, dicts)
SaveFiles(res_datas, page)
print("第%s页解析完毕!" % page)
time.sleep(1)
print("全部页面解析完成!")
if __name__ == '__main__':
Main()