思路
分析百度贴吧URL
发现,不同页面的URL唯一不同的是pn参数的值,每一页递增50。这样URL就搞定了。
编码思路
1. 构造URL
2. 根据URL,使用urllib.request发起网络请求,并获取HTML页面字符串
3. 将HTML页面字符串保存到本地磁盘中
源码
# !/usr/bin/env python
# -*- coding:utf-8 -*-
"""
使用urllib库下载百度贴吧页面
"""
import urllib.request
import urllib.parse
import random
def build_urls():
"""
获取url列表
:return: url列表
"""
urls = []
begain_page = 1
end_page = 10
base_url = "https://tieba.baidu.com/f?"
tieba_name = urllib.parse.urlencode({"kw": "诛仙"})
base_url = urllib.parse.urljoin(base=base_url, url=tieba_name)
for page in range(begain_page, end_page + 1):
pn = (page - 1) * 50
final_url = base_url + "&pn=" + str(pn)
urls.append(final_url)
return urls
def load_page(url,page):
"""
加载html页面信息
:param url: 要请求的url
:param page: 第几页
:return: None
"""
print("\n正在加载第"+str(page)+"页")
user_agents = [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
]
user_agent = random.choice(user_agents)
request=urllib.request.Request(url)
request.add_header(key="User-Agent",val=user_agent)
response=urllib.request.urlopen(request)
html=response.read().decode()
# 下载到本地
download_html(html,page)
def download_html(html,page):
"""
将html字符串保存到本地
:param html: html字符串
:return: None
"""
print("正在下载第" + str(page) + "页")
html_name="诛仙吧第"+str(page)+"页.html"
html_path="html/"+html_name
with open(html_path,mode='w') as f:
f.write(html)
def tieba_spider():
"""
调度器
:param urls: 要爬取的url列表
"""
# 获取url列表
urls = build_urls()
# 加载页面
for url in urls:
page=urls.index(url)+1 # 当前url对应的页码
load_page(url,page)
if __name__=='__main__':
# 开始采集
tieba_spider()
运行结果