Urllib 模块
import urllib
import urllib.request
# 爬取到内存 忽略解码过程中错误
data = urllib.request.urlopen("http://www.jd.com").read().decode('utf-8', 'ignore')
# 验证数据爬取是否成功
len(data)
# 提取标题
import re
pat = "<title>(.*?)</title>"
title = re.compile(pat, re.S).findall(data)
# ['京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!']
# 爬取到硬盘
urllib.request.urlretrieve("http://www.jd.com", filename="./jd.html")
# 浏览器伪装
opener = urllib.request.build_opener()
UA = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
opener.addheaders = [UA]
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://www.qiushibaike.com").read().decode('utf-8', 'ignore')
# 使用用户代理池
uapools = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
]
import random
def UA():
opener = urllib.request.build_opener()
thisua = random.choice(uapools)
ua = ("User-Agent", thisua)
opener.addheaders = [ua]
urllib.request.install_opener(opener)
for i in range(10):
UA()
data = urllib.request.urlopen("https://www.qiushibaike.com").read().decode('utf-8', 'ignore')
糗事百科实战
批量爬取段子,实现自动翻页
import urllib.request
import re
import random
import time
uapools = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36",
]
def UA():
opener = urllib.request.build_opener()
thisua = random.choice(uapools)
ua = ("User-Agent", thisua)
opener.addheaders = [ua]
urllib.request.install_opener(opener)
for i in range(0, 10):
UA()
thisurl = "https://www.qiushibaike.com/8hr/page/" + str(i + 1) + '/?s=4948859'
try:
data = urllib.request.urlopen(thisurl).read().decode('utf-8', 'ignore')
# pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
pat = '<a class="recmd-content" .*?>(.*?)</a>'
rst = re.compile(pat, re.S).findall(data)
for j in range(0, len(rst)):
print(rst[j])
print("-----")
except Exception as err:
pass
其他小节笔记
阿里云爬虫项目课程笔记【1】:正则表达式 与 XPath表达式
阿里云爬虫项目课程笔记【3】:腾讯视频评论实战
阿里云爬虫项目课程笔记【4】:Requests 模块 与 云栖社区博文爬虫实战
阿里云爬虫项目课程笔记【5】:Scrapy 模块 与 当当爬虫实战
阿里云爬虫项目课程笔记【6 - 8】:招聘信息、淘宝网商品信息 与 知乎 爬虫实战
阿里云爬虫项目课程笔记【9 & 10】常见的反爬策略与反爬攻克手段、腾讯漫画爬取实战 与 分布式爬虫