import re
import urllib.request
url="https://blog.youkuaiyun.com/"
#伪装成浏览器User-Agent Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0")
opener=urllib.request.build_opener()
opener.addheader=[headers]
#将opener对象安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read()
data=data.decode("utf-8","ignore")
print(len(data))
#设置正则
pat='<h3 class="tracking-ad"data-mod="popu_254"><a href="(.*?)"'
result=re.compile(pat).findall(data)
print(len(result))
for i in range(0,len(result)):
try:
file=r"C:\Users\Mr.Ma\Desktop\优快云"+str(i)+".html"
urllib.request.urlretrieve(result[i],filename=file)
print("第"+str(i)+"次爬取成功!")
except urllib.error.HTTPError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)