#糗事百科爬虫
import urllib.request
import re
import urllib.error
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1,3):
url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/"
pagedata = urllib.request.urlopen(url).read().decode('utf-8','ignore')
pat = '<div class="recmd-right">.*?href="(.*?)"'
datalist = re.compile(pat,re.S).findall(pagedata)
for j in range(0,len(datalist)):
try:
thisdata = datalist[j]
thisdataurl = "https://www.qiushibaike.com"+thisdata
file = "d://26/"+str(i)+str(j)+".html"
urllib.request.urlretrieve(thisdataurl,filename=file)
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
except Exception as e:
print("异常")
05-31