基本和上次写的爬取不得姐视频差不太多,也是正则表达式方法,所以这次就直接贴代码了
import urllib.request
import urllib.error
import re
def download(url):
try:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
request=urllib.request.Request(url,headers=headers)
html=urllib.request.urlopen(request).read()
html=html.decode('UTF-8')
return html
except:
a=1
for page in range(1,3):
url="http://blog.youkuaiyun.com/Joliph/article/list/"+str(page)
html=download(url)
titlereg=r'<h3 class="list_c_t"><a href=".*?">(.*?)</a></h3>'
titlelist=re.findall(titlereg,html)
yearreg=r'<div class="date_t"><span>(.*?)</span>'
yearlist=re.findall(yearreg,html)
monthreg=r'<div class="date_t"><span>.*?</span><em>(.*?)</em></div>'
monthlist=re.findall(monthreg,html)
dayreg=r'<div class="date_b">(.*?)</div>'
daylist=re.findall(dayreg,html)
number=len(titlelist)
for i in range(1,number+1):
print(yearlist[i-1]+"."+monthlist[i-1]+"."+daylist[i-1]+"----"+titlelist[i-1])