本次案例针对的是电影天堂上的电影,爬取所有电影的链接,以及每个链接对应的详情页。
整体思路是:封装两个 函数,用来获取url,对url进行解析。
这个函数用来获取每个页面的URL:
def get_detail_url(url):
response=requests.get(url,headers=HEADERS)
# print(response.encoding)
text=response.text.encode('ISO-8859-1').decode('gbk','ignore')
# print(text)
html=etree.HTML(text)
detail_urls=html.xpath("//table[@class='tbspan']//a/@href")
detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls)
return detail_urls
这个函数用来对每个电影的url进行解析:
def parse_detail_page(url):
movie={}
response=requests.get(url,headers=HEADERS)
# text=response.content.decode('gbk')
text = response.text.encode('ISO-8859-1').decode('gbk', 'ignore')
html=etree.HTML(text)
title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title']=title
zoomE=html.xpath("//div[@id='Zoom']")[0]
imgs=zoomE.xpath(".//img/@src")
cover=imgs[0]
movie['cover'] = cover
if len(imgs)>1:
screenshot=imgs[1]
movie['screenshot']=screenshot
else:
movie['screenshot']=None
def parse_info(info,rule):
return info.replace(rule,'').strip()
infos=zoomE.xpath(".//text()")
# print(type(infos))
for index, info in enumerate(infos):
# print(type(info))
# if info.startwith
if info.startswith("◎年 代"):
# info=info.replace("◎年 代","").strip()
info=parse_info(info,"◎年 代")
movie['year']=info
elif info.startswith("◎产 地"):
# info=info.replace("◎产 地","").strip()
info = parse_info(info, "◎产 地")
movie['country']=info
elif info.startswith("◎类 别"):
# info = info.replace("◎类 别", "").strip()
info = parse_info(info, "◎类 别")
movie['catagory'] = info
elif info.startswith("◎豆瓣评分"):
# info = info.replace("◎豆瓣评分", "").strip()
info = parse_info(info, "◎豆瓣评分")
movie['douban_rating'] = info
elif info.startswith("◎片 长"):
# info=info.replace("◎类 别","").strip()
info = parse_info(info, "◎片 长")
movie['duration']=info
elif info.startswith("◎导 演"):
info = parse_info(info, "导 演")
movie['director'] = info
elif info.startswith("◎主 演"):
info=parse_info(info,"◎主 演")
actors=[info]
for x in range(index+1,len(infos)):
actor=infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors']=actors
elif info.startswith("◎简 介"):
for x in range(index+1,len(infos)):
profile=infos[x].strip()
movie['profile']=profile
download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie['download_url']=download_url
return movie