from lxml import etree
import requests
BASE_DOMIN = "http://dytt8.net"
url = "http://dytt8.net/html/gndy/dyzz/list_23_1.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
def get_detail_urls(url):
response = requests.get(url, headers=headers)
#response.text
#response.content
#requests库,默认会使用自己猜测的编码方式将
#抓取下来的网页进行编码,然后存储到 text属性上去
# 在电影天堂的网页中,因为编码方式,requests库猜错了,所以会产生乱码
#print(response.text)
#print(response.content.decode("gbk"))
print(response.encoding)
text = response.text
html = etree.HTML(text)
details_urls = html.xpath(".//table[@class='tbspan']//a/@href")
details_urls = map(lambda url:BASE_DOMIN+url, details_urls)
return details_urls
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=headers)
text = response.content.decode("gbk")
html = etree.HTML(tex
电影天堂爬虫
最新推荐文章于 2025-06-23 10:16:45 发布