'''
爬取豆瓣网站上
经典电影的 名字 和剧情简介
'''
import requests
from lxml import etree
class DoubanSpider(object):
def __init__(self):
self.url = "https://movie.douban.com/explore#!type=movie&tag=%E7%BB%8F%E5%85%B8"
self.headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
self.data_list = []
def get_response(self,url):
response = requests.get(url,headers = self.headers)
data = response.content.decode("utf-8")
return data
def parse_data(self,data,i):
xpath_data = etree.HTML(data)
name_list = xpath_data.xpath('//div[@class="hd"]/a/span[1]/text()')
url_list = xpath_data.xpath('//div[@class="hd"]/a/@href')
print(len(name_list ))
print(name_list )
print(len(url_list))
print(url_list)
self.save(i,name_list,url_list)
def save(self,i,name_list,url_list):
j=i+1
f = open("./豆瓣爬取/"+"第{}页.txt".format(j),'w')
for index, name in enumerate(name_list):
f.write(name)
f.write('\n')
f.write(url_list[index])
f.write('\n')
f.close()
def run(self):
page = input("请输入要爬取的页数:")
for i in range(int(page)):
self.url = "https://movie.douban.com/top250?start={}".format(str(i*25))
print(self.url)
data = self.get_response(self.url)
self.parse_data(data,i)
DoubanSpider().run()