##########################
#author :Conan_ft
#Email:634598660@qq.com
import requests
from lxml import etree
from lxml import html
#from lxml.html import fromstring, tostring
url = “https://maoyan.com/films”
headers = {
‘Referer’: ‘https://maoyan.com/films’,
‘Connection’:‘keep-alive’,
‘Host’:‘maoyan.com’,
‘Cookie’:‘uuid_n_v=v1; uuid=340E2D802BED11EB82D08318B644F2A4A83B9D90FA1F48BDA4C833A58466F5A9; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1605958336,1605959248; _lxsdk_cuid=175ea93803db-0165966f617246-4c3f2678-1fa400-175ea93803ec8; _lxsdk_s=175ea938040-f87-df9-32d%7C%7C9; _lxsdk=340E2D802BED11EB82D08318B644F2A4A83B9D90FA1F48BDA4C833A58466F5A9; __mta=209639085.1605958336700.1605959257465.1605960054434.4; _csrf=792cc35eeb4498786e5dd80b985d35ae4c0147ecfff3719d7a84a6fbd4f286b1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1605960054’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0’
}
class Spider(object):
def start_requests(self):
#response = requests.get("https://maoyan.com/films")
response = requests.get(url,headers = headers)
#print(response.encoding)
response.encoding = 'utf-8'
#print(response.text)
xml = etree.HTML(response.text)
film_tit_list = xml.xpath('//div[@class="channel-detail movie-item-title"]/a/text()')
a_score = xml.xpath('//div[@class="channel-detail channel-detail-orange"]/i/text()')
b_score = xml.xpath('//div[@class="channel-detail channel-detail-orange"]/i/text()')
film_src_list = xml.xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
#print(film_tit_list,film_src_list)
#for tit_list,src_list in zip(film_tit_list,film_src_list):
#print(tit_list,src_list)
for tit_list,first_score,second_score,src_list in zip(film_tit_list,a_score,b_score,film_src_list):
finall_url = "https://maoyan.com" + src_list
score = first_score + second_score
print(tit_list,finall_url,score)
#self.next_file(tit_list,src_list)
spider = Spider()
spider.start_requests()
####猫眼 有反爬机制, 一定要有header。