python爬取 maoyan 电影 评分

##########################
#author :Conan_ft
#Email:634598660@qq.com
import requests
from lxml import etree
from lxml import html
#from lxml.html import fromstring, tostring
url = “https://maoyan.com/films”
headers = {
‘Referer’: ‘https://maoyan.com/films’,
‘Connection’:‘keep-alive’,
‘Host’:‘maoyan.com’,
‘Cookie’:‘uuid_n_v=v1; uuid=340E2D802BED11EB82D08318B644F2A4A83B9D90FA1F48BDA4C833A58466F5A9; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1605958336,1605959248; _lxsdk_cuid=175ea93803db-0165966f617246-4c3f2678-1fa400-175ea93803ec8; _lxsdk_s=175ea938040-f87-df9-32d%7C%7C9; _lxsdk=340E2D802BED11EB82D08318B644F2A4A83B9D90FA1F48BDA4C833A58466F5A9; __mta=209639085.1605958336700.1605959257465.1605960054434.4; _csrf=792cc35eeb4498786e5dd80b985d35ae4c0147ecfff3719d7a84a6fbd4f286b1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1605960054’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0’
}
class Spider(object):
def start_requests(self):

    #response = requests.get("https://maoyan.com/films")
    response = requests.get(url,headers = headers)
    #print(response.encoding)
    response.encoding = 'utf-8'
    #print(response.text)
    xml = etree.HTML(response.text)

    film_tit_list = xml.xpath('//div[@class="channel-detail movie-item-title"]/a/text()')
    a_score = xml.xpath('//div[@class="channel-detail channel-detail-orange"]/i/text()')
    b_score = xml.xpath('//div[@class="channel-detail channel-detail-orange"]/i/text()')
    film_src_list = xml.xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
    #print(film_tit_list,film_src_list)
    #for tit_list,src_list in zip(film_tit_list,film_src_list):
        #print(tit_list,src_list)
    for tit_list,first_score,second_score,src_list in zip(film_tit_list,a_score,b_score,film_src_list):
        finall_url = "https://maoyan.com" + src_list
        score = first_score + second_score
        print(tit_list,finall_url,score)
        #self.next_file(tit_list,src_list)

spider = Spider()
spider.start_requests()

####猫眼 有反爬机制, 一定要有header。

在这里插入图片描述

Python爬取猫眼电影信息通常涉及网络爬虫技术,利用像BeautifulSoup、Scrapy等库来抓取数据。以下是一个简化的步骤: 1. **安装必要的库**:首先需要安装`requests`, `beautifulsoup4`, 可能还需要`lxml`,因为它们常用于处理HTML内容。 ```bash pip install requests beautifulsoup4 lxml ``` 2. **编写爬虫脚本**: - 使用`requests.get()`获取网页源代码。 - 解析HTML,找到包含电影排名、演员、类型和评分的部分。这通常通过查找特定的HTML标签(如`<div class="movie-info">...</div>`)并提取其中的数据。 ```python import requests from bs4 import BeautifulSoup def get_movie_info(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') # 需要定位到对应部分的CSS或XPath选择器 rank_div = soup.select('.movie-ranking') # 排名区域 actor_div = soup.select('.actor-info') # 演员信息 type_and_score_div = soup.select('.type-and-score') # 类型和评分 # 提取数据并解析(可能需要进一步处理) ranking = [div.text for div in rank_div] actors = [div.text for div in actor_div] types_and_scores = [(info['type'], info['score']) for info in type_and_score_div] return ranking, actors, types_and_scores # 示例URL,替换为你实际的目标页面 url = "https://maoyan.com/board/4" rankings, actors, details = get_movie_info(url) ``` 注意:在实际操作中,网站可能会有反爬机制或限制频繁请求,因此在爬取时应遵守网站的Robots协议,并尽量模拟人类用户的行为。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值