Python 爬取豆瓣电影Top250(一)

from bs4 import BeautifulSoup
import requests
import time
import pymongo

#创建数据库
client = pymongo.MongoClient('localhost', 27017)    #激活客户端
douban = client['douban']
url_list = douban['url_list']
item_list = douban['item_info']


start_url = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0, 250, 25)]
urlone = 'https://movie.douban.com/top250?start=0&filter='
#获取某页各电影的信息(影名,链接,评分,评论数,一句话影评)
def get_index_url(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    titles = soup.select('div.hd > a > span.title')
    links = soup.select('div > div.info > div.hd > a ')
    scores = soup.select('div.bd > div > span.rating_num')
    comments_count = soup.select('div > div.info > div.bd > div > span:nth-of-type(4)')
    film_review = soup.select('p.quote > span')
    # content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > p.quote > span
    # print(film_review)
    ##content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span:nth-child(4)
    tt = []
    #影名中有不规范的地方
    for i in titles:
        if (i.get_text()[1]!= '/'):
            # print(i.get_text())
            tt.append(i.get_text())
    # print(tt)
    for title,link,score,comment,review in zip(tt, links, scores,comments_count,film_review):
        data = {
            'title': title,
            'link' : link.get('href'),
            'score': score.get_text(),
            'comments_count' : comment.get_text(),
            'review_one': review.get_text()
        }
        print(data)
get_index_url(urlone)
# for i in start_url:
#     get_index_url(i)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值