
from bs4 import BeautifulSoup
import requests
import time
import pymongo
#创建数据库
client = pymongo.MongoClient('localhost', 27017) #激活客户端
douban = client['douban']
url_list = douban['url_list']
item_list = douban['item_info']
start_url = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0, 250, 25)]
urlone = 'https://movie.douban.com/top250?start=0&filter='
#获取某页各电影的信息(影名,链接,评分,评论数,一句话影评)
def get_index_url(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('div.hd > a > span.title')
links = soup.select('div > div.info > div.hd > a ')
scores = soup.select('div.bd > div > span.rating_num')
comments_count = soup.select('div > div.info > div.bd > div > span:nth-of-type(4)')
film_review = soup.select('p.quote > span')
# content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > p.quote > span
# print(film_review)
##content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span:nth-child(4)
tt = []
#影名中有不规范的地方
for i in titles:
if (i.get_text()[1]!= '/'):
# print(i.get_text())
tt.append(i.get_text())
# print(tt)
for title,link,score,comment,review in zip(tt, links, scores,comments_count,film_review):
data = {
'title': title,
'link' : link.get('href'),
'score': score.get_text(),
'comments_count' : comment.get_text(),
'review_one': review.get_text()
}
print(data)
get_index_url(urlone)
# for i in start_url:
# get_index_url(i)
Python 爬取豆瓣电影Top250(一)
最新推荐文章于 2025-08-15 09:35:14 发布