直接上代码,主要2个函数,一个是获取每个电影的详情页URL的函数,一个是处理电影详情页数据的函数。
import requests
from bs4 import BeautifulSoup
import time
start_url = 'https://movie.douban.com/top250'
movie_url = []
#连接太多会被拒绝,限制在5个
requests.adapters.DEFAULT_RETRIES = 5
def get_url(url):
global movie_url, start_url
resp = requests.get(url)
soup = BeautifulSoup(resp.text,'lxml')
info_list = soup.find_all(class_='info')
for info in info_list:
movie_url.append(info.find('a').get("href"))
try:
next_link = soup.find(attrs={
'rel':'next'}).get("href")
url = start_url+next_link
except:
url = None
print(url)
return url
def get_movie_info(url):
try:
resp = requests.get(url)
except:
time.sleep(5)
resp = requests.get(url)
soup = BeautifulSoup(resp.text,'lxml')
score = soup.find(attrs={
'property':