import requests
from bs4 import BeautifulSoup
def get_page():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'
}
movies = []
for i in range(10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25)
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html,'lxml')
div_list = soup.find_all('div', class_='hd')
for item in div_list:
movie = item.a.span.text.strip()
movies.append(movie)
print(movies)
length = len(movies)
for j in range(0, length):
fileHandle = open("movies.txt", 'a', encoding='utf-8')
fileHandle.write(str(j+1)+ ':' + movies[j])
fileHandle.write("\n")
get_page()
python爬虫入门豆瓣top250的抓取
最新推荐文章于 2020-08-24 11:09:27 发布