爬取用的库是requests和beautifulsoup,代码编写不难,主要是个别的细节处理需要注意
1、电影得分的处理
右键审查元素,我们看到分数的整数部分和小数部分是分开的,在beautifulsoup中,我们可以用(.strings或者.stripped_strings),但是这样取出来的内容是一个可迭代的生成器,只用用列表或字典才能看到结果
到网上搜罗了一圈终于找到解决办法,也是基础知识存在问题,解决的办法如下
2、多线程
多线程要用到multiprocessing库,具体方法参见代码
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import time
import pymongo
from multiprocessing import Pool
client = pymongo.MongoClient('localhost',27017)
movie = client['movie']
maoyan = movie['maoyan']
# for item in maoyan.find():
# print(item)
def get_one_page(url):
try:
reponse = requests.get(url)
if reponse.status_code == 200:
return reponse.text
return None
except RequestException:
return None
def parse_one_page(html):
soup = BeautifulSoup(html,'lxml')
# print(soup)
ranks = soup.select('#app > div > div > div.main > dl > dd > i')#text
titles = soup.select('div.movie-item-info p.name a')#text
stars = soup.select('div.movie-item-info p.star')#text
times = soup.select('div.movie-item-info p.releasetime')
scores = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p')
for rank,title,star,time,score in zip(ranks,titles,stars,times,scores):
data = {
'rank':rank.text.strip(),
'title':title.text.strip(),
'star':star.text.strip(),
'time':time.text.strip(),
'score':''.join(score.strings)
}
maoyan.insert(data)
print('yes')
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
# print(html)
parse_one_page(html)
if __name__=='__main__':
pool = Pool()
pool.map(main,[i*10 for i in range(10)])