from urllib import request
from multiprocessing import Pool,Manager
import functools
from http import cookiejar
import json,re
#引入cookjar来放在cookie过期
cookie = cookiejar.CookieJar()
handle_cookie=request.HTTPCookieProcessor(cookie)
opener = request.build_opener(handle_cookie)
#添加UA来简单防止反爬
opener.addheaders =[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0')]
def write_to_file(dic):
# 将字典转换成json字符串写入文件方便调用
dic_json = json.dumps(dic,ensure_ascii=False)
with open('maoyan.txt','a',encoding='utf-8') as f:
f.write(dic_json+'\n')
return None
def get_file(html):
# 创建正则表达式对象对得到的HTML进行匹配
pattern =re.compile('<p class="name">[\s\S]*?title="([\s\S]*?)"[\s\S]*? '
'<p class="star">([\s\S]*?)</p>[\s\S]*?<p class="releasetime">([\s\S]*?)</p>')
result = re.findall(pattern,html)
# 将匹配的可迭代对象进行遍历,生成生成器传出
for i in result:
yield {'title':i[0],
'autor':i[1].strip(),
'time':i[2],
}
def get_html(lock,offset):
# 拼接目标URL 使用request的方法爬取信息
url = 'http://maoyan.com/board/4?offset='+str(offset)
req = request.Request(url)
# 判断是否成功请求网站
if opener.open(req).code ==200:
html = opener.open(req).read().decode()
else:
return None
for i in get_file(html):
# 通过加锁的形式来保证写入文件的准确率
lock.acquire()
write_to_file(i)
lock.release()
if __name__ == '__main__':
# 从Manage里获取lock锁 并且使用functools 对原函数增加新变量
manager = Manager()
lock = manager.Lock()
new_get_html =functools.partial(get_html,lock)
# 使用进程池加快获取速度
pool = Pool()
pool.map(new_get_html,[i*10 for i in range(10)])
#关闭进程池
pool.close()
pool.join()
使用进程池抓取猫眼数据
最新推荐文章于 2023-03-13 12:02:05 发布