抓取首页:http://maoyan.com/board/4?offset=0
代码:
# -*- coding:utf-8 -*-
import requests
from requests.exceptions import RequestException
import re # 正则需要的包
import json # json.dumps需要的包
from multiprocessing import Pool #多线程
import time
import os # os.path模块
'''
按页面抓取网页html内容并返回
'''
def get_one_page(url):
# 伪装浏览器请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
'''
正则表达式匹配
'''
def parse_one_page(html):
# 正则匹配的模式 分析html得到
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name">'
+ '<a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
# 参数中的re.S表示匹配任意字符(包括换行符)
items = re.findall(pattern, html)
# 封装为字典
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
# print(items)
'''
保存电影封面图片
'''
def save_image_file(url, path):
imageResponse = requests.get(url)
if imageResponse.status_code == 200:
with open(path, 'wb') as f:
f.write(imageResponse.content)
f.close()
'''
功能:将电影信息保存至文件
json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
'''
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def main(offset):
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
# 若不存在封面文件夹就新建
if not os.path.exists('covers'): # os.path用法见:https://www.cnblogs.com/wuxie1989/p/5623435.html
os.mkdir('covers')
for item in parse_one_page(html):
print(item)
write_to_file(item)
save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg') # 构造的存储电影图片名称
# parse_one_page(html)
# print(html)
if __name__ == '__main__':
time_start = time.time()
# for i in range(10):
# main(i*10) # 由翻页的规律得到
pool = Pool() # 创建进程池
pool.map(main, [i*10 for i in range(10)]) #pool.map() 用法与普通map相同
time_end = time.time()
time_cost = time_end - time_start # 抓取耗时
print(time_cost)
页面规律:
http://maoyan.com/board/4?offset=0
http://maoyan.com/board/4?offset=10
http://maoyan.com/board/4?offset=20
……
html文本:
不使用线程池耗时:
使用线程池耗时:
抓取到的信息如下:
参考资料:
崔庆才 Pyhon3爬虫
参考博客:
https://www.jianshu.com/p/8fa68aee0581