代码如下
import re
import requests
import json
#from multiprocessing import Pool
# 多进程
#url = 'https://maoyan.com/board/4?offset=0'
#r = requests.get(url)
#r.encoding = r.apparent_encoding
#print(r.status_code)
#print(r.request.headers)
# 获取单页
def get_one_page(url):
try:
kv = {
'user-agent':'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("爬取失败")
# 提取需要的信息
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index':item[0],
'title':item[1],