用正则表达式解析网页
# 猫眼电影
import json
import re
from multiprocessing.pool import Pool
import requests
from requests.exceptions import RequestException
#请求一个页面返回响应内容
def get_one_page(url):
try:
response =requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
#解析网页
def parse_one_page(html):
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) #不要出现多余的空格
items = re.findall(pattern,html)
print(items)
for item in items:
yield{
'number':item[0],
'picture':item[1],
'title':item[2],
'actors':item[3].strip()[3:],
# 'time':item[4].strip()[5:],
'time': get_release_time(item[4].strip()[5:]),
'area': get_release_area(item[4].strip()[5:]),
'score':item[5]+item[6]
}
def main(offset):
url='http://maoyan.com/board/4?offset='+str(offset)
html=get_one_page(url)
for item in parse_one_page(html):
print(item)
if __name__=='__main__':
pool = Pool()
pool.map(main, [i * 10 for i in range(10)])