#Author:donghuiya #这是抓取猫眼电影前100页,分四步: #1.得到网页源代码 #2.用正则表达式获得所需要的内容 #3.写入文件中 #4.用多进程更快 import requests from requests.exceptions import RequestException import re import json from multiprocessing import Pool def get_one_page(url): try: response=requests.get(url) if response.status_code==200: return response.text return None except RequestException: return None def parse_one_page(html): pattern=re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)' +'</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) items=re.findall(pattern,html) for item in items: yield{ 'index':item[0], 'title':item[1], 'actor':item[2].strip()[3:], 'time':item[3].strip()[5:], 'score':item[4]+item[5] } def write_to_file(content): with open('result1.txt','a',encoding="utf-8") as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close() def main(offset): url='http://maoyan.com/board/4?offset='+str(offset) html=get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__=='__main__': pool=Pool() pool.map(main,[i*10 for i in range(10)])