之前写爬虫用的都是BeautifulSoup库,这次学了一下正则表达式,用猫眼电影的Top100榜试了一下。
import re
import requests
import json
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
res = requests.get(url, headers = headers)
if res.status_code == 200:
return res.text
else:
return None
def get_information(html):
pattern = re.compile('<dd>.*?<i.*?board-index.*?>(.*?)</i>.*?<a.*?title="(.*?)".*?img\sdata-src="(.*?)".*?</a>.*?<p.*?star.*?>(.*?)</p>.*?<p.*?releasetime.*?>(.*?)</p>',re.S)
datas = re.findall(pattern,html)
for data in datas:
yield {
'index':data[0],
'title':data[1],
'imgSrc':data[2].strip(),
'star':data[3].strip()[3:] if len(data[3].strip())>3 else '',
'releaseTime':data[4].strip()[5:] if len(data[4].strip())>5 else''
}
def write_to_json(content):
with open('data.txt','a',encoding = 'utf-8') as f:
f.write(json.dumps(content,ensure_ascii = False))
def main():
for i in range(0,100,10):
rawUrl = 'http://maoyan.com/board/4?offset='
url = rawUrl+str(i)
html = get_one_page(url)
for item in get_information(html):
write_to_json(item)
main()