#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
import re
from requests.exceptions import RequestException
from json import dumps
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?"board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?/>.*?name"><a.*?>(.*?)</a>'
+'</p>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>'
+'.*?fraction">(.*?)</i>.*?</dd>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index' :item[0],
'picurl' :item[1],
'title' :item[2],
'actor' :item[3].replace('\n','').strip(),
'times' :item[4],
'score' :item[5] + item[6]
}
def write_to_file(content):
with open('movie.txt','a',encoding='utf8') as fp:
fp.write(dumps(content,ensure_ascii=False) + '\n')
def main(offset):
url = "http://maoyan.com/board/4?offset=%s" % offset
html = get_one_page(url)
for item in parse_one_page(html):
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(i * 10)
requests爬猫眼电影 -- 记录
最新推荐文章于 2024-11-21 23:18:22 发布