HTMLParser
from html.parser import HTMLParser
class MyParser(HTMLParser):
"""
HTMLParser
"""
def __init__(self):
HTMLParser.__init__(self)
def handle_startendtag(self, tag, attrs):
super().handle_startendtag(tag, attrs)
def handle_starttag(self, tag, attrs):
pass
def handle_endtag(self, tag):
pass
def handle_charref(self, name):
pass
def handle_data(self, data):
pass
def handle_comment(self, data):
pass
def handle_decl(self, decl):
pass
def handle_entityref(self, name):
pass
def handle_pi(self, data):
pass
Douban电影内容爬取
import requests
from html.parser import HTMLParser
class MovieParser(HTMLParser):
"""
电影解析器
"""
def __init__(self):
HTMLParser.__init__(self)
self.moives = []
def handle_starttag(self, tag, attrs):
def _attr(attrList, attrName):
for attr in attrList:
if attr[0] == attrName:
return attr[1]
return None
if tag == 'li' and _attr(attrs, 'data-title'):
movie = {}
movie['title'] = _attr(attrs, 'data-title')
movie['score'] = _attr(attrs, 'data-score')
if movie['score'] is None:
movie['score'] = "None"
movie['director'] = _attr(attrs, 'data-director')
movie['actors'] = _attr(attrs, 'data-actors')
self.moives.append(movie)
def error(self, message):
pass
def my_movies(url):
"""
网络请求
:param url: 地址
:return: 解析好的内容
"""
mp = None
try:
headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)'}
response = requests.get(url, headers=headers)
response.raise_for_status()
mp = MovieParser()
mp.feed(response.text)
return mp.moives
except:
return print('发生异常')
finally:
if mp is not None:
mp.close()
def save_file(path, text):
"""
文本存储
:param path: 存储路径
:param text: 文本内容
:return: None
"""
with open(path, 'w', encoding='UTF-8') as file:
file.write(text)
if __name__ == '__main__':
url = "https://movie.douban.com/cinema/nowplaying/chongqing/"
text = my_movies(url)
save_file("d:/upload/movies.json", str(text).replace('\'', '\"'))