爬取猫眼电影网TOP100信息,并将其中电影名,主演,上映时间保存到数据库中
import pymysql
import re
from urllib import request
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
url = 'http://maoyan.com/board/4'
films_info = []
def get_all_url(url, page):
url_li = [url]
for i in range(page - 1):
new_url = url + '?offset=%d' % ((i + 1) * 10)
url_li.append(new_url)
return url_li
def get_info(url):
header = {'User-agent': 'FireFox/12.0'}
req = request.Request(url, headers=header)
with urlopen(req) as urlobj:
return urlobj.read().decode('utf-8')
def get_content(url):
info = get_info(url)
pattern = r'<a href="\/films\/.*" title="(.*)" ' \
r'data-act="boarditem-click" data-val="\{movieId:.*\}' \
r'">.*<\/a><\/p>\s*<p class="star">\s*(主演:.*)\s*<\/p>\s*' \
r'<p class="releasetime">(上映时间:.*)<\/p> '
films_info.extend(re.findall(pattern, info))
def main():
res = get_all_url(url, 10)
with ThreadPoolExecutor(max_workers=10) as poor:
poor.map(get_content, res)
conn = pymysql.connect(host='localhost', user='root', passwd='wl1009', db='homework', charset='utf8')
cur = conn.cursor()
try:
cur.execute('create table films (movie varchar(20),actor varchar(50),time varchar(40))')
insert_sql = 'insert into films values(%s,%s,%s)'
cur.executemany(insert_sql, films_info)
cur.execute('select * from films')
print(cur.fetchall())
except Exception as e:
print(e)
else:
print('信息存储成功...')
conn.commit()
cur.close()
conn.close()
main()
