from urllib import request from urllib import parse import time import re import pymysql class MaoyanSpider(object): def __init__(self): self.baseurl = 'https://maoyan.com/board/4?offset=' self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'} # 爬取页数计数 self.page = 1 # 创建2个对象 self.db = pymysql.connect( 'localhost','root','123456','spider', charset='utf8' ) self.cursor = self.db.cursor() # 获取页面 def get_page(self,url): req = request.Request(url,headers=self.headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_page(html) # 解析页面 def parse_page(self,html): # 正则解析 p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # r_list : [('霸王别姬','张国荣','1993'),(),()] self.write_page(r_list) # 保存数据(存到mysql数据库) def write_page(self,r_list): ins = 'insert into film(name,star,time) \ values(%s,%s,%s)' for rt in r_list: film_list = [ rt[0].strip(), rt[1].strip(), rt[2].strip()[5:15] ] self.cursor.execute(ins,film_list) # 提交到数据库执行 self.db.commit() # 主函数 def main(self): # 用range函数可获取某些查询参数的值 for offset in range(0,41,10): url = self.baseurl + str(offset) self.get_page(url) print('第%d页爬取成功' % self.page) self.page += 1 time.sleep(1) # 等所有页面爬完后再关闭 self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.main()
最近想跳槽,不知道市场行情,正好把51job的网站简单做了一下爬取,后续再做二级界面爬取,数据分析!