#-*- coding:utf-8 -*- import urllib.request import re class Spider: def __init__(self,page=""): self.page = page self.switch = True def loadPage(self,page): self.page = page if self.page == "": url = "http://www.neihanpa.com/article/" else: url = "http://www.neihanpa.com/article/index_"+str(self.page)+".html" headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} request = urllib.request.Request(url,headers=headers) response =urllib.request.urlopen(request) html = response.read().decode() #print(html) pattern = re.compile('<div\sclass="desc">(.*?)</div>',re.S) conten_list = pattern.findall(html) self.writePage(conten_list) def writePage(self,conten_list): for item in conten_list: print(item) with open("dunzi.txt","a", encoding="utf-8") as f: f.write(item+"\n") def startWork(self): while self.switch: page = input("请输入页码:") self.loadPage(page) command = input("如果继续爬取,请按回车(退出输入quit)") if command == "quit": self.switch = False print ("Thanks for user") if __name__ == "__main__": dz = Spider() dz.startWork()
爬虫初试
最新推荐文章于 2024-11-22 11:50:40 发布