爬虫初试

 

#-*- coding:utf-8 -*-

import urllib.request
import re

class Spider:
    def __init__(self,page=""):
        self.page = page
        self.switch = True


    def loadPage(self,page):
        self.page = page
        if self.page == "":
            url = "http://www.neihanpa.com/article/"
        else:
            url = "http://www.neihanpa.com/article/index_"+str(self.page)+".html"

        headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        request = urllib.request.Request(url,headers=headers)
        response =urllib.request.urlopen(request)
        html = response.read().decode()
        #print(html)
        pattern = re.compile('<div\sclass="desc">(.*?)</div>',re.S)
        conten_list = pattern.findall(html)
        self.writePage(conten_list)

    def writePage(self,conten_list):
        for item in conten_list:
            print(item)
            with open("dunzi.txt","a", encoding="utf-8") as f:
                f.write(item+"\n")


    def startWork(self):
        while self.switch:
            page = input("请输入页码:")
            self.loadPage(page)
            command = input("如果继续爬取,请按回车(退出输入quit)")
            if command == "quit":
                self.switch = False
        print ("Thanks for user")




if __name__ == "__main__":
    dz = Spider()
    dz.startWork()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值