爬虫模板开发

做一个界面来方便爬取数据,用的re

项目上传git地址:https://github.com/sqhl/Spider

1.pyqt5界面:
 

class Ui_Form(object):
    def setupUi(self, Form):
        Form.setObjectName("Form")
        Form.setEnabled(True)
        Form.resize(762, 487)
        self.label = QtWidgets.QLabel(Form)
        self.label.setGeometry(QtCore.QRect(70, 20, 91, 31))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(Form)
        self.lineEdit.setEnabled(False)
        self.lineEdit.setGeometry(QtCore.QRect(170, 20, 271, 31))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(Form)
        self.pushButton.setGeometry(QtCore.QRect(450, 20, 71, 31))
        self.pushButton.setObjectName("pushButton")
        self.label_2 = QtWidgets.QLabel(Form)
        self.label_2.setGeometry(QtCore.QRect(530, 30, 231, 16))
        self.label_2.setObjectName("label_2")
        self.label_3 = QtWidgets.QLabel(Form)
        self.label_3.setGeometry(QtCore.QRect(80, 60, 81, 31))
        self.label_3.setObjectName("label_3")
        self.lineEdit_2 = QtWidgets.QLineEdit(Form)
        self.lineEdit_2.setGeometry(QtCore.QRect(170, 60, 271, 31))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.checkBox = QtWidgets.QCheckBox(Form)
        self.checkBox.setGeometry(QtCore.QRect(450, 70, 91, 19))
        self.checkBox.setObjectName("checkBox")
        self.checkBox_2 = QtWidgets.QCheckBox(Form)
        self.checkBox_2.setGeometry(QtCore.QRect(170, 110, 91, 19))
        self.checkBox_2.setObjectName("checkBox_2")
        self.checkBox_3 = QtWidgets.QCheckBox(Form)
        self.checkBox_3.setGeometry(QtCore.QRect(400, 110, 91, 19))
        self.checkBox_3.setObjectName("checkBox_3")
        self.checkBox_4 = QtWidgets.QCheckBox(Form)
        self.checkBox_4.setGeometry(QtCore.QRect(170, 180, 91, 19))
        self.checkBox_4.setObjectName("checkBox_4")
        self.checkBox_5 = QtWidgets.QCheckBox(Form)
        self.checkBox_5.setGeometry(QtCore.QRect(400, 180, 91, 19))
        self.checkBox_5.setObjectName("checkBox_5")
        self.lineEdit_3 = QtWidgets.QLineEdit(Form)
        self.lineEdit_3.setGeometry(QtCore.QRect(170, 130, 211, 31))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.lineEdit_4 = QtWidgets.QLineEdit(Form)
        self.lineEdit_4.setGeometry(QtCore.QRect(400, 130, 211, 31))
        self.lineEdit_4.setObjectName("lineEdit_4")
        self.lineEdit_5 = QtWidgets.QLineEdit(Form)
        self.lineEdit_5.setGeometry(QtCore.QRect(170, 200, 211, 31))
        self.lineEdit_5.setObjectName("lineEdit_5")
        self.lineEdit_6 = QtWidgets.QLineEdit(Form)
        self.lineEdit_6.setGeometry(QtCore.QRect(400, 200, 211, 31))
        self.lineEdit_6.setObjectName("lineEdit_6")
        self.textBrowser = QtWidgets.QTextBrowser(Form)
        self.textBrowser.setGeometry(QtCore.QRect(170, 240, 441, 192))
        self.textBrowser.setObjectName("textBrowser")
        self.pushButton_2 = QtWidgets.QPushButton(Form)
        self.pushButton_2.setGeometry(QtCore.QRect(240, 440, 93, 28))
        self.pushButton_2.setObjectName("pushButton_2")
        self.pushButton_3 = QtWidgets.QPushButton(Form)
        self.pushButton_3.setGeometry(QtCore.QRect(410, 440, 93, 28))
        self.pushButton_3.setObjectName("pushButton_3")
        self.pushButton_3.setEnabled(False)

        self.retranslateUi(Form)
        QtCore.QMetaObject.connectSlotsByName(Form)

    def retranslateUi(self, Form):
        _translate = QtCore.QCoreApplication.translate
        Form.setWindowTitle(_translate("Form", "Form"))
        self.label.setText(_translate("Form", "选择文件夹 :"))

        self.pushButton.setText(_translate("Form", "选择"))
        self.pushButton.clicked.connect(lambda: self.msg(Form))

        self.label_2.setText(_translate("Form", "(默认为当前目录下的Data文件夹)"))
        self.label_3.setText(_translate("Form", "输入网址 :"))
        self.checkBox.setText(_translate("Form", "伪造"))
        self.checkBox_2.setText(_translate("Form", "文章"))
        self.checkBox_3.setText(_translate("Form", "图片"))
        self.checkBox_4.setText(_translate("Form", "链接"))
        self.checkBox_5.setText(_translate("Form", "自定义"))

        self.pushButton_2.setText(_translate("Form", "start"))
        self.pushButton_2.clicked.connect(lambda: self.start(Form))

        self.pushButton_3.setText(_translate("Form", "stop"))
        self.pushButton_3.clicked.connect(lambda: self.stop(Form))
    def msg(self,Form):
        directory1 = QFileDialog.getExistingDirectory(Form,"选取文件夹","./Data")
        self.lineEdit.setText(directory1)
    def start(self,Form):
        self.textBrowser.append("爬虫开始工作....")
        self.Spider = Spider(self)
        self.Spider.start()
        if self.checkBox_4.checkState():
            self.pushButton_3.setEnabled(True)
    def stop(self,Form):
        self.Spider.stop()
        self.textBrowser.append("停止继续爬取!!!")
        self.pushButton_3.setEnabled(False)
class Spider(QThread):
    def __init__(self, index):
        super().__init__()
        self.index = index
        self.spider = spider()
        self.spider.spider_init(index)
        self.flag = True
    def run(self):
        while self.flag:
            self.spider.spider_next_href(self.index)
    def stop(self):
        time.sleep(2)
        self.flag = False
        self.spider.save_href(self.index)

效果:

2.spider_next.py:
 

from urllib import  request
import re
import copy
import gevent
from gevent import monkey
import os
monkey.patch_all()
from datetime import datetime
from bs4 import BeautifulSoup
import random
class spider(object):
    def spider_init(self,index):
        url = index.lineEdit_2.text()
        self.b = []
        self.image = []
        if url == "":
            index.textBrowser.append("url输入错误!")
        else:
            self.b.append(url)
            self.c = copy.deepcopy(self.b)
            self.dic = index.lineEdit.text()
            if not self.dic:
                self.dic = "./Data"
            self.dic += "/" + datetime.now().strftime('%Y-%m-%d')
            if not os.path.exists(self.dic):
                os.mkdir(self.dic)
    def spider_next_href(self, index):
        if self.c:
            self.spider_html(self.c.pop(0), index)
    def spider_html(self, url, index):
        try:
            index.textBrowser.append("开始爬取"+url)
            if index.checkBox.checkState():
                headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
                req = request.Request(url=url, headers=headers) #这里必须写url=url
                req.add_header('Referer', 'http://www.mzitu.com/')
            else:
                req = request.Request(url)
            self.new_dic = self.dic+"/"+url.split("/")[-1]
            if not os.path.exists(self.new_dic):
                os.mkdir(self.new_dic)
            self.html = request.urlopen(req).read().decode('utf-8')
            if index.checkBox_2.checkState():  # 文章
                g1 = gevent.spawn(self.spider_article, index)
            if index.checkBox_3.checkState():  # 图片
                g2 = gevent.spawn(self.spider_pic, index)
            if index.checkBox_4.checkState():  # 链接
                g3 = gevent.spawn(self.spider_href, index)
            if index.checkBox_5.checkState():  # 自定义
                g4 = gevent.spawn(self.spider_custom, index)
            if index.checkBox_2.checkState():
                g1.join()
            if index.checkBox_3.checkState():
                g2.join()
            if index.checkBox_4.checkState():
                g3.join()
            if index.checkBox_5.checkState():
                g4.join()
            index.textBrowser.append(url+"爬取结束......")
        except:
            index.textBrowser.append("url输入错误!")
    def spider_href(self, index): #链接
        #url = index.lineEdit_3.text()
        req_href = r"(https://blog.youkuaiyun.com/by_side_with_sun/article/details/.+?)\""
        all_list = re.findall(req_href, self.html)
        right_list = set(list(filter(lambda x:x not in self.b,all_list)))
        index.textBrowser.append("新增链接数:"+str(len(right_list)))
        for href in right_list:
            self.b.append(href)
            self.c.append(href)
        index.textBrowser.append("剩余链接数:"+str(len(self.c)))
    def spider_pic(self,index): #图片
        index.textBrowser.append("爬取图片中....")
        req_pic = r'<img src="(http|https://.*?)"'
        imgList = re.findall(req_pic, self.html)
        if imgList:
            if not os.path.exists(self.new_dic+"/img"):
                os.mkdir(self.new_dic+"/img")
            x=0
            for img in imgList:
                if img not in self.image:
                    self.image.append(img)
                    try:
                        with open(self.new_dic+'/img/'+str(x)+".jpg","wb") as f:
                            f.write((request.urlopen(img)).read())
                    except:
                        continue
                    x+=1
            index.textBrowser.append("这一次砸门爬了:"+str(x)+"张图片.....")
    def spider_article(self, index): #文章
        index.textBrowser.append("开始爬取文章....")
        req_title = r"<title>(.*?)</title>"
        title = re.findall(req_title, self.html)
        req_text = r"<article>(.*?)</article>"
        text = re.findall(req_text, self.html, re.S)
        text = "".join(text)
        dr = re.compile(r'<[^>]+>', re.S)
        text = dr.sub('', text)
        with open(self.new_dic+"/article.txt","a", encoding="utf-8") as f:
            f.write((title[0]+"\n"+text).replace(u'\xa0', u''))
        index.textBrowser.append("爬取--"+title[0]+"--文章结束....")
    def spider_custom(self, index): #自定义
        print("custom")
    def save_href(self,index):
        with open(self.dic+"/href.txt","a", encoding="utf-8") as f:
            f.write("\n".join(self.b))
        index.textBrowser.append("总计爬取"+str(len(self.b))+"条链接..........")
        index.textBrowser.append("总计爬取" + str(len(self.image)) + "张图片..........")
if __name__ =="__main__":
    spider()

希望能在我的基础上做做更改,互相学习

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值