做一个界面来方便爬取数据,用的re
项目上传git地址:https://github.com/sqhl/Spider
1.pyqt5界面:
class Ui_Form(object):
def setupUi(self, Form):
Form.setObjectName("Form")
Form.setEnabled(True)
Form.resize(762, 487)
self.label = QtWidgets.QLabel(Form)
self.label.setGeometry(QtCore.QRect(70, 20, 91, 31))
self.label.setObjectName("label")
self.lineEdit = QtWidgets.QLineEdit(Form)
self.lineEdit.setEnabled(False)
self.lineEdit.setGeometry(QtCore.QRect(170, 20, 271, 31))
self.lineEdit.setObjectName("lineEdit")
self.pushButton = QtWidgets.QPushButton(Form)
self.pushButton.setGeometry(QtCore.QRect(450, 20, 71, 31))
self.pushButton.setObjectName("pushButton")
self.label_2 = QtWidgets.QLabel(Form)
self.label_2.setGeometry(QtCore.QRect(530, 30, 231, 16))
self.label_2.setObjectName("label_2")
self.label_3 = QtWidgets.QLabel(Form)
self.label_3.setGeometry(QtCore.QRect(80, 60, 81, 31))
self.label_3.setObjectName("label_3")
self.lineEdit_2 = QtWidgets.QLineEdit(Form)
self.lineEdit_2.setGeometry(QtCore.QRect(170, 60, 271, 31))
self.lineEdit_2.setObjectName("lineEdit_2")
self.checkBox = QtWidgets.QCheckBox(Form)
self.checkBox.setGeometry(QtCore.QRect(450, 70, 91, 19))
self.checkBox.setObjectName("checkBox")
self.checkBox_2 = QtWidgets.QCheckBox(Form)
self.checkBox_2.setGeometry(QtCore.QRect(170, 110, 91, 19))
self.checkBox_2.setObjectName("checkBox_2")
self.checkBox_3 = QtWidgets.QCheckBox(Form)
self.checkBox_3.setGeometry(QtCore.QRect(400, 110, 91, 19))
self.checkBox_3.setObjectName("checkBox_3")
self.checkBox_4 = QtWidgets.QCheckBox(Form)
self.checkBox_4.setGeometry(QtCore.QRect(170, 180, 91, 19))
self.checkBox_4.setObjectName("checkBox_4")
self.checkBox_5 = QtWidgets.QCheckBox(Form)
self.checkBox_5.setGeometry(QtCore.QRect(400, 180, 91, 19))
self.checkBox_5.setObjectName("checkBox_5")
self.lineEdit_3 = QtWidgets.QLineEdit(Form)
self.lineEdit_3.setGeometry(QtCore.QRect(170, 130, 211, 31))
self.lineEdit_3.setObjectName("lineEdit_3")
self.lineEdit_4 = QtWidgets.QLineEdit(Form)
self.lineEdit_4.setGeometry(QtCore.QRect(400, 130, 211, 31))
self.lineEdit_4.setObjectName("lineEdit_4")
self.lineEdit_5 = QtWidgets.QLineEdit(Form)
self.lineEdit_5.setGeometry(QtCore.QRect(170, 200, 211, 31))
self.lineEdit_5.setObjectName("lineEdit_5")
self.lineEdit_6 = QtWidgets.QLineEdit(Form)
self.lineEdit_6.setGeometry(QtCore.QRect(400, 200, 211, 31))
self.lineEdit_6.setObjectName("lineEdit_6")
self.textBrowser = QtWidgets.QTextBrowser(Form)
self.textBrowser.setGeometry(QtCore.QRect(170, 240, 441, 192))
self.textBrowser.setObjectName("textBrowser")
self.pushButton_2 = QtWidgets.QPushButton(Form)
self.pushButton_2.setGeometry(QtCore.QRect(240, 440, 93, 28))
self.pushButton_2.setObjectName("pushButton_2")
self.pushButton_3 = QtWidgets.QPushButton(Form)
self.pushButton_3.setGeometry(QtCore.QRect(410, 440, 93, 28))
self.pushButton_3.setObjectName("pushButton_3")
self.pushButton_3.setEnabled(False)
self.retranslateUi(Form)
QtCore.QMetaObject.connectSlotsByName(Form)
def retranslateUi(self, Form):
_translate = QtCore.QCoreApplication.translate
Form.setWindowTitle(_translate("Form", "Form"))
self.label.setText(_translate("Form", "选择文件夹 :"))
self.pushButton.setText(_translate("Form", "选择"))
self.pushButton.clicked.connect(lambda: self.msg(Form))
self.label_2.setText(_translate("Form", "(默认为当前目录下的Data文件夹)"))
self.label_3.setText(_translate("Form", "输入网址 :"))
self.checkBox.setText(_translate("Form", "伪造"))
self.checkBox_2.setText(_translate("Form", "文章"))
self.checkBox_3.setText(_translate("Form", "图片"))
self.checkBox_4.setText(_translate("Form", "链接"))
self.checkBox_5.setText(_translate("Form", "自定义"))
self.pushButton_2.setText(_translate("Form", "start"))
self.pushButton_2.clicked.connect(lambda: self.start(Form))
self.pushButton_3.setText(_translate("Form", "stop"))
self.pushButton_3.clicked.connect(lambda: self.stop(Form))
def msg(self,Form):
directory1 = QFileDialog.getExistingDirectory(Form,"选取文件夹","./Data")
self.lineEdit.setText(directory1)
def start(self,Form):
self.textBrowser.append("爬虫开始工作....")
self.Spider = Spider(self)
self.Spider.start()
if self.checkBox_4.checkState():
self.pushButton_3.setEnabled(True)
def stop(self,Form):
self.Spider.stop()
self.textBrowser.append("停止继续爬取!!!")
self.pushButton_3.setEnabled(False)
class Spider(QThread):
def __init__(self, index):
super().__init__()
self.index = index
self.spider = spider()
self.spider.spider_init(index)
self.flag = True
def run(self):
while self.flag:
self.spider.spider_next_href(self.index)
def stop(self):
time.sleep(2)
self.flag = False
self.spider.save_href(self.index)
效果:
2.spider_next.py:
from urllib import request
import re
import copy
import gevent
from gevent import monkey
import os
monkey.patch_all()
from datetime import datetime
from bs4 import BeautifulSoup
import random
class spider(object):
def spider_init(self,index):
url = index.lineEdit_2.text()
self.b = []
self.image = []
if url == "":
index.textBrowser.append("url输入错误!")
else:
self.b.append(url)
self.c = copy.deepcopy(self.b)
self.dic = index.lineEdit.text()
if not self.dic:
self.dic = "./Data"
self.dic += "/" + datetime.now().strftime('%Y-%m-%d')
if not os.path.exists(self.dic):
os.mkdir(self.dic)
def spider_next_href(self, index):
if self.c:
self.spider_html(self.c.pop(0), index)
def spider_html(self, url, index):
try:
index.textBrowser.append("开始爬取"+url)
if index.checkBox.checkState():
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
req = request.Request(url=url, headers=headers) #这里必须写url=url
req.add_header('Referer', 'http://www.mzitu.com/')
else:
req = request.Request(url)
self.new_dic = self.dic+"/"+url.split("/")[-1]
if not os.path.exists(self.new_dic):
os.mkdir(self.new_dic)
self.html = request.urlopen(req).read().decode('utf-8')
if index.checkBox_2.checkState(): # 文章
g1 = gevent.spawn(self.spider_article, index)
if index.checkBox_3.checkState(): # 图片
g2 = gevent.spawn(self.spider_pic, index)
if index.checkBox_4.checkState(): # 链接
g3 = gevent.spawn(self.spider_href, index)
if index.checkBox_5.checkState(): # 自定义
g4 = gevent.spawn(self.spider_custom, index)
if index.checkBox_2.checkState():
g1.join()
if index.checkBox_3.checkState():
g2.join()
if index.checkBox_4.checkState():
g3.join()
if index.checkBox_5.checkState():
g4.join()
index.textBrowser.append(url+"爬取结束......")
except:
index.textBrowser.append("url输入错误!")
def spider_href(self, index): #链接
#url = index.lineEdit_3.text()
req_href = r"(https://blog.youkuaiyun.com/by_side_with_sun/article/details/.+?)\""
all_list = re.findall(req_href, self.html)
right_list = set(list(filter(lambda x:x not in self.b,all_list)))
index.textBrowser.append("新增链接数:"+str(len(right_list)))
for href in right_list:
self.b.append(href)
self.c.append(href)
index.textBrowser.append("剩余链接数:"+str(len(self.c)))
def spider_pic(self,index): #图片
index.textBrowser.append("爬取图片中....")
req_pic = r'<img src="(http|https://.*?)"'
imgList = re.findall(req_pic, self.html)
if imgList:
if not os.path.exists(self.new_dic+"/img"):
os.mkdir(self.new_dic+"/img")
x=0
for img in imgList:
if img not in self.image:
self.image.append(img)
try:
with open(self.new_dic+'/img/'+str(x)+".jpg","wb") as f:
f.write((request.urlopen(img)).read())
except:
continue
x+=1
index.textBrowser.append("这一次砸门爬了:"+str(x)+"张图片.....")
def spider_article(self, index): #文章
index.textBrowser.append("开始爬取文章....")
req_title = r"<title>(.*?)</title>"
title = re.findall(req_title, self.html)
req_text = r"<article>(.*?)</article>"
text = re.findall(req_text, self.html, re.S)
text = "".join(text)
dr = re.compile(r'<[^>]+>', re.S)
text = dr.sub('', text)
with open(self.new_dic+"/article.txt","a", encoding="utf-8") as f:
f.write((title[0]+"\n"+text).replace(u'\xa0', u''))
index.textBrowser.append("爬取--"+title[0]+"--文章结束....")
def spider_custom(self, index): #自定义
print("custom")
def save_href(self,index):
with open(self.dic+"/href.txt","a", encoding="utf-8") as f:
f.write("\n".join(self.b))
index.textBrowser.append("总计爬取"+str(len(self.b))+"条链接..........")
index.textBrowser.append("总计爬取" + str(len(self.image)) + "张图片..........")
if __name__ =="__main__":
spider()
希望能在我的基础上做做更改,互相学习