渣渣写的爬虫,也是以前就写过的一个网站,采集官方数据,应用python进行数据采集抓取,同时进行了try.except报错处理,算是可以运行完毕的爬虫,同时将报错相关记录写入到了txt文件中,可以进行后续报错中断数据的补采集操作!
目标网址:https://www.goldenpin.org.tw/%E9%87%91%E9%BB%9E%E8%A8%AD%E8%A8%88%E7%8D%8E/?y=2020
想要获取一个网站的数据,在排除反爬限制的前提下,就是请求访问的中断处理和完善,偶尔服务器访问请求中断或者报错,以及节点获取的错误,都可以考虑进去,防止爬虫中断,当然采集数据建议还是链接数据库保存相关数据信息内容,不管是后面补采集还是其他都更加方便处理操作!
几个关键点
requests访问超时封装处理
from requests.adapters import HTTPAdapter
self.s = requests.Session()
self.s.mount('http://', HTTPAdapter(max_retries=5))
self.s.mount('https://', HTTPAdapter(max_retries=5))
更改数字,可以设置重试次数!
格式化数据
for href,category in zip (hrefs,categorys):
print(href,category)
for in zip 函数的使用可自行百度参考方法!
标题格式化数据处理
title=req.xpath('//h1[@class="entry-title"]/text()')[0]
pattern = r"[\/\\\:\*\?\"\<\>\|]"
h1 = re.sub(pattern, "_", title) # 替换为下划线
标题作为数据存储的文件名,需要进行格式化处理,将非法字符替换为下划线!
完整源码参考:
#金点奖设计采集
#20201118 @author:WX:huguo00289
# -*- coding: UTF-8 -*-
import requests,re,time
from fake_useragent import UserAgent
import json,os
from lxml import etree
from requests.adapters import HTTPAdapter
class Gd(object):
def __init__(self):
self.ua=UserAgent()
self.headers={
'User-Agent':self.ua.random,
#Cookie: PHPSESSID=t9gp0d4ebv684u4miotr4edkba; devicePixelRatio=1; _ga=GA1.3.657336680.1605679856; _gid=GA1.3.569339568.1605679856; _gat_gtag_UA_55240876_38=1
}
self.url="https://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
self.s = requests.Session()
self.s.mount('http://', HTTPAdapter(max_retries=5))
self.s.mount('https://', HTTPAdapter(max_retries=5))
def get_content(self,i):
data={
'action': 'presscore_template_ajax',
'postID': '15317',
'paged': 1,
'targetPage': i,
'term': '',
'orderby': '',
'order': '',
'nonce': '004811f2a4',
'contentType': 'portfolio',
'pageData[type]': 'page',
'pageData[template]': 'portfolio',
'pageData[layout]': 'masonry',
'sender': 'more',
}
response=self.s.post(self.url,timeout=8,headers=self.headers,data=data)
resq=response.content.decode('utf-8')
req = json.loads(resq)
print(req['success'])
if req['success'] == True:
html=req['html']
#print(html)
hrefs=re.findall(r'<a target="_blank" href="(.+?)" class="alignnone rollover layzr-bg"',html,re.S)
print(len(hrefs))
categorys=re.findall(r'<div class="entry-meta portfolio-categories"><span class="category-link"><a href=".+?" >(.+?)</span></div></div>',html,re.S)
print(len(categorys))
for href,category in zip (hrefs,categorys):
print(href,category)
try:
self.parse(href,category)
except Exception as e:
print(f'采集网页:{href} 出错,错误代码:{e}')
with open('href_fail.txt', 'a+', encoding='utf-8') as f:
f.write(f'{href},{category}\n')
def parse(self,href,category):
print(f'>> 正在采集网页 {href} 数据..')
html=self.s.get(href,headers=self.headers,timeout=8).content.decode('utf-8')
req=etree.HTML(html)
title=req.xpath('//h1[@class="entry-title"]/text()')[0]
pattern = r"[\/\\\:\*\?\"\<\>\|]"
h1 = re.sub(pattern, "_", title) # 替换为下划线
path = f'{category}/{h1}/'
infos=req.xpath('//div[@class="wpb_wrapper"]//text()')
info=''.join(infos)
with open(f'{category}.txt','a+',encoding='utf-8') as f:
f.write(f'{title}\n{info}\n\n')
imgs=req.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@src')
print(h1,info,imgs)
try:
self.downs(imgs,path)
except Exception as e:
print(f'获取图片:{imgs} 出错,错误代码:{e}')
with open('imgs_fail.txt', 'a+', encoding='utf-8') as f:
f.write(f'{imgs},{path}\n')
def downs(self,imgs,path):
os.makedirs(path, exist_ok=True)
for img in imgs:
img_url=img
img_name=img.split('/')[-1]
print(img_url,img_name)
try:
self.dwon_img(img_url,img_name,path)
except Exception as e:
print(f'下载图片:{img_name} 出错,错误代码:{e}')
with open('img_fail.txt','a+',encoding='utf-8') as f:
f.write(f'{img_url},{img_name},{path}\n')
def dwon_img(self,img_url,img_name,path):
print(f'>> 正在下载图片:{img_name} ..')
r=self.s.get(img_url,timeout=8,headers=self.headers)
with open(f'{path}{img_name}','wb') as f:
f.write(r.content)
print(f'>> 图片:{img_name} 下载完成!')
def run(self):
for i in range(1,1000):
print(f'>> 正在爬取第 {i} 页数据..')
try:
self.get_content(i)
except Exception as e:
print(f'爬取第 {i} 页数据出错,错误代码:{e}')
with open('list_fail.txt','a+',encoding='utf-8') as f:
f.write(f'{i}\n')
if __name__=='__main__':
spider=Gd()
spider.run()
数据补采集操作源码参考:
#金点奖设计采集补采集
# -*- coding: UTF-8 -*-
from gdspider import Gd
#补图片
def get_bimg_fail():
path=r'bimg_fail.txt'
with open(path,'r',encoding='utf-8') as f:
img_fails=f.readlines()
print(len(img_fails))
spider=Gd()
for img_fail in img_fails:
img_fail=img_fail.strip()
img_fail=img_fail.split(',')
print(img_fail)
spider.dwon_img(img_fail[0],img_fail[1],img_fail[2])
#补连接
def get_href_fail():
path=r'bhref_fail.txt'
with open(path,'r',encoding='utf-8') as f:
href_fails=f.readlines()
print(len(href_fails))
spider=Gd()
for href_fail in href_fails:
href_fail=href_fail.strip()
href_fail=href_fail.split(",")
href=href_fail[0]
if "<a href=" in str(href_fail):
category=href_fail[-1].split('/')[-1]
category=category.strip()
category = category.replace('" >', '')
else:
category=href_fail[1]
href=href.replace(' https','https')
print(href,category)
spider.parse(href,category)
def get_blist():
path = r'blist_fail.txt'
with open(path, 'r', encoding='utf-8') as f:
blist_fails = f.readlines()
print(len(blist_fails))
spider = Gd()
for blist_fail in blist_fails:
blist_fail=blist_fail.strip()
print(blist_fail)
spider.get_content(blist_fail)
if __name__=='__main__':
get_bimg_fail()
#get_href_fail()
#get_blist()
关注本渣渣微信公众号:二爷记
后台回复关键字:“金点设计奖”
获取完整项目
相关阅读: