一、图片爬虫
(1)京东手机图片的抓取
import re
import urllib.request
def craw(url,page):
html1=urllib.request.urlopen(url).read()
html1=str(html1)
pat1='<div id="plist".+? <div class="page clearfix">'
result1=re.compile(pat1).findall(html1)
result1=result1[0]
pat2='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'
imagelist=re.compile(pat2).findall(result1)
x=1
for imageurl in imagelist:
imagename="E:/picture/"+str(page)+str(x)+".jpg"
imageurl="http://"+imageurl
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,"code"):
x+=1
if hasattr(e,"reason"):
x+=1
x+=1
print(x)
for i in range(1,79):
url="http://list.jd.com/list.html?cat=9987,653,655&page="+str(i)
craw(url,i)
(2)妹子图片抓取(参照)
#
# http://www.zjito.com/
# 爬取妹子图片 bs4 + re + gevent 多线程爬虫
import requests
from bs4 import BeautifulSoup
import urllib
import gevent
from gevent import Greenlet
import socket
import random
def cbk(a,b,c):
'''''回调函数
@a:已经下载的数据块
@b:数据块的大小
@c:远程文件的大小
'''
per=100.0*a*b/c
if per>100:
per=100
print('%.2f%%' % per)
def photo_download(photo_thread, index_number, photo_number, number):
while number < 3564 :
try:
i = 0
number = number + 1
url = 'http://www.zjito.com/dqfl/'+dict[i]+'/'+str(index_number)+'.shtml?idx=1'
# 爬虫目标网站地址
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)
# 获得目标页面返回信息
print(r.status_code)
print(url)
while r.status_code == 404:
# 判断响应状态码
i = i + 1
url = 'http://www.zjito.com/dqfl/'+dict[i]+'/'+str(index_number)+'.shtml?idx=1'
print(url)
else :
soup = BeautifulSoup(r.text, 'html.parser')
# 返回的信息放入soup中
# 获取页面全部标签信息
# print(soup.prettify())
# 测试显示的是否是页面的标签
for link in soup.find_all(class_="div-num"):
print(link.get('data-src'))
# 输出图片地址
socket.setdefaulttimeout(3.0)
# 设置超时
photo_number = photo_number + 1
urllib.request.urlretrieve(link.get('data-src'), file+'/'+str(photo_thread)+'_'+str(photo_number)+'.jpg', cbk)
# 下载图片并显示下载进度
gevent.sleep(random.randint(0,2)*0.001)
except Exception as e:
index_number = index_number + 1
index_number = index_number + 1
if __name__ == '__main__':
dict = ['zgnd', 'tw', 'xg', 'rb', 'hg', 'mlxy', 'tg', 'om', 'hx',]
# 照片分类
photo_thread = [1, 2]
# 线程计数器
photo_number = -1
# 下载图片计数器,最大50
# index_number = 530273
# 页面计数器,最小530273,最大544527
file = '../photo/'
# 图片的保存地址
thread1 = Greenlet.spawn(photo_download, photo_thread[0], 530273, photo_number, 0)
# 从命名中创建,并运行新的Greenlet的包装器
# 函数photo_download,带有传递的参数
thread2 = gevent.spawn(photo_download, photo_thread[1], 533836, photo_number, 0)
# 两个thread运行,一个从530273页面开始爬取,另一个从537400页面开始爬取
# 537400 - 530273 = 7127
# 7127 / 2 = 3564
# 3564 + 530273 = 533836
threads = [thread1, thread2]
# 阻止所有线程完成
gevent.joinall(threads)
二、链接爬虫
(1)爬取csdn链接
import re
import urllib.request
def getlink(url):
#模拟成浏览器
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#将opener安装为全局
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())
#根据需求构建好链接表达式
pat='(https?://[^\s)";]+\.(\w|/)*)'
link=re.compile(pat).findall(data)
#去除重复元素
link=list(set(link))
return link
#要爬取的网页链接
url="http://blog.youkuaiyun.com/"
#获取对应网页中包含的链接地址
linklist=getlink(url)
#通过for循环分别遍历输出获取到的链接地址到屏幕上
for link in linklist:
print(link[0])
三、糗事百科爬虫
import urllib.request
import re
def getcontent(url,page):
#模拟成浏览器
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#将opener安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8")
#构建对应用户提取的正则表达式
userpat='target="_blank" title="(.*?)">'
#构建段子内容提取的正则表达式
contentpat='<div class="content">(.*?)</div>'
#寻找出所有的用户
userlist=re.compile(userpat,re.S).findall(data)
#寻找出所有的内容
contentlist=re.compile(contentpat,re.S).findall(data)
x=1
#通过for循环遍历段子内容并将内容分别赋给对应的变量
for content in contentlist:
content=content.replace("\n","")
#用字符串作为变量名,先将对应字符串赋给一个变量
name="content"+str(x)
#通过exec()函数实现用字符串作为变量名并赋值
exec(name+'=content')
x+=1
y=1
#通过for循环遍历用户,并输出该用户对应的内容
for user in userlist:
name="content"+str(y)
print("用户"+str(page)+str(y)+"是:"+user)
print("内容是:")
exec("print("+name+")")
print("\n")
y+=1
#分别获取各页的段子,通过for循环可以获取多页
for i in range(1,30):
url="http://www.qiushibaike.com/8hr/page/"+str(i)
getcontent(url,i)