(2)Python爬虫入门——爬取贴吧图片
https://blog.youkuaiyun.com/toyijiu/article/details/79335240
[START]
import re
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html, x):
reg = r'src="(http://img.*?\.jpg)"'
imgre = re.compile(reg)
imList = re.findall(reg, html)
print(imList)
for i in imList:
print(i)
print x
urllib.urlretrieve(i, '%s.jpg' % x)
x += 1
return x
x = 1
url = "http://tieba.baidu.com/p/3466236659?pn="
for k in range(1, 28):
ul = url+str(k)
print ul
html = getHtml(ul)
# print html
x = getImg(html, x)
[END]
(3)python批量抓取网页图片
https://blog.youkuaiyun.com/qq_38344751/article/details/99448271
[START]
#!/usr/bin/python3
# -*- encoding:utf-8 *-*
from urllib import request
import re
import sys
def getResponse(url):
url_request = request.Request(url)
url_response = request.urlopen(url_request)
return url_response
def getJpg(data):
jpglist = re.findall(r'src="http.+?.jpg"',data)
return jpglist
def downLoad(jpgUrl,n):
try:
request.urlretrieve(jpgUrl,'pic\\%s.jpg' %n)
except Exception as e:
print(e)
finally:
print('picture %s downloding success' % n)
http_response = getResponse("http://dzh.mop.com/")
data = http_response.read().decode('utf-8')
n = 1
jpglist = getJpg(data)
for info in jpglist:
print(info)
s = re.findall(r'http.+?.jpg',info)
downLoad(s[0],n)
n= n +1
# from urllib import request
# url='http://dzh.mop.com/'
# url_request=request.Request(url)
# url_response=request.urlopen(url_request)
# data=url_response.read().decode('utf-8')
# jpglist=re.findall('http.+?.jpg',data)
# n=1
# for each in jpglist:
# print(each)
# try:
# request.urlretrieve(each,'pic\\%s.jpg',n)
# except Exception as e:
# print(e)
# finally:
# print('success downloding %s',n)
# n+=1
[END]
(4)Python爬虫之爬取网站图片
https://blog.youkuaiyun.com/qq_38412868/article/details/82080260
[START]
from bs4 import BeautifulSoup
import requests
if __name__=='__main__':
url='http://www.27270.com/tag/649.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
req = requests.get(url=url, headers=headers)
req=requests.get(url=url,headers=headers)
req.encoding = 'gb2312'
html=req.text
bf=BeautifulSoup(html,'lxml')
targets_url=bf.find('div',class_='w1200 oh').find_all('a',target='_blank')
for each in targets_url:
img_req=requests.get(url=each.get('href'),headers=headers)
img_req.encoding = 'gb2312'
html = img_req.text
bf = BeautifulSoup(html, 'lxml')
img_url = bf.find('div', class_='articleV4Body').find('img')['src']
name=each.img.get('alt')+'.jpg'
path=r'C:\Users\asus\Desktop\新建文件夹'
file_name = path + '\\' + name
try:
req1=requests.get(img_url,headers=headers)
f=open(file_name,'wb')
f.write(req1.content)
f.close()
except:
print("some error")
from bs4 import BeautifulSoup
import requests
def download(img_url,headers,n):
req = requests.get(img_url, headers=headers)
name = '%s'%n+'='+img_url[-15:]
path = r'C:\Users\asus\Desktop\火影壁纸1'
file_name = path + '\\' + name
f = open(file_name, 'wb')
f.write(req.content)
f.close
def parses_picture(url,headers,n):
url = r'http://desk.zol.com.cn/' + url
img_req = requests.get(url, headers=headers)
img_req.encoding = 'gb2312'
html = img_req.text
bf = BeautifulSoup(html, 'lxml')
try:
img_url = bf.find('div', class_='photo').find('img').get('src')
download(img_url,headers,n)
url1 = bf.find('div',id='photo-next').a.get('href')
parses_picture(url1,headers,n)
except:
print(u'第%s图片集到头了'%n)
if __name__=='__main__':
url='http://desk.zol.com.cn/dongman/huoyingrenzhe/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
req = requests.get(url=url, headers=headers)
req=requests.get(url=url,headers=headers)
req.encoding = 'gb2312'
html=req.text
bf=BeautifulSoup(html,'lxml')
targets_url=bf.find_all('li',class_='photo-list-padding')
n=1
for each in targets_url:
url = each.a.get('href')
parses_picture(url,headers,n)
n=n+1
[END]
(5)python 获取网站上所有图片的元数据信息
https://cloud.tencent.com/developer/article/1477335
[START]
!/usr/bin/python
# -*- encoding:utf-8 *-*
import optparse
from PIL import Image
from PIL.ExifTags import TAGS
import urllib2
from bs4 import BeautifulSoup as BS
from os.path import basename
from urlparse import urlsplit
#通过BeautifulSoup查找URL中所有的img标签
def findImages(url):
print '[+] Finding images on ' + url
urlContent = urllib2.urlopen(url).read()
soup = BS(urlContent, 'lxml')
imgTags = soup.findAll('img')
return imgTags
#通过img标签的src属性的值来获取图片URL下载图片
def downloadImage(imgTag):
try:
print '[+] Dowloading image...'
imgSrc = imgTag['src']
imgContent = urllib2.urlopen(imgSrc).read()
imgFileName = basename(urlsplit(imgSrc)[2])
imgFile = open(imgFileName, 'wb')
imgFile.write(imgContent)
imgFile.close()
return imgFileName
except:
return ' '
#获取图像文件的元数据,并寻找是否存在Exif标签“GPSInfo”
def testForExif(imgFileName):
try:
exifData = {}
imgFile = Image.open(imgFileName)
info = imgFile._getexif()
if info:
for (tag, value) in info.items():
decoded = TAGS.get(tag, tag)
exifData[decoded] = value
exifGPS = exifData['GPSInfo']
if exifGPS:
print '[*] ' + imgFileName + ' contains GPS MetaData'
except:
pass
def main():
parser = optparse.OptionParser('[*]Usage: python Exif.py -u <target url>')
parser.add_option('-u', dest='url', type='string', help='specify url address')
(options, args) = parser.parse_args()
url = options.url
if url == None:
print parser.usage
exit(0)
else:
imgTags = findImages(url)
for imgTag in imgTags:
imgFileName = downloadImage(imgTag)
testForExif(imgFileName)
if name == 'main':
main()
[END]
本文详细介绍使用Python爬虫从不同网站抓取图片的方法,包括贴吧、新闻网站及壁纸站点等,涵盖基本爬虫框架搭建、图片链接解析、图片下载保存及图片元数据提取等内容。
1万+

被折叠的 条评论
为什么被折叠?



