python-urllib模块【下载图片】

最新推荐文章于 2024-05-04 16:17:03 发布

转载最新推荐文章于 2024-05-04 16:17:03 发布 · 9.6k 阅读

文章标签：

#python #xhtml #import #image #string #class

python 专栏收录该内容

16 篇文章

订阅专栏

本文详细介绍了Python中网络爬虫的基础知识及应用，包括如何使用urllib库获取网页内容，处理HTML标签提取链接，以及多线程与单线程方式下载图片的技术细节。涵盖了从基础操作到复杂应用的全面指南。

0，python中关于下载的部分总结如下：

import urllib

if __name__=="__main__":
    url = "http://www.baidu.com"
    #根据url读取html源码
    content = urllib.urlopen(url).read()
    #转为中文可读，可以直接查看当前html源文件是什么编码格式，百度的是gb2312
    content = content.decode("gb2312").encode("utf-8")
    print content

1，处理A标签字符串：

#!/usr/bin/python
#encoding=utf-8
import htmllib,urllib,formatter,string
'''
import chardet,sys
type = sys.getdefaultencoding()
'''
class GetLinks(htmllib.HTMLParser): #从HTMLParser类中继承
    def __init__(self): #初始化的时候调用，将links设置为空。这里的links为字典结构
        self.links = {} #存放[地址->链接]的字典
        f = formatter.NullFormatter()#将传输过来的数据不做处理，格式化为数据流
        htmllib.HTMLParser.__init__(self, f)

    def anchor_bgn(self, href, name, type): #锚点标签开始的时候处理
        self.save_bgn()
        self.link = href

    def anchor_end(self): #锚点标签结束的时候处理
        text = string.strip(self.save_end()) #去掉A标签保留A标签的信息
        if self.link and text:
            self.links[text] = self.link#self.links.get(text, []) + [self.link]

#fp = urllib.urlopen("http://www.baidu.com") #打开指定的URL
#data = fp.read()
#fp.close()
data = '<html><head><title>test</title><body><a href="http: //www.163.com">链接到163</a><a href="http://www.focus.cn">焦点</a></body></html>'
linkdemo = GetLinks() #实例化一个LinkDemo对象
linkdemo.feed(data) #给HTMLParser喂食
linkdemo.close()

for href, link in linkdemo.links.items(): #打印相关的信息
    print href, "=>", link

输出：

焦点 => http://www.focus.cn
链接到163 => http: //www.163.com

再如：

# -* - coding: UTF-8 -* -
import htmllib, urllib, formatter, string

class GetLinks(htmllib.HTMLParser):
    def __init__(self):
        self.links = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        if href[:4] == 'http':
            self.link = href
        else:
            self.link = None

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.link and text:
            self.links[text] = self.link


fp = urllib.urlopen("http://list.taobao.com/browse/cat-0.htm")
data = fp.read()
fp.close()

linkdemo = GetLinks()
linkdemo.feed(data)
linkdemo.close()

for href, link in linkdemo.links.items():
    href = href.decode('gb2312').encode('utf-8')
    print href, '-', link
    pass

结果是下载到的淘宝“裤架 - http://ju.atpanel.com/?url=http://list.taobao.com/market/baihuo.htm?spm=1.47613.90750.”这样的列表

2，下载豆瓣图片【多线程】：

# -* - coding: UTF-8 -* -
from HTMLParser import HTMLParser
import htmllib,urllib,formatter,string
import os,sys,time
import threading
'''
Created on 2012-10-09
@author: xing.gexing
'''
#建立线程池，并启动线程直到结束
def parallel(urls):  
    startTime = time.time()
    threads=[]
    counts = range(len(urls))
    for i in counts:
        t=MyThread(downloadFromURL,(urls[i],),downloadFromURL.__name__)
        threads.append(t)
    for i in counts:
        threads[i].start()
    for i in counts:
        threads[i].join()
    print 'use time cost:%s'%(time.time()-startTime)

#自定义线程类
class MyThread(threading.Thread):
    def __init__(self,func,args,name=''):
        threading.Thread.__init__(self)
        self.name=name
        self.func=func
        self.args=args
    def run(self):
        apply(self.func,self.args)

#根据url找到图片的链接并下载
def downloadFromURL(url):
    fp = urllib.urlopen(url)
    data = fp.read()
    fp.close()

    hp = MyHTMLParser()
    hp.feed(data)
    hp.close()
    for i in hp.links:
        print(i)
        downloadImage(i)

#根绝imageUrl下载图片到本地   
def downloadImage(imageUrl):
    dir = "./image_douban"
    try:
        if not os.path.exists(dir):
            os.mkdir(dir)
    except:
        print "Failed to create directory in %s"%dir
        exit()
    image = imageUrl.split('/')[-1]
    path = dir+"/"+image
    data = urllib.urlopen(imageUrl).read()
    f = file(path,"wb")
    f.write(data)
    f.close()

#定义html解析，关键在于handle_starttag
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
 
    def handle_starttag(self, tag, attrs):
        if len(attrs) == 0: 
            pass
        else:
            for (variable, value)  in attrs:
                if variable=="src" and value[:4]== "http" and value[-4:]==".jpg":
                    self.links.append(value)
 
if __name__ == "__main__":
    html = """
    <a href="www.google.com"> google.com</a>
    <A Href="www.pythonclub.org"> PythonClub </a>
    <A HREF = "www.sina.com.cn"> Sina </a>
    """
    #url2 = "http://image.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baiduimage&pv=&word=car&z=5"
    #url = "http://image.baidu.com"
    #url = "http://movie.douban.com/"
    #下载豆瓣电影图片
    base = 20
    count = 1
    urls = []
    while count <= 100:
        url = "http://movie.douban.com/tag/%E6%83%8A%E6%82%9A?start="+str(base*count)+"&type=T"
        urls.append(url)
        count += 1 
    parallel(urls)

3，下载百度图片【单线程】：

需要特别注意的是对于百度图片的处理：搜索的关键词是其中的word，注意替换。

百度图片搜索的第1页（包含20张图片）：http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=0

百度图片搜索的第2页（包含20张图片）：http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%C6%FB%B3%B5&pn=20

...

对于其中每一页，每张图片都有个这样的后缀：/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987，所以一共20个，查找i?ct进行匹配即可。

将这个后缀与百度图片地址http://image.baidu.com拼接即可得到该图片源的网页：http://image.baidu.com/i?ct=503316480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20&rn=1&di=36978446751&ln=1987

在该网页中匹配img src即可找到图片绝对路径。

# -* - coding: UTF-8 -* - 
import os,sys,urllib
docString='''
Created on 2012-10-10
@author: xing.gexing
'''
def baidu(imgsum,findstr):
        gbstr=("找到相关图片约".decode("utf8")).encode("gb2312")
        gbstr2=("找到相关图片".decode("utf8")).encode("gb2312")
        gbstr3=("张".decode("utf8").encode("gb2312"))
        if findstr=="":
            return 0

        findstr=(findstr.decode("utf8")).encode("gb2312")
        findstr=urllib.quote(findstr)
        url="http://image.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs&s=0&word=%s&pn="%findstr
        webfile=urllib.urlopen(url+"0").read()
        start=webfile.find(gbstr)
        if start==-1:
                start=webfile.find(gbstr2)
                start=start+12
        else:
                start=start+14
        end=webfile.find(gbstr3,start)
        sum=webfile[start:end]
        sum=sum.replace(",","")
        sum=int(sum)                           #总图片数
        sumpage=sum/20+1                       #总页数
        print "you have found %d pics in baiduImage"%sum
        i=0                                    #下载的图片数
        for page in range(sumpage):
                p_url=url+"%s"%(page*20)       #当前页url  
                webfile=urllib.urlopen(p_url).read()

                i_start = 0
                i_end = 0
                while True:
                        i_start=webfile.find('''<a href="/i?ct''',i_end)
                        if i_start<0:
                            break
                        i_start+=10
                        i_end=webfile.find('''"''',i_start)
                        i_url=webfile[i_start:i_end]           
                        i_url="http://image.baidu.com/"+i_url
                        webstr=urllib.urlopen(i_url).read()

                        start = 0
                        end = 0
                        while True:
                                start=webstr.find('''<img src="''',end)
                                if start<0:
                                        break
                                start+=10
                                end=webstr.find('''"''',start)
                                imgurl=webstr[start:end]
                                if imgurl[-4:]!=".jpg":
                                    continue
                                if imgurl.find("img-jg.gif")!=-1:
                                        continue
                                i=i+1
                                print "downloading pic %s from %s"%(i,imgurl)
                                try:
                                         data=urllib.urlopen(imgurl).read()
                                except:
                                         print "lost 1 pic"
                                         break
                                f=open("%s/%d.jpg"%(dir,i),"w")
                                f.write(data)
                                f.close()

                                if i==int(imgsum):
                                        print "finish download %s pics"%i
                                        return 1

if __name__ == "__main__":
    print docString
    print "config your downloading arguments:"
    findstr = raw_input("search:")
    if findstr == "":
        findstr = "汽车"
    imgsum = raw_input("num:")
    if imgsum == "":
        imgsum = 10
    dir = "./baiduPic"
    try:
        if not os.path.exists(dir):
            os.mkdir(dir)
    except:
        print "Failed to create directory in linux:"
        exit()
    print "config OK!"
    baidu(imgsum,findstr)