from bs4 import BeautifulSoup import urllib2 import urllib, os, re, time, sys #import socket def build_request(link): # user_agent = 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' values = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } headers = { 'User-Agent' : 'Custom User-Agent' } data=urllib.urlencode(values) req = urllib2.Request(link, data, headers) req.add_unredirected_header('User-Agent', 'Custom User-Agent') return req def build_urllib2(link): print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())) #set timeout urllib2.socket.setdefaulttimeout(60) #set proxy null_proxy_handler = urllib2.ProxyHandler({}) #null_proxy_handler = urllib2.ProxyHandler({}) opener = urllib2.build_opener(null_proxy_handler) urllib2.install_opener(opener) print 'after install opener' print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())) def get_img(link, path): print 'start to download '+ link p = re.compile(r'[/\\:\*\"?|<>]+') q = re.compile(r'[\.]+') content=[] try: req= build_request(link) content = urllib2.urlopen(req) # content = opener.open(req) print 'start to create soup' soup = BeautifulSoup(content) my_img = soup.find_all('img') print 'ccccccccccc' if my_img ==[]: print 'no pic there' sys.exit(0) print 'ok, start to download' for img in my_img: img_link = img.get('src') #print img_link filename = img_link.split("/")[-1] if not q.search(filename): #add postfix to the filename if it doesn't have filename = filename+'.jpg' file_path = os.path.join(path,filename) if os.path.exists(file_path): continue if p.search(filename): print 'continue' continue print 'downloading '+filename try: urllib.urlretrieve(img_link,file_path, None) except: print 'T_T, Failed to download '+ filename continue except urllib2.HTTPError, e: print e.code print e.msg print e.headers print e.fp.read() weblink = "http://club.history.sina.com.cn/thread-5534627-1-1.html" mypath = "G:\\python\\test\\" build_urllib2(weblink) get_img(weblink, mypath)
爬虫实例抓取并download with Beautifulsoap
最新推荐文章于 2020-03-15 00:12:30 发布
