一单线程下载
import requests,os,bs4
print(os.getcwd())
url='http://xkcd.com'
os.makedirs('xkcd',exist_ok=True)
while not url.endswith('100'):
print('downloading page %s...'%url)
res=requests.get(url)
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text)
comicElem=soup.select('#comic img')
if comicElem==[]:
print('could not find comic image')
else:
# comicUrl=comicElem[0].get('src')
comicUrl = comicElem[0].get('src').strip("http://")
comicUrl = "http://" + comicUrl
if 'xkcd' not in comicUrl:
comicUrl = comicUrl[:7] + 'xkcd.com/' + comicUrl[7:]
print('downloading image %s..'%(comicUrl))
#download the image
res=requests.get(comicUrl)
res.raise_for_status()
#save the image to
imageFile=open(os.path.join('xkcd',os.path.basename(comicUrl)),'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
prevLink=soup.select('a[rel="prev"]')[0]
url='http://xkcd.com'+prevLink.get('href')
print('done')
二多线程下载
import threading,requests,os,bs4
print(os.getcwd())
os.makedirs('XKCD1',exist_ok=True)
def downloadXkcd(startComic,endComic):
for urlnumber in range(startComic,endComic,30):
print('downloading page http://xkcd.com/%s...'%(urlnumber))
res=requests.get('http://xkcd.com/%s'%(urlnumber))
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text)
comicElem=soup.select('#comic img')
if comicElem==[]:
print('could not find comic image')
else:
# comicUrl=comicElem[0].get('src')
comicUrl = comicElem[0].get('src').strip("http://")
comicUrl = "http://" + comicUrl
if 'xkcd' not in comicUrl:
comicUrl = comicUrl[:7] + 'xkcd.com/' + comicUrl[7:]
print('downloading image %s..'%(comicUrl))
#download the image
res=requests.get(comicUrl)
res.raise_for_status()
#save the image to
imageFile=open(os.path.join('XKCD1',os.path.basename(comicUrl)),'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
#create and start the thread objects
downloadThreads=[]
for i in range(1,1400,100):
downloadThread=threading.Thread(target=downloadXkcd,args=(i,i+99))
downloadThreads.append(downloadThread)
print(downloadThreads)
downloadThread.start()
for downloadThread in downloadThreads:
downloadThread.join()
print('done')