import urllib
import string
#显示下载进度的函数
def callbackfunc(blocknum, blocksize, totalsize):
'''回调函数
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
'''
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
print "%.2f%%"% percent
#定义要抓取的页面
url = 'http://www.freebuf.com/articles'
#读取要抓取的页面
globalcontent = urllib.urlopen(url).read()
#捕捉文章列表
#这里在源码中查询"<dt><a href="这个字符串
#new_inner01_h = globalcontent.find('<li><a href=')
#new_inner01_h = globalcontent.find('<dt><a href=')
new_inner01_h = globalcontent.find('<div class="news-img"><a target="_blank" href=')
new_inner01_l = globalcontent.find('.html"><img calss=')
news_inner01 = globalcontent[new_inner01_h+47:new_inner01_l+5]
print news_inner01
local = 'd:\\crawl\\1.html'
urllib.urlretrieve(news_inner01, local, callbackfunc)
爬虫_简单下载一下网页
最新推荐文章于 2023-03-23 22:21:21 发布
