#!/usr/bin/python
# simple web crawler
import urllib2
import re
# download a web file (.html) of url with given name
def downURL(url, filename):
try:
fp = urllib2.urlopen(url)
except:
print 'download exception'
return False
op = open(filename, 'wb')
while True:
s = fp.read()
if not s:
break
op.write(s)
fp.close()
op.close()
return True
# get urls in a web
def getURLs(url):
try:
fp = urllib2.urlopen(url)
except:
print 'get url exception'
return []
pattern = re.compile('http://[\w\.]+')
while True:
s = fp.read()
if not s:
break
urls = pattern.findall(s)
fp.close()
return urls
# crawl web in one level
def spider(startURL):
urls = []
urls.append(startURL)
urllist = getURLs(startURL)
for url in urllist:
print url
if urls.count(url) == 0:
urls.append(url)
i = 0
while True:
if len(urls) <= 0:
break
else:
url = urls.pop(0)
i = i + 1
downURL(url, str(i) + '.html')
return True
# test
spider('http://www.baidu.com')
Simple Web Crawler Used Python
最新推荐文章于 2024-11-18 20:00:54 发布
本文介绍了一个简单的Python网页爬虫程序,该程序可以下载指定URL的网页内容,并从中提取所有链接地址。爬虫具备基本的一级页面抓取能力,能够递归地访问找到的每个链接并保存为.html文件。
4559

被折叠的 条评论
为什么被折叠?



