本来是编译原理的一个课程设计,没有限定语言,刚开始想着用C/C++吧,不过网上看被人的代码后就心生退却了。当时想着,这个课程设计还挺好玩的,想着把它“做大做好”,不过一星期下来,慢慢吞吞的。现在把代码贴出来,开贴激励自己。
不要忘记当初的决定,坚持下去。学好Python,最后做出一个搜索引擎!
基本上是BFS,其他没有用任何算法。
import re
import sys
import urllib2
import urlparse
def downURL(url,filename):
urlcontent = urllib2.urlopen(url).read()
filep = open(filename,"wb")
filep.write(urlcontent)
filep.close()
return 1
def getURL(url):
newR = []
content = urllib2.urlopen(url).read()
pattern=re.compile("<[aA]\s+(?:.+?\s+)href\=[\'\"](.*?)[\'\"]")
R = pattern.findall(content)
for r in R:
if "javascript" in r or "@" in r:
continue
newR.append(urlparse.urljoin(url,r))
return newR
def getMail(url,filemail):
f = urllib2.urlopen(url).read()
patternMail = re.compile("[^\._-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+")
#filemail = open('mail.txt',"a")
#patternMail = re.compile("[a-zA-Z0-9]*@[a-zA-Z0-9\.]*")
mail = patternMail.findall(f)
#filemail.write(mail)
for m in mail:
filemail.write(m)
print m
#filemail.close()
def spider(time,count,starturl):
f = 0
U = []
U.append(starturl)
while 1:
if f>time:
break
if len(U)>0:
url=U.pop(0)
#downURL(url,str(f)+'.html')
f = f + 1
list = getURL(url)
for url in list:
U.append(url)
if len(U)>time:
break;
else:
break
filemail = open('mail.txt',"a")
while count:
count = count - 1
if len(U)>0:
url = U.pop(0)
#downURL(url,str(f)+'.html')
getMail(url,filemail)
f = f + 1
else:
break
filemail.close()
return 1
if __name__=='__main__':
spider(500,100,'http://www.scu.edu.cn')
print 'over'
一些参考链接:http://blog.youkuaiyun.com/leoowen19/article/details/6260363
http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html