## spider.py
import urllib2,httplib
import os,sys,re,time,random
#[OK]
def removeBR(site):
result = re.search(r'\n',site)
if result:
return site.replace('\n','')
return site
#[OK]
def getRequestList(fname):
file = open(fname,"r")
sites = file.readlines()
list = []
for s in sites:
list.append(removeBR(s))
return list
#[OK]
def inOldSet(url,set):
if url in set:
return True
return False
#[]
#[OK]
def isEndWithJavascript(site):
ret = re.search("/javascript:",site)
if ret:
return True
return False
#[OK]
def ignoreJavascript(site):
if isEndWithJavascript(site):
s1,s2 = site.rsplit("/javascript:",1)
return s1
return site
#[50%_OK]
def isFile(url):
res = re.search("(.html|.htm|.xml|.txt|.css|.js|.avi|.flv|.jpg|.gif|.bmp|.png|.xhtml|.dat|.doc|.xls|.php|.jsp|.asp)$",url)
if res:
return True
return False
#[OK] http://aa.com/index.txt -> http://aa.com
def getBaseUrl(url):
if isFile(url):
s1,s2 = url.rsplit("/",1)
return s1
if url[len(url) - 1:] == "/":
return url[:len(url) - 1]
else:
return url
#[OK]
def addIndexForSite(site):
if isFile(site):
return site
bs = getBaseUrl(site)
return "%s/index