import os
import re
import urllib
# get page code by url
def getPageCode(url):
urlItem = urllib.urlopen(url)
pageCode = urlItem.read()
urlItem.close()
return pageCode
# save code
def savePage(filePath, pageCode):
fileWrite = open(filePath, 'w')
fileWrite.write(pageCode)
fileWrite.close()
# analyze damain
def analyzeCode(filePath, rsPath):
fileRead = open(filePath, 'r')
fileWrite = open(rsPath, 'w')
#set the rule
regex = ur'^[\w]{1,5}\.(com|net|org)$'
#get each line
for eachLine in fileRead:
print eachLine
if re.match(regex, eachLine):
fileWrite.write('%s' % eachLine)
fileWrite.close()
fileRead.close()
def main():
url = "http://domain.webmasterhome.cn/tomorrowDel_domain.asp"
fileDir = "d:/"
filePath = fileDir + "code.txt"
rsPath = fileDir + "rs.txt"
print "The result saved in : " + rsPath + "\n----------------"
print "1 get code..."
pageCode = getPageCode(url)
print "2 save code..."
savePage(filePath, pageCode)
print "3 analyze code..."
analyzeCode(filePath, rsPath)
if __name__ == "__main__":
main()
今天抽空把找域名的脚本重写了一次,自动联网抓取即将过期的短域名。
如果查找其它日期的短域名,修改url即可。
短域名的定义 可自己修改 24行的正则表达式。