import urllib
import time
import sys
url = ['']*350
page = 1
link = 1
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_' + str(page) + '.html').read()
i = 0
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
while title != -1 and href != -1 and html != -1 and i < 50:
url[link] = con[href + 6:html + 4]
#print link, ' : ', url[i]
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'html',href)
i = i + 1
link = link + 1
else:
print 'page', page, ' find end!'
page = page + 1
else:
print 'all page finished!'
j = 0
while j < link:
if url[j].strip() != '':
filename = 'hanhan/'+url[j][-26:]
print j,' downloading ', url[j]
try:
content = urllib.urlopen(url[j]).read()
open(filename, 'w+').write(content)
except:
print 'exception:',sys.exc_info()
else:
print j,'empty url!'
#except:
# print 'exception: ', sys.exc_info()[0]
j = j + 1
time.sleep(1)
else:
print 'download article finished'