其实真的很丑,而且使用的递归的方式,所以要等所有下载完了才会显示结果。有点郁闷:更多的期待

代码:
baidublog.py: 这个在前一篇文章的基础上,修改了下一篇文章地址的查找:
def findNextBlogHtml(user,htmlContent):
htmlBlogContent = unicode(htmlContent,'gb2312','ignore').encode('utf-8','ignore')
# parser the html content
htmlsoup = BeautifulSoup(htmlBlogContent)
nextBlogUrlZero = htmlsoup.findAll("div",{"class":"opt"})
urlRe = re.compile('/.*?.html')
urls = urlRe.findall(str(nextBlogUrlZero[0]))
if(len(urls)>=1):
blogUrl = re.findall(r"\w*.html",urls[0],re.I)
if (len(blogUrl[0]) >6 ):
htmlAddr = blogUrl[0]
else:
htmlAddr ="None"
else:
htmlAddr ="None"
print htmlAddr
接下来是图形界面:
#-*- coding: utf-8 -*-
from Tkinter import *
from baidublog import *
class GridDemo( Frame ):
def __init__( self ):
Frame.__init__( self )
self.master.title( "Baidu Blog Backup" )
self.grid( sticky = W+E+N+S )
self.label1 = Label( self,text="百度用户名:",width = 5 )
self.label1.grid( row = 0, column = 1, sticky = W+E+N+S )
self.entry1 = Entry(self,width=20)
self.entry1.grid(row=0,column=2)
self.entry1.insert(INSERT, "codedeveloper")
self.label2 = Label( self,text="第一篇博文地址:",width = 8 )
self.label2.grid( row = 0, column = 3, sticky = W+E+N+S )
self.entry2 = Entry(self,width=40)
self.entry2.grid(row=0,column=4,sticky = W+E+N+S)
self.entry2.insert(INSERT, "977f3010ab7e17dcf7039e99.html")
self.text = Text(self)
self.text.grid(row =1,columnspan = 5,sticky = W+E+N+S)
self.button = Button(self,text='Backup', width = 30,command=self.startBackupBlog)
self.button.grid(row=2, columnspan =5)
def startBackupBlog(self):
user = self.entry1.get()
firstBlogUrl = self.entry2.get()
self.backupAction(user, firstBlogUrl)
def backupAction(self,user,firstBlogUrl):
#first read first blog's title and content
blogContent, blogTitle,htmlContent = getBlogContentAndTitle(user,firstBlogUrl)
#save the html to file
saveToFile(user,blogContent,blogTitle,firstBlogUrl)
self.text.insert(INSERT,str( blogTitle)+"-"+str(firstBlogUrl))
#find next url
firstBlogUrl = findNextBlogHtml(user,htmlContent)
if firstBlogUrl != "None" :
self.backupAction(user,firstBlogUrl)
#self.text.insert(INSERT, "More")
else:
self.text.insert(INSERT, "\n恭喜全部备份完毕!")
def main():
GridDemo().mainloop()
if __name__ == "__main__":
main()
个人觉得:还是先读取文章分类中所有文章链表,然后根据线程等下载方式,可以提高效率
本文介绍了一个基于递归的Python脚本,用于下载指定百度用户名下的所有博客文章。脚本通过解析HTML内容,找到下一篇文章的链接,并持续调用自身直至所有文章下载完成。此外,提供了一个图形界面用于输入用户名和起始博客地址,便于用户操作。
1280

被折叠的 条评论
为什么被折叠?



