用Python实现备份优快云博客:
功能:备份输入user的所有博文到当前目录下的user文件夹内(html格式)
不足:暂时还没开多线程,并且反盗链之类还没有处理(虽然转载的文章一般都是保存没反盗链的图,但是,还是一个不足),然后就是语言的精简还需提高,比如文件名修正,应该可以用循环匹配查找…………
谢谢python技术交流群里的菜鱼和xiao等前辈的指导,编码问题给我郁闷了好久,欢迎大家拍砖。
————————
添加多线程,知道join了,呵呵
转为的exe版本见0分下载页http://download.youkuaiyun.com/detail/betabin/4377512
————————
上面的exe资源被Csdn删掉了,这能接受,可是连事后通知都没有,这有点………
貌似又恢复了……这…………
————————
把文件名的过滤用sub替代,减去冗余代码
会python的就自己脚本跑吧。
# -*- coding: cp936 -*-
'''
Author: BetaBin
Date: 2012/06/16
Function: Backup the csdn blog.
'''
import urllib
import urllib2
import os
import re
import traceback
import threading
import datetime
#Global data
#日志链接,相对路径:日志名
blogurl = {}
savednum = 1
threadnum = 5
bloghost = 'http://blog.youkuaiyun.com'
bloguser = 'BetaBin'
#获取infourl的html源码utf8编码
def getinfo(infourl, hostsite = ''):
postdata = urllib.urlencode({})
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
if hostsite != '':
print "hostsite: ", hostsite
headers['Referer'] = hostsite
req = urllib2.Request(
url = infourl,
data = postdata,
headers = headers
)
#return urllib2.urlopen(req).read().decode('utf8')
urlfile = urllib2.urlopen(req)
result = urlfile.read().decode('utf8')
urlfile.close()
return result
#开始备份url页面的所有博客及后续页面博客
def backup(url):
#获取内容
result = getinfo(url)
#找本页日志链接
titlepatt = """class="link_title"><a href="(.*?)">[\\s]+(.*?)[\\s]+</a>"""
retitle = re.compile(titlepatt)
blogs = retitle.findall(result)
#print "Got: ", blogs
for blog in blogs:
blogurl[blog[0]] = blog[1]
#print blog[0] + '\t\t' + blog[1]
#找下页链接
nextpagepatt = u"""<a href="(.*?)">下一页</a>"""
renextpage = re.compile(nextpagepatt)
pagelink = renextpage.search(result)
if pagelink is not None:
pagelink = renextpage.findall(result)
truelink = pagelink[0].rfind('href')
backup(bloghost + pagelink[0][truelink + 6:])
#打印blogurl映射里的内容
def printbloglink():
for item in blogurl:
print item, "\t\t", blogurl[item]
#找到博客正文内容
def getcontent(inforesult):
contentpatt = """<div id="article_content" class="article_content"[\\s\\S]*?<div class="share_buttons\""""
recontent = re.compile(contentpatt)
result = recontent.search(inforesult)
if result is not None:
#根据前面正则匹配,最后26个字符是优快云分享链接的,不需要
return '<html><head><meta http-equiv="Content-Type" content="text ml; charset=utf-8" /></head><body>'+result.group()[:-26]+'</dody></html>'
else:
print "正文提取出错……"
return None
#根据博文相对链接,开始下载博文
blogurllock = threading.Lock()
def downloadblog():
while True:
#读取一条博文链接
blogurllock.acquire()
if (len(blogurl) > 0):
item = blogurl.popitem()
url = bloghost + item[0]
title = item[1]
global savednum
blognum = savednum
savednum += 1
else:
blogurllock.release()
return
blogurllock.release()
#确定博文标题,重叠自动重命名
if not os.path.exists(bloguser):
os.makedirs(bloguser)
title = ReplaceBadCharOfFileName(title)
filename = title
file_no = 1
while os.path.isfile(bloguser + '/' + filename + '.html'):
filename = title + '(' + str(file_no) + ')'
file_no += 1
#获取博客正文html
content = getcontent(getinfo(url))
if content is None:
traceback.print_exc()
#找图片链接
picturepatt = """<img src="(http:.*?)"[\\s\\S]*?alt"""
repicture = re.compile(picturepatt)
pictures = repicture.findall(content)
#下载图片,保存至相应文件夹
for pictureurl in pictures:
#创建文件夹
folder = bloguser + '/' + filename + '/'
picturename = pictureurl[pictureurl.rfind('/') + 1:]
if not os.path.exists(folder):
os.makedirs(folder)
try:
path = os.path.join(os.getcwd(), bloguser, filename, picturename)
urllib.urlretrieve(pictureurl, path)
except:
print '图片保存失败,跳过此图片:', pictureurl
traceback.print_exc()
#一般失败是因为转载的反盗链
#所以选择错了就该博文的全部图片都不下载
break
else:
content = content.replace(pictureurl, filename + '/' + picturename, 1)
bloghtml = open(bloguser + '/' + filename + '.html', 'wb')
bloghtml.write(content.encode('utf8'))
bloghtml.close()
print "第", blognum, "篇博文", title, "保存完毕"
#去掉文件名的不合法字符
def ReplaceBadCharOfFileName(filename):
filename=filename.replace(" ","")
filename=filename.replace("\\", "")
#把一些冗余的操作用sub来消去,不过上面两个……
badpatt = """[\*\?;|<>&/:]"""
rebad = re.compile(badpatt)
return rebad.sub("", filename)
#检查账号是否存在
def isvaliduser():
#forbiddenpatt = """<head><title>403 Forbidden</title></head>
#reforbidden = re.compile(forbiddenpatt)
try:
getinfo(bloghost + '/' + bloguser)
except:
return False
else:
return True
#main function
bloguser = raw_input('请输入您的优快云博客账号(例如http://blog.youkuaiyun.com/betabin中的betabin): ')
if not isvaliduser():
print "该账号无效"
else:
starttime = datetime.datetime.now()
backup(bloghost + "/" + bloguser)
#downloadblog()
#开多线程支持
threads = []
for threadid in range(threadnum):
downloadthread = threading.Thread(None, downloadblog)
threads.append(downloadthread)
downloadthread.start()
for downloadthread in threads:
downloadthread.join()
endtime = datetime.datetime.now()
print "共用时:", endtime - starttime
print "备份完毕"