备份一下优快云空间的收藏模块,顺便检测一下 python2 的学习成果,简单的使用 liburl2 和 re,轻拍。
#!\usr\bin\env python2
# -*- coding: utf-8 -*-
import re
import os
import time
import pdb
FAVORITE_USER = 'summer__xt'
FAVORITE_ULR = 'http://my.youkuaiyun.com/' + FAVORITE_USER + '/favorite'
FAVORITE_HEAD__USER_AGENT= 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0;)'
STR_TAB = ' '
g_favorite_page_count = 1
g_favorite_page_index = 1
g_favorite_items = []
def loop_favorite():
global g_favorite_page_count
g_favorite_page_index = 1
while g_favorite_page_index <= g_favorite_page_count:
url = (FAVORITE_ULR + '/%d') % g_favorite_page_index
print '%s%s' % (url, os.linesep)
html = get_favorite(url)
page_count = int(parse_favorite(html))
# 可能没有尾页项
g_favorite_page_count = page_count if 0 != page_count else g_favorite_page_count;
print '{index: %d, parse: %d, count: %d}' % (g_favorite_page_index, page_count, g_favorite_page_count)
g_favorite_page_index += 1
save_favorite()
print 'DONE!'
pass
def get_favorite(url):
import urllib2
favorite_req = urllib2.Request(url, headers={'User-Agent' : FAVORITE_HEAD__USER_AGENT})
favorite = urllib2.urlopen(favorite_req)
return favorite.read()
def parse_favorite(html):
m = re.search(
r'<div[^>]+class="fav_list">(?:.|\r|\n)+?</div>',
html
)
items_html = m.group()
lst_item = re.findall(
r'<dl>[^<]+<dt>[^<]*<span class="time">(.+)</span>[^<]*<span class="title">[^<]*<a href="([^"]+)">([^<]+)</a>[^<]*</span>[^<]*</dt>[^<]*(?:<dd>[^<]*<strong>标签:</strong>[^<]*((?:<a[^>]+href="[^"]*"[^>]*>[^<]*</a>[^<]*)+)</dd>)?[^<]*(?:[^<]*<dd>[^<]*<strong>描述:</strong>((?:.|\r|\n)+?)</dd>[^<]*)?</dl>',
items_html
)
if 20 != len(lst_item) and 1 != g_favorite_page_index and g_favorite_page_count != g_favorite_page_index:
pdb.set_trace()
tags = []
lst_it_item = []
for it_item in lst_item:
tags = re.findall(
r'<a href="[^"]*">([^<]*)</a>',
it_item[3]
)
lst_it_item = list(it_item)
lst_it_item[3] = tags
g_favorite_items.append(lst_it_item)
print '{globe_item: %d, current_item: %d}' % ( len(lst_item), len(g_favorite_items) )
return parse_footer(html)
def parse_footer(html):
m = re.findall(
r'<a class="pageliststy" href="/' + FAVORITE_USER + r'/favorite/(\d+)">尾页</a>',
html
)
return m[0] if 1 == len(m) else 0
def save_favorite():
lt = time.localtime()
build_file_date = time.strftime("%Y%m%d_%H%M%S", lt)
build_date = time.strftime("%Y-%m-%d %H:%M:%S", lt)
favorite_file_name = '%s\[%s]优快云FavoriteBackup_%s.xml' % (
os.getcwd(),
FAVORITE_USER,
build_file_date
)
print favorite_file_name
try:
fo = open(favorite_file_name, 'w+')
fo.write(favorite_tostring(build_date))
except Exception, e:
print e
finally:
fo.close()
pass
def favorite_tostring(build_date):
global g_favorite_items
xmlinfo = '<favorites count="%d" build_date="%s">' % (
len(g_favorite_items),
build_date
)
for it in g_favorite_items:
xmlinfo += STR_TAB + '<item>'
xmlinfo += STR_TAB * 2 + '<time>%s</time>' % it[0]
xmlinfo += STR_TAB * 2 + '<url><![CDATA[%s%s%s]]></url>' % (os.linesep, it[1], os.linesep)
xmlinfo += STR_TAB * 2 + '<title><![CDATA[%s%s%s]]></title>' % (os.linesep, it[2], os.linesep)
xmlinfo += STR_TAB * 2 + '<tags count="%d">' % len(it[3])
for it_tag in it[3]:
xmlinfo += STR_TAB * 3 + '<tag>%s</tag>' % it_tag
xmlinfo += STR_TAB * 2 + '</tags>'
if 0 != len(it[4]):
xmlinfo += STR_TAB * 2 + '<description><![CDATA[%s%s%s]]></description>' % (os.linesep, it[4], os.linesep)
xmlinfo += STR_TAB + '</item>'
xmlinfo += "</favorites>"
return xmlinfo
if '__main__' == __name__:
loop_favorite()